1package uniseg
2
3import "unicode/utf8"
4
5// The states of the line break parser.
6const (
7 lbAny = iota
8 lbBK
9 lbCR
10 lbLF
11 lbNL
12 lbSP
13 lbZW
14 lbWJ
15 lbGL
16 lbBA
17 lbHY
18 lbCL
19 lbCP
20 lbEX
21 lbIS
22 lbSY
23 lbOP
24 lbQU
25 lbQUSP
26 lbNS
27 lbCLCPSP
28 lbB2
29 lbB2SP
30 lbCB
31 lbBB
32 lbLB21a
33 lbHL
34 lbAL
35 lbNU
36 lbPR
37 lbEB
38 lbIDEM
39 lbNUNU
40 lbNUSY
41 lbNUIS
42 lbNUCL
43 lbNUCP
44 lbPO
45 lbJL
46 lbJV
47 lbJT
48 lbH2
49 lbH3
50 lbOddRI
51 lbEvenRI
52 lbExtPicCn
53 lbZWJBit = 64
54 lbCPeaFWHBit = 128
55)
56
57// These constants define whether a given text may be broken into the next line.
58// If the break is optional (LineCanBreak), you may choose to break or not based
59// on your own criteria, for example, if the text has reached the available
60// width.
61const (
62 LineDontBreak = iota // You may not break the line here.
63 LineCanBreak // You may or may not break the line here.
64 LineMustBreak // You must break the line here.
65)
66
67// lbTransitions implements the line break parser's state transitions. It's
68// anologous to [grTransitions], see comments there for details.
69//
70// Unicode version 15.0.0.
71func lbTransitions(state, prop int) (newState, lineBreak, rule int) {
72 switch uint64(state) | uint64(prop)<<32 {
73 // LB4.
74 case lbBK | prAny<<32:
75 return lbAny, LineMustBreak, 40
76
77 // LB5.
78 case lbCR | prLF<<32:
79 return lbLF, LineDontBreak, 50
80 case lbCR | prAny<<32:
81 return lbAny, LineMustBreak, 50
82 case lbLF | prAny<<32:
83 return lbAny, LineMustBreak, 50
84 case lbNL | prAny<<32:
85 return lbAny, LineMustBreak, 50
86
87 // LB6.
88 case lbAny | prBK<<32:
89 return lbBK, LineDontBreak, 60
90 case lbAny | prCR<<32:
91 return lbCR, LineDontBreak, 60
92 case lbAny | prLF<<32:
93 return lbLF, LineDontBreak, 60
94 case lbAny | prNL<<32:
95 return lbNL, LineDontBreak, 60
96
97 // LB7.
98 case lbAny | prSP<<32:
99 return lbSP, LineDontBreak, 70
100 case lbAny | prZW<<32:
101 return lbZW, LineDontBreak, 70
102
103 // LB8.
104 case lbZW | prSP<<32:
105 return lbZW, LineDontBreak, 70
106 case lbZW | prAny<<32:
107 return lbAny, LineCanBreak, 80
108
109 // LB11.
110 case lbAny | prWJ<<32:
111 return lbWJ, LineDontBreak, 110
112 case lbWJ | prAny<<32:
113 return lbAny, LineDontBreak, 110
114
115 // LB12.
116 case lbAny | prGL<<32:
117 return lbGL, LineCanBreak, 310
118 case lbGL | prAny<<32:
119 return lbAny, LineDontBreak, 120
120
121 // LB13 (simple transitions).
122 case lbAny | prCL<<32:
123 return lbCL, LineCanBreak, 310
124 case lbAny | prCP<<32:
125 return lbCP, LineCanBreak, 310
126 case lbAny | prEX<<32:
127 return lbEX, LineDontBreak, 130
128 case lbAny | prIS<<32:
129 return lbIS, LineCanBreak, 310
130 case lbAny | prSY<<32:
131 return lbSY, LineCanBreak, 310
132
133 // LB14.
134 case lbAny | prOP<<32:
135 return lbOP, LineCanBreak, 310
136 case lbOP | prSP<<32:
137 return lbOP, LineDontBreak, 70
138 case lbOP | prAny<<32:
139 return lbAny, LineDontBreak, 140
140
141 // LB15.
142 case lbQU | prSP<<32:
143 return lbQUSP, LineDontBreak, 70
144 case lbQU | prOP<<32:
145 return lbOP, LineDontBreak, 150
146 case lbQUSP | prOP<<32:
147 return lbOP, LineDontBreak, 150
148
149 // LB16.
150 case lbCL | prSP<<32:
151 return lbCLCPSP, LineDontBreak, 70
152 case lbNUCL | prSP<<32:
153 return lbCLCPSP, LineDontBreak, 70
154 case lbCP | prSP<<32:
155 return lbCLCPSP, LineDontBreak, 70
156 case lbNUCP | prSP<<32:
157 return lbCLCPSP, LineDontBreak, 70
158 case lbCL | prNS<<32:
159 return lbNS, LineDontBreak, 160
160 case lbNUCL | prNS<<32:
161 return lbNS, LineDontBreak, 160
162 case lbCP | prNS<<32:
163 return lbNS, LineDontBreak, 160
164 case lbNUCP | prNS<<32:
165 return lbNS, LineDontBreak, 160
166 case lbCLCPSP | prNS<<32:
167 return lbNS, LineDontBreak, 160
168
169 // LB17.
170 case lbAny | prB2<<32:
171 return lbB2, LineCanBreak, 310
172 case lbB2 | prSP<<32:
173 return lbB2SP, LineDontBreak, 70
174 case lbB2 | prB2<<32:
175 return lbB2, LineDontBreak, 170
176 case lbB2SP | prB2<<32:
177 return lbB2, LineDontBreak, 170
178
179 // LB18.
180 case lbSP | prAny<<32:
181 return lbAny, LineCanBreak, 180
182 case lbQUSP | prAny<<32:
183 return lbAny, LineCanBreak, 180
184 case lbCLCPSP | prAny<<32:
185 return lbAny, LineCanBreak, 180
186 case lbB2SP | prAny<<32:
187 return lbAny, LineCanBreak, 180
188
189 // LB19.
190 case lbAny | prQU<<32:
191 return lbQU, LineDontBreak, 190
192 case lbQU | prAny<<32:
193 return lbAny, LineDontBreak, 190
194
195 // LB20.
196 case lbAny | prCB<<32:
197 return lbCB, LineCanBreak, 200
198 case lbCB | prAny<<32:
199 return lbAny, LineCanBreak, 200
200
201 // LB21.
202 case lbAny | prBA<<32:
203 return lbBA, LineDontBreak, 210
204 case lbAny | prHY<<32:
205 return lbHY, LineDontBreak, 210
206 case lbAny | prNS<<32:
207 return lbNS, LineDontBreak, 210
208 case lbAny | prBB<<32:
209 return lbBB, LineCanBreak, 310
210 case lbBB | prAny<<32:
211 return lbAny, LineDontBreak, 210
212
213 // LB21a.
214 case lbAny | prHL<<32:
215 return lbHL, LineCanBreak, 310
216 case lbHL | prHY<<32:
217 return lbLB21a, LineDontBreak, 210
218 case lbHL | prBA<<32:
219 return lbLB21a, LineDontBreak, 210
220 case lbLB21a | prAny<<32:
221 return lbAny, LineDontBreak, 211
222
223 // LB21b.
224 case lbSY | prHL<<32:
225 return lbHL, LineDontBreak, 212
226 case lbNUSY | prHL<<32:
227 return lbHL, LineDontBreak, 212
228
229 // LB22.
230 case lbAny | prIN<<32:
231 return lbAny, LineDontBreak, 220
232
233 // LB23.
234 case lbAny | prAL<<32:
235 return lbAL, LineCanBreak, 310
236 case lbAny | prNU<<32:
237 return lbNU, LineCanBreak, 310
238 case lbAL | prNU<<32:
239 return lbNU, LineDontBreak, 230
240 case lbHL | prNU<<32:
241 return lbNU, LineDontBreak, 230
242 case lbNU | prAL<<32:
243 return lbAL, LineDontBreak, 230
244 case lbNU | prHL<<32:
245 return lbHL, LineDontBreak, 230
246 case lbNUNU | prAL<<32:
247 return lbAL, LineDontBreak, 230
248 case lbNUNU | prHL<<32:
249 return lbHL, LineDontBreak, 230
250
251 // LB23a.
252 case lbAny | prPR<<32:
253 return lbPR, LineCanBreak, 310
254 case lbAny | prID<<32:
255 return lbIDEM, LineCanBreak, 310
256 case lbAny | prEB<<32:
257 return lbEB, LineCanBreak, 310
258 case lbAny | prEM<<32:
259 return lbIDEM, LineCanBreak, 310
260 case lbPR | prID<<32:
261 return lbIDEM, LineDontBreak, 231
262 case lbPR | prEB<<32:
263 return lbEB, LineDontBreak, 231
264 case lbPR | prEM<<32:
265 return lbIDEM, LineDontBreak, 231
266 case lbIDEM | prPO<<32:
267 return lbPO, LineDontBreak, 231
268 case lbEB | prPO<<32:
269 return lbPO, LineDontBreak, 231
270
271 // LB24.
272 case lbAny | prPO<<32:
273 return lbPO, LineCanBreak, 310
274 case lbPR | prAL<<32:
275 return lbAL, LineDontBreak, 240
276 case lbPR | prHL<<32:
277 return lbHL, LineDontBreak, 240
278 case lbPO | prAL<<32:
279 return lbAL, LineDontBreak, 240
280 case lbPO | prHL<<32:
281 return lbHL, LineDontBreak, 240
282 case lbAL | prPR<<32:
283 return lbPR, LineDontBreak, 240
284 case lbAL | prPO<<32:
285 return lbPO, LineDontBreak, 240
286 case lbHL | prPR<<32:
287 return lbPR, LineDontBreak, 240
288 case lbHL | prPO<<32:
289 return lbPO, LineDontBreak, 240
290
291 // LB25 (simple transitions).
292 case lbPR | prNU<<32:
293 return lbNU, LineDontBreak, 250
294 case lbPO | prNU<<32:
295 return lbNU, LineDontBreak, 250
296 case lbOP | prNU<<32:
297 return lbNU, LineDontBreak, 250
298 case lbHY | prNU<<32:
299 return lbNU, LineDontBreak, 250
300 case lbNU | prNU<<32:
301 return lbNUNU, LineDontBreak, 250
302 case lbNU | prSY<<32:
303 return lbNUSY, LineDontBreak, 250
304 case lbNU | prIS<<32:
305 return lbNUIS, LineDontBreak, 250
306 case lbNUNU | prNU<<32:
307 return lbNUNU, LineDontBreak, 250
308 case lbNUNU | prSY<<32:
309 return lbNUSY, LineDontBreak, 250
310 case lbNUNU | prIS<<32:
311 return lbNUIS, LineDontBreak, 250
312 case lbNUSY | prNU<<32:
313 return lbNUNU, LineDontBreak, 250
314 case lbNUSY | prSY<<32:
315 return lbNUSY, LineDontBreak, 250
316 case lbNUSY | prIS<<32:
317 return lbNUIS, LineDontBreak, 250
318 case lbNUIS | prNU<<32:
319 return lbNUNU, LineDontBreak, 250
320 case lbNUIS | prSY<<32:
321 return lbNUSY, LineDontBreak, 250
322 case lbNUIS | prIS<<32:
323 return lbNUIS, LineDontBreak, 250
324 case lbNU | prCL<<32:
325 return lbNUCL, LineDontBreak, 250
326 case lbNU | prCP<<32:
327 return lbNUCP, LineDontBreak, 250
328 case lbNUNU | prCL<<32:
329 return lbNUCL, LineDontBreak, 250
330 case lbNUNU | prCP<<32:
331 return lbNUCP, LineDontBreak, 250
332 case lbNUSY | prCL<<32:
333 return lbNUCL, LineDontBreak, 250
334 case lbNUSY | prCP<<32:
335 return lbNUCP, LineDontBreak, 250
336 case lbNUIS | prCL<<32:
337 return lbNUCL, LineDontBreak, 250
338 case lbNUIS | prCP<<32:
339 return lbNUCP, LineDontBreak, 250
340 case lbNU | prPO<<32:
341 return lbPO, LineDontBreak, 250
342 case lbNUNU | prPO<<32:
343 return lbPO, LineDontBreak, 250
344 case lbNUSY | prPO<<32:
345 return lbPO, LineDontBreak, 250
346 case lbNUIS | prPO<<32:
347 return lbPO, LineDontBreak, 250
348 case lbNUCL | prPO<<32:
349 return lbPO, LineDontBreak, 250
350 case lbNUCP | prPO<<32:
351 return lbPO, LineDontBreak, 250
352 case lbNU | prPR<<32:
353 return lbPR, LineDontBreak, 250
354 case lbNUNU | prPR<<32:
355 return lbPR, LineDontBreak, 250
356 case lbNUSY | prPR<<32:
357 return lbPR, LineDontBreak, 250
358 case lbNUIS | prPR<<32:
359 return lbPR, LineDontBreak, 250
360 case lbNUCL | prPR<<32:
361 return lbPR, LineDontBreak, 250
362 case lbNUCP | prPR<<32:
363 return lbPR, LineDontBreak, 250
364
365 // LB26.
366 case lbAny | prJL<<32:
367 return lbJL, LineCanBreak, 310
368 case lbAny | prJV<<32:
369 return lbJV, LineCanBreak, 310
370 case lbAny | prJT<<32:
371 return lbJT, LineCanBreak, 310
372 case lbAny | prH2<<32:
373 return lbH2, LineCanBreak, 310
374 case lbAny | prH3<<32:
375 return lbH3, LineCanBreak, 310
376 case lbJL | prJL<<32:
377 return lbJL, LineDontBreak, 260
378 case lbJL | prJV<<32:
379 return lbJV, LineDontBreak, 260
380 case lbJL | prH2<<32:
381 return lbH2, LineDontBreak, 260
382 case lbJL | prH3<<32:
383 return lbH3, LineDontBreak, 260
384 case lbJV | prJV<<32:
385 return lbJV, LineDontBreak, 260
386 case lbJV | prJT<<32:
387 return lbJT, LineDontBreak, 260
388 case lbH2 | prJV<<32:
389 return lbJV, LineDontBreak, 260
390 case lbH2 | prJT<<32:
391 return lbJT, LineDontBreak, 260
392 case lbJT | prJT<<32:
393 return lbJT, LineDontBreak, 260
394 case lbH3 | prJT<<32:
395 return lbJT, LineDontBreak, 260
396
397 // LB27.
398 case lbJL | prPO<<32:
399 return lbPO, LineDontBreak, 270
400 case lbJV | prPO<<32:
401 return lbPO, LineDontBreak, 270
402 case lbJT | prPO<<32:
403 return lbPO, LineDontBreak, 270
404 case lbH2 | prPO<<32:
405 return lbPO, LineDontBreak, 270
406 case lbH3 | prPO<<32:
407 return lbPO, LineDontBreak, 270
408 case lbPR | prJL<<32:
409 return lbJL, LineDontBreak, 270
410 case lbPR | prJV<<32:
411 return lbJV, LineDontBreak, 270
412 case lbPR | prJT<<32:
413 return lbJT, LineDontBreak, 270
414 case lbPR | prH2<<32:
415 return lbH2, LineDontBreak, 270
416 case lbPR | prH3<<32:
417 return lbH3, LineDontBreak, 270
418
419 // LB28.
420 case lbAL | prAL<<32:
421 return lbAL, LineDontBreak, 280
422 case lbAL | prHL<<32:
423 return lbHL, LineDontBreak, 280
424 case lbHL | prAL<<32:
425 return lbAL, LineDontBreak, 280
426 case lbHL | prHL<<32:
427 return lbHL, LineDontBreak, 280
428
429 // LB29.
430 case lbIS | prAL<<32:
431 return lbAL, LineDontBreak, 290
432 case lbIS | prHL<<32:
433 return lbHL, LineDontBreak, 290
434 case lbNUIS | prAL<<32:
435 return lbAL, LineDontBreak, 290
436 case lbNUIS | prHL<<32:
437 return lbHL, LineDontBreak, 290
438
439 default:
440 return -1, -1, -1
441 }
442}
443
444// transitionLineBreakState determines the new state of the line break parser
445// given the current state and the next code point. It also returns the type of
446// line break: LineDontBreak, LineCanBreak, or LineMustBreak. If more than one
447// code point is needed to determine the new state, the byte slice or the string
448// starting after rune "r" can be used (whichever is not nil or empty) for
449// further lookups.
450func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) {
451 // Determine the property of the next character.
452 nextProperty, generalCategory := propertyLineBreak(r)
453
454 // Prepare.
455 var forceNoBreak, isCPeaFWH bool
456 if state >= 0 && state&lbCPeaFWHBit != 0 {
457 isCPeaFWH = true // LB30: CP but ea is not F, W, or H.
458 state = state &^ lbCPeaFWHBit
459 }
460 if state >= 0 && state&lbZWJBit != 0 {
461 state = state &^ lbZWJBit // Extract zero-width joiner bit.
462 forceNoBreak = true // LB8a.
463 }
464
465 defer func() {
466 // Transition into LB30.
467 if newState == lbCP || newState == lbNUCP {
468 ea := propertyEastAsianWidth(r)
469 if ea != prF && ea != prW && ea != prH {
470 newState |= lbCPeaFWHBit
471 }
472 }
473
474 // Override break.
475 if forceNoBreak {
476 lineBreak = LineDontBreak
477 }
478 }()
479
480 // LB1.
481 if nextProperty == prAI || nextProperty == prSG || nextProperty == prXX {
482 nextProperty = prAL
483 } else if nextProperty == prSA {
484 if generalCategory == gcMn || generalCategory == gcMc {
485 nextProperty = prCM
486 } else {
487 nextProperty = prAL
488 }
489 } else if nextProperty == prCJ {
490 nextProperty = prNS
491 }
492
493 // Combining marks.
494 if nextProperty == prZWJ || nextProperty == prCM {
495 var bit int
496 if nextProperty == prZWJ {
497 bit = lbZWJBit
498 }
499 mustBreakState := state < 0 || state == lbBK || state == lbCR || state == lbLF || state == lbNL
500 if !mustBreakState && state != lbSP && state != lbZW && state != lbQUSP && state != lbCLCPSP && state != lbB2SP {
501 // LB9.
502 return state | bit, LineDontBreak
503 } else {
504 // LB10.
505 if mustBreakState {
506 return lbAL | bit, LineMustBreak
507 }
508 return lbAL | bit, LineCanBreak
509 }
510 }
511
512 // Find the applicable transition in the table.
513 var rule int
514 newState, lineBreak, rule = lbTransitions(state, nextProperty)
515 if newState < 0 {
516 // No specific transition found. Try the less specific ones.
517 anyPropProp, anyPropLineBreak, anyPropRule := lbTransitions(state, prAny)
518 anyStateProp, anyStateLineBreak, anyStateRule := lbTransitions(lbAny, nextProperty)
519 if anyPropProp >= 0 && anyStateProp >= 0 {
520 // Both apply. We'll use a mix (see comments for grTransitions).
521 newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
522 if anyPropRule < anyStateRule {
523 lineBreak, rule = anyPropLineBreak, anyPropRule
524 }
525 } else if anyPropProp >= 0 {
526 // We only have a specific state.
527 newState, lineBreak, rule = anyPropProp, anyPropLineBreak, anyPropRule
528 // This branch will probably never be reached because okAnyState will
529 // always be true given the current transition map. But we keep it here
530 // for future modifications to the transition map where this may not be
531 // true anymore.
532 } else if anyStateProp >= 0 {
533 // We only have a specific property.
534 newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
535 } else {
536 // No known transition. LB31: ALL รท ALL.
537 newState, lineBreak, rule = lbAny, LineCanBreak, 310
538 }
539 }
540
541 // LB12a.
542 if rule > 121 &&
543 nextProperty == prGL &&
544 (state != lbSP && state != lbBA && state != lbHY && state != lbLB21a && state != lbQUSP && state != lbCLCPSP && state != lbB2SP) {
545 return lbGL, LineDontBreak
546 }
547
548 // LB13.
549 if rule > 130 && state != lbNU && state != lbNUNU {
550 switch nextProperty {
551 case prCL:
552 return lbCL, LineDontBreak
553 case prCP:
554 return lbCP, LineDontBreak
555 case prIS:
556 return lbIS, LineDontBreak
557 case prSY:
558 return lbSY, LineDontBreak
559 }
560 }
561
562 // LB25 (look ahead).
563 if rule > 250 &&
564 (state == lbPR || state == lbPO) &&
565 nextProperty == prOP || nextProperty == prHY {
566 var r rune
567 if b != nil { // Byte slice version.
568 r, _ = utf8.DecodeRune(b)
569 } else { // String version.
570 r, _ = utf8.DecodeRuneInString(str)
571 }
572 if r != utf8.RuneError {
573 pr, _ := propertyLineBreak(r)
574 if pr == prNU {
575 return lbNU, LineDontBreak
576 }
577 }
578 }
579
580 // LB30 (part one).
581 if rule > 300 {
582 if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP {
583 ea := propertyEastAsianWidth(r)
584 if ea != prF && ea != prW && ea != prH {
585 return lbOP, LineDontBreak
586 }
587 } else if isCPeaFWH {
588 switch nextProperty {
589 case prAL:
590 return lbAL, LineDontBreak
591 case prHL:
592 return lbHL, LineDontBreak
593 case prNU:
594 return lbNU, LineDontBreak
595 }
596 }
597 }
598
599 // LB30a.
600 if newState == lbAny && nextProperty == prRI {
601 if state != lbOddRI && state != lbEvenRI { // Includes state == -1.
602 // Transition into the first RI.
603 return lbOddRI, lineBreak
604 }
605 if state == lbOddRI {
606 // Don't break pairs of Regional Indicators.
607 return lbEvenRI, LineDontBreak
608 }
609 return lbOddRI, lineBreak
610 }
611
612 // LB30b.
613 if rule > 302 {
614 if nextProperty == prEM {
615 if state == lbEB || state == lbExtPicCn {
616 return prAny, LineDontBreak
617 }
618 }
619 graphemeProperty := propertyGraphemes(r)
620 if graphemeProperty == prExtendedPictographic && generalCategory == gcCn {
621 return lbExtPicCn, LineCanBreak
622 }
623 }
624
625 return
626}