linerules.go

  1package uniseg
  2
  3import "unicode/utf8"
  4
  5// The states of the line break parser.
  6const (
  7	lbAny = iota
  8	lbBK
  9	lbCR
 10	lbLF
 11	lbNL
 12	lbSP
 13	lbZW
 14	lbWJ
 15	lbGL
 16	lbBA
 17	lbHY
 18	lbCL
 19	lbCP
 20	lbEX
 21	lbIS
 22	lbSY
 23	lbOP
 24	lbQU
 25	lbQUSP
 26	lbNS
 27	lbCLCPSP
 28	lbB2
 29	lbB2SP
 30	lbCB
 31	lbBB
 32	lbLB21a
 33	lbHL
 34	lbAL
 35	lbNU
 36	lbPR
 37	lbEB
 38	lbIDEM
 39	lbNUNU
 40	lbNUSY
 41	lbNUIS
 42	lbNUCL
 43	lbNUCP
 44	lbPO
 45	lbJL
 46	lbJV
 47	lbJT
 48	lbH2
 49	lbH3
 50	lbOddRI
 51	lbEvenRI
 52	lbExtPicCn
 53	lbZWJBit     = 64
 54	lbCPeaFWHBit = 128
 55)
 56
 57// These constants define whether a given text may be broken into the next line.
 58// If the break is optional (LineCanBreak), you may choose to break or not based
 59// on your own criteria, for example, if the text has reached the available
 60// width.
 61const (
 62	LineDontBreak = iota // You may not break the line here.
 63	LineCanBreak         // You may or may not break the line here.
 64	LineMustBreak        // You must break the line here.
 65)
 66
 67// lbTransitions implements the line break parser's state transitions. It's
 68// anologous to [grTransitions], see comments there for details.
 69//
 70// Unicode version 15.0.0.
 71func lbTransitions(state, prop int) (newState, lineBreak, rule int) {
 72	switch uint64(state) | uint64(prop)<<32 {
 73	// LB4.
 74	case lbBK | prAny<<32:
 75		return lbAny, LineMustBreak, 40
 76
 77	// LB5.
 78	case lbCR | prLF<<32:
 79		return lbLF, LineDontBreak, 50
 80	case lbCR | prAny<<32:
 81		return lbAny, LineMustBreak, 50
 82	case lbLF | prAny<<32:
 83		return lbAny, LineMustBreak, 50
 84	case lbNL | prAny<<32:
 85		return lbAny, LineMustBreak, 50
 86
 87	// LB6.
 88	case lbAny | prBK<<32:
 89		return lbBK, LineDontBreak, 60
 90	case lbAny | prCR<<32:
 91		return lbCR, LineDontBreak, 60
 92	case lbAny | prLF<<32:
 93		return lbLF, LineDontBreak, 60
 94	case lbAny | prNL<<32:
 95		return lbNL, LineDontBreak, 60
 96
 97	// LB7.
 98	case lbAny | prSP<<32:
 99		return lbSP, LineDontBreak, 70
100	case lbAny | prZW<<32:
101		return lbZW, LineDontBreak, 70
102
103	// LB8.
104	case lbZW | prSP<<32:
105		return lbZW, LineDontBreak, 70
106	case lbZW | prAny<<32:
107		return lbAny, LineCanBreak, 80
108
109	// LB11.
110	case lbAny | prWJ<<32:
111		return lbWJ, LineDontBreak, 110
112	case lbWJ | prAny<<32:
113		return lbAny, LineDontBreak, 110
114
115	// LB12.
116	case lbAny | prGL<<32:
117		return lbGL, LineCanBreak, 310
118	case lbGL | prAny<<32:
119		return lbAny, LineDontBreak, 120
120
121	// LB13 (simple transitions).
122	case lbAny | prCL<<32:
123		return lbCL, LineCanBreak, 310
124	case lbAny | prCP<<32:
125		return lbCP, LineCanBreak, 310
126	case lbAny | prEX<<32:
127		return lbEX, LineDontBreak, 130
128	case lbAny | prIS<<32:
129		return lbIS, LineCanBreak, 310
130	case lbAny | prSY<<32:
131		return lbSY, LineCanBreak, 310
132
133	// LB14.
134	case lbAny | prOP<<32:
135		return lbOP, LineCanBreak, 310
136	case lbOP | prSP<<32:
137		return lbOP, LineDontBreak, 70
138	case lbOP | prAny<<32:
139		return lbAny, LineDontBreak, 140
140
141	// LB15.
142	case lbQU | prSP<<32:
143		return lbQUSP, LineDontBreak, 70
144	case lbQU | prOP<<32:
145		return lbOP, LineDontBreak, 150
146	case lbQUSP | prOP<<32:
147		return lbOP, LineDontBreak, 150
148
149	// LB16.
150	case lbCL | prSP<<32:
151		return lbCLCPSP, LineDontBreak, 70
152	case lbNUCL | prSP<<32:
153		return lbCLCPSP, LineDontBreak, 70
154	case lbCP | prSP<<32:
155		return lbCLCPSP, LineDontBreak, 70
156	case lbNUCP | prSP<<32:
157		return lbCLCPSP, LineDontBreak, 70
158	case lbCL | prNS<<32:
159		return lbNS, LineDontBreak, 160
160	case lbNUCL | prNS<<32:
161		return lbNS, LineDontBreak, 160
162	case lbCP | prNS<<32:
163		return lbNS, LineDontBreak, 160
164	case lbNUCP | prNS<<32:
165		return lbNS, LineDontBreak, 160
166	case lbCLCPSP | prNS<<32:
167		return lbNS, LineDontBreak, 160
168
169	// LB17.
170	case lbAny | prB2<<32:
171		return lbB2, LineCanBreak, 310
172	case lbB2 | prSP<<32:
173		return lbB2SP, LineDontBreak, 70
174	case lbB2 | prB2<<32:
175		return lbB2, LineDontBreak, 170
176	case lbB2SP | prB2<<32:
177		return lbB2, LineDontBreak, 170
178
179	// LB18.
180	case lbSP | prAny<<32:
181		return lbAny, LineCanBreak, 180
182	case lbQUSP | prAny<<32:
183		return lbAny, LineCanBreak, 180
184	case lbCLCPSP | prAny<<32:
185		return lbAny, LineCanBreak, 180
186	case lbB2SP | prAny<<32:
187		return lbAny, LineCanBreak, 180
188
189	// LB19.
190	case lbAny | prQU<<32:
191		return lbQU, LineDontBreak, 190
192	case lbQU | prAny<<32:
193		return lbAny, LineDontBreak, 190
194
195	// LB20.
196	case lbAny | prCB<<32:
197		return lbCB, LineCanBreak, 200
198	case lbCB | prAny<<32:
199		return lbAny, LineCanBreak, 200
200
201	// LB21.
202	case lbAny | prBA<<32:
203		return lbBA, LineDontBreak, 210
204	case lbAny | prHY<<32:
205		return lbHY, LineDontBreak, 210
206	case lbAny | prNS<<32:
207		return lbNS, LineDontBreak, 210
208	case lbAny | prBB<<32:
209		return lbBB, LineCanBreak, 310
210	case lbBB | prAny<<32:
211		return lbAny, LineDontBreak, 210
212
213	// LB21a.
214	case lbAny | prHL<<32:
215		return lbHL, LineCanBreak, 310
216	case lbHL | prHY<<32:
217		return lbLB21a, LineDontBreak, 210
218	case lbHL | prBA<<32:
219		return lbLB21a, LineDontBreak, 210
220	case lbLB21a | prAny<<32:
221		return lbAny, LineDontBreak, 211
222
223	// LB21b.
224	case lbSY | prHL<<32:
225		return lbHL, LineDontBreak, 212
226	case lbNUSY | prHL<<32:
227		return lbHL, LineDontBreak, 212
228
229	// LB22.
230	case lbAny | prIN<<32:
231		return lbAny, LineDontBreak, 220
232
233	// LB23.
234	case lbAny | prAL<<32:
235		return lbAL, LineCanBreak, 310
236	case lbAny | prNU<<32:
237		return lbNU, LineCanBreak, 310
238	case lbAL | prNU<<32:
239		return lbNU, LineDontBreak, 230
240	case lbHL | prNU<<32:
241		return lbNU, LineDontBreak, 230
242	case lbNU | prAL<<32:
243		return lbAL, LineDontBreak, 230
244	case lbNU | prHL<<32:
245		return lbHL, LineDontBreak, 230
246	case lbNUNU | prAL<<32:
247		return lbAL, LineDontBreak, 230
248	case lbNUNU | prHL<<32:
249		return lbHL, LineDontBreak, 230
250
251	// LB23a.
252	case lbAny | prPR<<32:
253		return lbPR, LineCanBreak, 310
254	case lbAny | prID<<32:
255		return lbIDEM, LineCanBreak, 310
256	case lbAny | prEB<<32:
257		return lbEB, LineCanBreak, 310
258	case lbAny | prEM<<32:
259		return lbIDEM, LineCanBreak, 310
260	case lbPR | prID<<32:
261		return lbIDEM, LineDontBreak, 231
262	case lbPR | prEB<<32:
263		return lbEB, LineDontBreak, 231
264	case lbPR | prEM<<32:
265		return lbIDEM, LineDontBreak, 231
266	case lbIDEM | prPO<<32:
267		return lbPO, LineDontBreak, 231
268	case lbEB | prPO<<32:
269		return lbPO, LineDontBreak, 231
270
271	// LB24.
272	case lbAny | prPO<<32:
273		return lbPO, LineCanBreak, 310
274	case lbPR | prAL<<32:
275		return lbAL, LineDontBreak, 240
276	case lbPR | prHL<<32:
277		return lbHL, LineDontBreak, 240
278	case lbPO | prAL<<32:
279		return lbAL, LineDontBreak, 240
280	case lbPO | prHL<<32:
281		return lbHL, LineDontBreak, 240
282	case lbAL | prPR<<32:
283		return lbPR, LineDontBreak, 240
284	case lbAL | prPO<<32:
285		return lbPO, LineDontBreak, 240
286	case lbHL | prPR<<32:
287		return lbPR, LineDontBreak, 240
288	case lbHL | prPO<<32:
289		return lbPO, LineDontBreak, 240
290
291	// LB25 (simple transitions).
292	case lbPR | prNU<<32:
293		return lbNU, LineDontBreak, 250
294	case lbPO | prNU<<32:
295		return lbNU, LineDontBreak, 250
296	case lbOP | prNU<<32:
297		return lbNU, LineDontBreak, 250
298	case lbHY | prNU<<32:
299		return lbNU, LineDontBreak, 250
300	case lbNU | prNU<<32:
301		return lbNUNU, LineDontBreak, 250
302	case lbNU | prSY<<32:
303		return lbNUSY, LineDontBreak, 250
304	case lbNU | prIS<<32:
305		return lbNUIS, LineDontBreak, 250
306	case lbNUNU | prNU<<32:
307		return lbNUNU, LineDontBreak, 250
308	case lbNUNU | prSY<<32:
309		return lbNUSY, LineDontBreak, 250
310	case lbNUNU | prIS<<32:
311		return lbNUIS, LineDontBreak, 250
312	case lbNUSY | prNU<<32:
313		return lbNUNU, LineDontBreak, 250
314	case lbNUSY | prSY<<32:
315		return lbNUSY, LineDontBreak, 250
316	case lbNUSY | prIS<<32:
317		return lbNUIS, LineDontBreak, 250
318	case lbNUIS | prNU<<32:
319		return lbNUNU, LineDontBreak, 250
320	case lbNUIS | prSY<<32:
321		return lbNUSY, LineDontBreak, 250
322	case lbNUIS | prIS<<32:
323		return lbNUIS, LineDontBreak, 250
324	case lbNU | prCL<<32:
325		return lbNUCL, LineDontBreak, 250
326	case lbNU | prCP<<32:
327		return lbNUCP, LineDontBreak, 250
328	case lbNUNU | prCL<<32:
329		return lbNUCL, LineDontBreak, 250
330	case lbNUNU | prCP<<32:
331		return lbNUCP, LineDontBreak, 250
332	case lbNUSY | prCL<<32:
333		return lbNUCL, LineDontBreak, 250
334	case lbNUSY | prCP<<32:
335		return lbNUCP, LineDontBreak, 250
336	case lbNUIS | prCL<<32:
337		return lbNUCL, LineDontBreak, 250
338	case lbNUIS | prCP<<32:
339		return lbNUCP, LineDontBreak, 250
340	case lbNU | prPO<<32:
341		return lbPO, LineDontBreak, 250
342	case lbNUNU | prPO<<32:
343		return lbPO, LineDontBreak, 250
344	case lbNUSY | prPO<<32:
345		return lbPO, LineDontBreak, 250
346	case lbNUIS | prPO<<32:
347		return lbPO, LineDontBreak, 250
348	case lbNUCL | prPO<<32:
349		return lbPO, LineDontBreak, 250
350	case lbNUCP | prPO<<32:
351		return lbPO, LineDontBreak, 250
352	case lbNU | prPR<<32:
353		return lbPR, LineDontBreak, 250
354	case lbNUNU | prPR<<32:
355		return lbPR, LineDontBreak, 250
356	case lbNUSY | prPR<<32:
357		return lbPR, LineDontBreak, 250
358	case lbNUIS | prPR<<32:
359		return lbPR, LineDontBreak, 250
360	case lbNUCL | prPR<<32:
361		return lbPR, LineDontBreak, 250
362	case lbNUCP | prPR<<32:
363		return lbPR, LineDontBreak, 250
364
365	// LB26.
366	case lbAny | prJL<<32:
367		return lbJL, LineCanBreak, 310
368	case lbAny | prJV<<32:
369		return lbJV, LineCanBreak, 310
370	case lbAny | prJT<<32:
371		return lbJT, LineCanBreak, 310
372	case lbAny | prH2<<32:
373		return lbH2, LineCanBreak, 310
374	case lbAny | prH3<<32:
375		return lbH3, LineCanBreak, 310
376	case lbJL | prJL<<32:
377		return lbJL, LineDontBreak, 260
378	case lbJL | prJV<<32:
379		return lbJV, LineDontBreak, 260
380	case lbJL | prH2<<32:
381		return lbH2, LineDontBreak, 260
382	case lbJL | prH3<<32:
383		return lbH3, LineDontBreak, 260
384	case lbJV | prJV<<32:
385		return lbJV, LineDontBreak, 260
386	case lbJV | prJT<<32:
387		return lbJT, LineDontBreak, 260
388	case lbH2 | prJV<<32:
389		return lbJV, LineDontBreak, 260
390	case lbH2 | prJT<<32:
391		return lbJT, LineDontBreak, 260
392	case lbJT | prJT<<32:
393		return lbJT, LineDontBreak, 260
394	case lbH3 | prJT<<32:
395		return lbJT, LineDontBreak, 260
396
397	// LB27.
398	case lbJL | prPO<<32:
399		return lbPO, LineDontBreak, 270
400	case lbJV | prPO<<32:
401		return lbPO, LineDontBreak, 270
402	case lbJT | prPO<<32:
403		return lbPO, LineDontBreak, 270
404	case lbH2 | prPO<<32:
405		return lbPO, LineDontBreak, 270
406	case lbH3 | prPO<<32:
407		return lbPO, LineDontBreak, 270
408	case lbPR | prJL<<32:
409		return lbJL, LineDontBreak, 270
410	case lbPR | prJV<<32:
411		return lbJV, LineDontBreak, 270
412	case lbPR | prJT<<32:
413		return lbJT, LineDontBreak, 270
414	case lbPR | prH2<<32:
415		return lbH2, LineDontBreak, 270
416	case lbPR | prH3<<32:
417		return lbH3, LineDontBreak, 270
418
419	// LB28.
420	case lbAL | prAL<<32:
421		return lbAL, LineDontBreak, 280
422	case lbAL | prHL<<32:
423		return lbHL, LineDontBreak, 280
424	case lbHL | prAL<<32:
425		return lbAL, LineDontBreak, 280
426	case lbHL | prHL<<32:
427		return lbHL, LineDontBreak, 280
428
429	// LB29.
430	case lbIS | prAL<<32:
431		return lbAL, LineDontBreak, 290
432	case lbIS | prHL<<32:
433		return lbHL, LineDontBreak, 290
434	case lbNUIS | prAL<<32:
435		return lbAL, LineDontBreak, 290
436	case lbNUIS | prHL<<32:
437		return lbHL, LineDontBreak, 290
438
439	default:
440		return -1, -1, -1
441	}
442}
443
444// transitionLineBreakState determines the new state of the line break parser
445// given the current state and the next code point. It also returns the type of
446// line break: LineDontBreak, LineCanBreak, or LineMustBreak. If more than one
447// code point is needed to determine the new state, the byte slice or the string
448// starting after rune "r" can be used (whichever is not nil or empty) for
449// further lookups.
450func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) {
451	// Determine the property of the next character.
452	nextProperty, generalCategory := propertyLineBreak(r)
453
454	// Prepare.
455	var forceNoBreak, isCPeaFWH bool
456	if state >= 0 && state&lbCPeaFWHBit != 0 {
457		isCPeaFWH = true // LB30: CP but ea is not F, W, or H.
458		state = state &^ lbCPeaFWHBit
459	}
460	if state >= 0 && state&lbZWJBit != 0 {
461		state = state &^ lbZWJBit // Extract zero-width joiner bit.
462		forceNoBreak = true       // LB8a.
463	}
464
465	defer func() {
466		// Transition into LB30.
467		if newState == lbCP || newState == lbNUCP {
468			ea := propertyEastAsianWidth(r)
469			if ea != prF && ea != prW && ea != prH {
470				newState |= lbCPeaFWHBit
471			}
472		}
473
474		// Override break.
475		if forceNoBreak {
476			lineBreak = LineDontBreak
477		}
478	}()
479
480	// LB1.
481	if nextProperty == prAI || nextProperty == prSG || nextProperty == prXX {
482		nextProperty = prAL
483	} else if nextProperty == prSA {
484		if generalCategory == gcMn || generalCategory == gcMc {
485			nextProperty = prCM
486		} else {
487			nextProperty = prAL
488		}
489	} else if nextProperty == prCJ {
490		nextProperty = prNS
491	}
492
493	// Combining marks.
494	if nextProperty == prZWJ || nextProperty == prCM {
495		var bit int
496		if nextProperty == prZWJ {
497			bit = lbZWJBit
498		}
499		mustBreakState := state < 0 || state == lbBK || state == lbCR || state == lbLF || state == lbNL
500		if !mustBreakState && state != lbSP && state != lbZW && state != lbQUSP && state != lbCLCPSP && state != lbB2SP {
501			// LB9.
502			return state | bit, LineDontBreak
503		} else {
504			// LB10.
505			if mustBreakState {
506				return lbAL | bit, LineMustBreak
507			}
508			return lbAL | bit, LineCanBreak
509		}
510	}
511
512	// Find the applicable transition in the table.
513	var rule int
514	newState, lineBreak, rule = lbTransitions(state, nextProperty)
515	if newState < 0 {
516		// No specific transition found. Try the less specific ones.
517		anyPropProp, anyPropLineBreak, anyPropRule := lbTransitions(state, prAny)
518		anyStateProp, anyStateLineBreak, anyStateRule := lbTransitions(lbAny, nextProperty)
519		if anyPropProp >= 0 && anyStateProp >= 0 {
520			// Both apply. We'll use a mix (see comments for grTransitions).
521			newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
522			if anyPropRule < anyStateRule {
523				lineBreak, rule = anyPropLineBreak, anyPropRule
524			}
525		} else if anyPropProp >= 0 {
526			// We only have a specific state.
527			newState, lineBreak, rule = anyPropProp, anyPropLineBreak, anyPropRule
528			// This branch will probably never be reached because okAnyState will
529			// always be true given the current transition map. But we keep it here
530			// for future modifications to the transition map where this may not be
531			// true anymore.
532		} else if anyStateProp >= 0 {
533			// We only have a specific property.
534			newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
535		} else {
536			// No known transition. LB31: ALL รท ALL.
537			newState, lineBreak, rule = lbAny, LineCanBreak, 310
538		}
539	}
540
541	// LB12a.
542	if rule > 121 &&
543		nextProperty == prGL &&
544		(state != lbSP && state != lbBA && state != lbHY && state != lbLB21a && state != lbQUSP && state != lbCLCPSP && state != lbB2SP) {
545		return lbGL, LineDontBreak
546	}
547
548	// LB13.
549	if rule > 130 && state != lbNU && state != lbNUNU {
550		switch nextProperty {
551		case prCL:
552			return lbCL, LineDontBreak
553		case prCP:
554			return lbCP, LineDontBreak
555		case prIS:
556			return lbIS, LineDontBreak
557		case prSY:
558			return lbSY, LineDontBreak
559		}
560	}
561
562	// LB25 (look ahead).
563	if rule > 250 &&
564		(state == lbPR || state == lbPO) &&
565		nextProperty == prOP || nextProperty == prHY {
566		var r rune
567		if b != nil { // Byte slice version.
568			r, _ = utf8.DecodeRune(b)
569		} else { // String version.
570			r, _ = utf8.DecodeRuneInString(str)
571		}
572		if r != utf8.RuneError {
573			pr, _ := propertyLineBreak(r)
574			if pr == prNU {
575				return lbNU, LineDontBreak
576			}
577		}
578	}
579
580	// LB30 (part one).
581	if rule > 300 {
582		if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP {
583			ea := propertyEastAsianWidth(r)
584			if ea != prF && ea != prW && ea != prH {
585				return lbOP, LineDontBreak
586			}
587		} else if isCPeaFWH {
588			switch nextProperty {
589			case prAL:
590				return lbAL, LineDontBreak
591			case prHL:
592				return lbHL, LineDontBreak
593			case prNU:
594				return lbNU, LineDontBreak
595			}
596		}
597	}
598
599	// LB30a.
600	if newState == lbAny && nextProperty == prRI {
601		if state != lbOddRI && state != lbEvenRI { // Includes state == -1.
602			// Transition into the first RI.
603			return lbOddRI, lineBreak
604		}
605		if state == lbOddRI {
606			// Don't break pairs of Regional Indicators.
607			return lbEvenRI, LineDontBreak
608		}
609		return lbOddRI, lineBreak
610	}
611
612	// LB30b.
613	if rule > 302 {
614		if nextProperty == prEM {
615			if state == lbEB || state == lbExtPicCn {
616				return prAny, LineDontBreak
617			}
618		}
619		graphemeProperty := propertyGraphemes(r)
620		if graphemeProperty == prExtendedPictographic && generalCategory == gcCn {
621			return lbExtPicCn, LineCanBreak
622		}
623	}
624
625	return
626}