1package uniseg
2
3import "unicode/utf8"
4
5// The states of the word break parser.
6const (
7 wbAny = iota
8 wbCR
9 wbLF
10 wbNewline
11 wbWSegSpace
12 wbHebrewLetter
13 wbALetter
14 wbWB7
15 wbWB7c
16 wbNumeric
17 wbWB11
18 wbKatakana
19 wbExtendNumLet
20 wbOddRI
21 wbEvenRI
22 wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
23)
24
25// wbTransitions implements the word break parser's state transitions. It's
26// anologous to [grTransitions], see comments there for details.
27//
28// Unicode version 15.0.0.
29func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
30 switch uint64(state) | uint64(prop)<<32 {
31 // WB3b.
32 case wbAny | prNewline<<32:
33 return wbNewline, true, 32
34 case wbAny | prCR<<32:
35 return wbCR, true, 32
36 case wbAny | prLF<<32:
37 return wbLF, true, 32
38
39 // WB3a.
40 case wbNewline | prAny<<32:
41 return wbAny, true, 31
42 case wbCR | prAny<<32:
43 return wbAny, true, 31
44 case wbLF | prAny<<32:
45 return wbAny, true, 31
46
47 // WB3.
48 case wbCR | prLF<<32:
49 return wbLF, false, 30
50
51 // WB3d.
52 case wbAny | prWSegSpace<<32:
53 return wbWSegSpace, true, 9990
54 case wbWSegSpace | prWSegSpace<<32:
55 return wbWSegSpace, false, 34
56
57 // WB5.
58 case wbAny | prALetter<<32:
59 return wbALetter, true, 9990
60 case wbAny | prHebrewLetter<<32:
61 return wbHebrewLetter, true, 9990
62 case wbALetter | prALetter<<32:
63 return wbALetter, false, 50
64 case wbALetter | prHebrewLetter<<32:
65 return wbHebrewLetter, false, 50
66 case wbHebrewLetter | prALetter<<32:
67 return wbALetter, false, 50
68 case wbHebrewLetter | prHebrewLetter<<32:
69 return wbHebrewLetter, false, 50
70
71 // WB7. Transitions to wbWB7 handled by transitionWordBreakState().
72 case wbWB7 | prALetter<<32:
73 return wbALetter, false, 70
74 case wbWB7 | prHebrewLetter<<32:
75 return wbHebrewLetter, false, 70
76
77 // WB7a.
78 case wbHebrewLetter | prSingleQuote<<32:
79 return wbAny, false, 71
80
81 // WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
82 case wbWB7c | prHebrewLetter<<32:
83 return wbHebrewLetter, false, 73
84
85 // WB8.
86 case wbAny | prNumeric<<32:
87 return wbNumeric, true, 9990
88 case wbNumeric | prNumeric<<32:
89 return wbNumeric, false, 80
90
91 // WB9.
92 case wbALetter | prNumeric<<32:
93 return wbNumeric, false, 90
94 case wbHebrewLetter | prNumeric<<32:
95 return wbNumeric, false, 90
96
97 // WB10.
98 case wbNumeric | prALetter<<32:
99 return wbALetter, false, 100
100 case wbNumeric | prHebrewLetter<<32:
101 return wbHebrewLetter, false, 100
102
103 // WB11. Transitions to wbWB11 handled by transitionWordBreakState().
104 case wbWB11 | prNumeric<<32:
105 return wbNumeric, false, 110
106
107 // WB13.
108 case wbAny | prKatakana<<32:
109 return wbKatakana, true, 9990
110 case wbKatakana | prKatakana<<32:
111 return wbKatakana, false, 130
112
113 // WB13a.
114 case wbAny | prExtendNumLet<<32:
115 return wbExtendNumLet, true, 9990
116 case wbALetter | prExtendNumLet<<32:
117 return wbExtendNumLet, false, 131
118 case wbHebrewLetter | prExtendNumLet<<32:
119 return wbExtendNumLet, false, 131
120 case wbNumeric | prExtendNumLet<<32:
121 return wbExtendNumLet, false, 131
122 case wbKatakana | prExtendNumLet<<32:
123 return wbExtendNumLet, false, 131
124 case wbExtendNumLet | prExtendNumLet<<32:
125 return wbExtendNumLet, false, 131
126
127 // WB13b.
128 case wbExtendNumLet | prALetter<<32:
129 return wbALetter, false, 132
130 case wbExtendNumLet | prHebrewLetter<<32:
131 return wbHebrewLetter, false, 132
132 case wbExtendNumLet | prNumeric<<32:
133 return wbNumeric, false, 132
134 case wbExtendNumLet | prKatakana<<32:
135 return wbKatakana, false, 132
136
137 default:
138 return -1, false, -1
139 }
140}
141
142// transitionWordBreakState determines the new state of the word break parser
143// given the current state and the next code point. It also returns whether a
144// word boundary was detected. If more than one code point is needed to
145// determine the new state, the byte slice or the string starting after rune "r"
146// can be used (whichever is not nil or empty) for further lookups.
147func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
148 // Determine the property of the next character.
149 nextProperty := property(workBreakCodePoints, r)
150
151 // "Replacing Ignore Rules".
152 if nextProperty == prZWJ {
153 // WB4 (for zero-width joiners).
154 if state == wbNewline || state == wbCR || state == wbLF {
155 return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
156 }
157 if state < 0 {
158 return wbAny | wbZWJBit, false
159 }
160 return state | wbZWJBit, false
161 } else if nextProperty == prExtend || nextProperty == prFormat {
162 // WB4 (for Extend and Format).
163 if state == wbNewline || state == wbCR || state == wbLF {
164 return wbAny, true // Make sure we don't apply WB4 to WB3a.
165 }
166 if state == wbWSegSpace || state == wbAny|wbZWJBit {
167 return wbAny, false // We don't break but this is also not WB3d or WB3c.
168 }
169 if state < 0 {
170 return wbAny, false
171 }
172 return state, false
173 } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
174 // WB3c.
175 return wbAny, false
176 }
177 if state >= 0 {
178 state = state &^ wbZWJBit
179 }
180
181 // Find the applicable transition in the table.
182 var rule int
183 newState, wordBreak, rule = wbTransitions(state, nextProperty)
184 if newState < 0 {
185 // No specific transition found. Try the less specific ones.
186 anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
187 anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
188 if anyPropState >= 0 && anyStateState >= 0 {
189 // Both apply. We'll use a mix (see comments for grTransitions).
190 newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
191 if anyPropRule < anyStateRule {
192 wordBreak, rule = anyPropWordBreak, anyPropRule
193 }
194 } else if anyPropState >= 0 {
195 // We only have a specific state.
196 newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
197 // This branch will probably never be reached because okAnyState will
198 // always be true given the current transition map. But we keep it here
199 // for future modifications to the transition map where this may not be
200 // true anymore.
201 } else if anyStateState >= 0 {
202 // We only have a specific property.
203 newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
204 } else {
205 // No known transition. WB999: Any รท Any.
206 newState, wordBreak, rule = wbAny, true, 9990
207 }
208 }
209
210 // For those rules that need to look up runes further in the string, we
211 // determine the property after nextProperty, skipping over Format, Extend,
212 // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
213 // be determined (because the text ends or the rune is faulty).
214 farProperty := -1
215 if rule > 60 &&
216 (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
217 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
218 nextProperty == prDoubleQuote || // WB7b.
219 nextProperty == prMidNum) { // WB12.
220 for {
221 var (
222 r rune
223 length int
224 )
225 if b != nil { // Byte slice version.
226 r, length = utf8.DecodeRune(b)
227 b = b[length:]
228 } else { // String version.
229 r, length = utf8.DecodeRuneInString(str)
230 str = str[length:]
231 }
232 if r == utf8.RuneError {
233 break
234 }
235 prop := property(workBreakCodePoints, r)
236 if prop == prExtend || prop == prFormat || prop == prZWJ {
237 continue
238 }
239 farProperty = prop
240 break
241 }
242 }
243
244 // WB6.
245 if rule > 60 &&
246 (state == wbALetter || state == wbHebrewLetter) &&
247 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
248 (farProperty == prALetter || farProperty == prHebrewLetter) {
249 return wbWB7, false
250 }
251
252 // WB7b.
253 if rule > 72 &&
254 state == wbHebrewLetter &&
255 nextProperty == prDoubleQuote &&
256 farProperty == prHebrewLetter {
257 return wbWB7c, false
258 }
259
260 // WB12.
261 if rule > 120 &&
262 state == wbNumeric &&
263 (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
264 farProperty == prNumeric {
265 return wbWB11, false
266 }
267
268 // WB15 and WB16.
269 if newState == wbAny && nextProperty == prRegionalIndicator {
270 if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
271 // Transition into the first RI.
272 return wbOddRI, true
273 }
274 if state == wbOddRI {
275 // Don't break pairs of Regional Indicators.
276 return wbEvenRI, false
277 }
278 return wbOddRI, true // We can break after a pair.
279 }
280
281 return
282}