1package uniseg
2
3import "unicode/utf8"
4
5// The states of the sentence break parser.
6const (
7 sbAny = iota
8 sbCR
9 sbParaSep
10 sbATerm
11 sbUpper
12 sbLower
13 sbSB7
14 sbSB8Close
15 sbSB8Sp
16 sbSTerm
17 sbSB8aClose
18 sbSB8aSp
19)
20
21// sbTransitions implements the sentence break parser's state transitions. It's
22// anologous to [grTransitions], see comments there for details.
23//
24// Unicode version 15.0.0.
25func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
26 switch uint64(state) | uint64(prop)<<32 {
27 // SB3.
28 case sbAny | prCR<<32:
29 return sbCR, false, 9990
30 case sbCR | prLF<<32:
31 return sbParaSep, false, 30
32
33 // SB4.
34 case sbAny | prSep<<32:
35 return sbParaSep, false, 9990
36 case sbAny | prLF<<32:
37 return sbParaSep, false, 9990
38 case sbParaSep | prAny<<32:
39 return sbAny, true, 40
40 case sbCR | prAny<<32:
41 return sbAny, true, 40
42
43 // SB6.
44 case sbAny | prATerm<<32:
45 return sbATerm, false, 9990
46 case sbATerm | prNumeric<<32:
47 return sbAny, false, 60
48 case sbSB7 | prNumeric<<32:
49 return sbAny, false, 60 // Because ATerm also appears in SB7.
50
51 // SB7.
52 case sbAny | prUpper<<32:
53 return sbUpper, false, 9990
54 case sbAny | prLower<<32:
55 return sbLower, false, 9990
56 case sbUpper | prATerm<<32:
57 return sbSB7, false, 70
58 case sbLower | prATerm<<32:
59 return sbSB7, false, 70
60 case sbSB7 | prUpper<<32:
61 return sbUpper, false, 70
62
63 // SB8a.
64 case sbAny | prSTerm<<32:
65 return sbSTerm, false, 9990
66 case sbATerm | prSContinue<<32:
67 return sbAny, false, 81
68 case sbATerm | prATerm<<32:
69 return sbATerm, false, 81
70 case sbATerm | prSTerm<<32:
71 return sbSTerm, false, 81
72 case sbSB7 | prSContinue<<32:
73 return sbAny, false, 81
74 case sbSB7 | prATerm<<32:
75 return sbATerm, false, 81
76 case sbSB7 | prSTerm<<32:
77 return sbSTerm, false, 81
78 case sbSB8Close | prSContinue<<32:
79 return sbAny, false, 81
80 case sbSB8Close | prATerm<<32:
81 return sbATerm, false, 81
82 case sbSB8Close | prSTerm<<32:
83 return sbSTerm, false, 81
84 case sbSB8Sp | prSContinue<<32:
85 return sbAny, false, 81
86 case sbSB8Sp | prATerm<<32:
87 return sbATerm, false, 81
88 case sbSB8Sp | prSTerm<<32:
89 return sbSTerm, false, 81
90 case sbSTerm | prSContinue<<32:
91 return sbAny, false, 81
92 case sbSTerm | prATerm<<32:
93 return sbATerm, false, 81
94 case sbSTerm | prSTerm<<32:
95 return sbSTerm, false, 81
96 case sbSB8aClose | prSContinue<<32:
97 return sbAny, false, 81
98 case sbSB8aClose | prATerm<<32:
99 return sbATerm, false, 81
100 case sbSB8aClose | prSTerm<<32:
101 return sbSTerm, false, 81
102 case sbSB8aSp | prSContinue<<32:
103 return sbAny, false, 81
104 case sbSB8aSp | prATerm<<32:
105 return sbATerm, false, 81
106 case sbSB8aSp | prSTerm<<32:
107 return sbSTerm, false, 81
108
109 // SB9.
110 case sbATerm | prClose<<32:
111 return sbSB8Close, false, 90
112 case sbSB7 | prClose<<32:
113 return sbSB8Close, false, 90
114 case sbSB8Close | prClose<<32:
115 return sbSB8Close, false, 90
116 case sbATerm | prSp<<32:
117 return sbSB8Sp, false, 90
118 case sbSB7 | prSp<<32:
119 return sbSB8Sp, false, 90
120 case sbSB8Close | prSp<<32:
121 return sbSB8Sp, false, 90
122 case sbSTerm | prClose<<32:
123 return sbSB8aClose, false, 90
124 case sbSB8aClose | prClose<<32:
125 return sbSB8aClose, false, 90
126 case sbSTerm | prSp<<32:
127 return sbSB8aSp, false, 90
128 case sbSB8aClose | prSp<<32:
129 return sbSB8aSp, false, 90
130 case sbATerm | prSep<<32:
131 return sbParaSep, false, 90
132 case sbATerm | prCR<<32:
133 return sbParaSep, false, 90
134 case sbATerm | prLF<<32:
135 return sbParaSep, false, 90
136 case sbSB7 | prSep<<32:
137 return sbParaSep, false, 90
138 case sbSB7 | prCR<<32:
139 return sbParaSep, false, 90
140 case sbSB7 | prLF<<32:
141 return sbParaSep, false, 90
142 case sbSB8Close | prSep<<32:
143 return sbParaSep, false, 90
144 case sbSB8Close | prCR<<32:
145 return sbParaSep, false, 90
146 case sbSB8Close | prLF<<32:
147 return sbParaSep, false, 90
148 case sbSTerm | prSep<<32:
149 return sbParaSep, false, 90
150 case sbSTerm | prCR<<32:
151 return sbParaSep, false, 90
152 case sbSTerm | prLF<<32:
153 return sbParaSep, false, 90
154 case sbSB8aClose | prSep<<32:
155 return sbParaSep, false, 90
156 case sbSB8aClose | prCR<<32:
157 return sbParaSep, false, 90
158 case sbSB8aClose | prLF<<32:
159 return sbParaSep, false, 90
160
161 // SB10.
162 case sbSB8Sp | prSp<<32:
163 return sbSB8Sp, false, 100
164 case sbSB8aSp | prSp<<32:
165 return sbSB8aSp, false, 100
166 case sbSB8Sp | prSep<<32:
167 return sbParaSep, false, 100
168 case sbSB8Sp | prCR<<32:
169 return sbParaSep, false, 100
170 case sbSB8Sp | prLF<<32:
171 return sbParaSep, false, 100
172
173 // SB11.
174 case sbATerm | prAny<<32:
175 return sbAny, true, 110
176 case sbSB7 | prAny<<32:
177 return sbAny, true, 110
178 case sbSB8Close | prAny<<32:
179 return sbAny, true, 110
180 case sbSB8Sp | prAny<<32:
181 return sbAny, true, 110
182 case sbSTerm | prAny<<32:
183 return sbAny, true, 110
184 case sbSB8aClose | prAny<<32:
185 return sbAny, true, 110
186 case sbSB8aSp | prAny<<32:
187 return sbAny, true, 110
188 // We'll always break after ParaSep due to SB4.
189
190 default:
191 return -1, false, -1
192 }
193}
194
195// transitionSentenceBreakState determines the new state of the sentence break
196// parser given the current state and the next code point. It also returns
197// whether a sentence boundary was detected. If more than one code point is
198// needed to determine the new state, the byte slice or the string starting
199// after rune "r" can be used (whichever is not nil or empty) for further
200// lookups.
201func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
202 // Determine the property of the next character.
203 nextProperty := property(sentenceBreakCodePoints, r)
204
205 // SB5 (Replacing Ignore Rules).
206 if nextProperty == prExtend || nextProperty == prFormat {
207 if state == sbParaSep || state == sbCR {
208 return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
209 }
210 if state < 0 {
211 return sbAny, true // SB1.
212 }
213 return state, false
214 }
215
216 // Find the applicable transition in the table.
217 var rule int
218 newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
219 if newState < 0 {
220 // No specific transition found. Try the less specific ones.
221 anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
222 anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
223 if anyPropState >= 0 && anyStateState >= 0 {
224 // Both apply. We'll use a mix (see comments for grTransitions).
225 newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
226 if anyPropRule < anyStateRule {
227 sentenceBreak, rule = anyPropProp, anyPropRule
228 }
229 } else if anyPropState >= 0 {
230 // We only have a specific state.
231 newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
232 // This branch will probably never be reached because okAnyState will
233 // always be true given the current transition map. But we keep it here
234 // for future modifications to the transition map where this may not be
235 // true anymore.
236 } else if anyStateState >= 0 {
237 // We only have a specific property.
238 newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
239 } else {
240 // No known transition. SB999: Any × Any.
241 newState, sentenceBreak, rule = sbAny, false, 9990
242 }
243 }
244
245 // SB8.
246 if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
247 // Check the right side of the rule.
248 var length int
249 for nextProperty != prOLetter &&
250 nextProperty != prUpper &&
251 nextProperty != prLower &&
252 nextProperty != prSep &&
253 nextProperty != prCR &&
254 nextProperty != prLF &&
255 nextProperty != prATerm &&
256 nextProperty != prSTerm {
257 // Move on to the next rune.
258 if b != nil { // Byte slice version.
259 r, length = utf8.DecodeRune(b)
260 b = b[length:]
261 } else { // String version.
262 r, length = utf8.DecodeRuneInString(str)
263 str = str[length:]
264 }
265 if r == utf8.RuneError {
266 break
267 }
268 nextProperty = property(sentenceBreakCodePoints, r)
269 }
270 if nextProperty == prLower {
271 return sbLower, false
272 }
273 }
274
275 return
276}