sentencerules.go

  1package uniseg
  2
  3import "unicode/utf8"
  4
  5// The states of the sentence break parser.
  6const (
  7	sbAny = iota
  8	sbCR
  9	sbParaSep
 10	sbATerm
 11	sbUpper
 12	sbLower
 13	sbSB7
 14	sbSB8Close
 15	sbSB8Sp
 16	sbSTerm
 17	sbSB8aClose
 18	sbSB8aSp
 19)
 20
 21// sbTransitions implements the sentence break parser's state transitions. It's
 22// anologous to [grTransitions], see comments there for details.
 23//
 24// Unicode version 15.0.0.
 25func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
 26	switch uint64(state) | uint64(prop)<<32 {
 27	// SB3.
 28	case sbAny | prCR<<32:
 29		return sbCR, false, 9990
 30	case sbCR | prLF<<32:
 31		return sbParaSep, false, 30
 32
 33	// SB4.
 34	case sbAny | prSep<<32:
 35		return sbParaSep, false, 9990
 36	case sbAny | prLF<<32:
 37		return sbParaSep, false, 9990
 38	case sbParaSep | prAny<<32:
 39		return sbAny, true, 40
 40	case sbCR | prAny<<32:
 41		return sbAny, true, 40
 42
 43	// SB6.
 44	case sbAny | prATerm<<32:
 45		return sbATerm, false, 9990
 46	case sbATerm | prNumeric<<32:
 47		return sbAny, false, 60
 48	case sbSB7 | prNumeric<<32:
 49		return sbAny, false, 60 // Because ATerm also appears in SB7.
 50
 51	// SB7.
 52	case sbAny | prUpper<<32:
 53		return sbUpper, false, 9990
 54	case sbAny | prLower<<32:
 55		return sbLower, false, 9990
 56	case sbUpper | prATerm<<32:
 57		return sbSB7, false, 70
 58	case sbLower | prATerm<<32:
 59		return sbSB7, false, 70
 60	case sbSB7 | prUpper<<32:
 61		return sbUpper, false, 70
 62
 63	// SB8a.
 64	case sbAny | prSTerm<<32:
 65		return sbSTerm, false, 9990
 66	case sbATerm | prSContinue<<32:
 67		return sbAny, false, 81
 68	case sbATerm | prATerm<<32:
 69		return sbATerm, false, 81
 70	case sbATerm | prSTerm<<32:
 71		return sbSTerm, false, 81
 72	case sbSB7 | prSContinue<<32:
 73		return sbAny, false, 81
 74	case sbSB7 | prATerm<<32:
 75		return sbATerm, false, 81
 76	case sbSB7 | prSTerm<<32:
 77		return sbSTerm, false, 81
 78	case sbSB8Close | prSContinue<<32:
 79		return sbAny, false, 81
 80	case sbSB8Close | prATerm<<32:
 81		return sbATerm, false, 81
 82	case sbSB8Close | prSTerm<<32:
 83		return sbSTerm, false, 81
 84	case sbSB8Sp | prSContinue<<32:
 85		return sbAny, false, 81
 86	case sbSB8Sp | prATerm<<32:
 87		return sbATerm, false, 81
 88	case sbSB8Sp | prSTerm<<32:
 89		return sbSTerm, false, 81
 90	case sbSTerm | prSContinue<<32:
 91		return sbAny, false, 81
 92	case sbSTerm | prATerm<<32:
 93		return sbATerm, false, 81
 94	case sbSTerm | prSTerm<<32:
 95		return sbSTerm, false, 81
 96	case sbSB8aClose | prSContinue<<32:
 97		return sbAny, false, 81
 98	case sbSB8aClose | prATerm<<32:
 99		return sbATerm, false, 81
100	case sbSB8aClose | prSTerm<<32:
101		return sbSTerm, false, 81
102	case sbSB8aSp | prSContinue<<32:
103		return sbAny, false, 81
104	case sbSB8aSp | prATerm<<32:
105		return sbATerm, false, 81
106	case sbSB8aSp | prSTerm<<32:
107		return sbSTerm, false, 81
108
109	// SB9.
110	case sbATerm | prClose<<32:
111		return sbSB8Close, false, 90
112	case sbSB7 | prClose<<32:
113		return sbSB8Close, false, 90
114	case sbSB8Close | prClose<<32:
115		return sbSB8Close, false, 90
116	case sbATerm | prSp<<32:
117		return sbSB8Sp, false, 90
118	case sbSB7 | prSp<<32:
119		return sbSB8Sp, false, 90
120	case sbSB8Close | prSp<<32:
121		return sbSB8Sp, false, 90
122	case sbSTerm | prClose<<32:
123		return sbSB8aClose, false, 90
124	case sbSB8aClose | prClose<<32:
125		return sbSB8aClose, false, 90
126	case sbSTerm | prSp<<32:
127		return sbSB8aSp, false, 90
128	case sbSB8aClose | prSp<<32:
129		return sbSB8aSp, false, 90
130	case sbATerm | prSep<<32:
131		return sbParaSep, false, 90
132	case sbATerm | prCR<<32:
133		return sbParaSep, false, 90
134	case sbATerm | prLF<<32:
135		return sbParaSep, false, 90
136	case sbSB7 | prSep<<32:
137		return sbParaSep, false, 90
138	case sbSB7 | prCR<<32:
139		return sbParaSep, false, 90
140	case sbSB7 | prLF<<32:
141		return sbParaSep, false, 90
142	case sbSB8Close | prSep<<32:
143		return sbParaSep, false, 90
144	case sbSB8Close | prCR<<32:
145		return sbParaSep, false, 90
146	case sbSB8Close | prLF<<32:
147		return sbParaSep, false, 90
148	case sbSTerm | prSep<<32:
149		return sbParaSep, false, 90
150	case sbSTerm | prCR<<32:
151		return sbParaSep, false, 90
152	case sbSTerm | prLF<<32:
153		return sbParaSep, false, 90
154	case sbSB8aClose | prSep<<32:
155		return sbParaSep, false, 90
156	case sbSB8aClose | prCR<<32:
157		return sbParaSep, false, 90
158	case sbSB8aClose | prLF<<32:
159		return sbParaSep, false, 90
160
161	// SB10.
162	case sbSB8Sp | prSp<<32:
163		return sbSB8Sp, false, 100
164	case sbSB8aSp | prSp<<32:
165		return sbSB8aSp, false, 100
166	case sbSB8Sp | prSep<<32:
167		return sbParaSep, false, 100
168	case sbSB8Sp | prCR<<32:
169		return sbParaSep, false, 100
170	case sbSB8Sp | prLF<<32:
171		return sbParaSep, false, 100
172
173	// SB11.
174	case sbATerm | prAny<<32:
175		return sbAny, true, 110
176	case sbSB7 | prAny<<32:
177		return sbAny, true, 110
178	case sbSB8Close | prAny<<32:
179		return sbAny, true, 110
180	case sbSB8Sp | prAny<<32:
181		return sbAny, true, 110
182	case sbSTerm | prAny<<32:
183		return sbAny, true, 110
184	case sbSB8aClose | prAny<<32:
185		return sbAny, true, 110
186	case sbSB8aSp | prAny<<32:
187		return sbAny, true, 110
188	// We'll always break after ParaSep due to SB4.
189
190	default:
191		return -1, false, -1
192	}
193}
194
195// transitionSentenceBreakState determines the new state of the sentence break
196// parser given the current state and the next code point. It also returns
197// whether a sentence boundary was detected. If more than one code point is
198// needed to determine the new state, the byte slice or the string starting
199// after rune "r" can be used (whichever is not nil or empty) for further
200// lookups.
201func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
202	// Determine the property of the next character.
203	nextProperty := property(sentenceBreakCodePoints, r)
204
205	// SB5 (Replacing Ignore Rules).
206	if nextProperty == prExtend || nextProperty == prFormat {
207		if state == sbParaSep || state == sbCR {
208			return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
209		}
210		if state < 0 {
211			return sbAny, true // SB1.
212		}
213		return state, false
214	}
215
216	// Find the applicable transition in the table.
217	var rule int
218	newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
219	if newState < 0 {
220		// No specific transition found. Try the less specific ones.
221		anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
222		anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
223		if anyPropState >= 0 && anyStateState >= 0 {
224			// Both apply. We'll use a mix (see comments for grTransitions).
225			newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
226			if anyPropRule < anyStateRule {
227				sentenceBreak, rule = anyPropProp, anyPropRule
228			}
229		} else if anyPropState >= 0 {
230			// We only have a specific state.
231			newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
232			// This branch will probably never be reached because okAnyState will
233			// always be true given the current transition map. But we keep it here
234			// for future modifications to the transition map where this may not be
235			// true anymore.
236		} else if anyStateState >= 0 {
237			// We only have a specific property.
238			newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
239		} else {
240			// No known transition. SB999: Any × Any.
241			newState, sentenceBreak, rule = sbAny, false, 9990
242		}
243	}
244
245	// SB8.
246	if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
247		// Check the right side of the rule.
248		var length int
249		for nextProperty != prOLetter &&
250			nextProperty != prUpper &&
251			nextProperty != prLower &&
252			nextProperty != prSep &&
253			nextProperty != prCR &&
254			nextProperty != prLF &&
255			nextProperty != prATerm &&
256			nextProperty != prSTerm {
257			// Move on to the next rune.
258			if b != nil { // Byte slice version.
259				r, length = utf8.DecodeRune(b)
260				b = b[length:]
261			} else { // String version.
262				r, length = utf8.DecodeRuneInString(str)
263				str = str[length:]
264			}
265			if r == utf8.RuneError {
266				break
267			}
268			nextProperty = property(sentenceBreakCodePoints, r)
269		}
270		if nextProperty == prLower {
271			return sbLower, false
272		}
273	}
274
275	return
276}