1package uniseg
2
3import "unicode/utf8"
4
5// The bit masks used to extract boundary information returned by [Step].
6const (
7 MaskLine = 3
8 MaskWord = 4
9 MaskSentence = 8
10)
11
12// The number of bits to shift the boundary information returned by [Step] to
13// obtain the monospace width of the grapheme cluster.
14const ShiftWidth = 4
15
16// The bit positions by which boundary flags are shifted by the [Step] function.
17// These must correspond to the Mask constants.
18const (
19 shiftWord = 2
20 shiftSentence = 3
21 // shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
22)
23
24// The bit positions by which states are shifted by the [Step] function. These
25// values must ensure state values defined for each of the boundary algorithms
26// don't overlap (and that they all still fit in a single int). These must
27// correspond to the Mask constants.
28const (
29 shiftWordState = 4
30 shiftSentenceState = 9
31 shiftLineState = 13
32 shiftPropState = 21 // No mask as these are always the remaining bits.
33)
34
35// The bit mask used to extract the state returned by the [Step] function, after
36// shifting. These values must correspond to the shift constants.
37const (
38 maskGraphemeState = 0xf
39 maskWordState = 0x1f
40 maskSentenceState = 0xf
41 maskLineState = 0xff
42)
43
44// Step returns the first grapheme cluster (user-perceived character) found in
45// the given byte slice. It also returns information about the boundary between
46// that grapheme cluster and the one following it as well as the monospace width
47// of the grapheme cluster. There are three types of boundary information: word
48// boundaries, sentence boundaries, and line breaks. This function is therefore
49// a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
50// [FirstLineSegment].
51//
52// The "boundaries" return value can be evaluated as follows:
53//
54// - boundaries&MaskWord != 0: The boundary is a word boundary.
55// - boundaries&MaskWord == 0: The boundary is not a word boundary.
56// - boundaries&MaskSentence != 0: The boundary is a sentence boundary.
57// - boundaries&MaskSentence == 0: The boundary is not a sentence boundary.
58// - boundaries&MaskLine == LineDontBreak: You must not break the line at the
59// boundary.
60// - boundaries&MaskLine == LineMustBreak: You must break the line at the
61// boundary.
62// - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
63// the boundary.
64// - boundaries >> ShiftWidth: The width of the grapheme cluster for most
65// monospace fonts where a value of 1 represents one character cell.
66//
67// This function can be called continuously to extract all grapheme clusters
68// from a byte slice, as illustrated in the examples below.
69//
70// If you don't know which state to pass, for example when calling the function
71// for the first time, you must pass -1. For consecutive calls, pass the state
72// and rest slice returned by the previous call.
73//
74// The "rest" slice is the sub-slice of the original byte slice "b" starting
75// after the last byte of the identified grapheme cluster. If the length of the
76// "rest" slice is 0, the entire byte slice "b" has been processed. The
77// "cluster" byte slice is the sub-slice of the input slice containing the
78// first identified grapheme cluster.
79//
80// Given an empty byte slice "b", the function returns nil values.
81//
82// While slightly less convenient than using the Graphemes class, this function
83// has much better performance and makes no allocations. It lends itself well to
84// large byte slices.
85//
86// Note that in accordance with [UAX #14 LB3], the final segment will end with
87// a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
88// to ignore this by checking if the length of the "rest" slice is 0 and calling
89// [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
90//
91// [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
92func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
93 // An empty byte slice returns nothing.
94 if len(b) == 0 {
95 return
96 }
97
98 // Extract the first rune.
99 r, length := utf8.DecodeRune(b)
100 if len(b) <= length { // If we're already past the end, there is nothing else to parse.
101 var prop int
102 if state < 0 {
103 prop = propertyGraphemes(r)
104 } else {
105 prop = state >> shiftPropState
106 }
107 return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
108 }
109
110 // If we don't know the state, determine it now.
111 var graphemeState, wordState, sentenceState, lineState, firstProp int
112 remainder := b[length:]
113 if state < 0 {
114 graphemeState, firstProp, _ = transitionGraphemeState(state, r)
115 wordState, _ = transitionWordBreakState(state, r, remainder, "")
116 sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
117 lineState, _ = transitionLineBreakState(state, r, remainder, "")
118 } else {
119 graphemeState = state & maskGraphemeState
120 wordState = (state >> shiftWordState) & maskWordState
121 sentenceState = (state >> shiftSentenceState) & maskSentenceState
122 lineState = (state >> shiftLineState) & maskLineState
123 firstProp = state >> shiftPropState
124 }
125
126 // Transition until we find a grapheme cluster boundary.
127 width := runeWidth(r, firstProp)
128 for {
129 var (
130 graphemeBoundary, wordBoundary, sentenceBoundary bool
131 lineBreak, prop int
132 )
133
134 r, l := utf8.DecodeRune(remainder)
135 remainder = b[length+l:]
136
137 graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
138 wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
139 sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
140 lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
141
142 if graphemeBoundary {
143 boundary := lineBreak | (width << ShiftWidth)
144 if wordBoundary {
145 boundary |= 1 << shiftWord
146 }
147 if sentenceBoundary {
148 boundary |= 1 << shiftSentence
149 }
150 return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
151 }
152
153 if firstProp == prExtendedPictographic {
154 if r == vs15 {
155 width = 1
156 } else if r == vs16 {
157 width = 2
158 }
159 } else if firstProp != prRegionalIndicator && firstProp != prL {
160 width += runeWidth(r, prop)
161 }
162
163 length += l
164 if len(b) <= length {
165 return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
166 }
167 }
168}
169
170// StepString is like [Step] but its input and outputs are strings.
171func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
172 // An empty byte slice returns nothing.
173 if len(str) == 0 {
174 return
175 }
176
177 // Extract the first rune.
178 r, length := utf8.DecodeRuneInString(str)
179 if len(str) <= length { // If we're already past the end, there is nothing else to parse.
180 prop := propertyGraphemes(r)
181 return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
182 }
183
184 // If we don't know the state, determine it now.
185 var graphemeState, wordState, sentenceState, lineState, firstProp int
186 remainder := str[length:]
187 if state < 0 {
188 graphemeState, firstProp, _ = transitionGraphemeState(state, r)
189 wordState, _ = transitionWordBreakState(state, r, nil, remainder)
190 sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
191 lineState, _ = transitionLineBreakState(state, r, nil, remainder)
192 } else {
193 graphemeState = state & maskGraphemeState
194 wordState = (state >> shiftWordState) & maskWordState
195 sentenceState = (state >> shiftSentenceState) & maskSentenceState
196 lineState = (state >> shiftLineState) & maskLineState
197 firstProp = state >> shiftPropState
198 }
199
200 // Transition until we find a grapheme cluster boundary.
201 width := runeWidth(r, firstProp)
202 for {
203 var (
204 graphemeBoundary, wordBoundary, sentenceBoundary bool
205 lineBreak, prop int
206 )
207
208 r, l := utf8.DecodeRuneInString(remainder)
209 remainder = str[length+l:]
210
211 graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
212 wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
213 sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
214 lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
215
216 if graphemeBoundary {
217 boundary := lineBreak | (width << ShiftWidth)
218 if wordBoundary {
219 boundary |= 1 << shiftWord
220 }
221 if sentenceBoundary {
222 boundary |= 1 << shiftSentence
223 }
224 return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
225 }
226
227 if firstProp == prExtendedPictographic {
228 if r == vs15 {
229 width = 1
230 } else if r == vs16 {
231 width = 2
232 }
233 } else if firstProp != prRegionalIndicator && firstProp != prL {
234 width += runeWidth(r, prop)
235 }
236
237 length += l
238 if len(str) <= length {
239 return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
240 }
241 }
242}