1package uniseg
2
3import "unicode/utf8"
4
5// Graphemes implements an iterator over Unicode grapheme clusters, or
6// user-perceived characters. While iterating, it also provides information
7// about word boundaries, sentence boundaries, line breaks, and monospace
8// character widths.
9//
10// After constructing the class via [NewGraphemes] for a given string "str",
11// [Graphemes.Next] is called for every grapheme cluster in a loop until it
12// returns false. Inside the loop, information about the grapheme cluster as
13// well as boundary information and character width is available via the various
14// methods (see examples below).
15//
16// This class basically wraps the [StepString] parser and provides a convenient
17// interface to it. If you are only interested in some parts of this package's
18// functionality, using the specialized functions starting with "First" is
19// almost always faster.
20type Graphemes struct {
21 // The original string.
22 original string
23
24 // The remaining string to be parsed.
25 remaining string
26
27 // The current grapheme cluster.
28 cluster string
29
30 // The byte offset of the current grapheme cluster relative to the original
31 // string.
32 offset int
33
34 // The current boundary information of the [Step] parser.
35 boundaries int
36
37 // The current state of the [Step] parser.
38 state int
39}
40
41// NewGraphemes returns a new grapheme cluster iterator.
42func NewGraphemes(str string) *Graphemes {
43 return &Graphemes{
44 original: str,
45 remaining: str,
46 state: -1,
47 }
48}
49
50// Next advances the iterator by one grapheme cluster and returns false if no
51// clusters are left. This function must be called before the first cluster is
52// accessed.
53func (g *Graphemes) Next() bool {
54 if len(g.remaining) == 0 {
55 // We're already past the end.
56 g.state = -2
57 g.cluster = ""
58 return false
59 }
60 g.offset += len(g.cluster)
61 g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
62 return true
63}
64
65// Runes returns a slice of runes (code points) which corresponds to the current
66// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
67// has not yet been called, nil is returned.
68func (g *Graphemes) Runes() []rune {
69 if g.state < 0 {
70 return nil
71 }
72 return []rune(g.cluster)
73}
74
75// Str returns a substring of the original string which corresponds to the
76// current grapheme cluster. If the iterator is already past the end or
77// [Graphemes.Next] has not yet been called, an empty string is returned.
78func (g *Graphemes) Str() string {
79 return g.cluster
80}
81
82// Bytes returns a byte slice which corresponds to the current grapheme cluster.
83// If the iterator is already past the end or [Graphemes.Next] has not yet been
84// called, nil is returned.
85func (g *Graphemes) Bytes() []byte {
86 if g.state < 0 {
87 return nil
88 }
89 return []byte(g.cluster)
90}
91
92// Positions returns the interval of the current grapheme cluster as byte
93// positions into the original string. The first returned value "from" indexes
94// the first byte and the second returned value "to" indexes the first byte that
95// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
96// the original string "str". If [Graphemes.Next] has not yet been called, both
97// values are 0. If the iterator is already past the end, both values are 1.
98func (g *Graphemes) Positions() (int, int) {
99 if g.state == -1 {
100 return 0, 0
101 } else if g.state == -2 {
102 return 1, 1
103 }
104 return g.offset, g.offset + len(g.cluster)
105}
106
107// IsWordBoundary returns true if a word ends after the current grapheme
108// cluster.
109func (g *Graphemes) IsWordBoundary() bool {
110 if g.state < 0 {
111 return true
112 }
113 return g.boundaries&MaskWord != 0
114}
115
116// IsSentenceBoundary returns true if a sentence ends after the current
117// grapheme cluster.
118func (g *Graphemes) IsSentenceBoundary() bool {
119 if g.state < 0 {
120 return true
121 }
122 return g.boundaries&MaskSentence != 0
123}
124
125// LineBreak returns whether the line can be broken after the current grapheme
126// cluster. A value of [LineDontBreak] means the line may not be broken, a value
127// of [LineMustBreak] means the line must be broken, and a value of
128// [LineCanBreak] means the line may or may not be broken.
129func (g *Graphemes) LineBreak() int {
130 if g.state == -1 {
131 return LineDontBreak
132 }
133 if g.state == -2 {
134 return LineMustBreak
135 }
136 return g.boundaries & MaskLine
137}
138
139// Width returns the monospace width of the current grapheme cluster.
140func (g *Graphemes) Width() int {
141 if g.state < 0 {
142 return 0
143 }
144 return g.boundaries >> ShiftWidth
145}
146
147// Reset puts the iterator into its initial state such that the next call to
148// [Graphemes.Next] sets it to the first grapheme cluster again.
149func (g *Graphemes) Reset() {
150 g.state = -1
151 g.offset = 0
152 g.cluster = ""
153 g.remaining = g.original
154}
155
156// GraphemeClusterCount returns the number of user-perceived characters
157// (grapheme clusters) for the given string.
158func GraphemeClusterCount(s string) (n int) {
159 state := -1
160 for len(s) > 0 {
161 _, s, _, state = FirstGraphemeClusterInString(s, state)
162 n++
163 }
164 return
165}
166
167// ReverseString reverses the given string while observing grapheme cluster
168// boundaries.
169func ReverseString(s string) string {
170 str := []byte(s)
171 reversed := make([]byte, len(str))
172 state := -1
173 index := len(str)
174 for len(str) > 0 {
175 var cluster []byte
176 cluster, str, _, state = FirstGraphemeCluster(str, state)
177 index -= len(cluster)
178 copy(reversed[index:], cluster)
179 if index <= len(str)/2 {
180 break
181 }
182 }
183 return string(reversed)
184}
185
186// The number of bits the grapheme property must be shifted to make place for
187// grapheme states.
188const shiftGraphemePropState = 4
189
190// FirstGraphemeCluster returns the first grapheme cluster found in the given
191// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
192// Cluster Boundaries]. This function can be called continuously to extract all
193// grapheme clusters from a byte slice, as illustrated in the example below.
194//
195// If you don't know the current state, for example when calling the function
196// for the first time, you must pass -1. For consecutive calls, pass the state
197// and rest slice returned by the previous call.
198//
199// The "rest" slice is the sub-slice of the original byte slice "b" starting
200// after the last byte of the identified grapheme cluster. If the length of the
201// "rest" slice is 0, the entire byte slice "b" has been processed. The
202// "cluster" byte slice is the sub-slice of the input slice containing the
203// identified grapheme cluster.
204//
205// The returned width is the width of the grapheme cluster for most monospace
206// fonts where a value of 1 represents one character cell.
207//
208// Given an empty byte slice "b", the function returns nil values.
209//
210// While slightly less convenient than using the Graphemes class, this function
211// has much better performance and makes no allocations. It lends itself well to
212// large byte slices.
213//
214// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
215func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
216 // An empty byte slice returns nothing.
217 if len(b) == 0 {
218 return
219 }
220
221 // Extract the first rune.
222 r, length := utf8.DecodeRune(b)
223 if len(b) <= length { // If we're already past the end, there is nothing else to parse.
224 var prop int
225 if state < 0 {
226 prop = propertyGraphemes(r)
227 } else {
228 prop = state >> shiftGraphemePropState
229 }
230 return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
231 }
232
233 // If we don't know the state, determine it now.
234 var firstProp int
235 if state < 0 {
236 state, firstProp, _ = transitionGraphemeState(state, r)
237 } else {
238 firstProp = state >> shiftGraphemePropState
239 }
240 width += runeWidth(r, firstProp)
241
242 // Transition until we find a boundary.
243 for {
244 var (
245 prop int
246 boundary bool
247 )
248
249 r, l := utf8.DecodeRune(b[length:])
250 state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
251
252 if boundary {
253 return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
254 }
255
256 if firstProp == prExtendedPictographic {
257 if r == vs15 {
258 width = 1
259 } else if r == vs16 {
260 width = 2
261 }
262 } else if firstProp != prRegionalIndicator && firstProp != prL {
263 width += runeWidth(r, prop)
264 }
265
266 length += l
267 if len(b) <= length {
268 return b, nil, width, grAny | (prop << shiftGraphemePropState)
269 }
270 }
271}
272
273// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
274// outputs are strings.
275func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
276 // An empty string returns nothing.
277 if len(str) == 0 {
278 return
279 }
280
281 // Extract the first rune.
282 r, length := utf8.DecodeRuneInString(str)
283 if len(str) <= length { // If we're already past the end, there is nothing else to parse.
284 var prop int
285 if state < 0 {
286 prop = propertyGraphemes(r)
287 } else {
288 prop = state >> shiftGraphemePropState
289 }
290 return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
291 }
292
293 // If we don't know the state, determine it now.
294 var firstProp int
295 if state < 0 {
296 state, firstProp, _ = transitionGraphemeState(state, r)
297 } else {
298 firstProp = state >> shiftGraphemePropState
299 }
300 width += runeWidth(r, firstProp)
301
302 // Transition until we find a boundary.
303 for {
304 var (
305 prop int
306 boundary bool
307 )
308
309 r, l := utf8.DecodeRuneInString(str[length:])
310 state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
311
312 if boundary {
313 return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
314 }
315
316 if firstProp == prExtendedPictographic {
317 if r == vs15 {
318 width = 1
319 } else if r == vs16 {
320 width = 2
321 }
322 } else if firstProp != prRegionalIndicator && firstProp != prL {
323 width += runeWidth(r, prop)
324 }
325
326 length += l
327 if len(str) <= length {
328 return str, "", width, grAny | (prop << shiftGraphemePropState)
329 }
330 }
331}