grapheme.go

  1package uniseg
  2
  3import "unicode/utf8"
  4
  5// Graphemes implements an iterator over Unicode grapheme clusters, or
  6// user-perceived characters. While iterating, it also provides information
  7// about word boundaries, sentence boundaries, line breaks, and monospace
  8// character widths.
  9//
 10// After constructing the class via [NewGraphemes] for a given string "str",
 11// [Graphemes.Next] is called for every grapheme cluster in a loop until it
 12// returns false. Inside the loop, information about the grapheme cluster as
 13// well as boundary information and character width is available via the various
 14// methods (see examples below).
 15//
 16// This class basically wraps the [StepString] parser and provides a convenient
 17// interface to it. If you are only interested in some parts of this package's
 18// functionality, using the specialized functions starting with "First" is
 19// almost always faster.
 20type Graphemes struct {
 21	// The original string.
 22	original string
 23
 24	// The remaining string to be parsed.
 25	remaining string
 26
 27	// The current grapheme cluster.
 28	cluster string
 29
 30	// The byte offset of the current grapheme cluster relative to the original
 31	// string.
 32	offset int
 33
 34	// The current boundary information of the [Step] parser.
 35	boundaries int
 36
 37	// The current state of the [Step] parser.
 38	state int
 39}
 40
 41// NewGraphemes returns a new grapheme cluster iterator.
 42func NewGraphemes(str string) *Graphemes {
 43	return &Graphemes{
 44		original:  str,
 45		remaining: str,
 46		state:     -1,
 47	}
 48}
 49
 50// Next advances the iterator by one grapheme cluster and returns false if no
 51// clusters are left. This function must be called before the first cluster is
 52// accessed.
 53func (g *Graphemes) Next() bool {
 54	if len(g.remaining) == 0 {
 55		// We're already past the end.
 56		g.state = -2
 57		g.cluster = ""
 58		return false
 59	}
 60	g.offset += len(g.cluster)
 61	g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
 62	return true
 63}
 64
 65// Runes returns a slice of runes (code points) which corresponds to the current
 66// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
 67// has not yet been called, nil is returned.
 68func (g *Graphemes) Runes() []rune {
 69	if g.state < 0 {
 70		return nil
 71	}
 72	return []rune(g.cluster)
 73}
 74
 75// Str returns a substring of the original string which corresponds to the
 76// current grapheme cluster. If the iterator is already past the end or
 77// [Graphemes.Next] has not yet been called, an empty string is returned.
 78func (g *Graphemes) Str() string {
 79	return g.cluster
 80}
 81
 82// Bytes returns a byte slice which corresponds to the current grapheme cluster.
 83// If the iterator is already past the end or [Graphemes.Next] has not yet been
 84// called, nil is returned.
 85func (g *Graphemes) Bytes() []byte {
 86	if g.state < 0 {
 87		return nil
 88	}
 89	return []byte(g.cluster)
 90}
 91
 92// Positions returns the interval of the current grapheme cluster as byte
 93// positions into the original string. The first returned value "from" indexes
 94// the first byte and the second returned value "to" indexes the first byte that
 95// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
 96// the original string "str". If [Graphemes.Next] has not yet been called, both
 97// values are 0. If the iterator is already past the end, both values are 1.
 98func (g *Graphemes) Positions() (int, int) {
 99	if g.state == -1 {
100		return 0, 0
101	} else if g.state == -2 {
102		return 1, 1
103	}
104	return g.offset, g.offset + len(g.cluster)
105}
106
107// IsWordBoundary returns true if a word ends after the current grapheme
108// cluster.
109func (g *Graphemes) IsWordBoundary() bool {
110	if g.state < 0 {
111		return true
112	}
113	return g.boundaries&MaskWord != 0
114}
115
116// IsSentenceBoundary returns true if a sentence ends after the current
117// grapheme cluster.
118func (g *Graphemes) IsSentenceBoundary() bool {
119	if g.state < 0 {
120		return true
121	}
122	return g.boundaries&MaskSentence != 0
123}
124
125// LineBreak returns whether the line can be broken after the current grapheme
126// cluster. A value of [LineDontBreak] means the line may not be broken, a value
127// of [LineMustBreak] means the line must be broken, and a value of
128// [LineCanBreak] means the line may or may not be broken.
129func (g *Graphemes) LineBreak() int {
130	if g.state == -1 {
131		return LineDontBreak
132	}
133	if g.state == -2 {
134		return LineMustBreak
135	}
136	return g.boundaries & MaskLine
137}
138
139// Width returns the monospace width of the current grapheme cluster.
140func (g *Graphemes) Width() int {
141	if g.state < 0 {
142		return 0
143	}
144	return g.boundaries >> ShiftWidth
145}
146
147// Reset puts the iterator into its initial state such that the next call to
148// [Graphemes.Next] sets it to the first grapheme cluster again.
149func (g *Graphemes) Reset() {
150	g.state = -1
151	g.offset = 0
152	g.cluster = ""
153	g.remaining = g.original
154}
155
156// GraphemeClusterCount returns the number of user-perceived characters
157// (grapheme clusters) for the given string.
158func GraphemeClusterCount(s string) (n int) {
159	state := -1
160	for len(s) > 0 {
161		_, s, _, state = FirstGraphemeClusterInString(s, state)
162		n++
163	}
164	return
165}
166
167// ReverseString reverses the given string while observing grapheme cluster
168// boundaries.
169func ReverseString(s string) string {
170	str := []byte(s)
171	reversed := make([]byte, len(str))
172	state := -1
173	index := len(str)
174	for len(str) > 0 {
175		var cluster []byte
176		cluster, str, _, state = FirstGraphemeCluster(str, state)
177		index -= len(cluster)
178		copy(reversed[index:], cluster)
179		if index <= len(str)/2 {
180			break
181		}
182	}
183	return string(reversed)
184}
185
186// The number of bits the grapheme property must be shifted to make place for
187// grapheme states.
188const shiftGraphemePropState = 4
189
190// FirstGraphemeCluster returns the first grapheme cluster found in the given
191// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
192// Cluster Boundaries]. This function can be called continuously to extract all
193// grapheme clusters from a byte slice, as illustrated in the example below.
194//
195// If you don't know the current state, for example when calling the function
196// for the first time, you must pass -1. For consecutive calls, pass the state
197// and rest slice returned by the previous call.
198//
199// The "rest" slice is the sub-slice of the original byte slice "b" starting
200// after the last byte of the identified grapheme cluster. If the length of the
201// "rest" slice is 0, the entire byte slice "b" has been processed. The
202// "cluster" byte slice is the sub-slice of the input slice containing the
203// identified grapheme cluster.
204//
205// The returned width is the width of the grapheme cluster for most monospace
206// fonts where a value of 1 represents one character cell.
207//
208// Given an empty byte slice "b", the function returns nil values.
209//
210// While slightly less convenient than using the Graphemes class, this function
211// has much better performance and makes no allocations. It lends itself well to
212// large byte slices.
213//
214// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
215func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
216	// An empty byte slice returns nothing.
217	if len(b) == 0 {
218		return
219	}
220
221	// Extract the first rune.
222	r, length := utf8.DecodeRune(b)
223	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
224		var prop int
225		if state < 0 {
226			prop = propertyGraphemes(r)
227		} else {
228			prop = state >> shiftGraphemePropState
229		}
230		return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
231	}
232
233	// If we don't know the state, determine it now.
234	var firstProp int
235	if state < 0 {
236		state, firstProp, _ = transitionGraphemeState(state, r)
237	} else {
238		firstProp = state >> shiftGraphemePropState
239	}
240	width += runeWidth(r, firstProp)
241
242	// Transition until we find a boundary.
243	for {
244		var (
245			prop     int
246			boundary bool
247		)
248
249		r, l := utf8.DecodeRune(b[length:])
250		state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
251
252		if boundary {
253			return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
254		}
255
256		if firstProp == prExtendedPictographic {
257			if r == vs15 {
258				width = 1
259			} else if r == vs16 {
260				width = 2
261			}
262		} else if firstProp != prRegionalIndicator && firstProp != prL {
263			width += runeWidth(r, prop)
264		}
265
266		length += l
267		if len(b) <= length {
268			return b, nil, width, grAny | (prop << shiftGraphemePropState)
269		}
270	}
271}
272
273// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
274// outputs are strings.
275func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
276	// An empty string returns nothing.
277	if len(str) == 0 {
278		return
279	}
280
281	// Extract the first rune.
282	r, length := utf8.DecodeRuneInString(str)
283	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
284		var prop int
285		if state < 0 {
286			prop = propertyGraphemes(r)
287		} else {
288			prop = state >> shiftGraphemePropState
289		}
290		return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
291	}
292
293	// If we don't know the state, determine it now.
294	var firstProp int
295	if state < 0 {
296		state, firstProp, _ = transitionGraphemeState(state, r)
297	} else {
298		firstProp = state >> shiftGraphemePropState
299	}
300	width += runeWidth(r, firstProp)
301
302	// Transition until we find a boundary.
303	for {
304		var (
305			prop     int
306			boundary bool
307		)
308
309		r, l := utf8.DecodeRuneInString(str[length:])
310		state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
311
312		if boundary {
313			return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
314		}
315
316		if firstProp == prExtendedPictographic {
317			if r == vs15 {
318				width = 1
319			} else if r == vs16 {
320				width = 2
321			}
322		} else if firstProp != prRegionalIndicator && firstProp != prL {
323			width += runeWidth(r, prop)
324		}
325
326		length += l
327		if len(str) <= length {
328			return str, "", width, grAny | (prop << shiftGraphemePropState)
329		}
330	}
331}