1package uniseg
2
3// The states of the grapheme cluster parser.
4const (
5 grAny = iota
6 grCR
7 grControlLF
8 grL
9 grLVV
10 grLVTT
11 grPrepend
12 grExtendedPictographic
13 grExtendedPictographicZWJ
14 grRIOdd
15 grRIEven
16)
17
18// The grapheme cluster parser's breaking instructions.
19const (
20 grNoBoundary = iota
21 grBoundary
22)
23
24// grTransitions implements the grapheme cluster parser's state transitions.
25// Maps state and property to a new state, a breaking instruction, and rule
26// number. The breaking instruction always refers to the boundary between the
27// last and next code point. Returns negative values if no transition is found.
28//
29// This function is used as follows:
30//
31// 1. Find specific state + specific property. Stop if found.
32// 2. Find specific state + any property.
33// 3. Find any state + specific property.
34// 4. If only (2) or (3) (but not both) was found, stop.
35// 5. If both (2) and (3) were found, use state from (3) and breaking instruction
36// from the transition with the lower rule number, prefer (3) if rule numbers
37// are equal. Stop.
38// 6. Assume grAny and grBoundary.
39//
40// Unicode version 15.0.0.
41func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
42 // It turns out that using a big switch statement is much faster than using
43 // a map.
44
45 switch uint64(state) | uint64(prop)<<32 {
46 // GB5
47 case grAny | prCR<<32:
48 return grCR, grBoundary, 50
49 case grAny | prLF<<32:
50 return grControlLF, grBoundary, 50
51 case grAny | prControl<<32:
52 return grControlLF, grBoundary, 50
53
54 // GB4
55 case grCR | prAny<<32:
56 return grAny, grBoundary, 40
57 case grControlLF | prAny<<32:
58 return grAny, grBoundary, 40
59
60 // GB3
61 case grCR | prLF<<32:
62 return grControlLF, grNoBoundary, 30
63
64 // GB6
65 case grAny | prL<<32:
66 return grL, grBoundary, 9990
67 case grL | prL<<32:
68 return grL, grNoBoundary, 60
69 case grL | prV<<32:
70 return grLVV, grNoBoundary, 60
71 case grL | prLV<<32:
72 return grLVV, grNoBoundary, 60
73 case grL | prLVT<<32:
74 return grLVTT, grNoBoundary, 60
75
76 // GB7
77 case grAny | prLV<<32:
78 return grLVV, grBoundary, 9990
79 case grAny | prV<<32:
80 return grLVV, grBoundary, 9990
81 case grLVV | prV<<32:
82 return grLVV, grNoBoundary, 70
83 case grLVV | prT<<32:
84 return grLVTT, grNoBoundary, 70
85
86 // GB8
87 case grAny | prLVT<<32:
88 return grLVTT, grBoundary, 9990
89 case grAny | prT<<32:
90 return grLVTT, grBoundary, 9990
91 case grLVTT | prT<<32:
92 return grLVTT, grNoBoundary, 80
93
94 // GB9
95 case grAny | prExtend<<32:
96 return grAny, grNoBoundary, 90
97 case grAny | prZWJ<<32:
98 return grAny, grNoBoundary, 90
99
100 // GB9a
101 case grAny | prSpacingMark<<32:
102 return grAny, grNoBoundary, 91
103
104 // GB9b
105 case grAny | prPrepend<<32:
106 return grPrepend, grBoundary, 9990
107 case grPrepend | prAny<<32:
108 return grAny, grNoBoundary, 92
109
110 // GB11
111 case grAny | prExtendedPictographic<<32:
112 return grExtendedPictographic, grBoundary, 9990
113 case grExtendedPictographic | prExtend<<32:
114 return grExtendedPictographic, grNoBoundary, 110
115 case grExtendedPictographic | prZWJ<<32:
116 return grExtendedPictographicZWJ, grNoBoundary, 110
117 case grExtendedPictographicZWJ | prExtendedPictographic<<32:
118 return grExtendedPictographic, grNoBoundary, 110
119
120 // GB12 / GB13
121 case grAny | prRegionalIndicator<<32:
122 return grRIOdd, grBoundary, 9990
123 case grRIOdd | prRegionalIndicator<<32:
124 return grRIEven, grNoBoundary, 120
125 case grRIEven | prRegionalIndicator<<32:
126 return grRIOdd, grBoundary, 120
127 default:
128 return -1, -1, -1
129 }
130}
131
132// transitionGraphemeState determines the new state of the grapheme cluster
133// parser given the current state and the next code point. It also returns the
134// code point's grapheme property (the value mapped by the [graphemeCodePoints]
135// table) and whether a cluster boundary was detected.
136func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
137 // Determine the property of the next character.
138 prop = propertyGraphemes(r)
139
140 // Find the applicable transition.
141 nextState, nextProp, _ := grTransitions(state, prop)
142 if nextState >= 0 {
143 // We have a specific transition. We'll use it.
144 return nextState, prop, nextProp == grBoundary
145 }
146
147 // No specific transition found. Try the less specific ones.
148 anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
149 anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
150 if anyPropState >= 0 && anyStateState >= 0 {
151 // Both apply. We'll use a mix (see comments for grTransitions).
152 newState = anyStateState
153 boundary = anyStateProp == grBoundary
154 if anyPropRule < anyStateRule {
155 boundary = anyPropProp == grBoundary
156 }
157 return
158 }
159
160 if anyPropState >= 0 {
161 // We only have a specific state.
162 return anyPropState, prop, anyPropProp == grBoundary
163 // This branch will probably never be reached because okAnyState will
164 // always be true given the current transition map. But we keep it here
165 // for future modifications to the transition map where this may not be
166 // true anymore.
167 }
168
169 if anyStateState >= 0 {
170 // We only have a specific property.
171 return anyStateState, prop, anyStateProp == grBoundary
172 }
173
174 // No known transition. GB999: Any รท Any.
175 return grAny, prop, true
176}