graphemerules.go

  1package uniseg
  2
  3// The states of the grapheme cluster parser.
  4const (
  5	grAny = iota
  6	grCR
  7	grControlLF
  8	grL
  9	grLVV
 10	grLVTT
 11	grPrepend
 12	grExtendedPictographic
 13	grExtendedPictographicZWJ
 14	grRIOdd
 15	grRIEven
 16)
 17
 18// The grapheme cluster parser's breaking instructions.
 19const (
 20	grNoBoundary = iota
 21	grBoundary
 22)
 23
 24// grTransitions implements the grapheme cluster parser's state transitions.
 25// Maps state and property to a new state, a breaking instruction, and rule
 26// number. The breaking instruction always refers to the boundary between the
 27// last and next code point. Returns negative values if no transition is found.
 28//
 29// This function is used as follows:
 30//
 31//  1. Find specific state + specific property. Stop if found.
 32//  2. Find specific state + any property.
 33//  3. Find any state + specific property.
 34//  4. If only (2) or (3) (but not both) was found, stop.
 35//  5. If both (2) and (3) were found, use state from (3) and breaking instruction
 36//     from the transition with the lower rule number, prefer (3) if rule numbers
 37//     are equal. Stop.
 38//  6. Assume grAny and grBoundary.
 39//
 40// Unicode version 15.0.0.
 41func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
 42	// It turns out that using a big switch statement is much faster than using
 43	// a map.
 44
 45	switch uint64(state) | uint64(prop)<<32 {
 46	// GB5
 47	case grAny | prCR<<32:
 48		return grCR, grBoundary, 50
 49	case grAny | prLF<<32:
 50		return grControlLF, grBoundary, 50
 51	case grAny | prControl<<32:
 52		return grControlLF, grBoundary, 50
 53
 54	// GB4
 55	case grCR | prAny<<32:
 56		return grAny, grBoundary, 40
 57	case grControlLF | prAny<<32:
 58		return grAny, grBoundary, 40
 59
 60	// GB3
 61	case grCR | prLF<<32:
 62		return grControlLF, grNoBoundary, 30
 63
 64	// GB6
 65	case grAny | prL<<32:
 66		return grL, grBoundary, 9990
 67	case grL | prL<<32:
 68		return grL, grNoBoundary, 60
 69	case grL | prV<<32:
 70		return grLVV, grNoBoundary, 60
 71	case grL | prLV<<32:
 72		return grLVV, grNoBoundary, 60
 73	case grL | prLVT<<32:
 74		return grLVTT, grNoBoundary, 60
 75
 76	// GB7
 77	case grAny | prLV<<32:
 78		return grLVV, grBoundary, 9990
 79	case grAny | prV<<32:
 80		return grLVV, grBoundary, 9990
 81	case grLVV | prV<<32:
 82		return grLVV, grNoBoundary, 70
 83	case grLVV | prT<<32:
 84		return grLVTT, grNoBoundary, 70
 85
 86	// GB8
 87	case grAny | prLVT<<32:
 88		return grLVTT, grBoundary, 9990
 89	case grAny | prT<<32:
 90		return grLVTT, grBoundary, 9990
 91	case grLVTT | prT<<32:
 92		return grLVTT, grNoBoundary, 80
 93
 94	// GB9
 95	case grAny | prExtend<<32:
 96		return grAny, grNoBoundary, 90
 97	case grAny | prZWJ<<32:
 98		return grAny, grNoBoundary, 90
 99
100	// GB9a
101	case grAny | prSpacingMark<<32:
102		return grAny, grNoBoundary, 91
103
104	// GB9b
105	case grAny | prPrepend<<32:
106		return grPrepend, grBoundary, 9990
107	case grPrepend | prAny<<32:
108		return grAny, grNoBoundary, 92
109
110	// GB11
111	case grAny | prExtendedPictographic<<32:
112		return grExtendedPictographic, grBoundary, 9990
113	case grExtendedPictographic | prExtend<<32:
114		return grExtendedPictographic, grNoBoundary, 110
115	case grExtendedPictographic | prZWJ<<32:
116		return grExtendedPictographicZWJ, grNoBoundary, 110
117	case grExtendedPictographicZWJ | prExtendedPictographic<<32:
118		return grExtendedPictographic, grNoBoundary, 110
119
120	// GB12 / GB13
121	case grAny | prRegionalIndicator<<32:
122		return grRIOdd, grBoundary, 9990
123	case grRIOdd | prRegionalIndicator<<32:
124		return grRIEven, grNoBoundary, 120
125	case grRIEven | prRegionalIndicator<<32:
126		return grRIOdd, grBoundary, 120
127	default:
128		return -1, -1, -1
129	}
130}
131
132// transitionGraphemeState determines the new state of the grapheme cluster
133// parser given the current state and the next code point. It also returns the
134// code point's grapheme property (the value mapped by the [graphemeCodePoints]
135// table) and whether a cluster boundary was detected.
136func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
137	// Determine the property of the next character.
138	prop = propertyGraphemes(r)
139
140	// Find the applicable transition.
141	nextState, nextProp, _ := grTransitions(state, prop)
142	if nextState >= 0 {
143		// We have a specific transition. We'll use it.
144		return nextState, prop, nextProp == grBoundary
145	}
146
147	// No specific transition found. Try the less specific ones.
148	anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
149	anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
150	if anyPropState >= 0 && anyStateState >= 0 {
151		// Both apply. We'll use a mix (see comments for grTransitions).
152		newState = anyStateState
153		boundary = anyStateProp == grBoundary
154		if anyPropRule < anyStateRule {
155			boundary = anyPropProp == grBoundary
156		}
157		return
158	}
159
160	if anyPropState >= 0 {
161		// We only have a specific state.
162		return anyPropState, prop, anyPropProp == grBoundary
163		// This branch will probably never be reached because okAnyState will
164		// always be true given the current transition map. But we keep it here
165		// for future modifications to the transition map where this may not be
166		// true anymore.
167	}
168
169	if anyStateState >= 0 {
170		// We only have a specific property.
171		return anyStateState, prop, anyStateProp == grBoundary
172	}
173
174	// No known transition. GB999: Any รท Any.
175	return grAny, prop, true
176}