1package uniseg
2
3// The Unicode properties as used in the various parsers. Only the ones needed
4// in the context of this package are included.
5const (
6 prXX = 0 // Same as prAny.
7 prAny = iota // prAny must be 0.
8 prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
9 prCR
10 prLF
11 prControl
12 prExtend
13 prRegionalIndicator
14 prSpacingMark
15 prL
16 prV
17 prT
18 prLV
19 prLVT
20 prZWJ
21 prExtendedPictographic
22 prNewline
23 prWSegSpace
24 prDoubleQuote
25 prSingleQuote
26 prMidNumLet
27 prNumeric
28 prMidLetter
29 prMidNum
30 prExtendNumLet
31 prALetter
32 prFormat
33 prHebrewLetter
34 prKatakana
35 prSp
36 prSTerm
37 prClose
38 prSContinue
39 prATerm
40 prUpper
41 prLower
42 prSep
43 prOLetter
44 prCM
45 prBA
46 prBK
47 prSP
48 prEX
49 prQU
50 prAL
51 prPR
52 prPO
53 prOP
54 prCP
55 prIS
56 prHY
57 prSY
58 prNU
59 prCL
60 prNL
61 prGL
62 prAI
63 prBB
64 prHL
65 prSA
66 prJL
67 prJV
68 prJT
69 prNS
70 prZW
71 prB2
72 prIN
73 prWJ
74 prID
75 prEB
76 prCJ
77 prH2
78 prH3
79 prSG
80 prCB
81 prRI
82 prEM
83 prN
84 prNa
85 prA
86 prW
87 prH
88 prF
89 prEmojiPresentation
90)
91
92// Unicode General Categories. Only the ones needed in the context of this
93// package are included.
94const (
95 gcNone = iota // gcNone must be 0.
96 gcCc
97 gcZs
98 gcPo
99 gcSc
100 gcPs
101 gcPe
102 gcSm
103 gcPd
104 gcNd
105 gcLu
106 gcSk
107 gcPc
108 gcLl
109 gcSo
110 gcLo
111 gcPi
112 gcCf
113 gcNo
114 gcPf
115 gcLC
116 gcLm
117 gcMn
118 gcMe
119 gcMc
120 gcNl
121 gcZl
122 gcZp
123 gcCn
124 gcCs
125 gcCo
126)
127
128// Special code points.
129const (
130 vs15 = 0xfe0e // Variation Selector-15 (text presentation)
131 vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
132)
133
134// propertySearch performs a binary search on a property slice and returns the
135// entry whose range (start = first array element, end = second array element)
136// includes r, or an array of 0's if no such entry was found.
137func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
138 // Run a binary search.
139 from := 0
140 to := len(dictionary)
141 for to > from {
142 middle := (from + to) / 2
143 cpRange := dictionary[middle]
144 if int(r) < cpRange[0] {
145 to = middle
146 continue
147 }
148 if int(r) > cpRange[1] {
149 from = middle + 1
150 continue
151 }
152 return cpRange
153 }
154 return
155}
156
157// property returns the Unicode property value (see constants above) of the
158// given code point.
159func property(dictionary [][3]int, r rune) int {
160 return propertySearch(dictionary, r)[2]
161}
162
163// propertyLineBreak returns the Unicode property value and General Category
164// (see constants above) of the given code point, as listed in the line break
165// code points table, while fast tracking ASCII digits and letters.
166func propertyLineBreak(r rune) (property, generalCategory int) {
167 if r >= 'a' && r <= 'z' {
168 return prAL, gcLl
169 }
170 if r >= 'A' && r <= 'Z' {
171 return prAL, gcLu
172 }
173 if r >= '0' && r <= '9' {
174 return prNU, gcNd
175 }
176 entry := propertySearch(lineBreakCodePoints, r)
177 return entry[2], entry[3]
178}
179
180// propertyGraphemes returns the Unicode grapheme cluster property value of the
181// given code point while fast tracking ASCII characters.
182func propertyGraphemes(r rune) int {
183 if r >= 0x20 && r <= 0x7e {
184 return prAny
185 }
186 if r == 0x0a {
187 return prLF
188 }
189 if r == 0x0d {
190 return prCR
191 }
192 if r >= 0 && r <= 0x1f || r == 0x7f {
193 return prControl
194 }
195 return property(graphemeCodePoints, r)
196}
197
198// propertyEastAsianWidth returns the Unicode East Asian Width property value of
199// the given code point while fast tracking ASCII characters.
200func propertyEastAsianWidth(r rune) int {
201 if r >= 0x20 && r <= 0x7e {
202 return prNa
203 }
204 if r >= 0 && r <= 0x1f || r == 0x7f {
205 return prN
206 }
207 return property(eastAsianWidth, r)
208}