properties.go

  1package uniseg
  2
  3// The Unicode properties as used in the various parsers. Only the ones needed
  4// in the context of this package are included.
  5const (
  6	prXX      = 0    // Same as prAny.
  7	prAny     = iota // prAny must be 0.
  8	prPrepend        // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
  9	prCR
 10	prLF
 11	prControl
 12	prExtend
 13	prRegionalIndicator
 14	prSpacingMark
 15	prL
 16	prV
 17	prT
 18	prLV
 19	prLVT
 20	prZWJ
 21	prExtendedPictographic
 22	prNewline
 23	prWSegSpace
 24	prDoubleQuote
 25	prSingleQuote
 26	prMidNumLet
 27	prNumeric
 28	prMidLetter
 29	prMidNum
 30	prExtendNumLet
 31	prALetter
 32	prFormat
 33	prHebrewLetter
 34	prKatakana
 35	prSp
 36	prSTerm
 37	prClose
 38	prSContinue
 39	prATerm
 40	prUpper
 41	prLower
 42	prSep
 43	prOLetter
 44	prCM
 45	prBA
 46	prBK
 47	prSP
 48	prEX
 49	prQU
 50	prAL
 51	prPR
 52	prPO
 53	prOP
 54	prCP
 55	prIS
 56	prHY
 57	prSY
 58	prNU
 59	prCL
 60	prNL
 61	prGL
 62	prAI
 63	prBB
 64	prHL
 65	prSA
 66	prJL
 67	prJV
 68	prJT
 69	prNS
 70	prZW
 71	prB2
 72	prIN
 73	prWJ
 74	prID
 75	prEB
 76	prCJ
 77	prH2
 78	prH3
 79	prSG
 80	prCB
 81	prRI
 82	prEM
 83	prN
 84	prNa
 85	prA
 86	prW
 87	prH
 88	prF
 89	prEmojiPresentation
 90)
 91
 92// Unicode General Categories. Only the ones needed in the context of this
 93// package are included.
 94const (
 95	gcNone = iota // gcNone must be 0.
 96	gcCc
 97	gcZs
 98	gcPo
 99	gcSc
100	gcPs
101	gcPe
102	gcSm
103	gcPd
104	gcNd
105	gcLu
106	gcSk
107	gcPc
108	gcLl
109	gcSo
110	gcLo
111	gcPi
112	gcCf
113	gcNo
114	gcPf
115	gcLC
116	gcLm
117	gcMn
118	gcMe
119	gcMc
120	gcNl
121	gcZl
122	gcZp
123	gcCn
124	gcCs
125	gcCo
126)
127
128// Special code points.
129const (
130	vs15 = 0xfe0e // Variation Selector-15 (text presentation)
131	vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
132)
133
134// propertySearch performs a binary search on a property slice and returns the
135// entry whose range (start = first array element, end = second array element)
136// includes r, or an array of 0's if no such entry was found.
137func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
138	// Run a binary search.
139	from := 0
140	to := len(dictionary)
141	for to > from {
142		middle := (from + to) / 2
143		cpRange := dictionary[middle]
144		if int(r) < cpRange[0] {
145			to = middle
146			continue
147		}
148		if int(r) > cpRange[1] {
149			from = middle + 1
150			continue
151		}
152		return cpRange
153	}
154	return
155}
156
157// property returns the Unicode property value (see constants above) of the
158// given code point.
159func property(dictionary [][3]int, r rune) int {
160	return propertySearch(dictionary, r)[2]
161}
162
163// propertyLineBreak returns the Unicode property value and General Category
164// (see constants above) of the given code point, as listed in the line break
165// code points table, while fast tracking ASCII digits and letters.
166func propertyLineBreak(r rune) (property, generalCategory int) {
167	if r >= 'a' && r <= 'z' {
168		return prAL, gcLl
169	}
170	if r >= 'A' && r <= 'Z' {
171		return prAL, gcLu
172	}
173	if r >= '0' && r <= '9' {
174		return prNU, gcNd
175	}
176	entry := propertySearch(lineBreakCodePoints, r)
177	return entry[2], entry[3]
178}
179
180// propertyGraphemes returns the Unicode grapheme cluster property value of the
181// given code point while fast tracking ASCII characters.
182func propertyGraphemes(r rune) int {
183	if r >= 0x20 && r <= 0x7e {
184		return prAny
185	}
186	if r == 0x0a {
187		return prLF
188	}
189	if r == 0x0d {
190		return prCR
191	}
192	if r >= 0 && r <= 0x1f || r == 0x7f {
193		return prControl
194	}
195	return property(graphemeCodePoints, r)
196}
197
198// propertyEastAsianWidth returns the Unicode East Asian Width property value of
199// the given code point while fast tracking ASCII characters.
200func propertyEastAsianWidth(r rune) int {
201	if r >= 0x20 && r <= 0x7e {
202		return prNa
203	}
204	if r >= 0 && r <= 0x1f || r == 0x7f {
205		return prN
206	}
207	return property(eastAsianWidth, r)
208}