util_cjk.go

  1package util
  2
  3import "unicode"
  4
  5var cjkRadicalsSupplement = &unicode.RangeTable{
  6	R16: []unicode.Range16{
  7		{0x2E80, 0x2EFF, 1},
  8	},
  9}
 10
 11var kangxiRadicals = &unicode.RangeTable{
 12	R16: []unicode.Range16{
 13		{0x2F00, 0x2FDF, 1},
 14	},
 15}
 16
 17var ideographicDescriptionCharacters = &unicode.RangeTable{
 18	R16: []unicode.Range16{
 19		{0x2FF0, 0x2FFF, 1},
 20	},
 21}
 22
 23var cjkSymbolsAndPunctuation = &unicode.RangeTable{
 24	R16: []unicode.Range16{
 25		{0x3000, 0x303F, 1},
 26	},
 27}
 28
 29var hiragana = &unicode.RangeTable{
 30	R16: []unicode.Range16{
 31		{0x3040, 0x309F, 1},
 32	},
 33}
 34
 35var katakana = &unicode.RangeTable{
 36	R16: []unicode.Range16{
 37		{0x30A0, 0x30FF, 1},
 38	},
 39}
 40
 41var kanbun = &unicode.RangeTable{
 42	R16: []unicode.Range16{
 43		{0x3130, 0x318F, 1},
 44		{0x3190, 0x319F, 1},
 45	},
 46}
 47
 48var cjkStrokes = &unicode.RangeTable{
 49	R16: []unicode.Range16{
 50		{0x31C0, 0x31EF, 1},
 51	},
 52}
 53
 54var katakanaPhoneticExtensions = &unicode.RangeTable{
 55	R16: []unicode.Range16{
 56		{0x31F0, 0x31FF, 1},
 57	},
 58}
 59
 60var cjkCompatibility = &unicode.RangeTable{
 61	R16: []unicode.Range16{
 62		{0x3300, 0x33FF, 1},
 63	},
 64}
 65
 66var cjkUnifiedIdeographsExtensionA = &unicode.RangeTable{
 67	R16: []unicode.Range16{
 68		{0x3400, 0x4DBF, 1},
 69	},
 70}
 71
 72var cjkUnifiedIdeographs = &unicode.RangeTable{
 73	R16: []unicode.Range16{
 74		{0x4E00, 0x9FFF, 1},
 75	},
 76}
 77
 78var yiSyllables = &unicode.RangeTable{
 79	R16: []unicode.Range16{
 80		{0xA000, 0xA48F, 1},
 81	},
 82}
 83
 84var yiRadicals = &unicode.RangeTable{
 85	R16: []unicode.Range16{
 86		{0xA490, 0xA4CF, 1},
 87	},
 88}
 89
 90var cjkCompatibilityIdeographs = &unicode.RangeTable{
 91	R16: []unicode.Range16{
 92		{0xF900, 0xFAFF, 1},
 93	},
 94}
 95
 96var verticalForms = &unicode.RangeTable{
 97	R16: []unicode.Range16{
 98		{0xFE10, 0xFE1F, 1},
 99	},
100}
101
102var cjkCompatibilityForms = &unicode.RangeTable{
103	R16: []unicode.Range16{
104		{0xFE30, 0xFE4F, 1},
105	},
106}
107
108var smallFormVariants = &unicode.RangeTable{
109	R16: []unicode.Range16{
110		{0xFE50, 0xFE6F, 1},
111	},
112}
113
114var halfwidthAndFullwidthForms = &unicode.RangeTable{
115	R16: []unicode.Range16{
116		{0xFF00, 0xFFEF, 1},
117	},
118}
119
120var kanaSupplement = &unicode.RangeTable{
121	R32: []unicode.Range32{
122		{0x1B000, 0x1B0FF, 1},
123	},
124}
125
126var kanaExtendedA = &unicode.RangeTable{
127	R32: []unicode.Range32{
128		{0x1B100, 0x1B12F, 1},
129	},
130}
131
132var smallKanaExtension = &unicode.RangeTable{
133	R32: []unicode.Range32{
134		{0x1B130, 0x1B16F, 1},
135	},
136}
137
138var cjkUnifiedIdeographsExtensionB = &unicode.RangeTable{
139	R32: []unicode.Range32{
140		{0x20000, 0x2A6DF, 1},
141	},
142}
143
144var cjkUnifiedIdeographsExtensionC = &unicode.RangeTable{
145	R32: []unicode.Range32{
146		{0x2A700, 0x2B73F, 1},
147	},
148}
149
150var cjkUnifiedIdeographsExtensionD = &unicode.RangeTable{
151	R32: []unicode.Range32{
152		{0x2B740, 0x2B81F, 1},
153	},
154}
155
156var cjkUnifiedIdeographsExtensionE = &unicode.RangeTable{
157	R32: []unicode.Range32{
158		{0x2B820, 0x2CEAF, 1},
159	},
160}
161
162var cjkUnifiedIdeographsExtensionF = &unicode.RangeTable{
163	R32: []unicode.Range32{
164		{0x2CEB0, 0x2EBEF, 1},
165	},
166}
167
168var cjkCompatibilityIdeographsSupplement = &unicode.RangeTable{
169	R32: []unicode.Range32{
170		{0x2F800, 0x2FA1F, 1},
171	},
172}
173
174var cjkUnifiedIdeographsExtensionG = &unicode.RangeTable{
175	R32: []unicode.Range32{
176		{0x30000, 0x3134F, 1},
177	},
178}
179
180// IsEastAsianWideRune returns trhe if the given rune is an east asian wide character, otherwise false.
181func IsEastAsianWideRune(r rune) bool {
182	return unicode.Is(unicode.Hiragana, r) ||
183		unicode.Is(unicode.Katakana, r) ||
184		unicode.Is(unicode.Han, r) ||
185		unicode.Is(unicode.Lm, r) ||
186		unicode.Is(unicode.Hangul, r) ||
187		unicode.Is(cjkSymbolsAndPunctuation, r)
188}
189
190// IsSpaceDiscardingUnicodeRune returns true if the given rune is space-discarding unicode character, otherwise false.
191// See https://www.w3.org/TR/2020/WD-css-text-3-20200429/#space-discard-set
192func IsSpaceDiscardingUnicodeRune(r rune) bool {
193	return unicode.Is(cjkRadicalsSupplement, r) ||
194		unicode.Is(kangxiRadicals, r) ||
195		unicode.Is(ideographicDescriptionCharacters, r) ||
196		unicode.Is(cjkSymbolsAndPunctuation, r) ||
197		unicode.Is(hiragana, r) ||
198		unicode.Is(katakana, r) ||
199		unicode.Is(kanbun, r) ||
200		unicode.Is(cjkStrokes, r) ||
201		unicode.Is(katakanaPhoneticExtensions, r) ||
202		unicode.Is(cjkCompatibility, r) ||
203		unicode.Is(cjkUnifiedIdeographsExtensionA, r) ||
204		unicode.Is(cjkUnifiedIdeographs, r) ||
205		unicode.Is(yiSyllables, r) ||
206		unicode.Is(yiRadicals, r) ||
207		unicode.Is(cjkCompatibilityIdeographs, r) ||
208		unicode.Is(verticalForms, r) ||
209		unicode.Is(cjkCompatibilityForms, r) ||
210		unicode.Is(smallFormVariants, r) ||
211		unicode.Is(halfwidthAndFullwidthForms, r) ||
212		unicode.Is(kanaSupplement, r) ||
213		unicode.Is(kanaExtendedA, r) ||
214		unicode.Is(smallKanaExtension, r) ||
215		unicode.Is(cjkUnifiedIdeographsExtensionB, r) ||
216		unicode.Is(cjkUnifiedIdeographsExtensionC, r) ||
217		unicode.Is(cjkUnifiedIdeographsExtensionD, r) ||
218		unicode.Is(cjkUnifiedIdeographsExtensionE, r) ||
219		unicode.Is(cjkUnifiedIdeographsExtensionF, r) ||
220		unicode.Is(cjkCompatibilityIdeographsSupplement, r) ||
221		unicode.Is(cjkUnifiedIdeographsExtensionG, r)
222}
223
224// EastAsianWidth returns the east asian width of the given rune.
225// See https://www.unicode.org/reports/tr11/tr11-36.html
226func EastAsianWidth(r rune) string {
227	switch {
228	case r == 0x3000,
229		(0xFF01 <= r && r <= 0xFF60),
230		(0xFFE0 <= r && r <= 0xFFE6):
231		return "F"
232
233	case r == 0x20A9,
234		(0xFF61 <= r && r <= 0xFFBE),
235		(0xFFC2 <= r && r <= 0xFFC7),
236		(0xFFCA <= r && r <= 0xFFCF),
237		(0xFFD2 <= r && r <= 0xFFD7),
238		(0xFFDA <= r && r <= 0xFFDC),
239		(0xFFE8 <= r && r <= 0xFFEE):
240		return "H"
241
242	case (0x1100 <= r && r <= 0x115F),
243		(0x11A3 <= r && r <= 0x11A7),
244		(0x11FA <= r && r <= 0x11FF),
245		(0x2329 <= r && r <= 0x232A),
246		(0x2E80 <= r && r <= 0x2E99),
247		(0x2E9B <= r && r <= 0x2EF3),
248		(0x2F00 <= r && r <= 0x2FD5),
249		(0x2FF0 <= r && r <= 0x2FFB),
250		(0x3001 <= r && r <= 0x303E),
251		(0x3041 <= r && r <= 0x3096),
252		(0x3099 <= r && r <= 0x30FF),
253		(0x3105 <= r && r <= 0x312D),
254		(0x3131 <= r && r <= 0x318E),
255		(0x3190 <= r && r <= 0x31BA),
256		(0x31C0 <= r && r <= 0x31E3),
257		(0x31F0 <= r && r <= 0x321E),
258		(0x3220 <= r && r <= 0x3247),
259		(0x3250 <= r && r <= 0x32FE),
260		(0x3300 <= r && r <= 0x4DBF),
261		(0x4E00 <= r && r <= 0xA48C),
262		(0xA490 <= r && r <= 0xA4C6),
263		(0xA960 <= r && r <= 0xA97C),
264		(0xAC00 <= r && r <= 0xD7A3),
265		(0xD7B0 <= r && r <= 0xD7C6),
266		(0xD7CB <= r && r <= 0xD7FB),
267		(0xF900 <= r && r <= 0xFAFF),
268		(0xFE10 <= r && r <= 0xFE19),
269		(0xFE30 <= r && r <= 0xFE52),
270		(0xFE54 <= r && r <= 0xFE66),
271		(0xFE68 <= r && r <= 0xFE6B),
272		(0x1B000 <= r && r <= 0x1B001),
273		(0x1F200 <= r && r <= 0x1F202),
274		(0x1F210 <= r && r <= 0x1F23A),
275		(0x1F240 <= r && r <= 0x1F248),
276		(0x1F250 <= r && r <= 0x1F251),
277		(0x20000 <= r && r <= 0x2F73F),
278		(0x2B740 <= r && r <= 0x2FFFD),
279		(0x30000 <= r && r <= 0x3FFFD):
280		return "W"
281
282	case (0x0020 <= r && r <= 0x007E),
283		(0x00A2 <= r && r <= 0x00A3),
284		(0x00A5 <= r && r <= 0x00A6),
285		r == 0x00AC,
286		r == 0x00AF,
287		(0x27E6 <= r && r <= 0x27ED),
288		(0x2985 <= r && r <= 0x2986):
289		return "Na"
290
291	case (0x00A1 == r),
292		(0x00A4 == r),
293		(0x00A7 <= r && r <= 0x00A8),
294		(0x00AA == r),
295		(0x00AD <= r && r <= 0x00AE),
296		(0x00B0 <= r && r <= 0x00B4),
297		(0x00B6 <= r && r <= 0x00BA),
298		(0x00BC <= r && r <= 0x00BF),
299		(0x00C6 == r),
300		(0x00D0 == r),
301		(0x00D7 <= r && r <= 0x00D8),
302		(0x00DE <= r && r <= 0x00E1),
303		(0x00E6 == r),
304		(0x00E8 <= r && r <= 0x00EA),
305		(0x00EC <= r && r <= 0x00ED),
306		(0x00F0 == r),
307		(0x00F2 <= r && r <= 0x00F3),
308		(0x00F7 <= r && r <= 0x00FA),
309		(0x00FC == r),
310		(0x00FE == r),
311		(0x0101 == r),
312		(0x0111 == r),
313		(0x0113 == r),
314		(0x011B == r),
315		(0x0126 <= r && r <= 0x0127),
316		(0x012B == r),
317		(0x0131 <= r && r <= 0x0133),
318		(0x0138 == r),
319		(0x013F <= r && r <= 0x0142),
320		(0x0144 == r),
321		(0x0148 <= r && r <= 0x014B),
322		(0x014D == r),
323		(0x0152 <= r && r <= 0x0153),
324		(0x0166 <= r && r <= 0x0167),
325		(0x016B == r),
326		(0x01CE == r),
327		(0x01D0 == r),
328		(0x01D2 == r),
329		(0x01D4 == r),
330		(0x01D6 == r),
331		(0x01D8 == r),
332		(0x01DA == r),
333		(0x01DC == r),
334		(0x0251 == r),
335		(0x0261 == r),
336		(0x02C4 == r),
337		(0x02C7 == r),
338		(0x02C9 <= r && r <= 0x02CB),
339		(0x02CD == r),
340		(0x02D0 == r),
341		(0x02D8 <= r && r <= 0x02DB),
342		(0x02DD == r),
343		(0x02DF == r),
344		(0x0300 <= r && r <= 0x036F),
345		(0x0391 <= r && r <= 0x03A1),
346		(0x03A3 <= r && r <= 0x03A9),
347		(0x03B1 <= r && r <= 0x03C1),
348		(0x03C3 <= r && r <= 0x03C9),
349		(0x0401 == r),
350		(0x0410 <= r && r <= 0x044F),
351		(0x0451 == r),
352		(0x2010 == r),
353		(0x2013 <= r && r <= 0x2016),
354		(0x2018 <= r && r <= 0x2019),
355		(0x201C <= r && r <= 0x201D),
356		(0x2020 <= r && r <= 0x2022),
357		(0x2024 <= r && r <= 0x2027),
358		(0x2030 == r),
359		(0x2032 <= r && r <= 0x2033),
360		(0x2035 == r),
361		(0x203B == r),
362		(0x203E == r),
363		(0x2074 == r),
364		(0x207F == r),
365		(0x2081 <= r && r <= 0x2084),
366		(0x20AC == r),
367		(0x2103 == r),
368		(0x2105 == r),
369		(0x2109 == r),
370		(0x2113 == r),
371		(0x2116 == r),
372		(0x2121 <= r && r <= 0x2122),
373		(0x2126 == r),
374		(0x212B == r),
375		(0x2153 <= r && r <= 0x2154),
376		(0x215B <= r && r <= 0x215E),
377		(0x2160 <= r && r <= 0x216B),
378		(0x2170 <= r && r <= 0x2179),
379		(0x2189 == r),
380		(0x2190 <= r && r <= 0x2199),
381		(0x21B8 <= r && r <= 0x21B9),
382		(0x21D2 == r),
383		(0x21D4 == r),
384		(0x21E7 == r),
385		(0x2200 == r),
386		(0x2202 <= r && r <= 0x2203),
387		(0x2207 <= r && r <= 0x2208),
388		(0x220B == r),
389		(0x220F == r),
390		(0x2211 == r),
391		(0x2215 == r),
392		(0x221A == r),
393		(0x221D <= r && r <= 0x2220),
394		(0x2223 == r),
395		(0x2225 == r),
396		(0x2227 <= r && r <= 0x222C),
397		(0x222E == r),
398		(0x2234 <= r && r <= 0x2237),
399		(0x223C <= r && r <= 0x223D),
400		(0x2248 == r),
401		(0x224C == r),
402		(0x2252 == r),
403		(0x2260 <= r && r <= 0x2261),
404		(0x2264 <= r && r <= 0x2267),
405		(0x226A <= r && r <= 0x226B),
406		(0x226E <= r && r <= 0x226F),
407		(0x2282 <= r && r <= 0x2283),
408		(0x2286 <= r && r <= 0x2287),
409		(0x2295 == r),
410		(0x2299 == r),
411		(0x22A5 == r),
412		(0x22BF == r),
413		(0x2312 == r),
414		(0x2460 <= r && r <= 0x24E9),
415		(0x24EB <= r && r <= 0x254B),
416		(0x2550 <= r && r <= 0x2573),
417		(0x2580 <= r && r <= 0x258F),
418		(0x2592 <= r && r <= 0x2595),
419		(0x25A0 <= r && r <= 0x25A1),
420		(0x25A3 <= r && r <= 0x25A9),
421		(0x25B2 <= r && r <= 0x25B3),
422		(0x25B6 <= r && r <= 0x25B7),
423		(0x25BC <= r && r <= 0x25BD),
424		(0x25C0 <= r && r <= 0x25C1),
425		(0x25C6 <= r && r <= 0x25C8),
426		(0x25CB == r),
427		(0x25CE <= r && r <= 0x25D1),
428		(0x25E2 <= r && r <= 0x25E5),
429		(0x25EF == r),
430		(0x2605 <= r && r <= 0x2606),
431		(0x2609 == r),
432		(0x260E <= r && r <= 0x260F),
433		(0x2614 <= r && r <= 0x2615),
434		(0x261C == r),
435		(0x261E == r),
436		(0x2640 == r),
437		(0x2642 == r),
438		(0x2660 <= r && r <= 0x2661),
439		(0x2663 <= r && r <= 0x2665),
440		(0x2667 <= r && r <= 0x266A),
441		(0x266C <= r && r <= 0x266D),
442		(0x266F == r),
443		(0x269E <= r && r <= 0x269F),
444		(0x26BE <= r && r <= 0x26BF),
445		(0x26C4 <= r && r <= 0x26CD),
446		(0x26CF <= r && r <= 0x26E1),
447		(0x26E3 == r),
448		(0x26E8 <= r && r <= 0x26FF),
449		(0x273D == r),
450		(0x2757 == r),
451		(0x2776 <= r && r <= 0x277F),
452		(0x2B55 <= r && r <= 0x2B59),
453		(0x3248 <= r && r <= 0x324F),
454		(0xE000 <= r && r <= 0xF8FF),
455		(0xFE00 <= r && r <= 0xFE0F),
456		(0xFFFD == r),
457		(0x1F100 <= r && r <= 0x1F10A),
458		(0x1F110 <= r && r <= 0x1F12D),
459		(0x1F130 <= r && r <= 0x1F169),
460		(0x1F170 <= r && r <= 0x1F19A),
461		(0xE0100 <= r && r <= 0xE01EF),
462		(0xF0000 <= r && r <= 0xFFFFD),
463		(0x100000 <= r && r <= 0x10FFFD):
464		return "A"
465
466	default:
467		return "N"
468	}
469}