charclass.go

  1package syntax
  2
  3import (
  4	"bytes"
  5	"encoding/binary"
  6	"fmt"
  7	"sort"
  8	"unicode"
  9	"unicode/utf8"
 10)
 11
 12// CharSet combines start-end rune ranges and unicode categories representing a set of characters
 13type CharSet struct {
 14	ranges     []singleRange
 15	categories []category
 16	sub        *CharSet //optional subtractor
 17	negate     bool
 18	anything   bool
 19}
 20
 21type category struct {
 22	negate bool
 23	cat    string
 24}
 25
 26type singleRange struct {
 27	first rune
 28	last  rune
 29}
 30
 31const (
 32	spaceCategoryText = " "
 33	wordCategoryText  = "W"
 34)
 35
 36var (
 37	ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
 38	ecmaWord  = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
 39	ecmaDigit = []rune{0x0030, 0x003a}
 40
 41	re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
 42)
 43
 44var (
 45	AnyClass          = getCharSetFromOldString([]rune{0}, false)
 46	ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
 47	NoneClass         = getCharSetFromOldString(nil, false)
 48	ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
 49	NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
 50	ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
 51	NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
 52	ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
 53	NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
 54
 55	WordClass     = getCharSetFromCategoryString(false, false, wordCategoryText)
 56	NotWordClass  = getCharSetFromCategoryString(true, false, wordCategoryText)
 57	SpaceClass    = getCharSetFromCategoryString(false, false, spaceCategoryText)
 58	NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
 59	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
 60	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
 61
 62	RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
 63	NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
 64)
 65
 66var unicodeCategories = func() map[string]*unicode.RangeTable {
 67	retVal := make(map[string]*unicode.RangeTable)
 68	for k, v := range unicode.Scripts {
 69		retVal[k] = v
 70	}
 71	for k, v := range unicode.Categories {
 72		retVal[k] = v
 73	}
 74	for k, v := range unicode.Properties {
 75		retVal[k] = v
 76	}
 77	return retVal
 78}()
 79
 80func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
 81	if negateCat && negateSet {
 82		panic("BUG!  You should only negate the set OR the category in a constant setup, but not both")
 83	}
 84
 85	c := CharSet{negate: negateSet}
 86
 87	c.categories = make([]category, len(cats))
 88	for i, cat := range cats {
 89		c.categories[i] = category{cat: cat, negate: negateCat}
 90	}
 91	return func() *CharSet {
 92		//make a copy each time
 93		local := c
 94		//return that address
 95		return &local
 96	}
 97}
 98
 99func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
100	c := CharSet{}
101	if len(setText) > 0 {
102		fillFirst := false
103		l := len(setText)
104		if negate {
105			if setText[0] == 0 {
106				setText = setText[1:]
107			} else {
108				l++
109				fillFirst = true
110			}
111		}
112
113		if l%2 == 0 {
114			c.ranges = make([]singleRange, l/2)
115		} else {
116			c.ranges = make([]singleRange, l/2+1)
117		}
118
119		first := true
120		if fillFirst {
121			c.ranges[0] = singleRange{first: 0}
122			first = false
123		}
124
125		i := 0
126		for _, r := range setText {
127			if first {
128				// lower bound in a new range
129				c.ranges[i] = singleRange{first: r}
130				first = false
131			} else {
132				c.ranges[i].last = r - 1
133				i++
134				first = true
135			}
136		}
137		if !first {
138			c.ranges[i].last = utf8.MaxRune
139		}
140	}
141
142	return func() *CharSet {
143		local := c
144		return &local
145	}
146}
147
148// Copy makes a deep copy to prevent accidental mutation of a set
149func (c CharSet) Copy() CharSet {
150	ret := CharSet{
151		anything: c.anything,
152		negate:   c.negate,
153	}
154
155	ret.ranges = append(ret.ranges, c.ranges...)
156	ret.categories = append(ret.categories, c.categories...)
157
158	if c.sub != nil {
159		sub := c.sub.Copy()
160		ret.sub = &sub
161	}
162
163	return ret
164}
165
166// gets a human-readable description for a set string
167func (c CharSet) String() string {
168	buf := &bytes.Buffer{}
169	buf.WriteRune('[')
170
171	if c.IsNegated() {
172		buf.WriteRune('^')
173	}
174
175	for _, r := range c.ranges {
176
177		buf.WriteString(CharDescription(r.first))
178		if r.first != r.last {
179			if r.last-r.first != 1 {
180				//groups that are 1 char apart skip the dash
181				buf.WriteRune('-')
182			}
183			buf.WriteString(CharDescription(r.last))
184		}
185	}
186
187	for _, c := range c.categories {
188		buf.WriteString(c.String())
189	}
190
191	if c.sub != nil {
192		buf.WriteRune('-')
193		buf.WriteString(c.sub.String())
194	}
195
196	buf.WriteRune(']')
197
198	return buf.String()
199}
200
201// mapHashFill converts a charset into a buffer for use in maps
202func (c CharSet) mapHashFill(buf *bytes.Buffer) {
203	if c.negate {
204		buf.WriteByte(0)
205	} else {
206		buf.WriteByte(1)
207	}
208
209	binary.Write(buf, binary.LittleEndian, len(c.ranges))
210	binary.Write(buf, binary.LittleEndian, len(c.categories))
211	for _, r := range c.ranges {
212		buf.WriteRune(r.first)
213		buf.WriteRune(r.last)
214	}
215	for _, ct := range c.categories {
216		buf.WriteString(ct.cat)
217		if ct.negate {
218			buf.WriteByte(1)
219		} else {
220			buf.WriteByte(0)
221		}
222	}
223
224	if c.sub != nil {
225		c.sub.mapHashFill(buf)
226	}
227}
228
229// CharIn returns true if the rune is in our character set (either ranges or categories).
230// It handles negations and subtracted sub-charsets.
231func (c CharSet) CharIn(ch rune) bool {
232	val := false
233	// in s && !s.subtracted
234
235	//check ranges
236	for _, r := range c.ranges {
237		if ch < r.first {
238			continue
239		}
240		if ch <= r.last {
241			val = true
242			break
243		}
244	}
245
246	//check categories if we haven't already found a range
247	if !val && len(c.categories) > 0 {
248		for _, ct := range c.categories {
249			// special categories...then unicode
250			if ct.cat == spaceCategoryText {
251				if unicode.IsSpace(ch) {
252					// we found a space so we're done
253					// negate means this is a "bad" thing
254					val = !ct.negate
255					break
256				} else if ct.negate {
257					val = true
258					break
259				}
260			} else if ct.cat == wordCategoryText {
261				if IsWordChar(ch) {
262					val = !ct.negate
263					break
264				} else if ct.negate {
265					val = true
266					break
267				}
268			} else if unicode.Is(unicodeCategories[ct.cat], ch) {
269				// if we're in this unicode category then we're done
270				// if negate=true on this category then we "failed" our test
271				// otherwise we're good that we found it
272				val = !ct.negate
273				break
274			} else if ct.negate {
275				val = true
276				break
277			}
278		}
279	}
280
281	// negate the whole char set
282	if c.negate {
283		val = !val
284	}
285
286	// get subtracted recurse
287	if val && c.sub != nil {
288		val = !c.sub.CharIn(ch)
289	}
290
291	//log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
292	return val
293}
294
295func (c category) String() string {
296	switch c.cat {
297	case spaceCategoryText:
298		if c.negate {
299			return "\\S"
300		}
301		return "\\s"
302	case wordCategoryText:
303		if c.negate {
304			return "\\W"
305		}
306		return "\\w"
307	}
308	if _, ok := unicodeCategories[c.cat]; ok {
309
310		if c.negate {
311			return "\\P{" + c.cat + "}"
312		}
313		return "\\p{" + c.cat + "}"
314	}
315	return "Unknown category: " + c.cat
316}
317
318// CharDescription Produces a human-readable description for a single character.
319func CharDescription(ch rune) string {
320	/*if ch == '\\' {
321		return "\\\\"
322	}
323
324	if ch > ' ' && ch <= '~' {
325		return string(ch)
326	} else if ch == '\n' {
327		return "\\n"
328	} else if ch == ' ' {
329		return "\\ "
330	}*/
331
332	b := &bytes.Buffer{}
333	escape(b, ch, false) //fmt.Sprintf("%U", ch)
334	return b.String()
335}
336
337// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
338// RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
339// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
340// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
341func IsWordChar(r rune) bool {
342	//"L", "Mn", "Nd", "Pc"
343	return unicode.In(r,
344		unicode.Categories["L"], unicode.Categories["Mn"],
345		unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
346	//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
347}
348
349func IsECMAWordChar(r rune) bool {
350	return unicode.In(r,
351		unicode.Categories["L"], unicode.Categories["Mn"],
352		unicode.Categories["Nd"], unicode.Categories["Pc"])
353
354	//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
355}
356
357// SingletonChar will return the char from the first range without validation.
358// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
359func (c CharSet) SingletonChar() rune {
360	return c.ranges[0].first
361}
362
363func (c CharSet) IsSingleton() bool {
364	return !c.negate && //negated is multiple chars
365		len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
366		c.sub == nil && // subtraction means we've got multiple chars
367		c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
368}
369
370func (c CharSet) IsSingletonInverse() bool {
371	return c.negate && //same as above, but requires negated
372		len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
373		c.sub == nil && // subtraction means we've got multiple chars
374		c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
375}
376
377func (c CharSet) IsMergeable() bool {
378	return !c.IsNegated() && !c.HasSubtraction()
379}
380
381func (c CharSet) IsNegated() bool {
382	return c.negate
383}
384
385func (c CharSet) HasSubtraction() bool {
386	return c.sub != nil
387}
388
389func (c CharSet) IsEmpty() bool {
390	return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
391}
392
393func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
394	if ecma {
395		if negate {
396			c.addRanges(NotECMADigitClass().ranges)
397		} else {
398			c.addRanges(ECMADigitClass().ranges)
399		}
400	} else {
401		c.addCategories(category{cat: "Nd", negate: negate})
402	}
403}
404
405func (c *CharSet) addChar(ch rune) {
406	c.addRange(ch, ch)
407}
408
409func (c *CharSet) addSpace(ecma, re2, negate bool) {
410	if ecma {
411		if negate {
412			c.addRanges(NotECMASpaceClass().ranges)
413		} else {
414			c.addRanges(ECMASpaceClass().ranges)
415		}
416	} else if re2 {
417		if negate {
418			c.addRanges(NotRE2SpaceClass().ranges)
419		} else {
420			c.addRanges(RE2SpaceClass().ranges)
421		}
422	} else {
423		c.addCategories(category{cat: spaceCategoryText, negate: negate})
424	}
425}
426
427func (c *CharSet) addWord(ecma, negate bool) {
428	if ecma {
429		if negate {
430			c.addRanges(NotECMAWordClass().ranges)
431		} else {
432			c.addRanges(ECMAWordClass().ranges)
433		}
434	} else {
435		c.addCategories(category{cat: wordCategoryText, negate: negate})
436	}
437}
438
439// Add set ranges and categories into ours -- no deduping or anything
440func (c *CharSet) addSet(set CharSet) {
441	if c.anything {
442		return
443	}
444	if set.anything {
445		c.makeAnything()
446		return
447	}
448	// just append here to prevent double-canon
449	c.ranges = append(c.ranges, set.ranges...)
450	c.addCategories(set.categories...)
451	c.canonicalize()
452}
453
454func (c *CharSet) makeAnything() {
455	c.anything = true
456	c.categories = []category{}
457	c.ranges = AnyClass().ranges
458}
459
460func (c *CharSet) addCategories(cats ...category) {
461	// don't add dupes and remove positive+negative
462	if c.anything {
463		// if we've had a previous positive+negative group then
464		// just return, we're as broad as we can get
465		return
466	}
467
468	for _, ct := range cats {
469		found := false
470		for _, ct2 := range c.categories {
471			if ct.cat == ct2.cat {
472				if ct.negate != ct2.negate {
473					// oposite negations...this mean we just
474					// take us as anything and move on
475					c.makeAnything()
476					return
477				}
478				found = true
479				break
480			}
481		}
482
483		if !found {
484			c.categories = append(c.categories, ct)
485		}
486	}
487}
488
489// Merges new ranges to our own
490func (c *CharSet) addRanges(ranges []singleRange) {
491	if c.anything {
492		return
493	}
494	c.ranges = append(c.ranges, ranges...)
495	c.canonicalize()
496}
497
498// Merges everything but the new ranges into our own
499func (c *CharSet) addNegativeRanges(ranges []singleRange) {
500	if c.anything {
501		return
502	}
503
504	var hi rune
505
506	// convert incoming ranges into opposites, assume they are in order
507	for _, r := range ranges {
508		if hi < r.first {
509			c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
510		}
511		hi = r.last + 1
512	}
513
514	if hi < utf8.MaxRune {
515		c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
516	}
517
518	c.canonicalize()
519}
520
521func isValidUnicodeCat(catName string) bool {
522	_, ok := unicodeCategories[catName]
523	return ok
524}
525
526func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
527	if !isValidUnicodeCat(categoryName) {
528		// unknown unicode category, script, or property "blah"
529		panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
530
531	}
532
533	if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
534		// when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
535		c.addCategories(
536			category{cat: "Ll", negate: negate},
537			category{cat: "Lu", negate: negate},
538			category{cat: "Lt", negate: negate})
539	}
540	c.addCategories(category{cat: categoryName, negate: negate})
541}
542
543func (c *CharSet) addSubtraction(sub *CharSet) {
544	c.sub = sub
545}
546
547func (c *CharSet) addRange(chMin, chMax rune) {
548	c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
549	c.canonicalize()
550}
551
552func (c *CharSet) addNamedASCII(name string, negate bool) bool {
553	var rs []singleRange
554
555	switch name {
556	case "alnum":
557		rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
558	case "alpha":
559		rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
560	case "ascii":
561		rs = []singleRange{singleRange{0, 0x7f}}
562	case "blank":
563		rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
564	case "cntrl":
565		rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
566	case "digit":
567		c.addDigit(false, negate, "")
568	case "graph":
569		rs = []singleRange{singleRange{'!', '~'}}
570	case "lower":
571		rs = []singleRange{singleRange{'a', 'z'}}
572	case "print":
573		rs = []singleRange{singleRange{' ', '~'}}
574	case "punct": //[!-/:-@[-`{-~]
575		rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
576	case "space":
577		c.addSpace(true, false, negate)
578	case "upper":
579		rs = []singleRange{singleRange{'A', 'Z'}}
580	case "word":
581		c.addWord(true, negate)
582	case "xdigit":
583		rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
584	default:
585		return false
586	}
587
588	if len(rs) > 0 {
589		if negate {
590			c.addNegativeRanges(rs)
591		} else {
592			c.addRanges(rs)
593		}
594	}
595
596	return true
597}
598
599type singleRangeSorter []singleRange
600
601func (p singleRangeSorter) Len() int           { return len(p) }
602func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
603func (p singleRangeSorter) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
604
605// Logic to reduce a character class to a unique, sorted form.
606func (c *CharSet) canonicalize() {
607	var i, j int
608	var last rune
609
610	//
611	// Find and eliminate overlapping or abutting ranges
612	//
613
614	if len(c.ranges) > 1 {
615		sort.Sort(singleRangeSorter(c.ranges))
616
617		done := false
618
619		for i, j = 1, 0; ; i++ {
620			for last = c.ranges[j].last; ; i++ {
621				if i == len(c.ranges) || last == utf8.MaxRune {
622					done = true
623					break
624				}
625
626				CurrentRange := c.ranges[i]
627				if CurrentRange.first > last+1 {
628					break
629				}
630
631				if last < CurrentRange.last {
632					last = CurrentRange.last
633				}
634			}
635
636			c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
637
638			j++
639
640			if done {
641				break
642			}
643
644			if j < i {
645				c.ranges[j] = c.ranges[i]
646			}
647		}
648
649		c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
650	}
651}
652
653// Adds to the class any lowercase versions of characters already
654// in the class. Used for case-insensitivity.
655func (c *CharSet) addLowercase() {
656	if c.anything {
657		return
658	}
659	toAdd := []singleRange{}
660	for i := 0; i < len(c.ranges); i++ {
661		r := c.ranges[i]
662		if r.first == r.last {
663			lower := unicode.ToLower(r.first)
664			c.ranges[i] = singleRange{first: lower, last: lower}
665		} else {
666			toAdd = append(toAdd, r)
667		}
668	}
669
670	for _, r := range toAdd {
671		c.addLowercaseRange(r.first, r.last)
672	}
673	c.canonicalize()
674}
675
676/**************************************************************************
677    Let U be the set of Unicode character values and let L be the lowercase
678    function, mapping from U to U. To perform case insensitive matching of
679    character sets, we need to be able to map an interval I in U, say
680
681        I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
682
683    to a set A such that A contains L(I) and A is contained in the union of
684    I and L(I).
685
686    The table below partitions U into intervals on which L is non-decreasing.
687    Thus, for any interval J = [a, b] contained in one of these intervals,
688    L(J) is contained in [L(a), L(b)].
689
690    It is also true that for any such J, [L(a), L(b)] is contained in the
691    union of J and L(J). This does not follow from L being non-decreasing on
692    these intervals. It follows from the nature of the L on each interval.
693    On each interval, L has one of the following forms:
694
695        (1) L(ch) = constant            (LowercaseSet)
696        (2) L(ch) = ch + offset         (LowercaseAdd)
697        (3) L(ch) = ch | 1              (LowercaseBor)
698        (4) L(ch) = ch + (ch & 1)       (LowercaseBad)
699
700    It is easy to verify that for any of these forms [L(a), L(b)] is
701    contained in the union of [a, b] and L([a, b]).
702***************************************************************************/
703
704const (
705	LowercaseSet = 0 // Set to arg.
706	LowercaseAdd = 1 // Add arg.
707	LowercaseBor = 2 // Bitwise or with 1.
708	LowercaseBad = 3 // Bitwise and with 1 and add original.
709)
710
711type lcMap struct {
712	chMin, chMax rune
713	op, data     int32
714}
715
716var lcTable = []lcMap{
717	lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
718	lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
719	lcMap{'\u0100', '\u012E', LowercaseBor, 0},
720	lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
721	lcMap{'\u0132', '\u0136', LowercaseBor, 0},
722	lcMap{'\u0139', '\u0147', LowercaseBad, 0},
723	lcMap{'\u014A', '\u0176', LowercaseBor, 0},
724	lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
725	lcMap{'\u0179', '\u017D', LowercaseBad, 0},
726	lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
727	lcMap{'\u0182', '\u0184', LowercaseBor, 0},
728	lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
729	lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
730	lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
731	lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
732	lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
733	lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
734	lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
735	lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
736	lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
737	lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
738	lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
739	lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
740	lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
741	lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
742	lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
743	lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
744	lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
745	lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
746	lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
747	lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
748	lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
749	lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
750	lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
751	lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
752	lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
753	lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
754	lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
755	lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
756	lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
757	lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
758	lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
759	lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
760	lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
761	lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
762	lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
763	lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
764	lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
765	lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
766	lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
767	lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
768	lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
769	lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
770	lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
771	lcMap{'\u0460', '\u0480', LowercaseBor, 0},
772	lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
773	lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
774	lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
775	lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
776	lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
777	lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
778	lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
779	lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
780	lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
781	lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
782	lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
783	lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
784	lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
785	lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
786	lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
787	lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
788	lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
789	lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
790	lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
791	lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
792	lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
793	lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
794	lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
795	lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
796	lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
797	lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
798	lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
799	lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
800	lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
801	lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
802	lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
803	lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
804	lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
805	lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
806	lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
807	lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
808	lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
809	lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
810	lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
811}
812
813func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
814	var i, iMax, iMid int
815	var chMinT, chMaxT rune
816	var lc lcMap
817
818	for i, iMax = 0, len(lcTable); i < iMax; {
819		iMid = (i + iMax) / 2
820		if lcTable[iMid].chMax < chMin {
821			i = iMid + 1
822		} else {
823			iMax = iMid
824		}
825	}
826
827	for ; i < len(lcTable); i++ {
828		lc = lcTable[i]
829		if lc.chMin > chMax {
830			return
831		}
832		chMinT = lc.chMin
833		if chMinT < chMin {
834			chMinT = chMin
835		}
836
837		chMaxT = lc.chMax
838		if chMaxT > chMax {
839			chMaxT = chMax
840		}
841
842		switch lc.op {
843		case LowercaseSet:
844			chMinT = rune(lc.data)
845			chMaxT = rune(lc.data)
846			break
847		case LowercaseAdd:
848			chMinT += lc.data
849			chMaxT += lc.data
850			break
851		case LowercaseBor:
852			chMinT |= 1
853			chMaxT |= 1
854			break
855		case LowercaseBad:
856			chMinT += (chMinT & 1)
857			chMaxT += (chMaxT & 1)
858			break
859		}
860
861		if chMinT < chMin || chMaxT > chMax {
862			c.addRange(chMinT, chMaxT)
863		}
864	}
865}