parser.go

   1package syntax
   2
   3import (
   4	"fmt"
   5	"math"
   6	"os"
   7	"sort"
   8	"strconv"
   9	"unicode"
  10)
  11
  12type RegexOptions int32
  13
  14const (
  15	IgnoreCase              RegexOptions = 0x0001 // "i"
  16	Multiline                            = 0x0002 // "m"
  17	ExplicitCapture                      = 0x0004 // "n"
  18	Compiled                             = 0x0008 // "c"
  19	Singleline                           = 0x0010 // "s"
  20	IgnorePatternWhitespace              = 0x0020 // "x"
  21	RightToLeft                          = 0x0040 // "r"
  22	Debug                                = 0x0080 // "d"
  23	ECMAScript                           = 0x0100 // "e"
  24	RE2                                  = 0x0200 // RE2 compat mode
  25)
  26
  27func optionFromCode(ch rune) RegexOptions {
  28	// case-insensitive
  29	switch ch {
  30	case 'i', 'I':
  31		return IgnoreCase
  32	case 'r', 'R':
  33		return RightToLeft
  34	case 'm', 'M':
  35		return Multiline
  36	case 'n', 'N':
  37		return ExplicitCapture
  38	case 's', 'S':
  39		return Singleline
  40	case 'x', 'X':
  41		return IgnorePatternWhitespace
  42	case 'd', 'D':
  43		return Debug
  44	case 'e', 'E':
  45		return ECMAScript
  46	default:
  47		return 0
  48	}
  49}
  50
  51// An Error describes a failure to parse a regular expression
  52// and gives the offending expression.
  53type Error struct {
  54	Code ErrorCode
  55	Expr string
  56	Args []interface{}
  57}
  58
  59func (e *Error) Error() string {
  60	if len(e.Args) == 0 {
  61		return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
  62	}
  63	return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
  64}
  65
  66// An ErrorCode describes a failure to parse a regular expression.
  67type ErrorCode string
  68
  69const (
  70	// internal issue
  71	ErrInternalError ErrorCode = "regexp/syntax: internal error"
  72	// Parser errors
  73	ErrUnterminatedComment        = "unterminated comment"
  74	ErrInvalidCharRange           = "invalid character class range"
  75	ErrInvalidRepeatSize          = "invalid repeat count"
  76	ErrInvalidUTF8                = "invalid UTF-8"
  77	ErrCaptureGroupOutOfRange     = "capture group number out of range"
  78	ErrUnexpectedParen            = "unexpected )"
  79	ErrMissingParen               = "missing closing )"
  80	ErrMissingBrace               = "missing closing }"
  81	ErrInvalidRepeatOp            = "invalid nested repetition operator"
  82	ErrMissingRepeatArgument      = "missing argument to repetition operator"
  83	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
  84	ErrTooManyAlternates          = "too many | in (?()|)"
  85	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
  86	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
  87	ErrCapNumNotZero              = "capture number cannot be zero"
  88	ErrUndefinedBackRef           = "reference to undefined group number %v"
  89	ErrUndefinedNameRef           = "reference to undefined group name %v"
  90	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
  91	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
  92	ErrMalformedReference         = "(?(%v) ) malformed"
  93	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
  94	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
  95	ErrMalformedSlashP            = "malformed \\p{X} character escape"
  96	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
  97	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
  98	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
  99	ErrMissingControl             = "missing control character"
 100	ErrUnrecognizedControl        = "unrecognized control character"
 101	ErrTooFewHex                  = "insufficient hexadecimal digits"
 102	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
 103	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
 104	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
 105	ErrUnterminatedBracket        = "unterminated [] set"
 106	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
 107	ErrReversedCharRange          = "[x-y] range in reverse order"
 108)
 109
 110func (e ErrorCode) String() string {
 111	return string(e)
 112}
 113
 114type parser struct {
 115	stack         *regexNode
 116	group         *regexNode
 117	alternation   *regexNode
 118	concatenation *regexNode
 119	unit          *regexNode
 120
 121	patternRaw string
 122	pattern    []rune
 123
 124	currentPos  int
 125	specialCase *unicode.SpecialCase
 126
 127	autocap  int
 128	capcount int
 129	captop   int
 130	capsize  int
 131
 132	caps     map[int]int
 133	capnames map[string]int
 134
 135	capnumlist  []int
 136	capnamelist []string
 137
 138	options         RegexOptions
 139	optionsStack    []RegexOptions
 140	ignoreNextParen bool
 141}
 142
 143const (
 144	maxValueDiv10 int = math.MaxInt32 / 10
 145	maxValueMod10     = math.MaxInt32 % 10
 146)
 147
 148// Parse converts a regex string into a parse tree
 149func Parse(re string, op RegexOptions) (*RegexTree, error) {
 150	p := parser{
 151		options: op,
 152		caps:    make(map[int]int),
 153	}
 154	p.setPattern(re)
 155
 156	if err := p.countCaptures(); err != nil {
 157		return nil, err
 158	}
 159
 160	p.reset(op)
 161	root, err := p.scanRegex()
 162
 163	if err != nil {
 164		return nil, err
 165	}
 166	tree := &RegexTree{
 167		root:       root,
 168		caps:       p.caps,
 169		capnumlist: p.capnumlist,
 170		captop:     p.captop,
 171		Capnames:   p.capnames,
 172		Caplist:    p.capnamelist,
 173		options:    op,
 174	}
 175
 176	if tree.options&Debug > 0 {
 177		os.Stdout.WriteString(tree.Dump())
 178	}
 179
 180	return tree, nil
 181}
 182
 183func (p *parser) setPattern(pattern string) {
 184	p.patternRaw = pattern
 185	p.pattern = make([]rune, 0, len(pattern))
 186
 187	//populate our rune array to handle utf8 encoding
 188	for _, r := range pattern {
 189		p.pattern = append(p.pattern, r)
 190	}
 191}
 192func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
 193	return &Error{Code: code, Expr: p.patternRaw, Args: args}
 194}
 195
 196func (p *parser) noteCaptureSlot(i, pos int) {
 197	if _, ok := p.caps[i]; !ok {
 198		// the rhs of the hashtable isn't used in the parser
 199		p.caps[i] = pos
 200		p.capcount++
 201
 202		if p.captop <= i {
 203			if i == math.MaxInt32 {
 204				p.captop = i
 205			} else {
 206				p.captop = i + 1
 207			}
 208		}
 209	}
 210}
 211
 212func (p *parser) noteCaptureName(name string, pos int) {
 213	if p.capnames == nil {
 214		p.capnames = make(map[string]int)
 215	}
 216
 217	if _, ok := p.capnames[name]; !ok {
 218		p.capnames[name] = pos
 219		p.capnamelist = append(p.capnamelist, name)
 220	}
 221}
 222
 223func (p *parser) assignNameSlots() {
 224	if p.capnames != nil {
 225		for _, name := range p.capnamelist {
 226			for p.isCaptureSlot(p.autocap) {
 227				p.autocap++
 228			}
 229			pos := p.capnames[name]
 230			p.capnames[name] = p.autocap
 231			p.noteCaptureSlot(p.autocap, pos)
 232
 233			p.autocap++
 234		}
 235	}
 236
 237	// if the caps array has at least one gap, construct the list of used slots
 238	if p.capcount < p.captop {
 239		p.capnumlist = make([]int, p.capcount)
 240		i := 0
 241
 242		for k := range p.caps {
 243			p.capnumlist[i] = k
 244			i++
 245		}
 246
 247		sort.Ints(p.capnumlist)
 248	}
 249
 250	// merge capsnumlist into capnamelist
 251	if p.capnames != nil || p.capnumlist != nil {
 252		var oldcapnamelist []string
 253		var next int
 254		var k int
 255
 256		if p.capnames == nil {
 257			oldcapnamelist = nil
 258			p.capnames = make(map[string]int)
 259			p.capnamelist = []string{}
 260			next = -1
 261		} else {
 262			oldcapnamelist = p.capnamelist
 263			p.capnamelist = []string{}
 264			next = p.capnames[oldcapnamelist[0]]
 265		}
 266
 267		for i := 0; i < p.capcount; i++ {
 268			j := i
 269			if p.capnumlist != nil {
 270				j = p.capnumlist[i]
 271			}
 272
 273			if next == j {
 274				p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
 275				k++
 276
 277				if k == len(oldcapnamelist) {
 278					next = -1
 279				} else {
 280					next = p.capnames[oldcapnamelist[k]]
 281				}
 282
 283			} else {
 284				//feature: culture?
 285				str := strconv.Itoa(j)
 286				p.capnamelist = append(p.capnamelist, str)
 287				p.capnames[str] = j
 288			}
 289		}
 290	}
 291}
 292
 293func (p *parser) consumeAutocap() int {
 294	r := p.autocap
 295	p.autocap++
 296	return r
 297}
 298
 299// CountCaptures is a prescanner for deducing the slots used for
 300// captures by doing a partial tokenization of the pattern.
 301func (p *parser) countCaptures() error {
 302	var ch rune
 303
 304	p.noteCaptureSlot(0, 0)
 305
 306	p.autocap = 1
 307
 308	for p.charsRight() > 0 {
 309		pos := p.textpos()
 310		ch = p.moveRightGetChar()
 311		switch ch {
 312		case '\\':
 313			if p.charsRight() > 0 {
 314				p.scanBackslash(true)
 315			}
 316
 317		case '#':
 318			if p.useOptionX() {
 319				p.moveLeft()
 320				p.scanBlank()
 321			}
 322
 323		case '[':
 324			p.scanCharSet(false, true)
 325
 326		case ')':
 327			if !p.emptyOptionsStack() {
 328				p.popOptions()
 329			}
 330
 331		case '(':
 332			if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
 333				p.moveLeft()
 334				p.scanBlank()
 335			} else {
 336				p.pushOptions()
 337				if p.charsRight() > 0 && p.rightChar(0) == '?' {
 338					// we have (?...
 339					p.moveRight(1)
 340
 341					if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
 342						// named group: (?<... or (?'...
 343
 344						p.moveRight(1)
 345						ch = p.rightChar(0)
 346
 347						if ch != '0' && IsWordChar(ch) {
 348							if ch >= '1' && ch <= '9' {
 349								dec, err := p.scanDecimal()
 350								if err != nil {
 351									return err
 352								}
 353								p.noteCaptureSlot(dec, pos)
 354							} else {
 355								p.noteCaptureName(p.scanCapname(), pos)
 356							}
 357						}
 358					} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
 359						// RE2-compat (?P<)
 360						p.moveRight(2)
 361						ch = p.rightChar(0)
 362						if IsWordChar(ch) {
 363							p.noteCaptureName(p.scanCapname(), pos)
 364						}
 365
 366					} else {
 367						// (?...
 368
 369						// get the options if it's an option construct (?cimsx-cimsx...)
 370						p.scanOptions()
 371
 372						if p.charsRight() > 0 {
 373							if p.rightChar(0) == ')' {
 374								// (?cimsx-cimsx)
 375								p.moveRight(1)
 376								p.popKeepOptions()
 377							} else if p.rightChar(0) == '(' {
 378								// alternation construct: (?(foo)yes|no)
 379								// ignore the next paren so we don't capture the condition
 380								p.ignoreNextParen = true
 381
 382								// break from here so we don't reset ignoreNextParen
 383								continue
 384							}
 385						}
 386					}
 387				} else {
 388					if !p.useOptionN() && !p.ignoreNextParen {
 389						p.noteCaptureSlot(p.consumeAutocap(), pos)
 390					}
 391				}
 392			}
 393
 394			p.ignoreNextParen = false
 395
 396		}
 397	}
 398
 399	p.assignNameSlots()
 400	return nil
 401}
 402
 403func (p *parser) reset(topopts RegexOptions) {
 404	p.currentPos = 0
 405	p.autocap = 1
 406	p.ignoreNextParen = false
 407
 408	if len(p.optionsStack) > 0 {
 409		p.optionsStack = p.optionsStack[:0]
 410	}
 411
 412	p.options = topopts
 413	p.stack = nil
 414}
 415
 416func (p *parser) scanRegex() (*regexNode, error) {
 417	ch := '@' // nonspecial ch, means at beginning
 418	isQuant := false
 419
 420	p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
 421
 422	for p.charsRight() > 0 {
 423		wasPrevQuantifier := isQuant
 424		isQuant = false
 425
 426		if err := p.scanBlank(); err != nil {
 427			return nil, err
 428		}
 429
 430		startpos := p.textpos()
 431
 432		// move past all of the normal characters.  We'll stop when we hit some kind of control character,
 433		// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
 434		if p.useOptionX() {
 435			for p.charsRight() > 0 {
 436				ch = p.rightChar(0)
 437				//UGLY: clean up, this is ugly
 438				if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
 439					break
 440				}
 441				p.moveRight(1)
 442			}
 443		} else {
 444			for p.charsRight() > 0 {
 445				ch = p.rightChar(0)
 446				if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
 447					break
 448				}
 449				p.moveRight(1)
 450			}
 451		}
 452
 453		endpos := p.textpos()
 454
 455		p.scanBlank()
 456
 457		if p.charsRight() == 0 {
 458			ch = '!' // nonspecial, means at end
 459		} else if ch = p.rightChar(0); isSpecial(ch) {
 460			isQuant = isQuantifier(ch)
 461			p.moveRight(1)
 462		} else {
 463			ch = ' ' // nonspecial, means at ordinary char
 464		}
 465
 466		if startpos < endpos {
 467			cchUnquantified := endpos - startpos
 468			if isQuant {
 469				cchUnquantified--
 470			}
 471			wasPrevQuantifier = false
 472
 473			if cchUnquantified > 0 {
 474				p.addToConcatenate(startpos, cchUnquantified, false)
 475			}
 476
 477			if isQuant {
 478				p.addUnitOne(p.charAt(endpos - 1))
 479			}
 480		}
 481
 482		switch ch {
 483		case '!':
 484			goto BreakOuterScan
 485
 486		case ' ':
 487			goto ContinueOuterScan
 488
 489		case '[':
 490			cc, err := p.scanCharSet(p.useOptionI(), false)
 491			if err != nil {
 492				return nil, err
 493			}
 494			p.addUnitSet(cc)
 495
 496		case '(':
 497			p.pushOptions()
 498
 499			if grouper, err := p.scanGroupOpen(); err != nil {
 500				return nil, err
 501			} else if grouper == nil {
 502				p.popKeepOptions()
 503			} else {
 504				p.pushGroup()
 505				p.startGroup(grouper)
 506			}
 507
 508			continue
 509
 510		case '|':
 511			p.addAlternate()
 512			goto ContinueOuterScan
 513
 514		case ')':
 515			if p.emptyStack() {
 516				return nil, p.getErr(ErrUnexpectedParen)
 517			}
 518
 519			if err := p.addGroup(); err != nil {
 520				return nil, err
 521			}
 522			if err := p.popGroup(); err != nil {
 523				return nil, err
 524			}
 525			p.popOptions()
 526
 527			if p.unit == nil {
 528				goto ContinueOuterScan
 529			}
 530
 531		case '\\':
 532			n, err := p.scanBackslash(false)
 533			if err != nil {
 534				return nil, err
 535			}
 536			p.addUnitNode(n)
 537
 538		case '^':
 539			if p.useOptionM() {
 540				p.addUnitType(ntBol)
 541			} else {
 542				p.addUnitType(ntBeginning)
 543			}
 544
 545		case '$':
 546			if p.useOptionM() {
 547				p.addUnitType(ntEol)
 548			} else {
 549				p.addUnitType(ntEndZ)
 550			}
 551
 552		case '.':
 553			if p.useOptionE() {
 554				p.addUnitSet(ECMAAnyClass())
 555			} else if p.useOptionS() {
 556				p.addUnitSet(AnyClass())
 557			} else {
 558				p.addUnitNotone('\n')
 559			}
 560
 561		case '{', '*', '+', '?':
 562			if p.unit == nil {
 563				if wasPrevQuantifier {
 564					return nil, p.getErr(ErrInvalidRepeatOp)
 565				} else {
 566					return nil, p.getErr(ErrMissingRepeatArgument)
 567				}
 568			}
 569			p.moveLeft()
 570
 571		default:
 572			return nil, p.getErr(ErrInternalError)
 573		}
 574
 575		if err := p.scanBlank(); err != nil {
 576			return nil, err
 577		}
 578
 579		if p.charsRight() > 0 {
 580			isQuant = p.isTrueQuantifier()
 581		}
 582		if p.charsRight() == 0 || !isQuant {
 583			//maintain odd C# assignment order -- not sure if required, could clean up?
 584			p.addConcatenate()
 585			goto ContinueOuterScan
 586		}
 587
 588		ch = p.moveRightGetChar()
 589
 590		// Handle quantifiers
 591		for p.unit != nil {
 592			var min, max int
 593			var lazy bool
 594
 595			switch ch {
 596			case '*':
 597				min = 0
 598				max = math.MaxInt32
 599
 600			case '?':
 601				min = 0
 602				max = 1
 603
 604			case '+':
 605				min = 1
 606				max = math.MaxInt32
 607
 608			case '{':
 609				{
 610					var err error
 611					startpos = p.textpos()
 612					if min, err = p.scanDecimal(); err != nil {
 613						return nil, err
 614					}
 615					max = min
 616					if startpos < p.textpos() {
 617						if p.charsRight() > 0 && p.rightChar(0) == ',' {
 618							p.moveRight(1)
 619							if p.charsRight() == 0 || p.rightChar(0) == '}' {
 620								max = math.MaxInt32
 621							} else {
 622								if max, err = p.scanDecimal(); err != nil {
 623									return nil, err
 624								}
 625							}
 626						}
 627					}
 628
 629					if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
 630						p.addConcatenate()
 631						p.textto(startpos - 1)
 632						goto ContinueOuterScan
 633					}
 634				}
 635
 636			default:
 637				return nil, p.getErr(ErrInternalError)
 638			}
 639
 640			if err := p.scanBlank(); err != nil {
 641				return nil, err
 642			}
 643
 644			if p.charsRight() == 0 || p.rightChar(0) != '?' {
 645				lazy = false
 646			} else {
 647				p.moveRight(1)
 648				lazy = true
 649			}
 650
 651			if min > max {
 652				return nil, p.getErr(ErrInvalidRepeatSize)
 653			}
 654
 655			p.addConcatenate3(lazy, min, max)
 656		}
 657
 658	ContinueOuterScan:
 659	}
 660
 661BreakOuterScan:
 662	;
 663
 664	if !p.emptyStack() {
 665		return nil, p.getErr(ErrMissingParen)
 666	}
 667
 668	if err := p.addGroup(); err != nil {
 669		return nil, err
 670	}
 671
 672	return p.unit, nil
 673
 674}
 675
 676/*
 677 * Simple parsing for replacement patterns
 678 */
 679func (p *parser) scanReplacement() (*regexNode, error) {
 680	var c, startpos int
 681
 682	p.concatenation = newRegexNode(ntConcatenate, p.options)
 683
 684	for {
 685		c = p.charsRight()
 686		if c == 0 {
 687			break
 688		}
 689
 690		startpos = p.textpos()
 691
 692		for c > 0 && p.rightChar(0) != '$' {
 693			p.moveRight(1)
 694			c--
 695		}
 696
 697		p.addToConcatenate(startpos, p.textpos()-startpos, true)
 698
 699		if c > 0 {
 700			if p.moveRightGetChar() == '$' {
 701				n, err := p.scanDollar()
 702				if err != nil {
 703					return nil, err
 704				}
 705				p.addUnitNode(n)
 706			}
 707			p.addConcatenate()
 708		}
 709	}
 710
 711	return p.concatenation, nil
 712}
 713
 714/*
 715 * Scans $ patterns recognized within replacement patterns
 716 */
 717func (p *parser) scanDollar() (*regexNode, error) {
 718	if p.charsRight() == 0 {
 719		return newRegexNodeCh(ntOne, p.options, '$'), nil
 720	}
 721
 722	ch := p.rightChar(0)
 723	angled := false
 724	backpos := p.textpos()
 725	lastEndPos := backpos
 726
 727	// Note angle
 728
 729	if ch == '{' && p.charsRight() > 1 {
 730		angled = true
 731		p.moveRight(1)
 732		ch = p.rightChar(0)
 733	}
 734
 735	// Try to parse backreference: \1 or \{1} or \{cap}
 736
 737	if ch >= '0' && ch <= '9' {
 738		if !angled && p.useOptionE() {
 739			capnum := -1
 740			newcapnum := int(ch - '0')
 741			p.moveRight(1)
 742			if p.isCaptureSlot(newcapnum) {
 743				capnum = newcapnum
 744				lastEndPos = p.textpos()
 745			}
 746
 747			for p.charsRight() > 0 {
 748				ch = p.rightChar(0)
 749				if ch < '0' || ch > '9' {
 750					break
 751				}
 752				digit := int(ch - '0')
 753				if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
 754					return nil, p.getErr(ErrCaptureGroupOutOfRange)
 755				}
 756
 757				newcapnum = newcapnum*10 + digit
 758
 759				p.moveRight(1)
 760				if p.isCaptureSlot(newcapnum) {
 761					capnum = newcapnum
 762					lastEndPos = p.textpos()
 763				}
 764			}
 765			p.textto(lastEndPos)
 766			if capnum >= 0 {
 767				return newRegexNodeM(ntRef, p.options, capnum), nil
 768			}
 769		} else {
 770			capnum, err := p.scanDecimal()
 771			if err != nil {
 772				return nil, err
 773			}
 774			if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
 775				if p.isCaptureSlot(capnum) {
 776					return newRegexNodeM(ntRef, p.options, capnum), nil
 777				}
 778			}
 779		}
 780	} else if angled && IsWordChar(ch) {
 781		capname := p.scanCapname()
 782
 783		if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
 784			if p.isCaptureName(capname) {
 785				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
 786			}
 787		}
 788	} else if !angled {
 789		capnum := 1
 790
 791		switch ch {
 792		case '$':
 793			p.moveRight(1)
 794			return newRegexNodeCh(ntOne, p.options, '$'), nil
 795		case '&':
 796			capnum = 0
 797		case '`':
 798			capnum = replaceLeftPortion
 799		case '\'':
 800			capnum = replaceRightPortion
 801		case '+':
 802			capnum = replaceLastGroup
 803		case '_':
 804			capnum = replaceWholeString
 805		}
 806
 807		if capnum != 1 {
 808			p.moveRight(1)
 809			return newRegexNodeM(ntRef, p.options, capnum), nil
 810		}
 811	}
 812
 813	// unrecognized $: literalize
 814
 815	p.textto(backpos)
 816	return newRegexNodeCh(ntOne, p.options, '$'), nil
 817}
 818
 819// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
 820// a RegexNode for the type of group scanned, or nil if the group
 821// simply changed options (?cimsx-cimsx) or was a comment (#...).
 822func (p *parser) scanGroupOpen() (*regexNode, error) {
 823	var ch rune
 824	var nt nodeType
 825	var err error
 826	close := '>'
 827	start := p.textpos()
 828
 829	// just return a RegexNode if we have:
 830	// 1. "(" followed by nothing
 831	// 2. "(x" where x != ?
 832	// 3. "(?)"
 833	if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
 834		if p.useOptionN() || p.ignoreNextParen {
 835			p.ignoreNextParen = false
 836			return newRegexNode(ntGroup, p.options), nil
 837		}
 838		return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
 839	}
 840
 841	p.moveRight(1)
 842
 843	for {
 844		if p.charsRight() == 0 {
 845			break
 846		}
 847
 848		switch ch = p.moveRightGetChar(); ch {
 849		case ':':
 850			nt = ntGroup
 851
 852		case '=':
 853			p.options &= ^RightToLeft
 854			nt = ntRequire
 855
 856		case '!':
 857			p.options &= ^RightToLeft
 858			nt = ntPrevent
 859
 860		case '>':
 861			nt = ntGreedy
 862
 863		case '\'':
 864			close = '\''
 865			fallthrough
 866
 867		case '<':
 868			if p.charsRight() == 0 {
 869				goto BreakRecognize
 870			}
 871
 872			switch ch = p.moveRightGetChar(); ch {
 873			case '=':
 874				if close == '\'' {
 875					goto BreakRecognize
 876				}
 877
 878				p.options |= RightToLeft
 879				nt = ntRequire
 880
 881			case '!':
 882				if close == '\'' {
 883					goto BreakRecognize
 884				}
 885
 886				p.options |= RightToLeft
 887				nt = ntPrevent
 888
 889			default:
 890				p.moveLeft()
 891				capnum := -1
 892				uncapnum := -1
 893				proceed := false
 894
 895				// grab part before -
 896
 897				if ch >= '0' && ch <= '9' {
 898					if capnum, err = p.scanDecimal(); err != nil {
 899						return nil, err
 900					}
 901
 902					if !p.isCaptureSlot(capnum) {
 903						capnum = -1
 904					}
 905
 906					// check if we have bogus characters after the number
 907					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
 908						return nil, p.getErr(ErrInvalidGroupName)
 909					}
 910					if capnum == 0 {
 911						return nil, p.getErr(ErrCapNumNotZero)
 912					}
 913				} else if IsWordChar(ch) {
 914					capname := p.scanCapname()
 915
 916					if p.isCaptureName(capname) {
 917						capnum = p.captureSlotFromName(capname)
 918					}
 919
 920					// check if we have bogus character after the name
 921					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
 922						return nil, p.getErr(ErrInvalidGroupName)
 923					}
 924				} else if ch == '-' {
 925					proceed = true
 926				} else {
 927					// bad group name - starts with something other than a word character and isn't a number
 928					return nil, p.getErr(ErrInvalidGroupName)
 929				}
 930
 931				// grab part after - if any
 932
 933				if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
 934					p.moveRight(1)
 935
 936					//no more chars left, no closing char, etc
 937					if p.charsRight() == 0 {
 938						return nil, p.getErr(ErrInvalidGroupName)
 939					}
 940
 941					ch = p.rightChar(0)
 942					if ch >= '0' && ch <= '9' {
 943						if uncapnum, err = p.scanDecimal(); err != nil {
 944							return nil, err
 945						}
 946
 947						if !p.isCaptureSlot(uncapnum) {
 948							return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
 949						}
 950
 951						// check if we have bogus characters after the number
 952						if p.charsRight() > 0 && p.rightChar(0) != close {
 953							return nil, p.getErr(ErrInvalidGroupName)
 954						}
 955					} else if IsWordChar(ch) {
 956						uncapname := p.scanCapname()
 957
 958						if !p.isCaptureName(uncapname) {
 959							return nil, p.getErr(ErrUndefinedNameRef, uncapname)
 960						}
 961						uncapnum = p.captureSlotFromName(uncapname)
 962
 963						// check if we have bogus character after the name
 964						if p.charsRight() > 0 && p.rightChar(0) != close {
 965							return nil, p.getErr(ErrInvalidGroupName)
 966						}
 967					} else {
 968						// bad group name - starts with something other than a word character and isn't a number
 969						return nil, p.getErr(ErrInvalidGroupName)
 970					}
 971				}
 972
 973				// actually make the node
 974
 975				if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
 976					return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
 977				}
 978				goto BreakRecognize
 979			}
 980
 981		case '(':
 982			// alternation construct (?(...) | )
 983
 984			parenPos := p.textpos()
 985			if p.charsRight() > 0 {
 986				ch = p.rightChar(0)
 987
 988				// check if the alternation condition is a backref
 989				if ch >= '0' && ch <= '9' {
 990					var capnum int
 991					if capnum, err = p.scanDecimal(); err != nil {
 992						return nil, err
 993					}
 994					if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
 995						if p.isCaptureSlot(capnum) {
 996							return newRegexNodeM(ntTestref, p.options, capnum), nil
 997						}
 998						return nil, p.getErr(ErrUndefinedReference, capnum)
 999					}
1000
1001					return nil, p.getErr(ErrMalformedReference, capnum)
1002
1003				} else if IsWordChar(ch) {
1004					capname := p.scanCapname()
1005
1006					if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1007						return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1008					}
1009				}
1010			}
1011			// not a backref
1012			nt = ntTestgroup
1013			p.textto(parenPos - 1)   // jump to the start of the parentheses
1014			p.ignoreNextParen = true // but make sure we don't try to capture the insides
1015
1016			charsRight := p.charsRight()
1017			if charsRight >= 3 && p.rightChar(1) == '?' {
1018				rightchar2 := p.rightChar(2)
1019				// disallow comments in the condition
1020				if rightchar2 == '#' {
1021					return nil, p.getErr(ErrAlternationCantHaveComment)
1022				}
1023
1024				// disallow named capture group (?<..>..) in the condition
1025				if rightchar2 == '\'' {
1026					return nil, p.getErr(ErrAlternationCantCapture)
1027				}
1028
1029				if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1030					return nil, p.getErr(ErrAlternationCantCapture)
1031				}
1032			}
1033
1034		case 'P':
1035			if p.useRE2() {
1036				// support for P<name> syntax
1037				if p.charsRight() < 3 {
1038					goto BreakRecognize
1039				}
1040
1041				ch = p.moveRightGetChar()
1042				if ch != '<' {
1043					goto BreakRecognize
1044				}
1045
1046				ch = p.moveRightGetChar()
1047				p.moveLeft()
1048
1049				if IsWordChar(ch) {
1050					capnum := -1
1051					capname := p.scanCapname()
1052
1053					if p.isCaptureName(capname) {
1054						capnum = p.captureSlotFromName(capname)
1055					}
1056
1057					// check if we have bogus character after the name
1058					if p.charsRight() > 0 && p.rightChar(0) != '>' {
1059						return nil, p.getErr(ErrInvalidGroupName)
1060					}
1061
1062					// actually make the node
1063
1064					if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1065						return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1066					}
1067					goto BreakRecognize
1068
1069				} else {
1070					// bad group name - starts with something other than a word character and isn't a number
1071					return nil, p.getErr(ErrInvalidGroupName)
1072				}
1073			}
1074			// if we're not using RE2 compat mode then
1075			// we just behave like normal
1076			fallthrough
1077
1078		default:
1079			p.moveLeft()
1080
1081			nt = ntGroup
1082			// disallow options in the children of a testgroup node
1083			if p.group.t != ntTestgroup {
1084				p.scanOptions()
1085			}
1086			if p.charsRight() == 0 {
1087				goto BreakRecognize
1088			}
1089
1090			if ch = p.moveRightGetChar(); ch == ')' {
1091				return nil, nil
1092			}
1093
1094			if ch != ':' {
1095				goto BreakRecognize
1096			}
1097
1098		}
1099
1100		return newRegexNode(nt, p.options), nil
1101	}
1102
1103BreakRecognize:
1104
1105	// break Recognize comes here
1106
1107	return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1108}
1109
1110// scans backslash specials and basics
1111func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1112
1113	if p.charsRight() == 0 {
1114		return nil, p.getErr(ErrIllegalEndEscape)
1115	}
1116
1117	switch ch := p.rightChar(0); ch {
1118	case 'b', 'B', 'A', 'G', 'Z', 'z':
1119		p.moveRight(1)
1120		return newRegexNode(p.typeFromCode(ch), p.options), nil
1121
1122	case 'w':
1123		p.moveRight(1)
1124		if p.useOptionE() {
1125			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1126		}
1127		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1128
1129	case 'W':
1130		p.moveRight(1)
1131		if p.useOptionE() {
1132			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1133		}
1134		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1135
1136	case 's':
1137		p.moveRight(1)
1138		if p.useOptionE() {
1139			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1140		}
1141		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1142
1143	case 'S':
1144		p.moveRight(1)
1145		if p.useOptionE() {
1146			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1147		}
1148		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1149
1150	case 'd':
1151		p.moveRight(1)
1152		if p.useOptionE() {
1153			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1154		}
1155		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1156
1157	case 'D':
1158		p.moveRight(1)
1159		if p.useOptionE() {
1160			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1161		}
1162		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1163
1164	case 'p', 'P':
1165		p.moveRight(1)
1166		prop, err := p.parseProperty()
1167		if err != nil {
1168			return nil, err
1169		}
1170		cc := &CharSet{}
1171		cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1172		if p.useOptionI() {
1173			cc.addLowercase()
1174		}
1175
1176		return newRegexNodeSet(ntSet, p.options, cc), nil
1177
1178	default:
1179		return p.scanBasicBackslash(scanOnly)
1180	}
1181}
1182
1183// Scans \-style backreferences and character escapes
1184func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1185	if p.charsRight() == 0 {
1186		return nil, p.getErr(ErrIllegalEndEscape)
1187	}
1188	angled := false
1189	close := '\x00'
1190
1191	backpos := p.textpos()
1192	ch := p.rightChar(0)
1193
1194	// allow \k<foo> instead of \<foo>, which is now deprecated
1195
1196	if ch == 'k' {
1197		if p.charsRight() >= 2 {
1198			p.moveRight(1)
1199			ch = p.moveRightGetChar()
1200
1201			if ch == '<' || ch == '\'' {
1202				angled = true
1203				if ch == '\'' {
1204					close = '\''
1205				} else {
1206					close = '>'
1207				}
1208			}
1209		}
1210
1211		if !angled || p.charsRight() <= 0 {
1212			return nil, p.getErr(ErrMalformedNameRef)
1213		}
1214
1215		ch = p.rightChar(0)
1216
1217	} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1218		angled = true
1219		if ch == '\'' {
1220			close = '\''
1221		} else {
1222			close = '>'
1223		}
1224
1225		p.moveRight(1)
1226		ch = p.rightChar(0)
1227	}
1228
1229	// Try to parse backreference: \<1> or \<cap>
1230
1231	if angled && ch >= '0' && ch <= '9' {
1232		capnum, err := p.scanDecimal()
1233		if err != nil {
1234			return nil, err
1235		}
1236
1237		if p.charsRight() > 0 && p.moveRightGetChar() == close {
1238			if p.isCaptureSlot(capnum) {
1239				return newRegexNodeM(ntRef, p.options, capnum), nil
1240			}
1241			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1242		}
1243	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1244		capnum, err := p.scanDecimal()
1245		if err != nil {
1246			return nil, err
1247		}
1248
1249		if scanOnly {
1250			return nil, nil
1251		}
1252
1253		if p.useOptionE() || p.isCaptureSlot(capnum) {
1254			return newRegexNodeM(ntRef, p.options, capnum), nil
1255		}
1256		if capnum <= 9 {
1257			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1258		}
1259
1260	} else if angled && IsWordChar(ch) {
1261		capname := p.scanCapname()
1262
1263		if p.charsRight() > 0 && p.moveRightGetChar() == close {
1264			if p.isCaptureName(capname) {
1265				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1266			}
1267			return nil, p.getErr(ErrUndefinedNameRef, capname)
1268		}
1269	}
1270
1271	// Not backreference: must be char code
1272
1273	p.textto(backpos)
1274	ch, err := p.scanCharEscape()
1275	if err != nil {
1276		return nil, err
1277	}
1278
1279	if p.useOptionI() {
1280		ch = unicode.ToLower(ch)
1281	}
1282
1283	return newRegexNodeCh(ntOne, p.options, ch), nil
1284}
1285
1286// Scans X for \p{X} or \P{X}
1287func (p *parser) parseProperty() (string, error) {
1288	if p.charsRight() < 3 {
1289		return "", p.getErr(ErrIncompleteSlashP)
1290	}
1291	ch := p.moveRightGetChar()
1292	if ch != '{' {
1293		return "", p.getErr(ErrMalformedSlashP)
1294	}
1295
1296	startpos := p.textpos()
1297	for p.charsRight() > 0 {
1298		ch = p.moveRightGetChar()
1299		if !(IsWordChar(ch) || ch == '-') {
1300			p.moveLeft()
1301			break
1302		}
1303	}
1304	capname := string(p.pattern[startpos:p.textpos()])
1305
1306	if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1307		return "", p.getErr(ErrIncompleteSlashP)
1308	}
1309
1310	if !isValidUnicodeCat(capname) {
1311		return "", p.getErr(ErrUnknownSlashP, capname)
1312	}
1313
1314	return capname, nil
1315}
1316
1317// Returns ReNode type for zero-length assertions with a \ code.
1318func (p *parser) typeFromCode(ch rune) nodeType {
1319	switch ch {
1320	case 'b':
1321		if p.useOptionE() {
1322			return ntECMABoundary
1323		}
1324		return ntBoundary
1325	case 'B':
1326		if p.useOptionE() {
1327			return ntNonECMABoundary
1328		}
1329		return ntNonboundary
1330	case 'A':
1331		return ntBeginning
1332	case 'G':
1333		return ntStart
1334	case 'Z':
1335		return ntEndZ
1336	case 'z':
1337		return ntEnd
1338	default:
1339		return ntNothing
1340	}
1341}
1342
1343// Scans whitespace or x-mode comments.
1344func (p *parser) scanBlank() error {
1345	if p.useOptionX() {
1346		for {
1347			for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1348				p.moveRight(1)
1349			}
1350
1351			if p.charsRight() == 0 {
1352				break
1353			}
1354
1355			if p.rightChar(0) == '#' {
1356				for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1357					p.moveRight(1)
1358				}
1359			} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1360				p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1361				for p.charsRight() > 0 && p.rightChar(0) != ')' {
1362					p.moveRight(1)
1363				}
1364				if p.charsRight() == 0 {
1365					return p.getErr(ErrUnterminatedComment)
1366				}
1367				p.moveRight(1)
1368			} else {
1369				break
1370			}
1371		}
1372	} else {
1373		for {
1374			if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1375				p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1376				return nil
1377			}
1378
1379			for p.charsRight() > 0 && p.rightChar(0) != ')' {
1380				p.moveRight(1)
1381			}
1382			if p.charsRight() == 0 {
1383				return p.getErr(ErrUnterminatedComment)
1384			}
1385			p.moveRight(1)
1386		}
1387	}
1388	return nil
1389}
1390
1391func (p *parser) scanCapname() string {
1392	startpos := p.textpos()
1393
1394	for p.charsRight() > 0 {
1395		if !IsWordChar(p.moveRightGetChar()) {
1396			p.moveLeft()
1397			break
1398		}
1399	}
1400
1401	return string(p.pattern[startpos:p.textpos()])
1402}
1403
1404//Scans contents of [] (not including []'s), and converts to a set.
1405func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1406	ch := '\x00'
1407	chPrev := '\x00'
1408	inRange := false
1409	firstChar := true
1410	closed := false
1411
1412	var cc *CharSet
1413	if !scanOnly {
1414		cc = &CharSet{}
1415	}
1416
1417	if p.charsRight() > 0 && p.rightChar(0) == '^' {
1418		p.moveRight(1)
1419		if !scanOnly {
1420			cc.negate = true
1421		}
1422	}
1423
1424	for ; p.charsRight() > 0; firstChar = false {
1425		fTranslatedChar := false
1426		ch = p.moveRightGetChar()
1427		if ch == ']' {
1428			if !firstChar {
1429				closed = true
1430				break
1431			} else if p.useOptionE() {
1432				if !scanOnly {
1433					cc.addRanges(NoneClass().ranges)
1434				}
1435				closed = true
1436				break
1437			}
1438
1439		} else if ch == '\\' && p.charsRight() > 0 {
1440			switch ch = p.moveRightGetChar(); ch {
1441			case 'D', 'd':
1442				if !scanOnly {
1443					if inRange {
1444						return nil, p.getErr(ErrBadClassInCharRange, ch)
1445					}
1446					cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
1447				}
1448				continue
1449
1450			case 'S', 's':
1451				if !scanOnly {
1452					if inRange {
1453						return nil, p.getErr(ErrBadClassInCharRange, ch)
1454					}
1455					cc.addSpace(p.useOptionE(), ch == 'S')
1456				}
1457				continue
1458
1459			case 'W', 'w':
1460				if !scanOnly {
1461					if inRange {
1462						return nil, p.getErr(ErrBadClassInCharRange, ch)
1463					}
1464
1465					cc.addWord(p.useOptionE(), ch == 'W')
1466				}
1467				continue
1468
1469			case 'p', 'P':
1470				if !scanOnly {
1471					if inRange {
1472						return nil, p.getErr(ErrBadClassInCharRange, ch)
1473					}
1474					prop, err := p.parseProperty()
1475					if err != nil {
1476						return nil, err
1477					}
1478					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1479				} else {
1480					p.parseProperty()
1481				}
1482
1483				continue
1484
1485			case '-':
1486				if !scanOnly {
1487					cc.addRange(ch, ch)
1488				}
1489				continue
1490
1491			default:
1492				p.moveLeft()
1493				var err error
1494				ch, err = p.scanCharEscape() // non-literal character
1495				if err != nil {
1496					return nil, err
1497				}
1498				fTranslatedChar = true
1499				break // this break will only break out of the switch
1500			}
1501		} else if ch == '[' {
1502			// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1503			// It currently doesn't do anything other than skip the whole thing!
1504			if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1505				savePos := p.textpos()
1506
1507				p.moveRight(1)
1508				negate := false
1509				if p.charsRight() > 1 && p.rightChar(0) == '^' {
1510					negate = true
1511					p.moveRight(1)
1512				}
1513
1514				nm := p.scanCapname() // snag the name
1515				if !scanOnly && p.useRE2() {
1516					// look up the name since these are valid for RE2
1517					// add the group based on the name
1518					if ok := cc.addNamedASCII(nm, negate); !ok {
1519						return nil, p.getErr(ErrInvalidCharRange)
1520					}
1521				}
1522				if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1523					p.textto(savePos)
1524				} else if p.useRE2() {
1525					// move on
1526					continue
1527				}
1528			}
1529		}
1530
1531		if inRange {
1532			inRange = false
1533			if !scanOnly {
1534				if ch == '[' && !fTranslatedChar && !firstChar {
1535					// We thought we were in a range, but we're actually starting a subtraction.
1536					// In that case, we'll add chPrev to our char class, skip the opening [, and
1537					// scan the new character class recursively.
1538					cc.addChar(chPrev)
1539					sub, err := p.scanCharSet(caseInsensitive, false)
1540					if err != nil {
1541						return nil, err
1542					}
1543					cc.addSubtraction(sub)
1544
1545					if p.charsRight() > 0 && p.rightChar(0) != ']' {
1546						return nil, p.getErr(ErrSubtractionMustBeLast)
1547					}
1548				} else {
1549					// a regular range, like a-z
1550					if chPrev > ch {
1551						return nil, p.getErr(ErrReversedCharRange)
1552					}
1553					cc.addRange(chPrev, ch)
1554				}
1555			}
1556		} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1557			// this could be the start of a range
1558			chPrev = ch
1559			inRange = true
1560			p.moveRight(1)
1561		} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1562			// we aren't in a range, and now there is a subtraction.  Usually this happens
1563			// only when a subtraction follows a range, like [a-z-[b]]
1564			if !scanOnly {
1565				p.moveRight(1)
1566				sub, err := p.scanCharSet(caseInsensitive, false)
1567				if err != nil {
1568					return nil, err
1569				}
1570				cc.addSubtraction(sub)
1571
1572				if p.charsRight() > 0 && p.rightChar(0) != ']' {
1573					return nil, p.getErr(ErrSubtractionMustBeLast)
1574				}
1575			} else {
1576				p.moveRight(1)
1577				p.scanCharSet(caseInsensitive, true)
1578			}
1579		} else {
1580			if !scanOnly {
1581				cc.addRange(ch, ch)
1582			}
1583		}
1584	}
1585
1586	if !closed {
1587		return nil, p.getErr(ErrUnterminatedBracket)
1588	}
1589
1590	if !scanOnly && caseInsensitive {
1591		cc.addLowercase()
1592	}
1593
1594	return cc, nil
1595}
1596
1597// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1598func (p *parser) scanDecimal() (int, error) {
1599	i := 0
1600	var d int
1601
1602	for p.charsRight() > 0 {
1603		d = int(p.rightChar(0) - '0')
1604		if d < 0 || d > 9 {
1605			break
1606		}
1607		p.moveRight(1)
1608
1609		if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1610			return 0, p.getErr(ErrCaptureGroupOutOfRange)
1611		}
1612
1613		i *= 10
1614		i += d
1615	}
1616
1617	return int(i), nil
1618}
1619
1620// Returns true for options allowed only at the top level
1621func isOnlyTopOption(option RegexOptions) bool {
1622	return option == RightToLeft || option == ECMAScript || option == RE2
1623}
1624
1625// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1626func (p *parser) scanOptions() {
1627
1628	for off := false; p.charsRight() > 0; p.moveRight(1) {
1629		ch := p.rightChar(0)
1630
1631		if ch == '-' {
1632			off = true
1633		} else if ch == '+' {
1634			off = false
1635		} else {
1636			option := optionFromCode(ch)
1637			if option == 0 || isOnlyTopOption(option) {
1638				return
1639			}
1640
1641			if off {
1642				p.options &= ^option
1643			} else {
1644				p.options |= option
1645			}
1646		}
1647	}
1648}
1649
1650// Scans \ code for escape codes that map to single unicode chars.
1651func (p *parser) scanCharEscape() (rune, error) {
1652
1653	ch := p.moveRightGetChar()
1654
1655	if ch >= '0' && ch <= '7' {
1656		p.moveLeft()
1657		return p.scanOctal(), nil
1658	}
1659
1660	switch ch {
1661	case 'x':
1662		// support for \x{HEX} syntax from Perl and PCRE
1663		if p.charsRight() > 0 && p.rightChar(0) == '{' {
1664			p.moveRight(1)
1665			return p.scanHexUntilBrace()
1666		}
1667		return p.scanHex(2)
1668	case 'u':
1669		return p.scanHex(4)
1670	case 'a':
1671		return '\u0007', nil
1672	case 'b':
1673		return '\b', nil
1674	case 'e':
1675		return '\u001B', nil
1676	case 'f':
1677		return '\f', nil
1678	case 'n':
1679		return '\n', nil
1680	case 'r':
1681		return '\r', nil
1682	case 't':
1683		return '\t', nil
1684	case 'v':
1685		return '\u000B', nil
1686	case 'c':
1687		return p.scanControl()
1688	default:
1689		if !p.useOptionE() && IsWordChar(ch) {
1690			return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1691		}
1692		return ch, nil
1693	}
1694}
1695
1696// Grabs and converts an ascii control character
1697func (p *parser) scanControl() (rune, error) {
1698	if p.charsRight() <= 0 {
1699		return 0, p.getErr(ErrMissingControl)
1700	}
1701
1702	ch := p.moveRightGetChar()
1703
1704	// \ca interpreted as \cA
1705
1706	if ch >= 'a' && ch <= 'z' {
1707		ch = (ch - ('a' - 'A'))
1708	}
1709	ch = (ch - '@')
1710	if ch >= 0 && ch < ' ' {
1711		return ch, nil
1712	}
1713
1714	return 0, p.getErr(ErrUnrecognizedControl)
1715
1716}
1717
1718// Scan hex digits until we hit a closing brace.
1719// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1720func (p *parser) scanHexUntilBrace() (rune, error) {
1721	// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1722	// so we can enforce that
1723	i := 0
1724	hasContent := false
1725
1726	for p.charsRight() > 0 {
1727		ch := p.moveRightGetChar()
1728		if ch == '}' {
1729			// hit our close brace, we're done here
1730			// prevent \x{}
1731			if !hasContent {
1732				return 0, p.getErr(ErrTooFewHex)
1733			}
1734			return rune(i), nil
1735		}
1736		hasContent = true
1737		// no brace needs to be hex digit
1738		d := hexDigit(ch)
1739		if d < 0 {
1740			return 0, p.getErr(ErrMissingBrace)
1741		}
1742
1743		i *= 0x10
1744		i += d
1745
1746		if i > unicode.MaxRune {
1747			return 0, p.getErr(ErrInvalidHex)
1748		}
1749	}
1750
1751	// we only make it here if we run out of digits without finding the brace
1752	return 0, p.getErr(ErrMissingBrace)
1753}
1754
1755// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1756func (p *parser) scanHex(c int) (rune, error) {
1757
1758	i := 0
1759
1760	if p.charsRight() >= c {
1761		for c > 0 {
1762			d := hexDigit(p.moveRightGetChar())
1763			if d < 0 {
1764				break
1765			}
1766			i *= 0x10
1767			i += d
1768			c--
1769		}
1770	}
1771
1772	if c > 0 {
1773		return 0, p.getErr(ErrTooFewHex)
1774	}
1775
1776	return rune(i), nil
1777}
1778
1779// Returns n <= 0xF for a hex digit.
1780func hexDigit(ch rune) int {
1781
1782	if d := uint(ch - '0'); d <= 9 {
1783		return int(d)
1784	}
1785
1786	if d := uint(ch - 'a'); d <= 5 {
1787		return int(d + 0xa)
1788	}
1789
1790	if d := uint(ch - 'A'); d <= 5 {
1791		return int(d + 0xa)
1792	}
1793
1794	return -1
1795}
1796
1797// Scans up to three octal digits (stops before exceeding 0377).
1798func (p *parser) scanOctal() rune {
1799	// Consume octal chars only up to 3 digits and value 0377
1800
1801	c := 3
1802
1803	if c > p.charsRight() {
1804		c = p.charsRight()
1805	}
1806
1807	//we know the first char is good because the caller had to check
1808	i := 0
1809	d := int(p.rightChar(0) - '0')
1810	for c > 0 && d <= 7 {
1811		i *= 8
1812		i += d
1813		if p.useOptionE() && i >= 0x20 {
1814			break
1815		}
1816		c--
1817
1818		p.moveRight(1)
1819		if !p.rightMost() {
1820			d = int(p.rightChar(0) - '0')
1821		}
1822	}
1823
1824	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
1825	// is simply to truncate the high bits.
1826	i &= 0xFF
1827
1828	return rune(i)
1829}
1830
1831// Returns the current parsing position.
1832func (p *parser) textpos() int {
1833	return p.currentPos
1834}
1835
1836// Zaps to a specific parsing position.
1837func (p *parser) textto(pos int) {
1838	p.currentPos = pos
1839}
1840
1841// Returns the char at the right of the current parsing position and advances to the right.
1842func (p *parser) moveRightGetChar() rune {
1843	ch := p.pattern[p.currentPos]
1844	p.currentPos++
1845	return ch
1846}
1847
1848// Moves the current position to the right.
1849func (p *parser) moveRight(i int) {
1850	// default would be 1
1851	p.currentPos += i
1852}
1853
1854// Moves the current parsing position one to the left.
1855func (p *parser) moveLeft() {
1856	p.currentPos--
1857}
1858
1859// Returns the char left of the current parsing position.
1860func (p *parser) charAt(i int) rune {
1861	return p.pattern[i]
1862}
1863
1864// Returns the char i chars right of the current parsing position.
1865func (p *parser) rightChar(i int) rune {
1866	// default would be 0
1867	return p.pattern[p.currentPos+i]
1868}
1869
1870// Number of characters to the right of the current parsing position.
1871func (p *parser) charsRight() int {
1872	return len(p.pattern) - p.currentPos
1873}
1874
1875func (p *parser) rightMost() bool {
1876	return p.currentPos == len(p.pattern)
1877}
1878
1879// Looks up the slot number for a given name
1880func (p *parser) captureSlotFromName(capname string) int {
1881	return p.capnames[capname]
1882}
1883
1884// True if the capture slot was noted
1885func (p *parser) isCaptureSlot(i int) bool {
1886	if p.caps != nil {
1887		_, ok := p.caps[i]
1888		return ok
1889	}
1890
1891	return (i >= 0 && i < p.capsize)
1892}
1893
1894// Looks up the slot number for a given name
1895func (p *parser) isCaptureName(capname string) bool {
1896	if p.capnames == nil {
1897		return false
1898	}
1899
1900	_, ok := p.capnames[capname]
1901	return ok
1902}
1903
1904// option shortcuts
1905
1906// True if N option disabling '(' autocapture is on.
1907func (p *parser) useOptionN() bool {
1908	return (p.options & ExplicitCapture) != 0
1909}
1910
1911// True if I option enabling case-insensitivity is on.
1912func (p *parser) useOptionI() bool {
1913	return (p.options & IgnoreCase) != 0
1914}
1915
1916// True if M option altering meaning of $ and ^ is on.
1917func (p *parser) useOptionM() bool {
1918	return (p.options & Multiline) != 0
1919}
1920
1921// True if S option altering meaning of . is on.
1922func (p *parser) useOptionS() bool {
1923	return (p.options & Singleline) != 0
1924}
1925
1926// True if X option enabling whitespace/comment mode is on.
1927func (p *parser) useOptionX() bool {
1928	return (p.options & IgnorePatternWhitespace) != 0
1929}
1930
1931// True if E option enabling ECMAScript behavior on.
1932func (p *parser) useOptionE() bool {
1933	return (p.options & ECMAScript) != 0
1934}
1935
1936// true to use RE2 compatibility parsing behavior.
1937func (p *parser) useRE2() bool {
1938	return (p.options & RE2) != 0
1939}
1940
1941// True if options stack is empty.
1942func (p *parser) emptyOptionsStack() bool {
1943	return len(p.optionsStack) == 0
1944}
1945
1946// Finish the current quantifiable (when a quantifier is not found or is not possible)
1947func (p *parser) addConcatenate() {
1948	// The first (| inside a Testgroup group goes directly to the group
1949	p.concatenation.addChild(p.unit)
1950	p.unit = nil
1951}
1952
1953// Finish the current quantifiable (when a quantifier is found)
1954func (p *parser) addConcatenate3(lazy bool, min, max int) {
1955	p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
1956	p.unit = nil
1957}
1958
1959// Sets the current unit to a single char node
1960func (p *parser) addUnitOne(ch rune) {
1961	if p.useOptionI() {
1962		ch = unicode.ToLower(ch)
1963	}
1964
1965	p.unit = newRegexNodeCh(ntOne, p.options, ch)
1966}
1967
1968// Sets the current unit to a single inverse-char node
1969func (p *parser) addUnitNotone(ch rune) {
1970	if p.useOptionI() {
1971		ch = unicode.ToLower(ch)
1972	}
1973
1974	p.unit = newRegexNodeCh(ntNotone, p.options, ch)
1975}
1976
1977// Sets the current unit to a single set node
1978func (p *parser) addUnitSet(set *CharSet) {
1979	p.unit = newRegexNodeSet(ntSet, p.options, set)
1980}
1981
1982// Sets the current unit to a subtree
1983func (p *parser) addUnitNode(node *regexNode) {
1984	p.unit = node
1985}
1986
1987// Sets the current unit to an assertion of the specified type
1988func (p *parser) addUnitType(t nodeType) {
1989	p.unit = newRegexNode(t, p.options)
1990}
1991
1992// Finish the current group (in response to a ')' or end)
1993func (p *parser) addGroup() error {
1994	if p.group.t == ntTestgroup || p.group.t == ntTestref {
1995		p.group.addChild(p.concatenation.reverseLeft())
1996		if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
1997			return p.getErr(ErrTooManyAlternates)
1998		}
1999	} else {
2000		p.alternation.addChild(p.concatenation.reverseLeft())
2001		p.group.addChild(p.alternation)
2002	}
2003
2004	p.unit = p.group
2005	return nil
2006}
2007
2008// Pops the option stack, but keeps the current options unchanged.
2009func (p *parser) popKeepOptions() {
2010	lastIdx := len(p.optionsStack) - 1
2011	p.optionsStack = p.optionsStack[:lastIdx]
2012}
2013
2014// Recalls options from the stack.
2015func (p *parser) popOptions() {
2016	lastIdx := len(p.optionsStack) - 1
2017	// get the last item on the stack and then remove it by reslicing
2018	p.options = p.optionsStack[lastIdx]
2019	p.optionsStack = p.optionsStack[:lastIdx]
2020}
2021
2022// Saves options on a stack.
2023func (p *parser) pushOptions() {
2024	p.optionsStack = append(p.optionsStack, p.options)
2025}
2026
2027// Add a string to the last concatenate.
2028func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2029	var node *regexNode
2030
2031	if cch == 0 {
2032		return
2033	}
2034
2035	if cch > 1 {
2036		str := p.pattern[pos : pos+cch]
2037
2038		if p.useOptionI() && !isReplacement {
2039			// We do the ToLower character by character for consistency.  With surrogate chars, doing
2040			// a ToLower on the entire string could actually change the surrogate pair.  This is more correct
2041			// linguistically, but since Regex doesn't support surrogates, it's more important to be
2042			// consistent.
2043			for i := 0; i < len(str); i++ {
2044				str[i] = unicode.ToLower(str[i])
2045			}
2046		}
2047
2048		node = newRegexNodeStr(ntMulti, p.options, str)
2049	} else {
2050		ch := p.charAt(pos)
2051
2052		if p.useOptionI() && !isReplacement {
2053			ch = unicode.ToLower(ch)
2054		}
2055
2056		node = newRegexNodeCh(ntOne, p.options, ch)
2057	}
2058
2059	p.concatenation.addChild(node)
2060}
2061
2062// Push the parser state (in response to an open paren)
2063func (p *parser) pushGroup() {
2064	p.group.next = p.stack
2065	p.alternation.next = p.group
2066	p.concatenation.next = p.alternation
2067	p.stack = p.concatenation
2068}
2069
2070// Remember the pushed state (in response to a ')')
2071func (p *parser) popGroup() error {
2072	p.concatenation = p.stack
2073	p.alternation = p.concatenation.next
2074	p.group = p.alternation.next
2075	p.stack = p.group.next
2076
2077	// The first () inside a Testgroup group goes directly to the group
2078	if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2079		if p.unit == nil {
2080			return p.getErr(ErrConditionalExpression)
2081		}
2082
2083		p.group.addChild(p.unit)
2084		p.unit = nil
2085	}
2086	return nil
2087}
2088
2089// True if the group stack is empty.
2090func (p *parser) emptyStack() bool {
2091	return p.stack == nil
2092}
2093
2094// Start a new round for the parser state (in response to an open paren or string start)
2095func (p *parser) startGroup(openGroup *regexNode) {
2096	p.group = openGroup
2097	p.alternation = newRegexNode(ntAlternate, p.options)
2098	p.concatenation = newRegexNode(ntConcatenate, p.options)
2099}
2100
2101// Finish the current concatenation (in response to a |)
2102func (p *parser) addAlternate() {
2103	// The | parts inside a Testgroup group go directly to the group
2104
2105	if p.group.t == ntTestgroup || p.group.t == ntTestref {
2106		p.group.addChild(p.concatenation.reverseLeft())
2107	} else {
2108		p.alternation.addChild(p.concatenation.reverseLeft())
2109	}
2110
2111	p.concatenation = newRegexNode(ntConcatenate, p.options)
2112}
2113
2114// For categorizing ascii characters.
2115
2116const (
2117	Q byte = 5 // quantifier
2118	S      = 4 // ordinary stopper
2119	Z      = 3 // ScanBlank stopper
2120	X      = 2 // whitespace
2121	E      = 1 // should be escaped
2122)
2123
2124var _category = []byte{
2125	//01  2  3  4  5  6  7  8  9  A  B  C  D  E  F  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
2126	0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2127	// !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
2128	X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2129	//@A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
2130	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2131	//'a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~
2132	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2133}
2134
2135func isSpace(ch rune) bool {
2136	return (ch <= ' ' && _category[ch] == X)
2137}
2138
2139// Returns true for those characters that terminate a string of ordinary chars.
2140func isSpecial(ch rune) bool {
2141	return (ch <= '|' && _category[ch] >= S)
2142}
2143
2144// Returns true for those characters that terminate a string of ordinary chars.
2145func isStopperX(ch rune) bool {
2146	return (ch <= '|' && _category[ch] >= X)
2147}
2148
2149// Returns true for those characters that begin a quantifier.
2150func isQuantifier(ch rune) bool {
2151	return (ch <= '{' && _category[ch] >= Q)
2152}
2153
2154func (p *parser) isTrueQuantifier() bool {
2155	nChars := p.charsRight()
2156	if nChars == 0 {
2157		return false
2158	}
2159
2160	startpos := p.textpos()
2161	ch := p.charAt(startpos)
2162	if ch != '{' {
2163		return ch <= '{' && _category[ch] >= Q
2164	}
2165
2166	//UGLY: this is ugly -- the original code was ugly too
2167	pos := startpos
2168	for {
2169		nChars--
2170		if nChars <= 0 {
2171			break
2172		}
2173		pos++
2174		ch = p.charAt(pos)
2175		if ch < '0' || ch > '9' {
2176			break
2177		}
2178	}
2179
2180	if nChars == 0 || pos-startpos == 1 {
2181		return false
2182	}
2183	if ch == '}' {
2184		return true
2185	}
2186	if ch != ',' {
2187		return false
2188	}
2189	for {
2190		nChars--
2191		if nChars <= 0 {
2192			break
2193		}
2194		pos++
2195		ch = p.charAt(pos)
2196		if ch < '0' || ch > '9' {
2197			break
2198		}
2199	}
2200
2201	return nChars > 0 && ch == '}'
2202}