parser.go

   1package syntax
   2
   3import (
   4	"fmt"
   5	"math"
   6	"os"
   7	"sort"
   8	"strconv"
   9	"unicode"
  10)
  11
  12type RegexOptions int32
  13
  14const (
  15	IgnoreCase              RegexOptions = 0x0001 // "i"
  16	Multiline                            = 0x0002 // "m"
  17	ExplicitCapture                      = 0x0004 // "n"
  18	Compiled                             = 0x0008 // "c"
  19	Singleline                           = 0x0010 // "s"
  20	IgnorePatternWhitespace              = 0x0020 // "x"
  21	RightToLeft                          = 0x0040 // "r"
  22	Debug                                = 0x0080 // "d"
  23	ECMAScript                           = 0x0100 // "e"
  24	RE2                                  = 0x0200 // RE2 compat mode
  25	Unicode                              = 0x0400 // "u"
  26)
  27
  28func optionFromCode(ch rune) RegexOptions {
  29	// case-insensitive
  30	switch ch {
  31	case 'i', 'I':
  32		return IgnoreCase
  33	case 'r', 'R':
  34		return RightToLeft
  35	case 'm', 'M':
  36		return Multiline
  37	case 'n', 'N':
  38		return ExplicitCapture
  39	case 's', 'S':
  40		return Singleline
  41	case 'x', 'X':
  42		return IgnorePatternWhitespace
  43	case 'd', 'D':
  44		return Debug
  45	case 'e', 'E':
  46		return ECMAScript
  47	case 'u', 'U':
  48		return Unicode
  49	default:
  50		return 0
  51	}
  52}
  53
  54// An Error describes a failure to parse a regular expression
  55// and gives the offending expression.
  56type Error struct {
  57	Code ErrorCode
  58	Expr string
  59	Args []interface{}
  60}
  61
  62func (e *Error) Error() string {
  63	if len(e.Args) == 0 {
  64		return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
  65	}
  66	return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
  67}
  68
  69// An ErrorCode describes a failure to parse a regular expression.
  70type ErrorCode string
  71
  72const (
  73	// internal issue
  74	ErrInternalError ErrorCode = "regexp/syntax: internal error"
  75	// Parser errors
  76	ErrUnterminatedComment        = "unterminated comment"
  77	ErrInvalidCharRange           = "invalid character class range"
  78	ErrInvalidRepeatSize          = "invalid repeat count"
  79	ErrInvalidUTF8                = "invalid UTF-8"
  80	ErrCaptureGroupOutOfRange     = "capture group number out of range"
  81	ErrUnexpectedParen            = "unexpected )"
  82	ErrMissingParen               = "missing closing )"
  83	ErrMissingBrace               = "missing closing }"
  84	ErrInvalidRepeatOp            = "invalid nested repetition operator"
  85	ErrMissingRepeatArgument      = "missing argument to repetition operator"
  86	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
  87	ErrTooManyAlternates          = "too many | in (?()|)"
  88	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
  89	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
  90	ErrCapNumNotZero              = "capture number cannot be zero"
  91	ErrUndefinedBackRef           = "reference to undefined group number %v"
  92	ErrUndefinedNameRef           = "reference to undefined group name %v"
  93	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
  94	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
  95	ErrMalformedReference         = "(?(%v) ) malformed"
  96	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
  97	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
  98	ErrMalformedSlashP            = "malformed \\p{X} character escape"
  99	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
 100	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
 101	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
 102	ErrMissingControl             = "missing control character"
 103	ErrUnrecognizedControl        = "unrecognized control character"
 104	ErrTooFewHex                  = "insufficient hexadecimal digits"
 105	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
 106	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
 107	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
 108	ErrUnterminatedBracket        = "unterminated [] set"
 109	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
 110	ErrReversedCharRange          = "[%c-%c] range in reverse order"
 111)
 112
 113func (e ErrorCode) String() string {
 114	return string(e)
 115}
 116
 117type parser struct {
 118	stack         *regexNode
 119	group         *regexNode
 120	alternation   *regexNode
 121	concatenation *regexNode
 122	unit          *regexNode
 123
 124	patternRaw string
 125	pattern    []rune
 126
 127	currentPos  int
 128	specialCase *unicode.SpecialCase
 129
 130	autocap  int
 131	capcount int
 132	captop   int
 133	capsize  int
 134
 135	caps     map[int]int
 136	capnames map[string]int
 137
 138	capnumlist  []int
 139	capnamelist []string
 140
 141	options         RegexOptions
 142	optionsStack    []RegexOptions
 143	ignoreNextParen bool
 144}
 145
 146const (
 147	maxValueDiv10 int = math.MaxInt32 / 10
 148	maxValueMod10     = math.MaxInt32 % 10
 149)
 150
 151// Parse converts a regex string into a parse tree
 152func Parse(re string, op RegexOptions) (*RegexTree, error) {
 153	p := parser{
 154		options: op,
 155		caps:    make(map[int]int),
 156	}
 157	p.setPattern(re)
 158
 159	if err := p.countCaptures(); err != nil {
 160		return nil, err
 161	}
 162
 163	p.reset(op)
 164	root, err := p.scanRegex()
 165
 166	if err != nil {
 167		return nil, err
 168	}
 169	tree := &RegexTree{
 170		root:       root,
 171		caps:       p.caps,
 172		capnumlist: p.capnumlist,
 173		captop:     p.captop,
 174		Capnames:   p.capnames,
 175		Caplist:    p.capnamelist,
 176		options:    op,
 177	}
 178
 179	if tree.options&Debug > 0 {
 180		os.Stdout.WriteString(tree.Dump())
 181	}
 182
 183	return tree, nil
 184}
 185
 186func (p *parser) setPattern(pattern string) {
 187	p.patternRaw = pattern
 188	p.pattern = make([]rune, 0, len(pattern))
 189
 190	//populate our rune array to handle utf8 encoding
 191	for _, r := range pattern {
 192		p.pattern = append(p.pattern, r)
 193	}
 194}
 195func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
 196	return &Error{Code: code, Expr: p.patternRaw, Args: args}
 197}
 198
 199func (p *parser) noteCaptureSlot(i, pos int) {
 200	if _, ok := p.caps[i]; !ok {
 201		// the rhs of the hashtable isn't used in the parser
 202		p.caps[i] = pos
 203		p.capcount++
 204
 205		if p.captop <= i {
 206			if i == math.MaxInt32 {
 207				p.captop = i
 208			} else {
 209				p.captop = i + 1
 210			}
 211		}
 212	}
 213}
 214
 215func (p *parser) noteCaptureName(name string, pos int) {
 216	if p.capnames == nil {
 217		p.capnames = make(map[string]int)
 218	}
 219
 220	if _, ok := p.capnames[name]; !ok {
 221		p.capnames[name] = pos
 222		p.capnamelist = append(p.capnamelist, name)
 223	}
 224}
 225
 226func (p *parser) assignNameSlots() {
 227	if p.capnames != nil {
 228		for _, name := range p.capnamelist {
 229			for p.isCaptureSlot(p.autocap) {
 230				p.autocap++
 231			}
 232			pos := p.capnames[name]
 233			p.capnames[name] = p.autocap
 234			p.noteCaptureSlot(p.autocap, pos)
 235
 236			p.autocap++
 237		}
 238	}
 239
 240	// if the caps array has at least one gap, construct the list of used slots
 241	if p.capcount < p.captop {
 242		p.capnumlist = make([]int, p.capcount)
 243		i := 0
 244
 245		for k := range p.caps {
 246			p.capnumlist[i] = k
 247			i++
 248		}
 249
 250		sort.Ints(p.capnumlist)
 251	}
 252
 253	// merge capsnumlist into capnamelist
 254	if p.capnames != nil || p.capnumlist != nil {
 255		var oldcapnamelist []string
 256		var next int
 257		var k int
 258
 259		if p.capnames == nil {
 260			oldcapnamelist = nil
 261			p.capnames = make(map[string]int)
 262			p.capnamelist = []string{}
 263			next = -1
 264		} else {
 265			oldcapnamelist = p.capnamelist
 266			p.capnamelist = []string{}
 267			next = p.capnames[oldcapnamelist[0]]
 268		}
 269
 270		for i := 0; i < p.capcount; i++ {
 271			j := i
 272			if p.capnumlist != nil {
 273				j = p.capnumlist[i]
 274			}
 275
 276			if next == j {
 277				p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
 278				k++
 279
 280				if k == len(oldcapnamelist) {
 281					next = -1
 282				} else {
 283					next = p.capnames[oldcapnamelist[k]]
 284				}
 285
 286			} else {
 287				//feature: culture?
 288				str := strconv.Itoa(j)
 289				p.capnamelist = append(p.capnamelist, str)
 290				p.capnames[str] = j
 291			}
 292		}
 293	}
 294}
 295
 296func (p *parser) consumeAutocap() int {
 297	r := p.autocap
 298	p.autocap++
 299	return r
 300}
 301
 302// CountCaptures is a prescanner for deducing the slots used for
 303// captures by doing a partial tokenization of the pattern.
 304func (p *parser) countCaptures() error {
 305	var ch rune
 306
 307	p.noteCaptureSlot(0, 0)
 308
 309	p.autocap = 1
 310
 311	for p.charsRight() > 0 {
 312		pos := p.textpos()
 313		ch = p.moveRightGetChar()
 314		switch ch {
 315		case '\\':
 316			if p.charsRight() > 0 {
 317				p.scanBackslash(true)
 318			}
 319
 320		case '#':
 321			if p.useOptionX() {
 322				p.moveLeft()
 323				p.scanBlank()
 324			}
 325
 326		case '[':
 327			p.scanCharSet(false, true)
 328
 329		case ')':
 330			if !p.emptyOptionsStack() {
 331				p.popOptions()
 332			}
 333
 334		case '(':
 335			if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
 336				p.moveLeft()
 337				p.scanBlank()
 338			} else {
 339				p.pushOptions()
 340				if p.charsRight() > 0 && p.rightChar(0) == '?' {
 341					// we have (?...
 342					p.moveRight(1)
 343
 344					if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
 345						// named group: (?<... or (?'...
 346
 347						p.moveRight(1)
 348						ch = p.rightChar(0)
 349
 350						if ch != '0' && IsWordChar(ch) {
 351							if ch >= '1' && ch <= '9' {
 352								dec, err := p.scanDecimal()
 353								if err != nil {
 354									return err
 355								}
 356								p.noteCaptureSlot(dec, pos)
 357							} else {
 358								p.noteCaptureName(p.scanCapname(), pos)
 359							}
 360						}
 361					} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
 362						// RE2-compat (?P<)
 363						p.moveRight(2)
 364						ch = p.rightChar(0)
 365						if IsWordChar(ch) {
 366							p.noteCaptureName(p.scanCapname(), pos)
 367						}
 368
 369					} else {
 370						// (?...
 371
 372						// get the options if it's an option construct (?cimsx-cimsx...)
 373						p.scanOptions()
 374
 375						if p.charsRight() > 0 {
 376							if p.rightChar(0) == ')' {
 377								// (?cimsx-cimsx)
 378								p.moveRight(1)
 379								p.popKeepOptions()
 380							} else if p.rightChar(0) == '(' {
 381								// alternation construct: (?(foo)yes|no)
 382								// ignore the next paren so we don't capture the condition
 383								p.ignoreNextParen = true
 384
 385								// break from here so we don't reset ignoreNextParen
 386								continue
 387							}
 388						}
 389					}
 390				} else {
 391					if !p.useOptionN() && !p.ignoreNextParen {
 392						p.noteCaptureSlot(p.consumeAutocap(), pos)
 393					}
 394				}
 395			}
 396
 397			p.ignoreNextParen = false
 398
 399		}
 400	}
 401
 402	p.assignNameSlots()
 403	return nil
 404}
 405
 406func (p *parser) reset(topopts RegexOptions) {
 407	p.currentPos = 0
 408	p.autocap = 1
 409	p.ignoreNextParen = false
 410
 411	if len(p.optionsStack) > 0 {
 412		p.optionsStack = p.optionsStack[:0]
 413	}
 414
 415	p.options = topopts
 416	p.stack = nil
 417}
 418
 419func (p *parser) scanRegex() (*regexNode, error) {
 420	ch := '@' // nonspecial ch, means at beginning
 421	isQuant := false
 422
 423	p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
 424
 425	for p.charsRight() > 0 {
 426		wasPrevQuantifier := isQuant
 427		isQuant = false
 428
 429		if err := p.scanBlank(); err != nil {
 430			return nil, err
 431		}
 432
 433		startpos := p.textpos()
 434
 435		// move past all of the normal characters.  We'll stop when we hit some kind of control character,
 436		// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
 437		if p.useOptionX() {
 438			for p.charsRight() > 0 {
 439				ch = p.rightChar(0)
 440				//UGLY: clean up, this is ugly
 441				if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
 442					break
 443				}
 444				p.moveRight(1)
 445			}
 446		} else {
 447			for p.charsRight() > 0 {
 448				ch = p.rightChar(0)
 449				if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
 450					break
 451				}
 452				p.moveRight(1)
 453			}
 454		}
 455
 456		endpos := p.textpos()
 457
 458		p.scanBlank()
 459
 460		if p.charsRight() == 0 {
 461			ch = '!' // nonspecial, means at end
 462		} else if ch = p.rightChar(0); isSpecial(ch) {
 463			isQuant = isQuantifier(ch)
 464			p.moveRight(1)
 465		} else {
 466			ch = ' ' // nonspecial, means at ordinary char
 467		}
 468
 469		if startpos < endpos {
 470			cchUnquantified := endpos - startpos
 471			if isQuant {
 472				cchUnquantified--
 473			}
 474			wasPrevQuantifier = false
 475
 476			if cchUnquantified > 0 {
 477				p.addToConcatenate(startpos, cchUnquantified, false)
 478			}
 479
 480			if isQuant {
 481				p.addUnitOne(p.charAt(endpos - 1))
 482			}
 483		}
 484
 485		switch ch {
 486		case '!':
 487			goto BreakOuterScan
 488
 489		case ' ':
 490			goto ContinueOuterScan
 491
 492		case '[':
 493			cc, err := p.scanCharSet(p.useOptionI(), false)
 494			if err != nil {
 495				return nil, err
 496			}
 497			p.addUnitSet(cc)
 498
 499		case '(':
 500			p.pushOptions()
 501
 502			if grouper, err := p.scanGroupOpen(); err != nil {
 503				return nil, err
 504			} else if grouper == nil {
 505				p.popKeepOptions()
 506			} else {
 507				p.pushGroup()
 508				p.startGroup(grouper)
 509			}
 510
 511			continue
 512
 513		case '|':
 514			p.addAlternate()
 515			goto ContinueOuterScan
 516
 517		case ')':
 518			if p.emptyStack() {
 519				return nil, p.getErr(ErrUnexpectedParen)
 520			}
 521
 522			if err := p.addGroup(); err != nil {
 523				return nil, err
 524			}
 525			if err := p.popGroup(); err != nil {
 526				return nil, err
 527			}
 528			p.popOptions()
 529
 530			if p.unit == nil {
 531				goto ContinueOuterScan
 532			}
 533
 534		case '\\':
 535			n, err := p.scanBackslash(false)
 536			if err != nil {
 537				return nil, err
 538			}
 539			p.addUnitNode(n)
 540
 541		case '^':
 542			if p.useOptionM() {
 543				p.addUnitType(ntBol)
 544			} else {
 545				p.addUnitType(ntBeginning)
 546			}
 547
 548		case '$':
 549			if p.useOptionM() {
 550				p.addUnitType(ntEol)
 551			} else {
 552				p.addUnitType(ntEndZ)
 553			}
 554
 555		case '.':
 556			if p.useOptionS() {
 557				p.addUnitSet(AnyClass())
 558			} else if p.useOptionE() {
 559				p.addUnitSet(ECMAAnyClass())
 560			} else {
 561				p.addUnitNotone('\n')
 562			}
 563
 564		case '{', '*', '+', '?':
 565			if p.unit == nil {
 566				if wasPrevQuantifier {
 567					return nil, p.getErr(ErrInvalidRepeatOp)
 568				} else {
 569					return nil, p.getErr(ErrMissingRepeatArgument)
 570				}
 571			}
 572			p.moveLeft()
 573
 574		default:
 575			return nil, p.getErr(ErrInternalError)
 576		}
 577
 578		if err := p.scanBlank(); err != nil {
 579			return nil, err
 580		}
 581
 582		if p.charsRight() > 0 {
 583			isQuant = p.isTrueQuantifier()
 584		}
 585		if p.charsRight() == 0 || !isQuant {
 586			//maintain odd C# assignment order -- not sure if required, could clean up?
 587			p.addConcatenate()
 588			goto ContinueOuterScan
 589		}
 590
 591		ch = p.moveRightGetChar()
 592
 593		// Handle quantifiers
 594		for p.unit != nil {
 595			var min, max int
 596			var lazy bool
 597
 598			switch ch {
 599			case '*':
 600				min = 0
 601				max = math.MaxInt32
 602
 603			case '?':
 604				min = 0
 605				max = 1
 606
 607			case '+':
 608				min = 1
 609				max = math.MaxInt32
 610
 611			case '{':
 612				{
 613					var err error
 614					startpos = p.textpos()
 615					if min, err = p.scanDecimal(); err != nil {
 616						return nil, err
 617					}
 618					max = min
 619					if startpos < p.textpos() {
 620						if p.charsRight() > 0 && p.rightChar(0) == ',' {
 621							p.moveRight(1)
 622							if p.charsRight() == 0 || p.rightChar(0) == '}' {
 623								max = math.MaxInt32
 624							} else {
 625								if max, err = p.scanDecimal(); err != nil {
 626									return nil, err
 627								}
 628							}
 629						}
 630					}
 631
 632					if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
 633						p.addConcatenate()
 634						p.textto(startpos - 1)
 635						goto ContinueOuterScan
 636					}
 637				}
 638
 639			default:
 640				return nil, p.getErr(ErrInternalError)
 641			}
 642
 643			if err := p.scanBlank(); err != nil {
 644				return nil, err
 645			}
 646
 647			if p.charsRight() == 0 || p.rightChar(0) != '?' {
 648				lazy = false
 649			} else {
 650				p.moveRight(1)
 651				lazy = true
 652			}
 653
 654			if min > max {
 655				return nil, p.getErr(ErrInvalidRepeatSize)
 656			}
 657
 658			p.addConcatenate3(lazy, min, max)
 659		}
 660
 661	ContinueOuterScan:
 662	}
 663
 664BreakOuterScan:
 665	;
 666
 667	if !p.emptyStack() {
 668		return nil, p.getErr(ErrMissingParen)
 669	}
 670
 671	if err := p.addGroup(); err != nil {
 672		return nil, err
 673	}
 674
 675	return p.unit, nil
 676
 677}
 678
 679/*
 680 * Simple parsing for replacement patterns
 681 */
 682func (p *parser) scanReplacement() (*regexNode, error) {
 683	var c, startpos int
 684
 685	p.concatenation = newRegexNode(ntConcatenate, p.options)
 686
 687	for {
 688		c = p.charsRight()
 689		if c == 0 {
 690			break
 691		}
 692
 693		startpos = p.textpos()
 694
 695		for c > 0 && p.rightChar(0) != '$' {
 696			p.moveRight(1)
 697			c--
 698		}
 699
 700		p.addToConcatenate(startpos, p.textpos()-startpos, true)
 701
 702		if c > 0 {
 703			if p.moveRightGetChar() == '$' {
 704				n, err := p.scanDollar()
 705				if err != nil {
 706					return nil, err
 707				}
 708				p.addUnitNode(n)
 709			}
 710			p.addConcatenate()
 711		}
 712	}
 713
 714	return p.concatenation, nil
 715}
 716
 717/*
 718 * Scans $ patterns recognized within replacement patterns
 719 */
 720func (p *parser) scanDollar() (*regexNode, error) {
 721	if p.charsRight() == 0 {
 722		return newRegexNodeCh(ntOne, p.options, '$'), nil
 723	}
 724
 725	ch := p.rightChar(0)
 726	angled := false
 727	backpos := p.textpos()
 728	lastEndPos := backpos
 729
 730	// Note angle
 731
 732	if ch == '{' && p.charsRight() > 1 {
 733		angled = true
 734		p.moveRight(1)
 735		ch = p.rightChar(0)
 736	}
 737
 738	// Try to parse backreference: \1 or \{1} or \{cap}
 739
 740	if ch >= '0' && ch <= '9' {
 741		if !angled && p.useOptionE() {
 742			capnum := -1
 743			newcapnum := int(ch - '0')
 744			p.moveRight(1)
 745			if p.isCaptureSlot(newcapnum) {
 746				capnum = newcapnum
 747				lastEndPos = p.textpos()
 748			}
 749
 750			for p.charsRight() > 0 {
 751				ch = p.rightChar(0)
 752				if ch < '0' || ch > '9' {
 753					break
 754				}
 755				digit := int(ch - '0')
 756				if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
 757					return nil, p.getErr(ErrCaptureGroupOutOfRange)
 758				}
 759
 760				newcapnum = newcapnum*10 + digit
 761
 762				p.moveRight(1)
 763				if p.isCaptureSlot(newcapnum) {
 764					capnum = newcapnum
 765					lastEndPos = p.textpos()
 766				}
 767			}
 768			p.textto(lastEndPos)
 769			if capnum >= 0 {
 770				return newRegexNodeM(ntRef, p.options, capnum), nil
 771			}
 772		} else {
 773			capnum, err := p.scanDecimal()
 774			if err != nil {
 775				return nil, err
 776			}
 777			if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
 778				if p.isCaptureSlot(capnum) {
 779					return newRegexNodeM(ntRef, p.options, capnum), nil
 780				}
 781			}
 782		}
 783	} else if angled && IsWordChar(ch) {
 784		capname := p.scanCapname()
 785
 786		if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
 787			if p.isCaptureName(capname) {
 788				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
 789			}
 790		}
 791	} else if !angled {
 792		capnum := 1
 793
 794		switch ch {
 795		case '$':
 796			p.moveRight(1)
 797			return newRegexNodeCh(ntOne, p.options, '$'), nil
 798		case '&':
 799			capnum = 0
 800		case '`':
 801			capnum = replaceLeftPortion
 802		case '\'':
 803			capnum = replaceRightPortion
 804		case '+':
 805			capnum = replaceLastGroup
 806		case '_':
 807			capnum = replaceWholeString
 808		}
 809
 810		if capnum != 1 {
 811			p.moveRight(1)
 812			return newRegexNodeM(ntRef, p.options, capnum), nil
 813		}
 814	}
 815
 816	// unrecognized $: literalize
 817
 818	p.textto(backpos)
 819	return newRegexNodeCh(ntOne, p.options, '$'), nil
 820}
 821
 822// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
 823// a RegexNode for the type of group scanned, or nil if the group
 824// simply changed options (?cimsx-cimsx) or was a comment (#...).
 825func (p *parser) scanGroupOpen() (*regexNode, error) {
 826	var ch rune
 827	var nt nodeType
 828	var err error
 829	close := '>'
 830	start := p.textpos()
 831
 832	// just return a RegexNode if we have:
 833	// 1. "(" followed by nothing
 834	// 2. "(x" where x != ?
 835	// 3. "(?)"
 836	if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
 837		if p.useOptionN() || p.ignoreNextParen {
 838			p.ignoreNextParen = false
 839			return newRegexNode(ntGroup, p.options), nil
 840		}
 841		return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
 842	}
 843
 844	p.moveRight(1)
 845
 846	for {
 847		if p.charsRight() == 0 {
 848			break
 849		}
 850
 851		switch ch = p.moveRightGetChar(); ch {
 852		case ':':
 853			nt = ntGroup
 854
 855		case '=':
 856			p.options &= ^RightToLeft
 857			nt = ntRequire
 858
 859		case '!':
 860			p.options &= ^RightToLeft
 861			nt = ntPrevent
 862
 863		case '>':
 864			nt = ntGreedy
 865
 866		case '\'':
 867			close = '\''
 868			fallthrough
 869
 870		case '<':
 871			if p.charsRight() == 0 {
 872				goto BreakRecognize
 873			}
 874
 875			switch ch = p.moveRightGetChar(); ch {
 876			case '=':
 877				if close == '\'' {
 878					goto BreakRecognize
 879				}
 880
 881				p.options |= RightToLeft
 882				nt = ntRequire
 883
 884			case '!':
 885				if close == '\'' {
 886					goto BreakRecognize
 887				}
 888
 889				p.options |= RightToLeft
 890				nt = ntPrevent
 891
 892			default:
 893				p.moveLeft()
 894				capnum := -1
 895				uncapnum := -1
 896				proceed := false
 897
 898				// grab part before -
 899
 900				if ch >= '0' && ch <= '9' {
 901					if capnum, err = p.scanDecimal(); err != nil {
 902						return nil, err
 903					}
 904
 905					if !p.isCaptureSlot(capnum) {
 906						capnum = -1
 907					}
 908
 909					// check if we have bogus characters after the number
 910					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
 911						return nil, p.getErr(ErrInvalidGroupName)
 912					}
 913					if capnum == 0 {
 914						return nil, p.getErr(ErrCapNumNotZero)
 915					}
 916				} else if IsWordChar(ch) {
 917					capname := p.scanCapname()
 918
 919					if p.isCaptureName(capname) {
 920						capnum = p.captureSlotFromName(capname)
 921					}
 922
 923					// check if we have bogus character after the name
 924					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
 925						return nil, p.getErr(ErrInvalidGroupName)
 926					}
 927				} else if ch == '-' {
 928					proceed = true
 929				} else {
 930					// bad group name - starts with something other than a word character and isn't a number
 931					return nil, p.getErr(ErrInvalidGroupName)
 932				}
 933
 934				// grab part after - if any
 935
 936				if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
 937					p.moveRight(1)
 938
 939					//no more chars left, no closing char, etc
 940					if p.charsRight() == 0 {
 941						return nil, p.getErr(ErrInvalidGroupName)
 942					}
 943
 944					ch = p.rightChar(0)
 945					if ch >= '0' && ch <= '9' {
 946						if uncapnum, err = p.scanDecimal(); err != nil {
 947							return nil, err
 948						}
 949
 950						if !p.isCaptureSlot(uncapnum) {
 951							return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
 952						}
 953
 954						// check if we have bogus characters after the number
 955						if p.charsRight() > 0 && p.rightChar(0) != close {
 956							return nil, p.getErr(ErrInvalidGroupName)
 957						}
 958					} else if IsWordChar(ch) {
 959						uncapname := p.scanCapname()
 960
 961						if !p.isCaptureName(uncapname) {
 962							return nil, p.getErr(ErrUndefinedNameRef, uncapname)
 963						}
 964						uncapnum = p.captureSlotFromName(uncapname)
 965
 966						// check if we have bogus character after the name
 967						if p.charsRight() > 0 && p.rightChar(0) != close {
 968							return nil, p.getErr(ErrInvalidGroupName)
 969						}
 970					} else {
 971						// bad group name - starts with something other than a word character and isn't a number
 972						return nil, p.getErr(ErrInvalidGroupName)
 973					}
 974				}
 975
 976				// actually make the node
 977
 978				if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
 979					return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
 980				}
 981				goto BreakRecognize
 982			}
 983
 984		case '(':
 985			// alternation construct (?(...) | )
 986
 987			parenPos := p.textpos()
 988			if p.charsRight() > 0 {
 989				ch = p.rightChar(0)
 990
 991				// check if the alternation condition is a backref
 992				if ch >= '0' && ch <= '9' {
 993					var capnum int
 994					if capnum, err = p.scanDecimal(); err != nil {
 995						return nil, err
 996					}
 997					if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
 998						if p.isCaptureSlot(capnum) {
 999							return newRegexNodeM(ntTestref, p.options, capnum), nil
1000						}
1001						return nil, p.getErr(ErrUndefinedReference, capnum)
1002					}
1003
1004					return nil, p.getErr(ErrMalformedReference, capnum)
1005
1006				} else if IsWordChar(ch) {
1007					capname := p.scanCapname()
1008
1009					if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1010						return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1011					}
1012				}
1013			}
1014			// not a backref
1015			nt = ntTestgroup
1016			p.textto(parenPos - 1)   // jump to the start of the parentheses
1017			p.ignoreNextParen = true // but make sure we don't try to capture the insides
1018
1019			charsRight := p.charsRight()
1020			if charsRight >= 3 && p.rightChar(1) == '?' {
1021				rightchar2 := p.rightChar(2)
1022				// disallow comments in the condition
1023				if rightchar2 == '#' {
1024					return nil, p.getErr(ErrAlternationCantHaveComment)
1025				}
1026
1027				// disallow named capture group (?<..>..) in the condition
1028				if rightchar2 == '\'' {
1029					return nil, p.getErr(ErrAlternationCantCapture)
1030				}
1031
1032				if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1033					return nil, p.getErr(ErrAlternationCantCapture)
1034				}
1035			}
1036
1037		case 'P':
1038			if p.useRE2() {
1039				// support for P<name> syntax
1040				if p.charsRight() < 3 {
1041					goto BreakRecognize
1042				}
1043
1044				ch = p.moveRightGetChar()
1045				if ch != '<' {
1046					goto BreakRecognize
1047				}
1048
1049				ch = p.moveRightGetChar()
1050				p.moveLeft()
1051
1052				if IsWordChar(ch) {
1053					capnum := -1
1054					capname := p.scanCapname()
1055
1056					if p.isCaptureName(capname) {
1057						capnum = p.captureSlotFromName(capname)
1058					}
1059
1060					// check if we have bogus character after the name
1061					if p.charsRight() > 0 && p.rightChar(0) != '>' {
1062						return nil, p.getErr(ErrInvalidGroupName)
1063					}
1064
1065					// actually make the node
1066
1067					if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1068						return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1069					}
1070					goto BreakRecognize
1071
1072				} else {
1073					// bad group name - starts with something other than a word character and isn't a number
1074					return nil, p.getErr(ErrInvalidGroupName)
1075				}
1076			}
1077			// if we're not using RE2 compat mode then
1078			// we just behave like normal
1079			fallthrough
1080
1081		default:
1082			p.moveLeft()
1083
1084			nt = ntGroup
1085			// disallow options in the children of a testgroup node
1086			if p.group.t != ntTestgroup {
1087				p.scanOptions()
1088			}
1089			if p.charsRight() == 0 {
1090				goto BreakRecognize
1091			}
1092
1093			if ch = p.moveRightGetChar(); ch == ')' {
1094				return nil, nil
1095			}
1096
1097			if ch != ':' {
1098				goto BreakRecognize
1099			}
1100
1101		}
1102
1103		return newRegexNode(nt, p.options), nil
1104	}
1105
1106BreakRecognize:
1107
1108	// break Recognize comes here
1109
1110	return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1111}
1112
1113// scans backslash specials and basics
1114func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1115
1116	if p.charsRight() == 0 {
1117		return nil, p.getErr(ErrIllegalEndEscape)
1118	}
1119
1120	switch ch := p.rightChar(0); ch {
1121	case 'b', 'B', 'A', 'G', 'Z', 'z':
1122		p.moveRight(1)
1123		return newRegexNode(p.typeFromCode(ch), p.options), nil
1124
1125	case 'w':
1126		p.moveRight(1)
1127		if p.useOptionE() || p.useRE2() {
1128			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1129		}
1130		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1131
1132	case 'W':
1133		p.moveRight(1)
1134		if p.useOptionE() || p.useRE2() {
1135			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1136		}
1137		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1138
1139	case 's':
1140		p.moveRight(1)
1141		if p.useOptionE() {
1142			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1143		} else if p.useRE2() {
1144			return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
1145		}
1146		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1147
1148	case 'S':
1149		p.moveRight(1)
1150		if p.useOptionE() {
1151			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1152		} else if p.useRE2() {
1153			return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
1154		}
1155		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1156
1157	case 'd':
1158		p.moveRight(1)
1159		if p.useOptionE() || p.useRE2() {
1160			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1161		}
1162		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1163
1164	case 'D':
1165		p.moveRight(1)
1166		if p.useOptionE() || p.useRE2() {
1167			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1168		}
1169		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1170
1171	case 'p', 'P':
1172		p.moveRight(1)
1173		prop, err := p.parseProperty()
1174		if err != nil {
1175			return nil, err
1176		}
1177		cc := &CharSet{}
1178		cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1179		if p.useOptionI() {
1180			cc.addLowercase()
1181		}
1182
1183		return newRegexNodeSet(ntSet, p.options, cc), nil
1184
1185	default:
1186		return p.scanBasicBackslash(scanOnly)
1187	}
1188}
1189
1190// Scans \-style backreferences and character escapes
1191func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1192	if p.charsRight() == 0 {
1193		return nil, p.getErr(ErrIllegalEndEscape)
1194	}
1195	angled := false
1196	k := false
1197	close := '\x00'
1198
1199	backpos := p.textpos()
1200	ch := p.rightChar(0)
1201
1202	// Allow \k<foo> instead of \<foo>, which is now deprecated.
1203
1204	// According to ECMAScript specification, \k<name> is only parsed as a named group reference if
1205	// there is at least one group name in the regexp.
1206	// See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
1207	// Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
1208	if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
1209		if p.charsRight() >= 2 {
1210			p.moveRight(1)
1211			ch = p.moveRightGetChar()
1212
1213			if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
1214				angled = true
1215				if ch == '\'' {
1216					close = '\''
1217				} else {
1218					close = '>'
1219				}
1220			}
1221		}
1222
1223		if !angled || p.charsRight() <= 0 {
1224			return nil, p.getErr(ErrMalformedNameRef)
1225		}
1226
1227		ch = p.rightChar(0)
1228		k = true
1229
1230	} else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1231		angled = true
1232		if ch == '\'' {
1233			close = '\''
1234		} else {
1235			close = '>'
1236		}
1237
1238		p.moveRight(1)
1239		ch = p.rightChar(0)
1240	}
1241
1242	// Try to parse backreference: \<1> or \<cap>
1243
1244	if angled && ch >= '0' && ch <= '9' {
1245		capnum, err := p.scanDecimal()
1246		if err != nil {
1247			return nil, err
1248		}
1249
1250		if p.charsRight() > 0 && p.moveRightGetChar() == close {
1251			if p.isCaptureSlot(capnum) {
1252				return newRegexNodeM(ntRef, p.options, capnum), nil
1253			}
1254			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1255		}
1256	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1257		capnum, err := p.scanDecimal()
1258		if err != nil {
1259			return nil, err
1260		}
1261
1262		if scanOnly {
1263			return nil, nil
1264		}
1265
1266		if p.isCaptureSlot(capnum) {
1267			return newRegexNodeM(ntRef, p.options, capnum), nil
1268		}
1269		if capnum <= 9 && !p.useOptionE() {
1270			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1271		}
1272
1273	} else if angled {
1274		capname := p.scanCapname()
1275
1276		if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
1277
1278			if scanOnly {
1279				return nil, nil
1280			}
1281
1282			if p.isCaptureName(capname) {
1283				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1284			}
1285			return nil, p.getErr(ErrUndefinedNameRef, capname)
1286		} else {
1287			if k {
1288				return nil, p.getErr(ErrMalformedNameRef)
1289			}
1290		}
1291	}
1292
1293	// Not backreference: must be char code
1294
1295	p.textto(backpos)
1296	ch, err := p.scanCharEscape()
1297	if err != nil {
1298		return nil, err
1299	}
1300
1301	if scanOnly {
1302		return nil, nil
1303	}
1304
1305	if p.useOptionI() {
1306		ch = unicode.ToLower(ch)
1307	}
1308
1309	return newRegexNodeCh(ntOne, p.options, ch), nil
1310}
1311
1312// Scans X for \p{X} or \P{X}
1313func (p *parser) parseProperty() (string, error) {
1314	// RE2 and PCRE supports \pX syntax (no {} and only 1 letter unicode cats supported)
1315	// since this is purely additive syntax it's not behind a flag
1316	if p.charsRight() >= 1 && p.rightChar(0) != '{' {
1317		ch := string(p.moveRightGetChar())
1318		// check if it's a valid cat
1319		if !isValidUnicodeCat(ch) {
1320			return "", p.getErr(ErrUnknownSlashP, ch)
1321		}
1322		return ch, nil
1323	}
1324
1325	if p.charsRight() < 3 {
1326		return "", p.getErr(ErrIncompleteSlashP)
1327	}
1328	ch := p.moveRightGetChar()
1329	if ch != '{' {
1330		return "", p.getErr(ErrMalformedSlashP)
1331	}
1332
1333	startpos := p.textpos()
1334	for p.charsRight() > 0 {
1335		ch = p.moveRightGetChar()
1336		if !(IsWordChar(ch) || ch == '-') {
1337			p.moveLeft()
1338			break
1339		}
1340	}
1341	capname := string(p.pattern[startpos:p.textpos()])
1342
1343	if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1344		return "", p.getErr(ErrIncompleteSlashP)
1345	}
1346
1347	if !isValidUnicodeCat(capname) {
1348		return "", p.getErr(ErrUnknownSlashP, capname)
1349	}
1350
1351	return capname, nil
1352}
1353
1354// Returns ReNode type for zero-length assertions with a \ code.
1355func (p *parser) typeFromCode(ch rune) nodeType {
1356	switch ch {
1357	case 'b':
1358		if p.useOptionE() {
1359			return ntECMABoundary
1360		}
1361		return ntBoundary
1362	case 'B':
1363		if p.useOptionE() {
1364			return ntNonECMABoundary
1365		}
1366		return ntNonboundary
1367	case 'A':
1368		return ntBeginning
1369	case 'G':
1370		return ntStart
1371	case 'Z':
1372		return ntEndZ
1373	case 'z':
1374		return ntEnd
1375	default:
1376		return ntNothing
1377	}
1378}
1379
1380// Scans whitespace or x-mode comments.
1381func (p *parser) scanBlank() error {
1382	if p.useOptionX() {
1383		for {
1384			for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1385				p.moveRight(1)
1386			}
1387
1388			if p.charsRight() == 0 {
1389				break
1390			}
1391
1392			if p.rightChar(0) == '#' {
1393				for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1394					p.moveRight(1)
1395				}
1396			} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1397				p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1398				for p.charsRight() > 0 && p.rightChar(0) != ')' {
1399					p.moveRight(1)
1400				}
1401				if p.charsRight() == 0 {
1402					return p.getErr(ErrUnterminatedComment)
1403				}
1404				p.moveRight(1)
1405			} else {
1406				break
1407			}
1408		}
1409	} else {
1410		for {
1411			if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1412				p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1413				return nil
1414			}
1415
1416			for p.charsRight() > 0 && p.rightChar(0) != ')' {
1417				p.moveRight(1)
1418			}
1419			if p.charsRight() == 0 {
1420				return p.getErr(ErrUnterminatedComment)
1421			}
1422			p.moveRight(1)
1423		}
1424	}
1425	return nil
1426}
1427
1428func (p *parser) scanCapname() string {
1429	startpos := p.textpos()
1430
1431	for p.charsRight() > 0 {
1432		if !IsWordChar(p.moveRightGetChar()) {
1433			p.moveLeft()
1434			break
1435		}
1436	}
1437
1438	return string(p.pattern[startpos:p.textpos()])
1439}
1440
1441// Scans contents of [] (not including []'s), and converts to a set.
1442func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1443	ch := '\x00'
1444	chPrev := '\x00'
1445	inRange := false
1446	firstChar := true
1447	closed := false
1448
1449	var cc *CharSet
1450	if !scanOnly {
1451		cc = &CharSet{}
1452	}
1453
1454	if p.charsRight() > 0 && p.rightChar(0) == '^' {
1455		p.moveRight(1)
1456		if !scanOnly {
1457			cc.negate = true
1458		}
1459	}
1460
1461	for ; p.charsRight() > 0; firstChar = false {
1462		fTranslatedChar := false
1463		ch = p.moveRightGetChar()
1464		if ch == ']' {
1465			if !firstChar {
1466				closed = true
1467				break
1468			} else if p.useOptionE() {
1469				if !scanOnly {
1470					cc.addRanges(NoneClass().ranges)
1471				}
1472				closed = true
1473				break
1474			}
1475
1476		} else if ch == '\\' && p.charsRight() > 0 {
1477			switch ch = p.moveRightGetChar(); ch {
1478			case 'D', 'd':
1479				if !scanOnly {
1480					if inRange {
1481						return nil, p.getErr(ErrBadClassInCharRange, ch)
1482					}
1483					cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
1484				}
1485				continue
1486
1487			case 'S', 's':
1488				if !scanOnly {
1489					if inRange {
1490						return nil, p.getErr(ErrBadClassInCharRange, ch)
1491					}
1492					cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
1493				}
1494				continue
1495
1496			case 'W', 'w':
1497				if !scanOnly {
1498					if inRange {
1499						return nil, p.getErr(ErrBadClassInCharRange, ch)
1500					}
1501
1502					cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
1503				}
1504				continue
1505
1506			case 'p', 'P':
1507				if !scanOnly {
1508					if inRange {
1509						return nil, p.getErr(ErrBadClassInCharRange, ch)
1510					}
1511					prop, err := p.parseProperty()
1512					if err != nil {
1513						return nil, err
1514					}
1515					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1516				} else {
1517					p.parseProperty()
1518				}
1519
1520				continue
1521
1522			case '-':
1523				if !scanOnly {
1524					cc.addRange(ch, ch)
1525				}
1526				continue
1527
1528			default:
1529				p.moveLeft()
1530				var err error
1531				ch, err = p.scanCharEscape() // non-literal character
1532				if err != nil {
1533					return nil, err
1534				}
1535				fTranslatedChar = true
1536				break // this break will only break out of the switch
1537			}
1538		} else if ch == '[' {
1539			// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1540			// It currently doesn't do anything other than skip the whole thing!
1541			if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1542				savePos := p.textpos()
1543
1544				p.moveRight(1)
1545				negate := false
1546				if p.charsRight() > 1 && p.rightChar(0) == '^' {
1547					negate = true
1548					p.moveRight(1)
1549				}
1550
1551				nm := p.scanCapname() // snag the name
1552				if !scanOnly && p.useRE2() {
1553					// look up the name since these are valid for RE2
1554					// add the group based on the name
1555					if ok := cc.addNamedASCII(nm, negate); !ok {
1556						return nil, p.getErr(ErrInvalidCharRange)
1557					}
1558				}
1559				if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1560					p.textto(savePos)
1561				} else if p.useRE2() {
1562					// move on
1563					continue
1564				}
1565			}
1566		}
1567
1568		if inRange {
1569			inRange = false
1570			if !scanOnly {
1571				if ch == '[' && !fTranslatedChar && !firstChar {
1572					// We thought we were in a range, but we're actually starting a subtraction.
1573					// In that case, we'll add chPrev to our char class, skip the opening [, and
1574					// scan the new character class recursively.
1575					cc.addChar(chPrev)
1576					sub, err := p.scanCharSet(caseInsensitive, false)
1577					if err != nil {
1578						return nil, err
1579					}
1580					cc.addSubtraction(sub)
1581
1582					if p.charsRight() > 0 && p.rightChar(0) != ']' {
1583						return nil, p.getErr(ErrSubtractionMustBeLast)
1584					}
1585				} else {
1586					// a regular range, like a-z
1587					if chPrev > ch {
1588						return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
1589					}
1590					cc.addRange(chPrev, ch)
1591				}
1592			}
1593		} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1594			// this could be the start of a range
1595			chPrev = ch
1596			inRange = true
1597			p.moveRight(1)
1598		} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1599			// we aren't in a range, and now there is a subtraction.  Usually this happens
1600			// only when a subtraction follows a range, like [a-z-[b]]
1601			if !scanOnly {
1602				p.moveRight(1)
1603				sub, err := p.scanCharSet(caseInsensitive, false)
1604				if err != nil {
1605					return nil, err
1606				}
1607				cc.addSubtraction(sub)
1608
1609				if p.charsRight() > 0 && p.rightChar(0) != ']' {
1610					return nil, p.getErr(ErrSubtractionMustBeLast)
1611				}
1612			} else {
1613				p.moveRight(1)
1614				p.scanCharSet(caseInsensitive, true)
1615			}
1616		} else {
1617			if !scanOnly {
1618				cc.addRange(ch, ch)
1619			}
1620		}
1621	}
1622
1623	if !closed {
1624		return nil, p.getErr(ErrUnterminatedBracket)
1625	}
1626
1627	if !scanOnly && caseInsensitive {
1628		cc.addLowercase()
1629	}
1630
1631	return cc, nil
1632}
1633
1634// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1635func (p *parser) scanDecimal() (int, error) {
1636	i := 0
1637	var d int
1638
1639	for p.charsRight() > 0 {
1640		d = int(p.rightChar(0) - '0')
1641		if d < 0 || d > 9 {
1642			break
1643		}
1644		p.moveRight(1)
1645
1646		if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1647			return 0, p.getErr(ErrCaptureGroupOutOfRange)
1648		}
1649
1650		i *= 10
1651		i += d
1652	}
1653
1654	return int(i), nil
1655}
1656
1657// Returns true for options allowed only at the top level
1658func isOnlyTopOption(option RegexOptions) bool {
1659	return option == RightToLeft || option == ECMAScript || option == RE2
1660}
1661
1662// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1663func (p *parser) scanOptions() {
1664
1665	for off := false; p.charsRight() > 0; p.moveRight(1) {
1666		ch := p.rightChar(0)
1667
1668		if ch == '-' {
1669			off = true
1670		} else if ch == '+' {
1671			off = false
1672		} else {
1673			option := optionFromCode(ch)
1674			if option == 0 || isOnlyTopOption(option) {
1675				return
1676			}
1677
1678			if off {
1679				p.options &= ^option
1680			} else {
1681				p.options |= option
1682			}
1683		}
1684	}
1685}
1686
1687// Scans \ code for escape codes that map to single unicode chars.
1688func (p *parser) scanCharEscape() (r rune, err error) {
1689
1690	ch := p.moveRightGetChar()
1691
1692	if ch >= '0' && ch <= '7' {
1693		p.moveLeft()
1694		return p.scanOctal(), nil
1695	}
1696
1697	pos := p.textpos()
1698
1699	switch ch {
1700	case 'x':
1701		// support for \x{HEX} syntax from Perl and PCRE
1702		if p.charsRight() > 0 && p.rightChar(0) == '{' {
1703			if p.useOptionE() {
1704				return ch, nil
1705			}
1706			p.moveRight(1)
1707			return p.scanHexUntilBrace()
1708		} else {
1709			r, err = p.scanHex(2)
1710		}
1711	case 'u':
1712		// ECMAscript suppot \u{HEX} only if `u` is also set
1713		if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1714			p.moveRight(1)
1715			return p.scanHexUntilBrace()
1716		} else {
1717			r, err = p.scanHex(4)
1718		}
1719	case 'a':
1720		return '\u0007', nil
1721	case 'b':
1722		return '\b', nil
1723	case 'e':
1724		return '\u001B', nil
1725	case 'f':
1726		return '\f', nil
1727	case 'n':
1728		return '\n', nil
1729	case 'r':
1730		return '\r', nil
1731	case 't':
1732		return '\t', nil
1733	case 'v':
1734		return '\u000B', nil
1735	case 'c':
1736		r, err = p.scanControl()
1737	default:
1738		if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
1739			return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1740		}
1741		return ch, nil
1742	}
1743	if err != nil && p.useOptionE() {
1744		p.textto(pos)
1745		return ch, nil
1746	}
1747	return
1748}
1749
1750// Grabs and converts an ascii control character
1751func (p *parser) scanControl() (rune, error) {
1752	if p.charsRight() <= 0 {
1753		return 0, p.getErr(ErrMissingControl)
1754	}
1755
1756	ch := p.moveRightGetChar()
1757
1758	// \ca interpreted as \cA
1759
1760	if ch >= 'a' && ch <= 'z' {
1761		ch = (ch - ('a' - 'A'))
1762	}
1763	ch = (ch - '@')
1764	if ch >= 0 && ch < ' ' {
1765		return ch, nil
1766	}
1767
1768	return 0, p.getErr(ErrUnrecognizedControl)
1769
1770}
1771
1772// Scan hex digits until we hit a closing brace.
1773// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1774func (p *parser) scanHexUntilBrace() (rune, error) {
1775	// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1776	// so we can enforce that
1777	i := 0
1778	hasContent := false
1779
1780	for p.charsRight() > 0 {
1781		ch := p.moveRightGetChar()
1782		if ch == '}' {
1783			// hit our close brace, we're done here
1784			// prevent \x{}
1785			if !hasContent {
1786				return 0, p.getErr(ErrTooFewHex)
1787			}
1788			return rune(i), nil
1789		}
1790		hasContent = true
1791		// no brace needs to be hex digit
1792		d := hexDigit(ch)
1793		if d < 0 {
1794			return 0, p.getErr(ErrMissingBrace)
1795		}
1796
1797		i *= 0x10
1798		i += d
1799
1800		if i > unicode.MaxRune {
1801			return 0, p.getErr(ErrInvalidHex)
1802		}
1803	}
1804
1805	// we only make it here if we run out of digits without finding the brace
1806	return 0, p.getErr(ErrMissingBrace)
1807}
1808
1809// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1810func (p *parser) scanHex(c int) (rune, error) {
1811
1812	i := 0
1813
1814	if p.charsRight() >= c {
1815		for c > 0 {
1816			d := hexDigit(p.moveRightGetChar())
1817			if d < 0 {
1818				break
1819			}
1820			i *= 0x10
1821			i += d
1822			c--
1823		}
1824	}
1825
1826	if c > 0 {
1827		return 0, p.getErr(ErrTooFewHex)
1828	}
1829
1830	return rune(i), nil
1831}
1832
1833// Returns n <= 0xF for a hex digit.
1834func hexDigit(ch rune) int {
1835
1836	if d := uint(ch - '0'); d <= 9 {
1837		return int(d)
1838	}
1839
1840	if d := uint(ch - 'a'); d <= 5 {
1841		return int(d + 0xa)
1842	}
1843
1844	if d := uint(ch - 'A'); d <= 5 {
1845		return int(d + 0xa)
1846	}
1847
1848	return -1
1849}
1850
1851// Scans up to three octal digits (stops before exceeding 0377).
1852func (p *parser) scanOctal() rune {
1853	// Consume octal chars only up to 3 digits and value 0377
1854
1855	c := 3
1856
1857	if c > p.charsRight() {
1858		c = p.charsRight()
1859	}
1860
1861	//we know the first char is good because the caller had to check
1862	i := 0
1863	d := int(p.rightChar(0) - '0')
1864	for c > 0 && d <= 7 && d >= 0 {
1865		if i >= 0x20 && p.useOptionE() {
1866			break
1867		}
1868		i *= 8
1869		i += d
1870		c--
1871
1872		p.moveRight(1)
1873		if !p.rightMost() {
1874			d = int(p.rightChar(0) - '0')
1875		}
1876	}
1877
1878	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
1879	// is simply to truncate the high bits.
1880	i &= 0xFF
1881
1882	return rune(i)
1883}
1884
1885// Returns the current parsing position.
1886func (p *parser) textpos() int {
1887	return p.currentPos
1888}
1889
1890// Zaps to a specific parsing position.
1891func (p *parser) textto(pos int) {
1892	p.currentPos = pos
1893}
1894
1895// Returns the char at the right of the current parsing position and advances to the right.
1896func (p *parser) moveRightGetChar() rune {
1897	ch := p.pattern[p.currentPos]
1898	p.currentPos++
1899	return ch
1900}
1901
1902// Moves the current position to the right.
1903func (p *parser) moveRight(i int) {
1904	// default would be 1
1905	p.currentPos += i
1906}
1907
1908// Moves the current parsing position one to the left.
1909func (p *parser) moveLeft() {
1910	p.currentPos--
1911}
1912
1913// Returns the char left of the current parsing position.
1914func (p *parser) charAt(i int) rune {
1915	return p.pattern[i]
1916}
1917
1918// Returns the char i chars right of the current parsing position.
1919func (p *parser) rightChar(i int) rune {
1920	// default would be 0
1921	return p.pattern[p.currentPos+i]
1922}
1923
1924// Number of characters to the right of the current parsing position.
1925func (p *parser) charsRight() int {
1926	return len(p.pattern) - p.currentPos
1927}
1928
1929func (p *parser) rightMost() bool {
1930	return p.currentPos == len(p.pattern)
1931}
1932
1933// Looks up the slot number for a given name
1934func (p *parser) captureSlotFromName(capname string) int {
1935	return p.capnames[capname]
1936}
1937
1938// True if the capture slot was noted
1939func (p *parser) isCaptureSlot(i int) bool {
1940	if p.caps != nil {
1941		_, ok := p.caps[i]
1942		return ok
1943	}
1944
1945	return (i >= 0 && i < p.capsize)
1946}
1947
1948// Looks up the slot number for a given name
1949func (p *parser) isCaptureName(capname string) bool {
1950	if p.capnames == nil {
1951		return false
1952	}
1953
1954	_, ok := p.capnames[capname]
1955	return ok
1956}
1957
1958// option shortcuts
1959
1960// True if N option disabling '(' autocapture is on.
1961func (p *parser) useOptionN() bool {
1962	return (p.options & ExplicitCapture) != 0
1963}
1964
1965// True if I option enabling case-insensitivity is on.
1966func (p *parser) useOptionI() bool {
1967	return (p.options & IgnoreCase) != 0
1968}
1969
1970// True if M option altering meaning of $ and ^ is on.
1971func (p *parser) useOptionM() bool {
1972	return (p.options & Multiline) != 0
1973}
1974
1975// True if S option altering meaning of . is on.
1976func (p *parser) useOptionS() bool {
1977	return (p.options & Singleline) != 0
1978}
1979
1980// True if X option enabling whitespace/comment mode is on.
1981func (p *parser) useOptionX() bool {
1982	return (p.options & IgnorePatternWhitespace) != 0
1983}
1984
1985// True if E option enabling ECMAScript behavior on.
1986func (p *parser) useOptionE() bool {
1987	return (p.options & ECMAScript) != 0
1988}
1989
1990// true to use RE2 compatibility parsing behavior.
1991func (p *parser) useRE2() bool {
1992	return (p.options & RE2) != 0
1993}
1994
1995// True if U option enabling ECMAScript's Unicode behavior on.
1996func (p *parser) useOptionU() bool {
1997	return (p.options & Unicode) != 0
1998}
1999
2000// True if options stack is empty.
2001func (p *parser) emptyOptionsStack() bool {
2002	return len(p.optionsStack) == 0
2003}
2004
2005// Finish the current quantifiable (when a quantifier is not found or is not possible)
2006func (p *parser) addConcatenate() {
2007	// The first (| inside a Testgroup group goes directly to the group
2008	p.concatenation.addChild(p.unit)
2009	p.unit = nil
2010}
2011
2012// Finish the current quantifiable (when a quantifier is found)
2013func (p *parser) addConcatenate3(lazy bool, min, max int) {
2014	p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
2015	p.unit = nil
2016}
2017
2018// Sets the current unit to a single char node
2019func (p *parser) addUnitOne(ch rune) {
2020	if p.useOptionI() {
2021		ch = unicode.ToLower(ch)
2022	}
2023
2024	p.unit = newRegexNodeCh(ntOne, p.options, ch)
2025}
2026
2027// Sets the current unit to a single inverse-char node
2028func (p *parser) addUnitNotone(ch rune) {
2029	if p.useOptionI() {
2030		ch = unicode.ToLower(ch)
2031	}
2032
2033	p.unit = newRegexNodeCh(ntNotone, p.options, ch)
2034}
2035
2036// Sets the current unit to a single set node
2037func (p *parser) addUnitSet(set *CharSet) {
2038	p.unit = newRegexNodeSet(ntSet, p.options, set)
2039}
2040
2041// Sets the current unit to a subtree
2042func (p *parser) addUnitNode(node *regexNode) {
2043	p.unit = node
2044}
2045
2046// Sets the current unit to an assertion of the specified type
2047func (p *parser) addUnitType(t nodeType) {
2048	p.unit = newRegexNode(t, p.options)
2049}
2050
2051// Finish the current group (in response to a ')' or end)
2052func (p *parser) addGroup() error {
2053	if p.group.t == ntTestgroup || p.group.t == ntTestref {
2054		p.group.addChild(p.concatenation.reverseLeft())
2055		if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2056			return p.getErr(ErrTooManyAlternates)
2057		}
2058	} else {
2059		p.alternation.addChild(p.concatenation.reverseLeft())
2060		p.group.addChild(p.alternation)
2061	}
2062
2063	p.unit = p.group
2064	return nil
2065}
2066
2067// Pops the option stack, but keeps the current options unchanged.
2068func (p *parser) popKeepOptions() {
2069	lastIdx := len(p.optionsStack) - 1
2070	p.optionsStack = p.optionsStack[:lastIdx]
2071}
2072
2073// Recalls options from the stack.
2074func (p *parser) popOptions() {
2075	lastIdx := len(p.optionsStack) - 1
2076	// get the last item on the stack and then remove it by reslicing
2077	p.options = p.optionsStack[lastIdx]
2078	p.optionsStack = p.optionsStack[:lastIdx]
2079}
2080
2081// Saves options on a stack.
2082func (p *parser) pushOptions() {
2083	p.optionsStack = append(p.optionsStack, p.options)
2084}
2085
2086// Add a string to the last concatenate.
2087func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2088	var node *regexNode
2089
2090	if cch == 0 {
2091		return
2092	}
2093
2094	if cch > 1 {
2095		str := make([]rune, cch)
2096		copy(str, p.pattern[pos:pos+cch])
2097
2098		if p.useOptionI() && !isReplacement {
2099			// We do the ToLower character by character for consistency.  With surrogate chars, doing
2100			// a ToLower on the entire string could actually change the surrogate pair.  This is more correct
2101			// linguistically, but since Regex doesn't support surrogates, it's more important to be
2102			// consistent.
2103			for i := 0; i < len(str); i++ {
2104				str[i] = unicode.ToLower(str[i])
2105			}
2106		}
2107
2108		node = newRegexNodeStr(ntMulti, p.options, str)
2109	} else {
2110		ch := p.charAt(pos)
2111
2112		if p.useOptionI() && !isReplacement {
2113			ch = unicode.ToLower(ch)
2114		}
2115
2116		node = newRegexNodeCh(ntOne, p.options, ch)
2117	}
2118
2119	p.concatenation.addChild(node)
2120}
2121
2122// Push the parser state (in response to an open paren)
2123func (p *parser) pushGroup() {
2124	p.group.next = p.stack
2125	p.alternation.next = p.group
2126	p.concatenation.next = p.alternation
2127	p.stack = p.concatenation
2128}
2129
2130// Remember the pushed state (in response to a ')')
2131func (p *parser) popGroup() error {
2132	p.concatenation = p.stack
2133	p.alternation = p.concatenation.next
2134	p.group = p.alternation.next
2135	p.stack = p.group.next
2136
2137	// The first () inside a Testgroup group goes directly to the group
2138	if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2139		if p.unit == nil {
2140			return p.getErr(ErrConditionalExpression)
2141		}
2142
2143		p.group.addChild(p.unit)
2144		p.unit = nil
2145	}
2146	return nil
2147}
2148
2149// True if the group stack is empty.
2150func (p *parser) emptyStack() bool {
2151	return p.stack == nil
2152}
2153
2154// Start a new round for the parser state (in response to an open paren or string start)
2155func (p *parser) startGroup(openGroup *regexNode) {
2156	p.group = openGroup
2157	p.alternation = newRegexNode(ntAlternate, p.options)
2158	p.concatenation = newRegexNode(ntConcatenate, p.options)
2159}
2160
2161// Finish the current concatenation (in response to a |)
2162func (p *parser) addAlternate() {
2163	// The | parts inside a Testgroup group go directly to the group
2164
2165	if p.group.t == ntTestgroup || p.group.t == ntTestref {
2166		p.group.addChild(p.concatenation.reverseLeft())
2167	} else {
2168		p.alternation.addChild(p.concatenation.reverseLeft())
2169	}
2170
2171	p.concatenation = newRegexNode(ntConcatenate, p.options)
2172}
2173
2174// For categorizing ascii characters.
2175
2176const (
2177	Q byte = 5 // quantifier
2178	S      = 4 // ordinary stopper
2179	Z      = 3 // ScanBlank stopper
2180	X      = 2 // whitespace
2181	E      = 1 // should be escaped
2182)
2183
2184var _category = []byte{
2185	//01  2  3  4  5  6  7  8  9  A  B  C  D  E  F  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
2186	0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2187	// !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
2188	X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2189	//@A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
2190	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2191	//'a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~
2192	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2193}
2194
2195func isSpace(ch rune) bool {
2196	return (ch <= ' ' && _category[ch] == X)
2197}
2198
2199// Returns true for those characters that terminate a string of ordinary chars.
2200func isSpecial(ch rune) bool {
2201	return (ch <= '|' && _category[ch] >= S)
2202}
2203
2204// Returns true for those characters that terminate a string of ordinary chars.
2205func isStopperX(ch rune) bool {
2206	return (ch <= '|' && _category[ch] >= X)
2207}
2208
2209// Returns true for those characters that begin a quantifier.
2210func isQuantifier(ch rune) bool {
2211	return (ch <= '{' && _category[ch] >= Q)
2212}
2213
2214func (p *parser) isTrueQuantifier() bool {
2215	nChars := p.charsRight()
2216	if nChars == 0 {
2217		return false
2218	}
2219
2220	startpos := p.textpos()
2221	ch := p.charAt(startpos)
2222	if ch != '{' {
2223		return ch <= '{' && _category[ch] >= Q
2224	}
2225
2226	//UGLY: this is ugly -- the original code was ugly too
2227	pos := startpos
2228	for {
2229		nChars--
2230		if nChars <= 0 {
2231			break
2232		}
2233		pos++
2234		ch = p.charAt(pos)
2235		if ch < '0' || ch > '9' {
2236			break
2237		}
2238	}
2239
2240	if nChars == 0 || pos-startpos == 1 {
2241		return false
2242	}
2243	if ch == '}' {
2244		return true
2245	}
2246	if ch != ',' {
2247		return false
2248	}
2249	for {
2250		nChars--
2251		if nChars <= 0 {
2252			break
2253		}
2254		pos++
2255		ch = p.charAt(pos)
2256		if ch < '0' || ch > '9' {
2257			break
2258		}
2259	}
2260
2261	return nChars > 0 && ch == '}'
2262}