1package syntax
2
3import (
4 "fmt"
5 "math"
6 "os"
7 "sort"
8 "strconv"
9 "unicode"
10)
11
12type RegexOptions int32
13
14const (
15 IgnoreCase RegexOptions = 0x0001 // "i"
16 Multiline = 0x0002 // "m"
17 ExplicitCapture = 0x0004 // "n"
18 Compiled = 0x0008 // "c"
19 Singleline = 0x0010 // "s"
20 IgnorePatternWhitespace = 0x0020 // "x"
21 RightToLeft = 0x0040 // "r"
22 Debug = 0x0080 // "d"
23 ECMAScript = 0x0100 // "e"
24 RE2 = 0x0200 // RE2 compat mode
25 Unicode = 0x0400 // "u"
26)
27
28func optionFromCode(ch rune) RegexOptions {
29 // case-insensitive
30 switch ch {
31 case 'i', 'I':
32 return IgnoreCase
33 case 'r', 'R':
34 return RightToLeft
35 case 'm', 'M':
36 return Multiline
37 case 'n', 'N':
38 return ExplicitCapture
39 case 's', 'S':
40 return Singleline
41 case 'x', 'X':
42 return IgnorePatternWhitespace
43 case 'd', 'D':
44 return Debug
45 case 'e', 'E':
46 return ECMAScript
47 case 'u', 'U':
48 return Unicode
49 default:
50 return 0
51 }
52}
53
54// An Error describes a failure to parse a regular expression
55// and gives the offending expression.
56type Error struct {
57 Code ErrorCode
58 Expr string
59 Args []interface{}
60}
61
62func (e *Error) Error() string {
63 if len(e.Args) == 0 {
64 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
65 }
66 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
67}
68
69// An ErrorCode describes a failure to parse a regular expression.
70type ErrorCode string
71
72const (
73 // internal issue
74 ErrInternalError ErrorCode = "regexp/syntax: internal error"
75 // Parser errors
76 ErrUnterminatedComment = "unterminated comment"
77 ErrInvalidCharRange = "invalid character class range"
78 ErrInvalidRepeatSize = "invalid repeat count"
79 ErrInvalidUTF8 = "invalid UTF-8"
80 ErrCaptureGroupOutOfRange = "capture group number out of range"
81 ErrUnexpectedParen = "unexpected )"
82 ErrMissingParen = "missing closing )"
83 ErrMissingBrace = "missing closing }"
84 ErrInvalidRepeatOp = "invalid nested repetition operator"
85 ErrMissingRepeatArgument = "missing argument to repetition operator"
86 ErrConditionalExpression = "illegal conditional (?(...)) expression"
87 ErrTooManyAlternates = "too many | in (?()|)"
88 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
89 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
90 ErrCapNumNotZero = "capture number cannot be zero"
91 ErrUndefinedBackRef = "reference to undefined group number %v"
92 ErrUndefinedNameRef = "reference to undefined group name %v"
93 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
94 ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
95 ErrMalformedReference = "(?(%v) ) malformed"
96 ErrUndefinedReference = "(?(%v) ) reference to undefined group"
97 ErrIllegalEndEscape = "illegal \\ at end of pattern"
98 ErrMalformedSlashP = "malformed \\p{X} character escape"
99 ErrIncompleteSlashP = "incomplete \\p{X} character escape"
100 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
101 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
102 ErrMissingControl = "missing control character"
103 ErrUnrecognizedControl = "unrecognized control character"
104 ErrTooFewHex = "insufficient hexadecimal digits"
105 ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
106 ErrMalformedNameRef = "malformed \\k<...> named back reference"
107 ErrBadClassInCharRange = "cannot include class \\%v in character range"
108 ErrUnterminatedBracket = "unterminated [] set"
109 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
110 ErrReversedCharRange = "[%c-%c] range in reverse order"
111)
112
113func (e ErrorCode) String() string {
114 return string(e)
115}
116
117type parser struct {
118 stack *regexNode
119 group *regexNode
120 alternation *regexNode
121 concatenation *regexNode
122 unit *regexNode
123
124 patternRaw string
125 pattern []rune
126
127 currentPos int
128 specialCase *unicode.SpecialCase
129
130 autocap int
131 capcount int
132 captop int
133 capsize int
134
135 caps map[int]int
136 capnames map[string]int
137
138 capnumlist []int
139 capnamelist []string
140
141 options RegexOptions
142 optionsStack []RegexOptions
143 ignoreNextParen bool
144}
145
146const (
147 maxValueDiv10 int = math.MaxInt32 / 10
148 maxValueMod10 = math.MaxInt32 % 10
149)
150
151// Parse converts a regex string into a parse tree
152func Parse(re string, op RegexOptions) (*RegexTree, error) {
153 p := parser{
154 options: op,
155 caps: make(map[int]int),
156 }
157 p.setPattern(re)
158
159 if err := p.countCaptures(); err != nil {
160 return nil, err
161 }
162
163 p.reset(op)
164 root, err := p.scanRegex()
165
166 if err != nil {
167 return nil, err
168 }
169 tree := &RegexTree{
170 root: root,
171 caps: p.caps,
172 capnumlist: p.capnumlist,
173 captop: p.captop,
174 Capnames: p.capnames,
175 Caplist: p.capnamelist,
176 options: op,
177 }
178
179 if tree.options&Debug > 0 {
180 os.Stdout.WriteString(tree.Dump())
181 }
182
183 return tree, nil
184}
185
186func (p *parser) setPattern(pattern string) {
187 p.patternRaw = pattern
188 p.pattern = make([]rune, 0, len(pattern))
189
190 //populate our rune array to handle utf8 encoding
191 for _, r := range pattern {
192 p.pattern = append(p.pattern, r)
193 }
194}
195func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
196 return &Error{Code: code, Expr: p.patternRaw, Args: args}
197}
198
199func (p *parser) noteCaptureSlot(i, pos int) {
200 if _, ok := p.caps[i]; !ok {
201 // the rhs of the hashtable isn't used in the parser
202 p.caps[i] = pos
203 p.capcount++
204
205 if p.captop <= i {
206 if i == math.MaxInt32 {
207 p.captop = i
208 } else {
209 p.captop = i + 1
210 }
211 }
212 }
213}
214
215func (p *parser) noteCaptureName(name string, pos int) {
216 if p.capnames == nil {
217 p.capnames = make(map[string]int)
218 }
219
220 if _, ok := p.capnames[name]; !ok {
221 p.capnames[name] = pos
222 p.capnamelist = append(p.capnamelist, name)
223 }
224}
225
226func (p *parser) assignNameSlots() {
227 if p.capnames != nil {
228 for _, name := range p.capnamelist {
229 for p.isCaptureSlot(p.autocap) {
230 p.autocap++
231 }
232 pos := p.capnames[name]
233 p.capnames[name] = p.autocap
234 p.noteCaptureSlot(p.autocap, pos)
235
236 p.autocap++
237 }
238 }
239
240 // if the caps array has at least one gap, construct the list of used slots
241 if p.capcount < p.captop {
242 p.capnumlist = make([]int, p.capcount)
243 i := 0
244
245 for k := range p.caps {
246 p.capnumlist[i] = k
247 i++
248 }
249
250 sort.Ints(p.capnumlist)
251 }
252
253 // merge capsnumlist into capnamelist
254 if p.capnames != nil || p.capnumlist != nil {
255 var oldcapnamelist []string
256 var next int
257 var k int
258
259 if p.capnames == nil {
260 oldcapnamelist = nil
261 p.capnames = make(map[string]int)
262 p.capnamelist = []string{}
263 next = -1
264 } else {
265 oldcapnamelist = p.capnamelist
266 p.capnamelist = []string{}
267 next = p.capnames[oldcapnamelist[0]]
268 }
269
270 for i := 0; i < p.capcount; i++ {
271 j := i
272 if p.capnumlist != nil {
273 j = p.capnumlist[i]
274 }
275
276 if next == j {
277 p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
278 k++
279
280 if k == len(oldcapnamelist) {
281 next = -1
282 } else {
283 next = p.capnames[oldcapnamelist[k]]
284 }
285
286 } else {
287 //feature: culture?
288 str := strconv.Itoa(j)
289 p.capnamelist = append(p.capnamelist, str)
290 p.capnames[str] = j
291 }
292 }
293 }
294}
295
296func (p *parser) consumeAutocap() int {
297 r := p.autocap
298 p.autocap++
299 return r
300}
301
302// CountCaptures is a prescanner for deducing the slots used for
303// captures by doing a partial tokenization of the pattern.
304func (p *parser) countCaptures() error {
305 var ch rune
306
307 p.noteCaptureSlot(0, 0)
308
309 p.autocap = 1
310
311 for p.charsRight() > 0 {
312 pos := p.textpos()
313 ch = p.moveRightGetChar()
314 switch ch {
315 case '\\':
316 if p.charsRight() > 0 {
317 p.scanBackslash(true)
318 }
319
320 case '#':
321 if p.useOptionX() {
322 p.moveLeft()
323 p.scanBlank()
324 }
325
326 case '[':
327 p.scanCharSet(false, true)
328
329 case ')':
330 if !p.emptyOptionsStack() {
331 p.popOptions()
332 }
333
334 case '(':
335 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
336 p.moveLeft()
337 p.scanBlank()
338 } else {
339 p.pushOptions()
340 if p.charsRight() > 0 && p.rightChar(0) == '?' {
341 // we have (?...
342 p.moveRight(1)
343
344 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
345 // named group: (?<... or (?'...
346
347 p.moveRight(1)
348 ch = p.rightChar(0)
349
350 if ch != '0' && IsWordChar(ch) {
351 if ch >= '1' && ch <= '9' {
352 dec, err := p.scanDecimal()
353 if err != nil {
354 return err
355 }
356 p.noteCaptureSlot(dec, pos)
357 } else {
358 p.noteCaptureName(p.scanCapname(), pos)
359 }
360 }
361 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
362 // RE2-compat (?P<)
363 p.moveRight(2)
364 ch = p.rightChar(0)
365 if IsWordChar(ch) {
366 p.noteCaptureName(p.scanCapname(), pos)
367 }
368
369 } else {
370 // (?...
371
372 // get the options if it's an option construct (?cimsx-cimsx...)
373 p.scanOptions()
374
375 if p.charsRight() > 0 {
376 if p.rightChar(0) == ')' {
377 // (?cimsx-cimsx)
378 p.moveRight(1)
379 p.popKeepOptions()
380 } else if p.rightChar(0) == '(' {
381 // alternation construct: (?(foo)yes|no)
382 // ignore the next paren so we don't capture the condition
383 p.ignoreNextParen = true
384
385 // break from here so we don't reset ignoreNextParen
386 continue
387 }
388 }
389 }
390 } else {
391 if !p.useOptionN() && !p.ignoreNextParen {
392 p.noteCaptureSlot(p.consumeAutocap(), pos)
393 }
394 }
395 }
396
397 p.ignoreNextParen = false
398
399 }
400 }
401
402 p.assignNameSlots()
403 return nil
404}
405
406func (p *parser) reset(topopts RegexOptions) {
407 p.currentPos = 0
408 p.autocap = 1
409 p.ignoreNextParen = false
410
411 if len(p.optionsStack) > 0 {
412 p.optionsStack = p.optionsStack[:0]
413 }
414
415 p.options = topopts
416 p.stack = nil
417}
418
419func (p *parser) scanRegex() (*regexNode, error) {
420 ch := '@' // nonspecial ch, means at beginning
421 isQuant := false
422
423 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
424
425 for p.charsRight() > 0 {
426 wasPrevQuantifier := isQuant
427 isQuant = false
428
429 if err := p.scanBlank(); err != nil {
430 return nil, err
431 }
432
433 startpos := p.textpos()
434
435 // move past all of the normal characters. We'll stop when we hit some kind of control character,
436 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
437 if p.useOptionX() {
438 for p.charsRight() > 0 {
439 ch = p.rightChar(0)
440 //UGLY: clean up, this is ugly
441 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
442 break
443 }
444 p.moveRight(1)
445 }
446 } else {
447 for p.charsRight() > 0 {
448 ch = p.rightChar(0)
449 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
450 break
451 }
452 p.moveRight(1)
453 }
454 }
455
456 endpos := p.textpos()
457
458 p.scanBlank()
459
460 if p.charsRight() == 0 {
461 ch = '!' // nonspecial, means at end
462 } else if ch = p.rightChar(0); isSpecial(ch) {
463 isQuant = isQuantifier(ch)
464 p.moveRight(1)
465 } else {
466 ch = ' ' // nonspecial, means at ordinary char
467 }
468
469 if startpos < endpos {
470 cchUnquantified := endpos - startpos
471 if isQuant {
472 cchUnquantified--
473 }
474 wasPrevQuantifier = false
475
476 if cchUnquantified > 0 {
477 p.addToConcatenate(startpos, cchUnquantified, false)
478 }
479
480 if isQuant {
481 p.addUnitOne(p.charAt(endpos - 1))
482 }
483 }
484
485 switch ch {
486 case '!':
487 goto BreakOuterScan
488
489 case ' ':
490 goto ContinueOuterScan
491
492 case '[':
493 cc, err := p.scanCharSet(p.useOptionI(), false)
494 if err != nil {
495 return nil, err
496 }
497 p.addUnitSet(cc)
498
499 case '(':
500 p.pushOptions()
501
502 if grouper, err := p.scanGroupOpen(); err != nil {
503 return nil, err
504 } else if grouper == nil {
505 p.popKeepOptions()
506 } else {
507 p.pushGroup()
508 p.startGroup(grouper)
509 }
510
511 continue
512
513 case '|':
514 p.addAlternate()
515 goto ContinueOuterScan
516
517 case ')':
518 if p.emptyStack() {
519 return nil, p.getErr(ErrUnexpectedParen)
520 }
521
522 if err := p.addGroup(); err != nil {
523 return nil, err
524 }
525 if err := p.popGroup(); err != nil {
526 return nil, err
527 }
528 p.popOptions()
529
530 if p.unit == nil {
531 goto ContinueOuterScan
532 }
533
534 case '\\':
535 n, err := p.scanBackslash(false)
536 if err != nil {
537 return nil, err
538 }
539 p.addUnitNode(n)
540
541 case '^':
542 if p.useOptionM() {
543 p.addUnitType(ntBol)
544 } else {
545 p.addUnitType(ntBeginning)
546 }
547
548 case '$':
549 if p.useOptionM() {
550 p.addUnitType(ntEol)
551 } else {
552 p.addUnitType(ntEndZ)
553 }
554
555 case '.':
556 if p.useOptionS() {
557 p.addUnitSet(AnyClass())
558 } else if p.useOptionE() {
559 p.addUnitSet(ECMAAnyClass())
560 } else {
561 p.addUnitNotone('\n')
562 }
563
564 case '{', '*', '+', '?':
565 if p.unit == nil {
566 if wasPrevQuantifier {
567 return nil, p.getErr(ErrInvalidRepeatOp)
568 } else {
569 return nil, p.getErr(ErrMissingRepeatArgument)
570 }
571 }
572 p.moveLeft()
573
574 default:
575 return nil, p.getErr(ErrInternalError)
576 }
577
578 if err := p.scanBlank(); err != nil {
579 return nil, err
580 }
581
582 if p.charsRight() > 0 {
583 isQuant = p.isTrueQuantifier()
584 }
585 if p.charsRight() == 0 || !isQuant {
586 //maintain odd C# assignment order -- not sure if required, could clean up?
587 p.addConcatenate()
588 goto ContinueOuterScan
589 }
590
591 ch = p.moveRightGetChar()
592
593 // Handle quantifiers
594 for p.unit != nil {
595 var min, max int
596 var lazy bool
597
598 switch ch {
599 case '*':
600 min = 0
601 max = math.MaxInt32
602
603 case '?':
604 min = 0
605 max = 1
606
607 case '+':
608 min = 1
609 max = math.MaxInt32
610
611 case '{':
612 {
613 var err error
614 startpos = p.textpos()
615 if min, err = p.scanDecimal(); err != nil {
616 return nil, err
617 }
618 max = min
619 if startpos < p.textpos() {
620 if p.charsRight() > 0 && p.rightChar(0) == ',' {
621 p.moveRight(1)
622 if p.charsRight() == 0 || p.rightChar(0) == '}' {
623 max = math.MaxInt32
624 } else {
625 if max, err = p.scanDecimal(); err != nil {
626 return nil, err
627 }
628 }
629 }
630 }
631
632 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
633 p.addConcatenate()
634 p.textto(startpos - 1)
635 goto ContinueOuterScan
636 }
637 }
638
639 default:
640 return nil, p.getErr(ErrInternalError)
641 }
642
643 if err := p.scanBlank(); err != nil {
644 return nil, err
645 }
646
647 if p.charsRight() == 0 || p.rightChar(0) != '?' {
648 lazy = false
649 } else {
650 p.moveRight(1)
651 lazy = true
652 }
653
654 if min > max {
655 return nil, p.getErr(ErrInvalidRepeatSize)
656 }
657
658 p.addConcatenate3(lazy, min, max)
659 }
660
661 ContinueOuterScan:
662 }
663
664BreakOuterScan:
665 ;
666
667 if !p.emptyStack() {
668 return nil, p.getErr(ErrMissingParen)
669 }
670
671 if err := p.addGroup(); err != nil {
672 return nil, err
673 }
674
675 return p.unit, nil
676
677}
678
679/*
680 * Simple parsing for replacement patterns
681 */
682func (p *parser) scanReplacement() (*regexNode, error) {
683 var c, startpos int
684
685 p.concatenation = newRegexNode(ntConcatenate, p.options)
686
687 for {
688 c = p.charsRight()
689 if c == 0 {
690 break
691 }
692
693 startpos = p.textpos()
694
695 for c > 0 && p.rightChar(0) != '$' {
696 p.moveRight(1)
697 c--
698 }
699
700 p.addToConcatenate(startpos, p.textpos()-startpos, true)
701
702 if c > 0 {
703 if p.moveRightGetChar() == '$' {
704 n, err := p.scanDollar()
705 if err != nil {
706 return nil, err
707 }
708 p.addUnitNode(n)
709 }
710 p.addConcatenate()
711 }
712 }
713
714 return p.concatenation, nil
715}
716
717/*
718 * Scans $ patterns recognized within replacement patterns
719 */
720func (p *parser) scanDollar() (*regexNode, error) {
721 if p.charsRight() == 0 {
722 return newRegexNodeCh(ntOne, p.options, '$'), nil
723 }
724
725 ch := p.rightChar(0)
726 angled := false
727 backpos := p.textpos()
728 lastEndPos := backpos
729
730 // Note angle
731
732 if ch == '{' && p.charsRight() > 1 {
733 angled = true
734 p.moveRight(1)
735 ch = p.rightChar(0)
736 }
737
738 // Try to parse backreference: \1 or \{1} or \{cap}
739
740 if ch >= '0' && ch <= '9' {
741 if !angled && p.useOptionE() {
742 capnum := -1
743 newcapnum := int(ch - '0')
744 p.moveRight(1)
745 if p.isCaptureSlot(newcapnum) {
746 capnum = newcapnum
747 lastEndPos = p.textpos()
748 }
749
750 for p.charsRight() > 0 {
751 ch = p.rightChar(0)
752 if ch < '0' || ch > '9' {
753 break
754 }
755 digit := int(ch - '0')
756 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
757 return nil, p.getErr(ErrCaptureGroupOutOfRange)
758 }
759
760 newcapnum = newcapnum*10 + digit
761
762 p.moveRight(1)
763 if p.isCaptureSlot(newcapnum) {
764 capnum = newcapnum
765 lastEndPos = p.textpos()
766 }
767 }
768 p.textto(lastEndPos)
769 if capnum >= 0 {
770 return newRegexNodeM(ntRef, p.options, capnum), nil
771 }
772 } else {
773 capnum, err := p.scanDecimal()
774 if err != nil {
775 return nil, err
776 }
777 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
778 if p.isCaptureSlot(capnum) {
779 return newRegexNodeM(ntRef, p.options, capnum), nil
780 }
781 }
782 }
783 } else if angled && IsWordChar(ch) {
784 capname := p.scanCapname()
785
786 if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
787 if p.isCaptureName(capname) {
788 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
789 }
790 }
791 } else if !angled {
792 capnum := 1
793
794 switch ch {
795 case '$':
796 p.moveRight(1)
797 return newRegexNodeCh(ntOne, p.options, '$'), nil
798 case '&':
799 capnum = 0
800 case '`':
801 capnum = replaceLeftPortion
802 case '\'':
803 capnum = replaceRightPortion
804 case '+':
805 capnum = replaceLastGroup
806 case '_':
807 capnum = replaceWholeString
808 }
809
810 if capnum != 1 {
811 p.moveRight(1)
812 return newRegexNodeM(ntRef, p.options, capnum), nil
813 }
814 }
815
816 // unrecognized $: literalize
817
818 p.textto(backpos)
819 return newRegexNodeCh(ntOne, p.options, '$'), nil
820}
821
822// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
823// a RegexNode for the type of group scanned, or nil if the group
824// simply changed options (?cimsx-cimsx) or was a comment (#...).
825func (p *parser) scanGroupOpen() (*regexNode, error) {
826 var ch rune
827 var nt nodeType
828 var err error
829 close := '>'
830 start := p.textpos()
831
832 // just return a RegexNode if we have:
833 // 1. "(" followed by nothing
834 // 2. "(x" where x != ?
835 // 3. "(?)"
836 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
837 if p.useOptionN() || p.ignoreNextParen {
838 p.ignoreNextParen = false
839 return newRegexNode(ntGroup, p.options), nil
840 }
841 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
842 }
843
844 p.moveRight(1)
845
846 for {
847 if p.charsRight() == 0 {
848 break
849 }
850
851 switch ch = p.moveRightGetChar(); ch {
852 case ':':
853 nt = ntGroup
854
855 case '=':
856 p.options &= ^RightToLeft
857 nt = ntRequire
858
859 case '!':
860 p.options &= ^RightToLeft
861 nt = ntPrevent
862
863 case '>':
864 nt = ntGreedy
865
866 case '\'':
867 close = '\''
868 fallthrough
869
870 case '<':
871 if p.charsRight() == 0 {
872 goto BreakRecognize
873 }
874
875 switch ch = p.moveRightGetChar(); ch {
876 case '=':
877 if close == '\'' {
878 goto BreakRecognize
879 }
880
881 p.options |= RightToLeft
882 nt = ntRequire
883
884 case '!':
885 if close == '\'' {
886 goto BreakRecognize
887 }
888
889 p.options |= RightToLeft
890 nt = ntPrevent
891
892 default:
893 p.moveLeft()
894 capnum := -1
895 uncapnum := -1
896 proceed := false
897
898 // grab part before -
899
900 if ch >= '0' && ch <= '9' {
901 if capnum, err = p.scanDecimal(); err != nil {
902 return nil, err
903 }
904
905 if !p.isCaptureSlot(capnum) {
906 capnum = -1
907 }
908
909 // check if we have bogus characters after the number
910 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
911 return nil, p.getErr(ErrInvalidGroupName)
912 }
913 if capnum == 0 {
914 return nil, p.getErr(ErrCapNumNotZero)
915 }
916 } else if IsWordChar(ch) {
917 capname := p.scanCapname()
918
919 if p.isCaptureName(capname) {
920 capnum = p.captureSlotFromName(capname)
921 }
922
923 // check if we have bogus character after the name
924 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
925 return nil, p.getErr(ErrInvalidGroupName)
926 }
927 } else if ch == '-' {
928 proceed = true
929 } else {
930 // bad group name - starts with something other than a word character and isn't a number
931 return nil, p.getErr(ErrInvalidGroupName)
932 }
933
934 // grab part after - if any
935
936 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
937 p.moveRight(1)
938
939 //no more chars left, no closing char, etc
940 if p.charsRight() == 0 {
941 return nil, p.getErr(ErrInvalidGroupName)
942 }
943
944 ch = p.rightChar(0)
945 if ch >= '0' && ch <= '9' {
946 if uncapnum, err = p.scanDecimal(); err != nil {
947 return nil, err
948 }
949
950 if !p.isCaptureSlot(uncapnum) {
951 return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
952 }
953
954 // check if we have bogus characters after the number
955 if p.charsRight() > 0 && p.rightChar(0) != close {
956 return nil, p.getErr(ErrInvalidGroupName)
957 }
958 } else if IsWordChar(ch) {
959 uncapname := p.scanCapname()
960
961 if !p.isCaptureName(uncapname) {
962 return nil, p.getErr(ErrUndefinedNameRef, uncapname)
963 }
964 uncapnum = p.captureSlotFromName(uncapname)
965
966 // check if we have bogus character after the name
967 if p.charsRight() > 0 && p.rightChar(0) != close {
968 return nil, p.getErr(ErrInvalidGroupName)
969 }
970 } else {
971 // bad group name - starts with something other than a word character and isn't a number
972 return nil, p.getErr(ErrInvalidGroupName)
973 }
974 }
975
976 // actually make the node
977
978 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
979 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
980 }
981 goto BreakRecognize
982 }
983
984 case '(':
985 // alternation construct (?(...) | )
986
987 parenPos := p.textpos()
988 if p.charsRight() > 0 {
989 ch = p.rightChar(0)
990
991 // check if the alternation condition is a backref
992 if ch >= '0' && ch <= '9' {
993 var capnum int
994 if capnum, err = p.scanDecimal(); err != nil {
995 return nil, err
996 }
997 if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
998 if p.isCaptureSlot(capnum) {
999 return newRegexNodeM(ntTestref, p.options, capnum), nil
1000 }
1001 return nil, p.getErr(ErrUndefinedReference, capnum)
1002 }
1003
1004 return nil, p.getErr(ErrMalformedReference, capnum)
1005
1006 } else if IsWordChar(ch) {
1007 capname := p.scanCapname()
1008
1009 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1010 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1011 }
1012 }
1013 }
1014 // not a backref
1015 nt = ntTestgroup
1016 p.textto(parenPos - 1) // jump to the start of the parentheses
1017 p.ignoreNextParen = true // but make sure we don't try to capture the insides
1018
1019 charsRight := p.charsRight()
1020 if charsRight >= 3 && p.rightChar(1) == '?' {
1021 rightchar2 := p.rightChar(2)
1022 // disallow comments in the condition
1023 if rightchar2 == '#' {
1024 return nil, p.getErr(ErrAlternationCantHaveComment)
1025 }
1026
1027 // disallow named capture group (?<..>..) in the condition
1028 if rightchar2 == '\'' {
1029 return nil, p.getErr(ErrAlternationCantCapture)
1030 }
1031
1032 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1033 return nil, p.getErr(ErrAlternationCantCapture)
1034 }
1035 }
1036
1037 case 'P':
1038 if p.useRE2() {
1039 // support for P<name> syntax
1040 if p.charsRight() < 3 {
1041 goto BreakRecognize
1042 }
1043
1044 ch = p.moveRightGetChar()
1045 if ch != '<' {
1046 goto BreakRecognize
1047 }
1048
1049 ch = p.moveRightGetChar()
1050 p.moveLeft()
1051
1052 if IsWordChar(ch) {
1053 capnum := -1
1054 capname := p.scanCapname()
1055
1056 if p.isCaptureName(capname) {
1057 capnum = p.captureSlotFromName(capname)
1058 }
1059
1060 // check if we have bogus character after the name
1061 if p.charsRight() > 0 && p.rightChar(0) != '>' {
1062 return nil, p.getErr(ErrInvalidGroupName)
1063 }
1064
1065 // actually make the node
1066
1067 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1068 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1069 }
1070 goto BreakRecognize
1071
1072 } else {
1073 // bad group name - starts with something other than a word character and isn't a number
1074 return nil, p.getErr(ErrInvalidGroupName)
1075 }
1076 }
1077 // if we're not using RE2 compat mode then
1078 // we just behave like normal
1079 fallthrough
1080
1081 default:
1082 p.moveLeft()
1083
1084 nt = ntGroup
1085 // disallow options in the children of a testgroup node
1086 if p.group.t != ntTestgroup {
1087 p.scanOptions()
1088 }
1089 if p.charsRight() == 0 {
1090 goto BreakRecognize
1091 }
1092
1093 if ch = p.moveRightGetChar(); ch == ')' {
1094 return nil, nil
1095 }
1096
1097 if ch != ':' {
1098 goto BreakRecognize
1099 }
1100
1101 }
1102
1103 return newRegexNode(nt, p.options), nil
1104 }
1105
1106BreakRecognize:
1107
1108 // break Recognize comes here
1109
1110 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1111}
1112
1113// scans backslash specials and basics
1114func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1115
1116 if p.charsRight() == 0 {
1117 return nil, p.getErr(ErrIllegalEndEscape)
1118 }
1119
1120 switch ch := p.rightChar(0); ch {
1121 case 'b', 'B', 'A', 'G', 'Z', 'z':
1122 p.moveRight(1)
1123 return newRegexNode(p.typeFromCode(ch), p.options), nil
1124
1125 case 'w':
1126 p.moveRight(1)
1127 if p.useOptionE() || p.useRE2() {
1128 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1129 }
1130 return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1131
1132 case 'W':
1133 p.moveRight(1)
1134 if p.useOptionE() || p.useRE2() {
1135 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1136 }
1137 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1138
1139 case 's':
1140 p.moveRight(1)
1141 if p.useOptionE() {
1142 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1143 } else if p.useRE2() {
1144 return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
1145 }
1146 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1147
1148 case 'S':
1149 p.moveRight(1)
1150 if p.useOptionE() {
1151 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1152 } else if p.useRE2() {
1153 return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
1154 }
1155 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1156
1157 case 'd':
1158 p.moveRight(1)
1159 if p.useOptionE() || p.useRE2() {
1160 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1161 }
1162 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1163
1164 case 'D':
1165 p.moveRight(1)
1166 if p.useOptionE() || p.useRE2() {
1167 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1168 }
1169 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1170
1171 case 'p', 'P':
1172 p.moveRight(1)
1173 prop, err := p.parseProperty()
1174 if err != nil {
1175 return nil, err
1176 }
1177 cc := &CharSet{}
1178 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1179 if p.useOptionI() {
1180 cc.addLowercase()
1181 }
1182
1183 return newRegexNodeSet(ntSet, p.options, cc), nil
1184
1185 default:
1186 return p.scanBasicBackslash(scanOnly)
1187 }
1188}
1189
1190// Scans \-style backreferences and character escapes
1191func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1192 if p.charsRight() == 0 {
1193 return nil, p.getErr(ErrIllegalEndEscape)
1194 }
1195 angled := false
1196 k := false
1197 close := '\x00'
1198
1199 backpos := p.textpos()
1200 ch := p.rightChar(0)
1201
1202 // Allow \k<foo> instead of \<foo>, which is now deprecated.
1203
1204 // According to ECMAScript specification, \k<name> is only parsed as a named group reference if
1205 // there is at least one group name in the regexp.
1206 // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
1207 // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
1208 if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
1209 if p.charsRight() >= 2 {
1210 p.moveRight(1)
1211 ch = p.moveRightGetChar()
1212
1213 if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
1214 angled = true
1215 if ch == '\'' {
1216 close = '\''
1217 } else {
1218 close = '>'
1219 }
1220 }
1221 }
1222
1223 if !angled || p.charsRight() <= 0 {
1224 return nil, p.getErr(ErrMalformedNameRef)
1225 }
1226
1227 ch = p.rightChar(0)
1228 k = true
1229
1230 } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1231 angled = true
1232 if ch == '\'' {
1233 close = '\''
1234 } else {
1235 close = '>'
1236 }
1237
1238 p.moveRight(1)
1239 ch = p.rightChar(0)
1240 }
1241
1242 // Try to parse backreference: \<1> or \<cap>
1243
1244 if angled && ch >= '0' && ch <= '9' {
1245 capnum, err := p.scanDecimal()
1246 if err != nil {
1247 return nil, err
1248 }
1249
1250 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1251 if p.isCaptureSlot(capnum) {
1252 return newRegexNodeM(ntRef, p.options, capnum), nil
1253 }
1254 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1255 }
1256 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1257 capnum, err := p.scanDecimal()
1258 if err != nil {
1259 return nil, err
1260 }
1261
1262 if scanOnly {
1263 return nil, nil
1264 }
1265
1266 if p.isCaptureSlot(capnum) {
1267 return newRegexNodeM(ntRef, p.options, capnum), nil
1268 }
1269 if capnum <= 9 && !p.useOptionE() {
1270 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1271 }
1272
1273 } else if angled {
1274 capname := p.scanCapname()
1275
1276 if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
1277
1278 if scanOnly {
1279 return nil, nil
1280 }
1281
1282 if p.isCaptureName(capname) {
1283 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1284 }
1285 return nil, p.getErr(ErrUndefinedNameRef, capname)
1286 } else {
1287 if k {
1288 return nil, p.getErr(ErrMalformedNameRef)
1289 }
1290 }
1291 }
1292
1293 // Not backreference: must be char code
1294
1295 p.textto(backpos)
1296 ch, err := p.scanCharEscape()
1297 if err != nil {
1298 return nil, err
1299 }
1300
1301 if scanOnly {
1302 return nil, nil
1303 }
1304
1305 if p.useOptionI() {
1306 ch = unicode.ToLower(ch)
1307 }
1308
1309 return newRegexNodeCh(ntOne, p.options, ch), nil
1310}
1311
1312// Scans X for \p{X} or \P{X}
1313func (p *parser) parseProperty() (string, error) {
1314 // RE2 and PCRE supports \pX syntax (no {} and only 1 letter unicode cats supported)
1315 // since this is purely additive syntax it's not behind a flag
1316 if p.charsRight() >= 1 && p.rightChar(0) != '{' {
1317 ch := string(p.moveRightGetChar())
1318 // check if it's a valid cat
1319 if !isValidUnicodeCat(ch) {
1320 return "", p.getErr(ErrUnknownSlashP, ch)
1321 }
1322 return ch, nil
1323 }
1324
1325 if p.charsRight() < 3 {
1326 return "", p.getErr(ErrIncompleteSlashP)
1327 }
1328 ch := p.moveRightGetChar()
1329 if ch != '{' {
1330 return "", p.getErr(ErrMalformedSlashP)
1331 }
1332
1333 startpos := p.textpos()
1334 for p.charsRight() > 0 {
1335 ch = p.moveRightGetChar()
1336 if !(IsWordChar(ch) || ch == '-') {
1337 p.moveLeft()
1338 break
1339 }
1340 }
1341 capname := string(p.pattern[startpos:p.textpos()])
1342
1343 if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1344 return "", p.getErr(ErrIncompleteSlashP)
1345 }
1346
1347 if !isValidUnicodeCat(capname) {
1348 return "", p.getErr(ErrUnknownSlashP, capname)
1349 }
1350
1351 return capname, nil
1352}
1353
1354// Returns ReNode type for zero-length assertions with a \ code.
1355func (p *parser) typeFromCode(ch rune) nodeType {
1356 switch ch {
1357 case 'b':
1358 if p.useOptionE() {
1359 return ntECMABoundary
1360 }
1361 return ntBoundary
1362 case 'B':
1363 if p.useOptionE() {
1364 return ntNonECMABoundary
1365 }
1366 return ntNonboundary
1367 case 'A':
1368 return ntBeginning
1369 case 'G':
1370 return ntStart
1371 case 'Z':
1372 return ntEndZ
1373 case 'z':
1374 return ntEnd
1375 default:
1376 return ntNothing
1377 }
1378}
1379
1380// Scans whitespace or x-mode comments.
1381func (p *parser) scanBlank() error {
1382 if p.useOptionX() {
1383 for {
1384 for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1385 p.moveRight(1)
1386 }
1387
1388 if p.charsRight() == 0 {
1389 break
1390 }
1391
1392 if p.rightChar(0) == '#' {
1393 for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1394 p.moveRight(1)
1395 }
1396 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1397 p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1398 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1399 p.moveRight(1)
1400 }
1401 if p.charsRight() == 0 {
1402 return p.getErr(ErrUnterminatedComment)
1403 }
1404 p.moveRight(1)
1405 } else {
1406 break
1407 }
1408 }
1409 } else {
1410 for {
1411 if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1412 p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1413 return nil
1414 }
1415
1416 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1417 p.moveRight(1)
1418 }
1419 if p.charsRight() == 0 {
1420 return p.getErr(ErrUnterminatedComment)
1421 }
1422 p.moveRight(1)
1423 }
1424 }
1425 return nil
1426}
1427
1428func (p *parser) scanCapname() string {
1429 startpos := p.textpos()
1430
1431 for p.charsRight() > 0 {
1432 if !IsWordChar(p.moveRightGetChar()) {
1433 p.moveLeft()
1434 break
1435 }
1436 }
1437
1438 return string(p.pattern[startpos:p.textpos()])
1439}
1440
1441// Scans contents of [] (not including []'s), and converts to a set.
1442func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1443 ch := '\x00'
1444 chPrev := '\x00'
1445 inRange := false
1446 firstChar := true
1447 closed := false
1448
1449 var cc *CharSet
1450 if !scanOnly {
1451 cc = &CharSet{}
1452 }
1453
1454 if p.charsRight() > 0 && p.rightChar(0) == '^' {
1455 p.moveRight(1)
1456 if !scanOnly {
1457 cc.negate = true
1458 }
1459 }
1460
1461 for ; p.charsRight() > 0; firstChar = false {
1462 fTranslatedChar := false
1463 ch = p.moveRightGetChar()
1464 if ch == ']' {
1465 if !firstChar {
1466 closed = true
1467 break
1468 } else if p.useOptionE() {
1469 if !scanOnly {
1470 cc.addRanges(NoneClass().ranges)
1471 }
1472 closed = true
1473 break
1474 }
1475
1476 } else if ch == '\\' && p.charsRight() > 0 {
1477 switch ch = p.moveRightGetChar(); ch {
1478 case 'D', 'd':
1479 if !scanOnly {
1480 if inRange {
1481 return nil, p.getErr(ErrBadClassInCharRange, ch)
1482 }
1483 cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
1484 }
1485 continue
1486
1487 case 'S', 's':
1488 if !scanOnly {
1489 if inRange {
1490 return nil, p.getErr(ErrBadClassInCharRange, ch)
1491 }
1492 cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
1493 }
1494 continue
1495
1496 case 'W', 'w':
1497 if !scanOnly {
1498 if inRange {
1499 return nil, p.getErr(ErrBadClassInCharRange, ch)
1500 }
1501
1502 cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
1503 }
1504 continue
1505
1506 case 'p', 'P':
1507 if !scanOnly {
1508 if inRange {
1509 return nil, p.getErr(ErrBadClassInCharRange, ch)
1510 }
1511 prop, err := p.parseProperty()
1512 if err != nil {
1513 return nil, err
1514 }
1515 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1516 } else {
1517 p.parseProperty()
1518 }
1519
1520 continue
1521
1522 case '-':
1523 if !scanOnly {
1524 cc.addRange(ch, ch)
1525 }
1526 continue
1527
1528 default:
1529 p.moveLeft()
1530 var err error
1531 ch, err = p.scanCharEscape() // non-literal character
1532 if err != nil {
1533 return nil, err
1534 }
1535 fTranslatedChar = true
1536 break // this break will only break out of the switch
1537 }
1538 } else if ch == '[' {
1539 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1540 // It currently doesn't do anything other than skip the whole thing!
1541 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1542 savePos := p.textpos()
1543
1544 p.moveRight(1)
1545 negate := false
1546 if p.charsRight() > 1 && p.rightChar(0) == '^' {
1547 negate = true
1548 p.moveRight(1)
1549 }
1550
1551 nm := p.scanCapname() // snag the name
1552 if !scanOnly && p.useRE2() {
1553 // look up the name since these are valid for RE2
1554 // add the group based on the name
1555 if ok := cc.addNamedASCII(nm, negate); !ok {
1556 return nil, p.getErr(ErrInvalidCharRange)
1557 }
1558 }
1559 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1560 p.textto(savePos)
1561 } else if p.useRE2() {
1562 // move on
1563 continue
1564 }
1565 }
1566 }
1567
1568 if inRange {
1569 inRange = false
1570 if !scanOnly {
1571 if ch == '[' && !fTranslatedChar && !firstChar {
1572 // We thought we were in a range, but we're actually starting a subtraction.
1573 // In that case, we'll add chPrev to our char class, skip the opening [, and
1574 // scan the new character class recursively.
1575 cc.addChar(chPrev)
1576 sub, err := p.scanCharSet(caseInsensitive, false)
1577 if err != nil {
1578 return nil, err
1579 }
1580 cc.addSubtraction(sub)
1581
1582 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1583 return nil, p.getErr(ErrSubtractionMustBeLast)
1584 }
1585 } else {
1586 // a regular range, like a-z
1587 if chPrev > ch {
1588 return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
1589 }
1590 cc.addRange(chPrev, ch)
1591 }
1592 }
1593 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1594 // this could be the start of a range
1595 chPrev = ch
1596 inRange = true
1597 p.moveRight(1)
1598 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1599 // we aren't in a range, and now there is a subtraction. Usually this happens
1600 // only when a subtraction follows a range, like [a-z-[b]]
1601 if !scanOnly {
1602 p.moveRight(1)
1603 sub, err := p.scanCharSet(caseInsensitive, false)
1604 if err != nil {
1605 return nil, err
1606 }
1607 cc.addSubtraction(sub)
1608
1609 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1610 return nil, p.getErr(ErrSubtractionMustBeLast)
1611 }
1612 } else {
1613 p.moveRight(1)
1614 p.scanCharSet(caseInsensitive, true)
1615 }
1616 } else {
1617 if !scanOnly {
1618 cc.addRange(ch, ch)
1619 }
1620 }
1621 }
1622
1623 if !closed {
1624 return nil, p.getErr(ErrUnterminatedBracket)
1625 }
1626
1627 if !scanOnly && caseInsensitive {
1628 cc.addLowercase()
1629 }
1630
1631 return cc, nil
1632}
1633
1634// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1635func (p *parser) scanDecimal() (int, error) {
1636 i := 0
1637 var d int
1638
1639 for p.charsRight() > 0 {
1640 d = int(p.rightChar(0) - '0')
1641 if d < 0 || d > 9 {
1642 break
1643 }
1644 p.moveRight(1)
1645
1646 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1647 return 0, p.getErr(ErrCaptureGroupOutOfRange)
1648 }
1649
1650 i *= 10
1651 i += d
1652 }
1653
1654 return int(i), nil
1655}
1656
1657// Returns true for options allowed only at the top level
1658func isOnlyTopOption(option RegexOptions) bool {
1659 return option == RightToLeft || option == ECMAScript || option == RE2
1660}
1661
1662// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1663func (p *parser) scanOptions() {
1664
1665 for off := false; p.charsRight() > 0; p.moveRight(1) {
1666 ch := p.rightChar(0)
1667
1668 if ch == '-' {
1669 off = true
1670 } else if ch == '+' {
1671 off = false
1672 } else {
1673 option := optionFromCode(ch)
1674 if option == 0 || isOnlyTopOption(option) {
1675 return
1676 }
1677
1678 if off {
1679 p.options &= ^option
1680 } else {
1681 p.options |= option
1682 }
1683 }
1684 }
1685}
1686
1687// Scans \ code for escape codes that map to single unicode chars.
1688func (p *parser) scanCharEscape() (r rune, err error) {
1689
1690 ch := p.moveRightGetChar()
1691
1692 if ch >= '0' && ch <= '7' {
1693 p.moveLeft()
1694 return p.scanOctal(), nil
1695 }
1696
1697 pos := p.textpos()
1698
1699 switch ch {
1700 case 'x':
1701 // support for \x{HEX} syntax from Perl and PCRE
1702 if p.charsRight() > 0 && p.rightChar(0) == '{' {
1703 if p.useOptionE() {
1704 return ch, nil
1705 }
1706 p.moveRight(1)
1707 return p.scanHexUntilBrace()
1708 } else {
1709 r, err = p.scanHex(2)
1710 }
1711 case 'u':
1712 // ECMAscript suppot \u{HEX} only if `u` is also set
1713 if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1714 p.moveRight(1)
1715 return p.scanHexUntilBrace()
1716 } else {
1717 r, err = p.scanHex(4)
1718 }
1719 case 'a':
1720 return '\u0007', nil
1721 case 'b':
1722 return '\b', nil
1723 case 'e':
1724 return '\u001B', nil
1725 case 'f':
1726 return '\f', nil
1727 case 'n':
1728 return '\n', nil
1729 case 'r':
1730 return '\r', nil
1731 case 't':
1732 return '\t', nil
1733 case 'v':
1734 return '\u000B', nil
1735 case 'c':
1736 r, err = p.scanControl()
1737 default:
1738 if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
1739 return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1740 }
1741 return ch, nil
1742 }
1743 if err != nil && p.useOptionE() {
1744 p.textto(pos)
1745 return ch, nil
1746 }
1747 return
1748}
1749
1750// Grabs and converts an ascii control character
1751func (p *parser) scanControl() (rune, error) {
1752 if p.charsRight() <= 0 {
1753 return 0, p.getErr(ErrMissingControl)
1754 }
1755
1756 ch := p.moveRightGetChar()
1757
1758 // \ca interpreted as \cA
1759
1760 if ch >= 'a' && ch <= 'z' {
1761 ch = (ch - ('a' - 'A'))
1762 }
1763 ch = (ch - '@')
1764 if ch >= 0 && ch < ' ' {
1765 return ch, nil
1766 }
1767
1768 return 0, p.getErr(ErrUnrecognizedControl)
1769
1770}
1771
1772// Scan hex digits until we hit a closing brace.
1773// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1774func (p *parser) scanHexUntilBrace() (rune, error) {
1775 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1776 // so we can enforce that
1777 i := 0
1778 hasContent := false
1779
1780 for p.charsRight() > 0 {
1781 ch := p.moveRightGetChar()
1782 if ch == '}' {
1783 // hit our close brace, we're done here
1784 // prevent \x{}
1785 if !hasContent {
1786 return 0, p.getErr(ErrTooFewHex)
1787 }
1788 return rune(i), nil
1789 }
1790 hasContent = true
1791 // no brace needs to be hex digit
1792 d := hexDigit(ch)
1793 if d < 0 {
1794 return 0, p.getErr(ErrMissingBrace)
1795 }
1796
1797 i *= 0x10
1798 i += d
1799
1800 if i > unicode.MaxRune {
1801 return 0, p.getErr(ErrInvalidHex)
1802 }
1803 }
1804
1805 // we only make it here if we run out of digits without finding the brace
1806 return 0, p.getErr(ErrMissingBrace)
1807}
1808
1809// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1810func (p *parser) scanHex(c int) (rune, error) {
1811
1812 i := 0
1813
1814 if p.charsRight() >= c {
1815 for c > 0 {
1816 d := hexDigit(p.moveRightGetChar())
1817 if d < 0 {
1818 break
1819 }
1820 i *= 0x10
1821 i += d
1822 c--
1823 }
1824 }
1825
1826 if c > 0 {
1827 return 0, p.getErr(ErrTooFewHex)
1828 }
1829
1830 return rune(i), nil
1831}
1832
1833// Returns n <= 0xF for a hex digit.
1834func hexDigit(ch rune) int {
1835
1836 if d := uint(ch - '0'); d <= 9 {
1837 return int(d)
1838 }
1839
1840 if d := uint(ch - 'a'); d <= 5 {
1841 return int(d + 0xa)
1842 }
1843
1844 if d := uint(ch - 'A'); d <= 5 {
1845 return int(d + 0xa)
1846 }
1847
1848 return -1
1849}
1850
1851// Scans up to three octal digits (stops before exceeding 0377).
1852func (p *parser) scanOctal() rune {
1853 // Consume octal chars only up to 3 digits and value 0377
1854
1855 c := 3
1856
1857 if c > p.charsRight() {
1858 c = p.charsRight()
1859 }
1860
1861 //we know the first char is good because the caller had to check
1862 i := 0
1863 d := int(p.rightChar(0) - '0')
1864 for c > 0 && d <= 7 && d >= 0 {
1865 if i >= 0x20 && p.useOptionE() {
1866 break
1867 }
1868 i *= 8
1869 i += d
1870 c--
1871
1872 p.moveRight(1)
1873 if !p.rightMost() {
1874 d = int(p.rightChar(0) - '0')
1875 }
1876 }
1877
1878 // Octal codes only go up to 255. Any larger and the behavior that Perl follows
1879 // is simply to truncate the high bits.
1880 i &= 0xFF
1881
1882 return rune(i)
1883}
1884
1885// Returns the current parsing position.
1886func (p *parser) textpos() int {
1887 return p.currentPos
1888}
1889
1890// Zaps to a specific parsing position.
1891func (p *parser) textto(pos int) {
1892 p.currentPos = pos
1893}
1894
1895// Returns the char at the right of the current parsing position and advances to the right.
1896func (p *parser) moveRightGetChar() rune {
1897 ch := p.pattern[p.currentPos]
1898 p.currentPos++
1899 return ch
1900}
1901
1902// Moves the current position to the right.
1903func (p *parser) moveRight(i int) {
1904 // default would be 1
1905 p.currentPos += i
1906}
1907
1908// Moves the current parsing position one to the left.
1909func (p *parser) moveLeft() {
1910 p.currentPos--
1911}
1912
1913// Returns the char left of the current parsing position.
1914func (p *parser) charAt(i int) rune {
1915 return p.pattern[i]
1916}
1917
1918// Returns the char i chars right of the current parsing position.
1919func (p *parser) rightChar(i int) rune {
1920 // default would be 0
1921 return p.pattern[p.currentPos+i]
1922}
1923
1924// Number of characters to the right of the current parsing position.
1925func (p *parser) charsRight() int {
1926 return len(p.pattern) - p.currentPos
1927}
1928
1929func (p *parser) rightMost() bool {
1930 return p.currentPos == len(p.pattern)
1931}
1932
1933// Looks up the slot number for a given name
1934func (p *parser) captureSlotFromName(capname string) int {
1935 return p.capnames[capname]
1936}
1937
1938// True if the capture slot was noted
1939func (p *parser) isCaptureSlot(i int) bool {
1940 if p.caps != nil {
1941 _, ok := p.caps[i]
1942 return ok
1943 }
1944
1945 return (i >= 0 && i < p.capsize)
1946}
1947
1948// Looks up the slot number for a given name
1949func (p *parser) isCaptureName(capname string) bool {
1950 if p.capnames == nil {
1951 return false
1952 }
1953
1954 _, ok := p.capnames[capname]
1955 return ok
1956}
1957
1958// option shortcuts
1959
1960// True if N option disabling '(' autocapture is on.
1961func (p *parser) useOptionN() bool {
1962 return (p.options & ExplicitCapture) != 0
1963}
1964
1965// True if I option enabling case-insensitivity is on.
1966func (p *parser) useOptionI() bool {
1967 return (p.options & IgnoreCase) != 0
1968}
1969
1970// True if M option altering meaning of $ and ^ is on.
1971func (p *parser) useOptionM() bool {
1972 return (p.options & Multiline) != 0
1973}
1974
1975// True if S option altering meaning of . is on.
1976func (p *parser) useOptionS() bool {
1977 return (p.options & Singleline) != 0
1978}
1979
1980// True if X option enabling whitespace/comment mode is on.
1981func (p *parser) useOptionX() bool {
1982 return (p.options & IgnorePatternWhitespace) != 0
1983}
1984
1985// True if E option enabling ECMAScript behavior on.
1986func (p *parser) useOptionE() bool {
1987 return (p.options & ECMAScript) != 0
1988}
1989
1990// true to use RE2 compatibility parsing behavior.
1991func (p *parser) useRE2() bool {
1992 return (p.options & RE2) != 0
1993}
1994
1995// True if U option enabling ECMAScript's Unicode behavior on.
1996func (p *parser) useOptionU() bool {
1997 return (p.options & Unicode) != 0
1998}
1999
2000// True if options stack is empty.
2001func (p *parser) emptyOptionsStack() bool {
2002 return len(p.optionsStack) == 0
2003}
2004
2005// Finish the current quantifiable (when a quantifier is not found or is not possible)
2006func (p *parser) addConcatenate() {
2007 // The first (| inside a Testgroup group goes directly to the group
2008 p.concatenation.addChild(p.unit)
2009 p.unit = nil
2010}
2011
2012// Finish the current quantifiable (when a quantifier is found)
2013func (p *parser) addConcatenate3(lazy bool, min, max int) {
2014 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
2015 p.unit = nil
2016}
2017
2018// Sets the current unit to a single char node
2019func (p *parser) addUnitOne(ch rune) {
2020 if p.useOptionI() {
2021 ch = unicode.ToLower(ch)
2022 }
2023
2024 p.unit = newRegexNodeCh(ntOne, p.options, ch)
2025}
2026
2027// Sets the current unit to a single inverse-char node
2028func (p *parser) addUnitNotone(ch rune) {
2029 if p.useOptionI() {
2030 ch = unicode.ToLower(ch)
2031 }
2032
2033 p.unit = newRegexNodeCh(ntNotone, p.options, ch)
2034}
2035
2036// Sets the current unit to a single set node
2037func (p *parser) addUnitSet(set *CharSet) {
2038 p.unit = newRegexNodeSet(ntSet, p.options, set)
2039}
2040
2041// Sets the current unit to a subtree
2042func (p *parser) addUnitNode(node *regexNode) {
2043 p.unit = node
2044}
2045
2046// Sets the current unit to an assertion of the specified type
2047func (p *parser) addUnitType(t nodeType) {
2048 p.unit = newRegexNode(t, p.options)
2049}
2050
2051// Finish the current group (in response to a ')' or end)
2052func (p *parser) addGroup() error {
2053 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2054 p.group.addChild(p.concatenation.reverseLeft())
2055 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2056 return p.getErr(ErrTooManyAlternates)
2057 }
2058 } else {
2059 p.alternation.addChild(p.concatenation.reverseLeft())
2060 p.group.addChild(p.alternation)
2061 }
2062
2063 p.unit = p.group
2064 return nil
2065}
2066
2067// Pops the option stack, but keeps the current options unchanged.
2068func (p *parser) popKeepOptions() {
2069 lastIdx := len(p.optionsStack) - 1
2070 p.optionsStack = p.optionsStack[:lastIdx]
2071}
2072
2073// Recalls options from the stack.
2074func (p *parser) popOptions() {
2075 lastIdx := len(p.optionsStack) - 1
2076 // get the last item on the stack and then remove it by reslicing
2077 p.options = p.optionsStack[lastIdx]
2078 p.optionsStack = p.optionsStack[:lastIdx]
2079}
2080
2081// Saves options on a stack.
2082func (p *parser) pushOptions() {
2083 p.optionsStack = append(p.optionsStack, p.options)
2084}
2085
2086// Add a string to the last concatenate.
2087func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2088 var node *regexNode
2089
2090 if cch == 0 {
2091 return
2092 }
2093
2094 if cch > 1 {
2095 str := make([]rune, cch)
2096 copy(str, p.pattern[pos:pos+cch])
2097
2098 if p.useOptionI() && !isReplacement {
2099 // We do the ToLower character by character for consistency. With surrogate chars, doing
2100 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
2101 // linguistically, but since Regex doesn't support surrogates, it's more important to be
2102 // consistent.
2103 for i := 0; i < len(str); i++ {
2104 str[i] = unicode.ToLower(str[i])
2105 }
2106 }
2107
2108 node = newRegexNodeStr(ntMulti, p.options, str)
2109 } else {
2110 ch := p.charAt(pos)
2111
2112 if p.useOptionI() && !isReplacement {
2113 ch = unicode.ToLower(ch)
2114 }
2115
2116 node = newRegexNodeCh(ntOne, p.options, ch)
2117 }
2118
2119 p.concatenation.addChild(node)
2120}
2121
2122// Push the parser state (in response to an open paren)
2123func (p *parser) pushGroup() {
2124 p.group.next = p.stack
2125 p.alternation.next = p.group
2126 p.concatenation.next = p.alternation
2127 p.stack = p.concatenation
2128}
2129
2130// Remember the pushed state (in response to a ')')
2131func (p *parser) popGroup() error {
2132 p.concatenation = p.stack
2133 p.alternation = p.concatenation.next
2134 p.group = p.alternation.next
2135 p.stack = p.group.next
2136
2137 // The first () inside a Testgroup group goes directly to the group
2138 if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2139 if p.unit == nil {
2140 return p.getErr(ErrConditionalExpression)
2141 }
2142
2143 p.group.addChild(p.unit)
2144 p.unit = nil
2145 }
2146 return nil
2147}
2148
2149// True if the group stack is empty.
2150func (p *parser) emptyStack() bool {
2151 return p.stack == nil
2152}
2153
2154// Start a new round for the parser state (in response to an open paren or string start)
2155func (p *parser) startGroup(openGroup *regexNode) {
2156 p.group = openGroup
2157 p.alternation = newRegexNode(ntAlternate, p.options)
2158 p.concatenation = newRegexNode(ntConcatenate, p.options)
2159}
2160
2161// Finish the current concatenation (in response to a |)
2162func (p *parser) addAlternate() {
2163 // The | parts inside a Testgroup group go directly to the group
2164
2165 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2166 p.group.addChild(p.concatenation.reverseLeft())
2167 } else {
2168 p.alternation.addChild(p.concatenation.reverseLeft())
2169 }
2170
2171 p.concatenation = newRegexNode(ntConcatenate, p.options)
2172}
2173
2174// For categorizing ascii characters.
2175
2176const (
2177 Q byte = 5 // quantifier
2178 S = 4 // ordinary stopper
2179 Z = 3 // ScanBlank stopper
2180 X = 2 // whitespace
2181 E = 1 // should be escaped
2182)
2183
2184var _category = []byte{
2185 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
2186 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2187 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
2188 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2189 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
2190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2191 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
2192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2193}
2194
2195func isSpace(ch rune) bool {
2196 return (ch <= ' ' && _category[ch] == X)
2197}
2198
2199// Returns true for those characters that terminate a string of ordinary chars.
2200func isSpecial(ch rune) bool {
2201 return (ch <= '|' && _category[ch] >= S)
2202}
2203
2204// Returns true for those characters that terminate a string of ordinary chars.
2205func isStopperX(ch rune) bool {
2206 return (ch <= '|' && _category[ch] >= X)
2207}
2208
2209// Returns true for those characters that begin a quantifier.
2210func isQuantifier(ch rune) bool {
2211 return (ch <= '{' && _category[ch] >= Q)
2212}
2213
2214func (p *parser) isTrueQuantifier() bool {
2215 nChars := p.charsRight()
2216 if nChars == 0 {
2217 return false
2218 }
2219
2220 startpos := p.textpos()
2221 ch := p.charAt(startpos)
2222 if ch != '{' {
2223 return ch <= '{' && _category[ch] >= Q
2224 }
2225
2226 //UGLY: this is ugly -- the original code was ugly too
2227 pos := startpos
2228 for {
2229 nChars--
2230 if nChars <= 0 {
2231 break
2232 }
2233 pos++
2234 ch = p.charAt(pos)
2235 if ch < '0' || ch > '9' {
2236 break
2237 }
2238 }
2239
2240 if nChars == 0 || pos-startpos == 1 {
2241 return false
2242 }
2243 if ch == '}' {
2244 return true
2245 }
2246 if ch != ',' {
2247 return false
2248 }
2249 for {
2250 nChars--
2251 if nChars <= 0 {
2252 break
2253 }
2254 pos++
2255 ch = p.charAt(pos)
2256 if ch < '0' || ch > '9' {
2257 break
2258 }
2259 }
2260
2261 return nChars > 0 && ch == '}'
2262}