1package syntax
2
3import (
4 "fmt"
5 "math"
6 "os"
7 "sort"
8 "strconv"
9 "unicode"
10)
11
12type RegexOptions int32
13
14const (
15 IgnoreCase RegexOptions = 0x0001 // "i"
16 Multiline = 0x0002 // "m"
17 ExplicitCapture = 0x0004 // "n"
18 Compiled = 0x0008 // "c"
19 Singleline = 0x0010 // "s"
20 IgnorePatternWhitespace = 0x0020 // "x"
21 RightToLeft = 0x0040 // "r"
22 Debug = 0x0080 // "d"
23 ECMAScript = 0x0100 // "e"
24 RE2 = 0x0200 // RE2 compat mode
25)
26
27func optionFromCode(ch rune) RegexOptions {
28 // case-insensitive
29 switch ch {
30 case 'i', 'I':
31 return IgnoreCase
32 case 'r', 'R':
33 return RightToLeft
34 case 'm', 'M':
35 return Multiline
36 case 'n', 'N':
37 return ExplicitCapture
38 case 's', 'S':
39 return Singleline
40 case 'x', 'X':
41 return IgnorePatternWhitespace
42 case 'd', 'D':
43 return Debug
44 case 'e', 'E':
45 return ECMAScript
46 default:
47 return 0
48 }
49}
50
51// An Error describes a failure to parse a regular expression
52// and gives the offending expression.
53type Error struct {
54 Code ErrorCode
55 Expr string
56 Args []interface{}
57}
58
59func (e *Error) Error() string {
60 if len(e.Args) == 0 {
61 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
62 }
63 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
64}
65
66// An ErrorCode describes a failure to parse a regular expression.
67type ErrorCode string
68
69const (
70 // internal issue
71 ErrInternalError ErrorCode = "regexp/syntax: internal error"
72 // Parser errors
73 ErrUnterminatedComment = "unterminated comment"
74 ErrInvalidCharRange = "invalid character class range"
75 ErrInvalidRepeatSize = "invalid repeat count"
76 ErrInvalidUTF8 = "invalid UTF-8"
77 ErrCaptureGroupOutOfRange = "capture group number out of range"
78 ErrUnexpectedParen = "unexpected )"
79 ErrMissingParen = "missing closing )"
80 ErrMissingBrace = "missing closing }"
81 ErrInvalidRepeatOp = "invalid nested repetition operator"
82 ErrMissingRepeatArgument = "missing argument to repetition operator"
83 ErrConditionalExpression = "illegal conditional (?(...)) expression"
84 ErrTooManyAlternates = "too many | in (?()|)"
85 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
86 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
87 ErrCapNumNotZero = "capture number cannot be zero"
88 ErrUndefinedBackRef = "reference to undefined group number %v"
89 ErrUndefinedNameRef = "reference to undefined group name %v"
90 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
91 ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
92 ErrMalformedReference = "(?(%v) ) malformed"
93 ErrUndefinedReference = "(?(%v) ) reference to undefined group"
94 ErrIllegalEndEscape = "illegal \\ at end of pattern"
95 ErrMalformedSlashP = "malformed \\p{X} character escape"
96 ErrIncompleteSlashP = "incomplete \\p{X} character escape"
97 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
98 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
99 ErrMissingControl = "missing control character"
100 ErrUnrecognizedControl = "unrecognized control character"
101 ErrTooFewHex = "insufficient hexadecimal digits"
102 ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
103 ErrMalformedNameRef = "malformed \\k<...> named back reference"
104 ErrBadClassInCharRange = "cannot include class \\%v in character range"
105 ErrUnterminatedBracket = "unterminated [] set"
106 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
107 ErrReversedCharRange = "[x-y] range in reverse order"
108)
109
110func (e ErrorCode) String() string {
111 return string(e)
112}
113
114type parser struct {
115 stack *regexNode
116 group *regexNode
117 alternation *regexNode
118 concatenation *regexNode
119 unit *regexNode
120
121 patternRaw string
122 pattern []rune
123
124 currentPos int
125 specialCase *unicode.SpecialCase
126
127 autocap int
128 capcount int
129 captop int
130 capsize int
131
132 caps map[int]int
133 capnames map[string]int
134
135 capnumlist []int
136 capnamelist []string
137
138 options RegexOptions
139 optionsStack []RegexOptions
140 ignoreNextParen bool
141}
142
143const (
144 maxValueDiv10 int = math.MaxInt32 / 10
145 maxValueMod10 = math.MaxInt32 % 10
146)
147
148// Parse converts a regex string into a parse tree
149func Parse(re string, op RegexOptions) (*RegexTree, error) {
150 p := parser{
151 options: op,
152 caps: make(map[int]int),
153 }
154 p.setPattern(re)
155
156 if err := p.countCaptures(); err != nil {
157 return nil, err
158 }
159
160 p.reset(op)
161 root, err := p.scanRegex()
162
163 if err != nil {
164 return nil, err
165 }
166 tree := &RegexTree{
167 root: root,
168 caps: p.caps,
169 capnumlist: p.capnumlist,
170 captop: p.captop,
171 Capnames: p.capnames,
172 Caplist: p.capnamelist,
173 options: op,
174 }
175
176 if tree.options&Debug > 0 {
177 os.Stdout.WriteString(tree.Dump())
178 }
179
180 return tree, nil
181}
182
183func (p *parser) setPattern(pattern string) {
184 p.patternRaw = pattern
185 p.pattern = make([]rune, 0, len(pattern))
186
187 //populate our rune array to handle utf8 encoding
188 for _, r := range pattern {
189 p.pattern = append(p.pattern, r)
190 }
191}
192func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
193 return &Error{Code: code, Expr: p.patternRaw, Args: args}
194}
195
196func (p *parser) noteCaptureSlot(i, pos int) {
197 if _, ok := p.caps[i]; !ok {
198 // the rhs of the hashtable isn't used in the parser
199 p.caps[i] = pos
200 p.capcount++
201
202 if p.captop <= i {
203 if i == math.MaxInt32 {
204 p.captop = i
205 } else {
206 p.captop = i + 1
207 }
208 }
209 }
210}
211
212func (p *parser) noteCaptureName(name string, pos int) {
213 if p.capnames == nil {
214 p.capnames = make(map[string]int)
215 }
216
217 if _, ok := p.capnames[name]; !ok {
218 p.capnames[name] = pos
219 p.capnamelist = append(p.capnamelist, name)
220 }
221}
222
223func (p *parser) assignNameSlots() {
224 if p.capnames != nil {
225 for _, name := range p.capnamelist {
226 for p.isCaptureSlot(p.autocap) {
227 p.autocap++
228 }
229 pos := p.capnames[name]
230 p.capnames[name] = p.autocap
231 p.noteCaptureSlot(p.autocap, pos)
232
233 p.autocap++
234 }
235 }
236
237 // if the caps array has at least one gap, construct the list of used slots
238 if p.capcount < p.captop {
239 p.capnumlist = make([]int, p.capcount)
240 i := 0
241
242 for k := range p.caps {
243 p.capnumlist[i] = k
244 i++
245 }
246
247 sort.Ints(p.capnumlist)
248 }
249
250 // merge capsnumlist into capnamelist
251 if p.capnames != nil || p.capnumlist != nil {
252 var oldcapnamelist []string
253 var next int
254 var k int
255
256 if p.capnames == nil {
257 oldcapnamelist = nil
258 p.capnames = make(map[string]int)
259 p.capnamelist = []string{}
260 next = -1
261 } else {
262 oldcapnamelist = p.capnamelist
263 p.capnamelist = []string{}
264 next = p.capnames[oldcapnamelist[0]]
265 }
266
267 for i := 0; i < p.capcount; i++ {
268 j := i
269 if p.capnumlist != nil {
270 j = p.capnumlist[i]
271 }
272
273 if next == j {
274 p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
275 k++
276
277 if k == len(oldcapnamelist) {
278 next = -1
279 } else {
280 next = p.capnames[oldcapnamelist[k]]
281 }
282
283 } else {
284 //feature: culture?
285 str := strconv.Itoa(j)
286 p.capnamelist = append(p.capnamelist, str)
287 p.capnames[str] = j
288 }
289 }
290 }
291}
292
293func (p *parser) consumeAutocap() int {
294 r := p.autocap
295 p.autocap++
296 return r
297}
298
299// CountCaptures is a prescanner for deducing the slots used for
300// captures by doing a partial tokenization of the pattern.
301func (p *parser) countCaptures() error {
302 var ch rune
303
304 p.noteCaptureSlot(0, 0)
305
306 p.autocap = 1
307
308 for p.charsRight() > 0 {
309 pos := p.textpos()
310 ch = p.moveRightGetChar()
311 switch ch {
312 case '\\':
313 if p.charsRight() > 0 {
314 p.scanBackslash(true)
315 }
316
317 case '#':
318 if p.useOptionX() {
319 p.moveLeft()
320 p.scanBlank()
321 }
322
323 case '[':
324 p.scanCharSet(false, true)
325
326 case ')':
327 if !p.emptyOptionsStack() {
328 p.popOptions()
329 }
330
331 case '(':
332 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
333 p.moveLeft()
334 p.scanBlank()
335 } else {
336 p.pushOptions()
337 if p.charsRight() > 0 && p.rightChar(0) == '?' {
338 // we have (?...
339 p.moveRight(1)
340
341 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
342 // named group: (?<... or (?'...
343
344 p.moveRight(1)
345 ch = p.rightChar(0)
346
347 if ch != '0' && IsWordChar(ch) {
348 if ch >= '1' && ch <= '9' {
349 dec, err := p.scanDecimal()
350 if err != nil {
351 return err
352 }
353 p.noteCaptureSlot(dec, pos)
354 } else {
355 p.noteCaptureName(p.scanCapname(), pos)
356 }
357 }
358 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
359 // RE2-compat (?P<)
360 p.moveRight(2)
361 ch = p.rightChar(0)
362 if IsWordChar(ch) {
363 p.noteCaptureName(p.scanCapname(), pos)
364 }
365
366 } else {
367 // (?...
368
369 // get the options if it's an option construct (?cimsx-cimsx...)
370 p.scanOptions()
371
372 if p.charsRight() > 0 {
373 if p.rightChar(0) == ')' {
374 // (?cimsx-cimsx)
375 p.moveRight(1)
376 p.popKeepOptions()
377 } else if p.rightChar(0) == '(' {
378 // alternation construct: (?(foo)yes|no)
379 // ignore the next paren so we don't capture the condition
380 p.ignoreNextParen = true
381
382 // break from here so we don't reset ignoreNextParen
383 continue
384 }
385 }
386 }
387 } else {
388 if !p.useOptionN() && !p.ignoreNextParen {
389 p.noteCaptureSlot(p.consumeAutocap(), pos)
390 }
391 }
392 }
393
394 p.ignoreNextParen = false
395
396 }
397 }
398
399 p.assignNameSlots()
400 return nil
401}
402
403func (p *parser) reset(topopts RegexOptions) {
404 p.currentPos = 0
405 p.autocap = 1
406 p.ignoreNextParen = false
407
408 if len(p.optionsStack) > 0 {
409 p.optionsStack = p.optionsStack[:0]
410 }
411
412 p.options = topopts
413 p.stack = nil
414}
415
416func (p *parser) scanRegex() (*regexNode, error) {
417 ch := '@' // nonspecial ch, means at beginning
418 isQuant := false
419
420 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
421
422 for p.charsRight() > 0 {
423 wasPrevQuantifier := isQuant
424 isQuant = false
425
426 if err := p.scanBlank(); err != nil {
427 return nil, err
428 }
429
430 startpos := p.textpos()
431
432 // move past all of the normal characters. We'll stop when we hit some kind of control character,
433 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
434 if p.useOptionX() {
435 for p.charsRight() > 0 {
436 ch = p.rightChar(0)
437 //UGLY: clean up, this is ugly
438 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
439 break
440 }
441 p.moveRight(1)
442 }
443 } else {
444 for p.charsRight() > 0 {
445 ch = p.rightChar(0)
446 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
447 break
448 }
449 p.moveRight(1)
450 }
451 }
452
453 endpos := p.textpos()
454
455 p.scanBlank()
456
457 if p.charsRight() == 0 {
458 ch = '!' // nonspecial, means at end
459 } else if ch = p.rightChar(0); isSpecial(ch) {
460 isQuant = isQuantifier(ch)
461 p.moveRight(1)
462 } else {
463 ch = ' ' // nonspecial, means at ordinary char
464 }
465
466 if startpos < endpos {
467 cchUnquantified := endpos - startpos
468 if isQuant {
469 cchUnquantified--
470 }
471 wasPrevQuantifier = false
472
473 if cchUnquantified > 0 {
474 p.addToConcatenate(startpos, cchUnquantified, false)
475 }
476
477 if isQuant {
478 p.addUnitOne(p.charAt(endpos - 1))
479 }
480 }
481
482 switch ch {
483 case '!':
484 goto BreakOuterScan
485
486 case ' ':
487 goto ContinueOuterScan
488
489 case '[':
490 cc, err := p.scanCharSet(p.useOptionI(), false)
491 if err != nil {
492 return nil, err
493 }
494 p.addUnitSet(cc)
495
496 case '(':
497 p.pushOptions()
498
499 if grouper, err := p.scanGroupOpen(); err != nil {
500 return nil, err
501 } else if grouper == nil {
502 p.popKeepOptions()
503 } else {
504 p.pushGroup()
505 p.startGroup(grouper)
506 }
507
508 continue
509
510 case '|':
511 p.addAlternate()
512 goto ContinueOuterScan
513
514 case ')':
515 if p.emptyStack() {
516 return nil, p.getErr(ErrUnexpectedParen)
517 }
518
519 if err := p.addGroup(); err != nil {
520 return nil, err
521 }
522 if err := p.popGroup(); err != nil {
523 return nil, err
524 }
525 p.popOptions()
526
527 if p.unit == nil {
528 goto ContinueOuterScan
529 }
530
531 case '\\':
532 n, err := p.scanBackslash(false)
533 if err != nil {
534 return nil, err
535 }
536 p.addUnitNode(n)
537
538 case '^':
539 if p.useOptionM() {
540 p.addUnitType(ntBol)
541 } else {
542 p.addUnitType(ntBeginning)
543 }
544
545 case '$':
546 if p.useOptionM() {
547 p.addUnitType(ntEol)
548 } else {
549 p.addUnitType(ntEndZ)
550 }
551
552 case '.':
553 if p.useOptionE() {
554 p.addUnitSet(ECMAAnyClass())
555 } else if p.useOptionS() {
556 p.addUnitSet(AnyClass())
557 } else {
558 p.addUnitNotone('\n')
559 }
560
561 case '{', '*', '+', '?':
562 if p.unit == nil {
563 if wasPrevQuantifier {
564 return nil, p.getErr(ErrInvalidRepeatOp)
565 } else {
566 return nil, p.getErr(ErrMissingRepeatArgument)
567 }
568 }
569 p.moveLeft()
570
571 default:
572 return nil, p.getErr(ErrInternalError)
573 }
574
575 if err := p.scanBlank(); err != nil {
576 return nil, err
577 }
578
579 if p.charsRight() > 0 {
580 isQuant = p.isTrueQuantifier()
581 }
582 if p.charsRight() == 0 || !isQuant {
583 //maintain odd C# assignment order -- not sure if required, could clean up?
584 p.addConcatenate()
585 goto ContinueOuterScan
586 }
587
588 ch = p.moveRightGetChar()
589
590 // Handle quantifiers
591 for p.unit != nil {
592 var min, max int
593 var lazy bool
594
595 switch ch {
596 case '*':
597 min = 0
598 max = math.MaxInt32
599
600 case '?':
601 min = 0
602 max = 1
603
604 case '+':
605 min = 1
606 max = math.MaxInt32
607
608 case '{':
609 {
610 var err error
611 startpos = p.textpos()
612 if min, err = p.scanDecimal(); err != nil {
613 return nil, err
614 }
615 max = min
616 if startpos < p.textpos() {
617 if p.charsRight() > 0 && p.rightChar(0) == ',' {
618 p.moveRight(1)
619 if p.charsRight() == 0 || p.rightChar(0) == '}' {
620 max = math.MaxInt32
621 } else {
622 if max, err = p.scanDecimal(); err != nil {
623 return nil, err
624 }
625 }
626 }
627 }
628
629 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
630 p.addConcatenate()
631 p.textto(startpos - 1)
632 goto ContinueOuterScan
633 }
634 }
635
636 default:
637 return nil, p.getErr(ErrInternalError)
638 }
639
640 if err := p.scanBlank(); err != nil {
641 return nil, err
642 }
643
644 if p.charsRight() == 0 || p.rightChar(0) != '?' {
645 lazy = false
646 } else {
647 p.moveRight(1)
648 lazy = true
649 }
650
651 if min > max {
652 return nil, p.getErr(ErrInvalidRepeatSize)
653 }
654
655 p.addConcatenate3(lazy, min, max)
656 }
657
658 ContinueOuterScan:
659 }
660
661BreakOuterScan:
662 ;
663
664 if !p.emptyStack() {
665 return nil, p.getErr(ErrMissingParen)
666 }
667
668 if err := p.addGroup(); err != nil {
669 return nil, err
670 }
671
672 return p.unit, nil
673
674}
675
676/*
677 * Simple parsing for replacement patterns
678 */
679func (p *parser) scanReplacement() (*regexNode, error) {
680 var c, startpos int
681
682 p.concatenation = newRegexNode(ntConcatenate, p.options)
683
684 for {
685 c = p.charsRight()
686 if c == 0 {
687 break
688 }
689
690 startpos = p.textpos()
691
692 for c > 0 && p.rightChar(0) != '$' {
693 p.moveRight(1)
694 c--
695 }
696
697 p.addToConcatenate(startpos, p.textpos()-startpos, true)
698
699 if c > 0 {
700 if p.moveRightGetChar() == '$' {
701 n, err := p.scanDollar()
702 if err != nil {
703 return nil, err
704 }
705 p.addUnitNode(n)
706 }
707 p.addConcatenate()
708 }
709 }
710
711 return p.concatenation, nil
712}
713
714/*
715 * Scans $ patterns recognized within replacement patterns
716 */
717func (p *parser) scanDollar() (*regexNode, error) {
718 if p.charsRight() == 0 {
719 return newRegexNodeCh(ntOne, p.options, '$'), nil
720 }
721
722 ch := p.rightChar(0)
723 angled := false
724 backpos := p.textpos()
725 lastEndPos := backpos
726
727 // Note angle
728
729 if ch == '{' && p.charsRight() > 1 {
730 angled = true
731 p.moveRight(1)
732 ch = p.rightChar(0)
733 }
734
735 // Try to parse backreference: \1 or \{1} or \{cap}
736
737 if ch >= '0' && ch <= '9' {
738 if !angled && p.useOptionE() {
739 capnum := -1
740 newcapnum := int(ch - '0')
741 p.moveRight(1)
742 if p.isCaptureSlot(newcapnum) {
743 capnum = newcapnum
744 lastEndPos = p.textpos()
745 }
746
747 for p.charsRight() > 0 {
748 ch = p.rightChar(0)
749 if ch < '0' || ch > '9' {
750 break
751 }
752 digit := int(ch - '0')
753 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
754 return nil, p.getErr(ErrCaptureGroupOutOfRange)
755 }
756
757 newcapnum = newcapnum*10 + digit
758
759 p.moveRight(1)
760 if p.isCaptureSlot(newcapnum) {
761 capnum = newcapnum
762 lastEndPos = p.textpos()
763 }
764 }
765 p.textto(lastEndPos)
766 if capnum >= 0 {
767 return newRegexNodeM(ntRef, p.options, capnum), nil
768 }
769 } else {
770 capnum, err := p.scanDecimal()
771 if err != nil {
772 return nil, err
773 }
774 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
775 if p.isCaptureSlot(capnum) {
776 return newRegexNodeM(ntRef, p.options, capnum), nil
777 }
778 }
779 }
780 } else if angled && IsWordChar(ch) {
781 capname := p.scanCapname()
782
783 if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
784 if p.isCaptureName(capname) {
785 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
786 }
787 }
788 } else if !angled {
789 capnum := 1
790
791 switch ch {
792 case '$':
793 p.moveRight(1)
794 return newRegexNodeCh(ntOne, p.options, '$'), nil
795 case '&':
796 capnum = 0
797 case '`':
798 capnum = replaceLeftPortion
799 case '\'':
800 capnum = replaceRightPortion
801 case '+':
802 capnum = replaceLastGroup
803 case '_':
804 capnum = replaceWholeString
805 }
806
807 if capnum != 1 {
808 p.moveRight(1)
809 return newRegexNodeM(ntRef, p.options, capnum), nil
810 }
811 }
812
813 // unrecognized $: literalize
814
815 p.textto(backpos)
816 return newRegexNodeCh(ntOne, p.options, '$'), nil
817}
818
819// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
820// a RegexNode for the type of group scanned, or nil if the group
821// simply changed options (?cimsx-cimsx) or was a comment (#...).
822func (p *parser) scanGroupOpen() (*regexNode, error) {
823 var ch rune
824 var nt nodeType
825 var err error
826 close := '>'
827 start := p.textpos()
828
829 // just return a RegexNode if we have:
830 // 1. "(" followed by nothing
831 // 2. "(x" where x != ?
832 // 3. "(?)"
833 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
834 if p.useOptionN() || p.ignoreNextParen {
835 p.ignoreNextParen = false
836 return newRegexNode(ntGroup, p.options), nil
837 }
838 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
839 }
840
841 p.moveRight(1)
842
843 for {
844 if p.charsRight() == 0 {
845 break
846 }
847
848 switch ch = p.moveRightGetChar(); ch {
849 case ':':
850 nt = ntGroup
851
852 case '=':
853 p.options &= ^RightToLeft
854 nt = ntRequire
855
856 case '!':
857 p.options &= ^RightToLeft
858 nt = ntPrevent
859
860 case '>':
861 nt = ntGreedy
862
863 case '\'':
864 close = '\''
865 fallthrough
866
867 case '<':
868 if p.charsRight() == 0 {
869 goto BreakRecognize
870 }
871
872 switch ch = p.moveRightGetChar(); ch {
873 case '=':
874 if close == '\'' {
875 goto BreakRecognize
876 }
877
878 p.options |= RightToLeft
879 nt = ntRequire
880
881 case '!':
882 if close == '\'' {
883 goto BreakRecognize
884 }
885
886 p.options |= RightToLeft
887 nt = ntPrevent
888
889 default:
890 p.moveLeft()
891 capnum := -1
892 uncapnum := -1
893 proceed := false
894
895 // grab part before -
896
897 if ch >= '0' && ch <= '9' {
898 if capnum, err = p.scanDecimal(); err != nil {
899 return nil, err
900 }
901
902 if !p.isCaptureSlot(capnum) {
903 capnum = -1
904 }
905
906 // check if we have bogus characters after the number
907 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
908 return nil, p.getErr(ErrInvalidGroupName)
909 }
910 if capnum == 0 {
911 return nil, p.getErr(ErrCapNumNotZero)
912 }
913 } else if IsWordChar(ch) {
914 capname := p.scanCapname()
915
916 if p.isCaptureName(capname) {
917 capnum = p.captureSlotFromName(capname)
918 }
919
920 // check if we have bogus character after the name
921 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
922 return nil, p.getErr(ErrInvalidGroupName)
923 }
924 } else if ch == '-' {
925 proceed = true
926 } else {
927 // bad group name - starts with something other than a word character and isn't a number
928 return nil, p.getErr(ErrInvalidGroupName)
929 }
930
931 // grab part after - if any
932
933 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
934 p.moveRight(1)
935
936 //no more chars left, no closing char, etc
937 if p.charsRight() == 0 {
938 return nil, p.getErr(ErrInvalidGroupName)
939 }
940
941 ch = p.rightChar(0)
942 if ch >= '0' && ch <= '9' {
943 if uncapnum, err = p.scanDecimal(); err != nil {
944 return nil, err
945 }
946
947 if !p.isCaptureSlot(uncapnum) {
948 return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
949 }
950
951 // check if we have bogus characters after the number
952 if p.charsRight() > 0 && p.rightChar(0) != close {
953 return nil, p.getErr(ErrInvalidGroupName)
954 }
955 } else if IsWordChar(ch) {
956 uncapname := p.scanCapname()
957
958 if !p.isCaptureName(uncapname) {
959 return nil, p.getErr(ErrUndefinedNameRef, uncapname)
960 }
961 uncapnum = p.captureSlotFromName(uncapname)
962
963 // check if we have bogus character after the name
964 if p.charsRight() > 0 && p.rightChar(0) != close {
965 return nil, p.getErr(ErrInvalidGroupName)
966 }
967 } else {
968 // bad group name - starts with something other than a word character and isn't a number
969 return nil, p.getErr(ErrInvalidGroupName)
970 }
971 }
972
973 // actually make the node
974
975 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
976 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
977 }
978 goto BreakRecognize
979 }
980
981 case '(':
982 // alternation construct (?(...) | )
983
984 parenPos := p.textpos()
985 if p.charsRight() > 0 {
986 ch = p.rightChar(0)
987
988 // check if the alternation condition is a backref
989 if ch >= '0' && ch <= '9' {
990 var capnum int
991 if capnum, err = p.scanDecimal(); err != nil {
992 return nil, err
993 }
994 if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
995 if p.isCaptureSlot(capnum) {
996 return newRegexNodeM(ntTestref, p.options, capnum), nil
997 }
998 return nil, p.getErr(ErrUndefinedReference, capnum)
999 }
1000
1001 return nil, p.getErr(ErrMalformedReference, capnum)
1002
1003 } else if IsWordChar(ch) {
1004 capname := p.scanCapname()
1005
1006 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1007 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1008 }
1009 }
1010 }
1011 // not a backref
1012 nt = ntTestgroup
1013 p.textto(parenPos - 1) // jump to the start of the parentheses
1014 p.ignoreNextParen = true // but make sure we don't try to capture the insides
1015
1016 charsRight := p.charsRight()
1017 if charsRight >= 3 && p.rightChar(1) == '?' {
1018 rightchar2 := p.rightChar(2)
1019 // disallow comments in the condition
1020 if rightchar2 == '#' {
1021 return nil, p.getErr(ErrAlternationCantHaveComment)
1022 }
1023
1024 // disallow named capture group (?<..>..) in the condition
1025 if rightchar2 == '\'' {
1026 return nil, p.getErr(ErrAlternationCantCapture)
1027 }
1028
1029 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1030 return nil, p.getErr(ErrAlternationCantCapture)
1031 }
1032 }
1033
1034 case 'P':
1035 if p.useRE2() {
1036 // support for P<name> syntax
1037 if p.charsRight() < 3 {
1038 goto BreakRecognize
1039 }
1040
1041 ch = p.moveRightGetChar()
1042 if ch != '<' {
1043 goto BreakRecognize
1044 }
1045
1046 ch = p.moveRightGetChar()
1047 p.moveLeft()
1048
1049 if IsWordChar(ch) {
1050 capnum := -1
1051 capname := p.scanCapname()
1052
1053 if p.isCaptureName(capname) {
1054 capnum = p.captureSlotFromName(capname)
1055 }
1056
1057 // check if we have bogus character after the name
1058 if p.charsRight() > 0 && p.rightChar(0) != '>' {
1059 return nil, p.getErr(ErrInvalidGroupName)
1060 }
1061
1062 // actually make the node
1063
1064 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1065 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1066 }
1067 goto BreakRecognize
1068
1069 } else {
1070 // bad group name - starts with something other than a word character and isn't a number
1071 return nil, p.getErr(ErrInvalidGroupName)
1072 }
1073 }
1074 // if we're not using RE2 compat mode then
1075 // we just behave like normal
1076 fallthrough
1077
1078 default:
1079 p.moveLeft()
1080
1081 nt = ntGroup
1082 // disallow options in the children of a testgroup node
1083 if p.group.t != ntTestgroup {
1084 p.scanOptions()
1085 }
1086 if p.charsRight() == 0 {
1087 goto BreakRecognize
1088 }
1089
1090 if ch = p.moveRightGetChar(); ch == ')' {
1091 return nil, nil
1092 }
1093
1094 if ch != ':' {
1095 goto BreakRecognize
1096 }
1097
1098 }
1099
1100 return newRegexNode(nt, p.options), nil
1101 }
1102
1103BreakRecognize:
1104
1105 // break Recognize comes here
1106
1107 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1108}
1109
1110// scans backslash specials and basics
1111func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1112
1113 if p.charsRight() == 0 {
1114 return nil, p.getErr(ErrIllegalEndEscape)
1115 }
1116
1117 switch ch := p.rightChar(0); ch {
1118 case 'b', 'B', 'A', 'G', 'Z', 'z':
1119 p.moveRight(1)
1120 return newRegexNode(p.typeFromCode(ch), p.options), nil
1121
1122 case 'w':
1123 p.moveRight(1)
1124 if p.useOptionE() {
1125 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1126 }
1127 return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1128
1129 case 'W':
1130 p.moveRight(1)
1131 if p.useOptionE() {
1132 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1133 }
1134 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1135
1136 case 's':
1137 p.moveRight(1)
1138 if p.useOptionE() {
1139 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1140 }
1141 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1142
1143 case 'S':
1144 p.moveRight(1)
1145 if p.useOptionE() {
1146 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1147 }
1148 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1149
1150 case 'd':
1151 p.moveRight(1)
1152 if p.useOptionE() {
1153 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1154 }
1155 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1156
1157 case 'D':
1158 p.moveRight(1)
1159 if p.useOptionE() {
1160 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1161 }
1162 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1163
1164 case 'p', 'P':
1165 p.moveRight(1)
1166 prop, err := p.parseProperty()
1167 if err != nil {
1168 return nil, err
1169 }
1170 cc := &CharSet{}
1171 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1172 if p.useOptionI() {
1173 cc.addLowercase()
1174 }
1175
1176 return newRegexNodeSet(ntSet, p.options, cc), nil
1177
1178 default:
1179 return p.scanBasicBackslash(scanOnly)
1180 }
1181}
1182
1183// Scans \-style backreferences and character escapes
1184func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1185 if p.charsRight() == 0 {
1186 return nil, p.getErr(ErrIllegalEndEscape)
1187 }
1188 angled := false
1189 close := '\x00'
1190
1191 backpos := p.textpos()
1192 ch := p.rightChar(0)
1193
1194 // allow \k<foo> instead of \<foo>, which is now deprecated
1195
1196 if ch == 'k' {
1197 if p.charsRight() >= 2 {
1198 p.moveRight(1)
1199 ch = p.moveRightGetChar()
1200
1201 if ch == '<' || ch == '\'' {
1202 angled = true
1203 if ch == '\'' {
1204 close = '\''
1205 } else {
1206 close = '>'
1207 }
1208 }
1209 }
1210
1211 if !angled || p.charsRight() <= 0 {
1212 return nil, p.getErr(ErrMalformedNameRef)
1213 }
1214
1215 ch = p.rightChar(0)
1216
1217 } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1218 angled = true
1219 if ch == '\'' {
1220 close = '\''
1221 } else {
1222 close = '>'
1223 }
1224
1225 p.moveRight(1)
1226 ch = p.rightChar(0)
1227 }
1228
1229 // Try to parse backreference: \<1> or \<cap>
1230
1231 if angled && ch >= '0' && ch <= '9' {
1232 capnum, err := p.scanDecimal()
1233 if err != nil {
1234 return nil, err
1235 }
1236
1237 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1238 if p.isCaptureSlot(capnum) {
1239 return newRegexNodeM(ntRef, p.options, capnum), nil
1240 }
1241 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1242 }
1243 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1244 capnum, err := p.scanDecimal()
1245 if err != nil {
1246 return nil, err
1247 }
1248
1249 if scanOnly {
1250 return nil, nil
1251 }
1252
1253 if p.useOptionE() || p.isCaptureSlot(capnum) {
1254 return newRegexNodeM(ntRef, p.options, capnum), nil
1255 }
1256 if capnum <= 9 {
1257 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1258 }
1259
1260 } else if angled && IsWordChar(ch) {
1261 capname := p.scanCapname()
1262
1263 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1264 if p.isCaptureName(capname) {
1265 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1266 }
1267 return nil, p.getErr(ErrUndefinedNameRef, capname)
1268 }
1269 }
1270
1271 // Not backreference: must be char code
1272
1273 p.textto(backpos)
1274 ch, err := p.scanCharEscape()
1275 if err != nil {
1276 return nil, err
1277 }
1278
1279 if p.useOptionI() {
1280 ch = unicode.ToLower(ch)
1281 }
1282
1283 return newRegexNodeCh(ntOne, p.options, ch), nil
1284}
1285
1286// Scans X for \p{X} or \P{X}
1287func (p *parser) parseProperty() (string, error) {
1288 if p.charsRight() < 3 {
1289 return "", p.getErr(ErrIncompleteSlashP)
1290 }
1291 ch := p.moveRightGetChar()
1292 if ch != '{' {
1293 return "", p.getErr(ErrMalformedSlashP)
1294 }
1295
1296 startpos := p.textpos()
1297 for p.charsRight() > 0 {
1298 ch = p.moveRightGetChar()
1299 if !(IsWordChar(ch) || ch == '-') {
1300 p.moveLeft()
1301 break
1302 }
1303 }
1304 capname := string(p.pattern[startpos:p.textpos()])
1305
1306 if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1307 return "", p.getErr(ErrIncompleteSlashP)
1308 }
1309
1310 if !isValidUnicodeCat(capname) {
1311 return "", p.getErr(ErrUnknownSlashP, capname)
1312 }
1313
1314 return capname, nil
1315}
1316
1317// Returns ReNode type for zero-length assertions with a \ code.
1318func (p *parser) typeFromCode(ch rune) nodeType {
1319 switch ch {
1320 case 'b':
1321 if p.useOptionE() {
1322 return ntECMABoundary
1323 }
1324 return ntBoundary
1325 case 'B':
1326 if p.useOptionE() {
1327 return ntNonECMABoundary
1328 }
1329 return ntNonboundary
1330 case 'A':
1331 return ntBeginning
1332 case 'G':
1333 return ntStart
1334 case 'Z':
1335 return ntEndZ
1336 case 'z':
1337 return ntEnd
1338 default:
1339 return ntNothing
1340 }
1341}
1342
1343// Scans whitespace or x-mode comments.
1344func (p *parser) scanBlank() error {
1345 if p.useOptionX() {
1346 for {
1347 for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1348 p.moveRight(1)
1349 }
1350
1351 if p.charsRight() == 0 {
1352 break
1353 }
1354
1355 if p.rightChar(0) == '#' {
1356 for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1357 p.moveRight(1)
1358 }
1359 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1360 p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1361 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1362 p.moveRight(1)
1363 }
1364 if p.charsRight() == 0 {
1365 return p.getErr(ErrUnterminatedComment)
1366 }
1367 p.moveRight(1)
1368 } else {
1369 break
1370 }
1371 }
1372 } else {
1373 for {
1374 if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1375 p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1376 return nil
1377 }
1378
1379 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1380 p.moveRight(1)
1381 }
1382 if p.charsRight() == 0 {
1383 return p.getErr(ErrUnterminatedComment)
1384 }
1385 p.moveRight(1)
1386 }
1387 }
1388 return nil
1389}
1390
1391func (p *parser) scanCapname() string {
1392 startpos := p.textpos()
1393
1394 for p.charsRight() > 0 {
1395 if !IsWordChar(p.moveRightGetChar()) {
1396 p.moveLeft()
1397 break
1398 }
1399 }
1400
1401 return string(p.pattern[startpos:p.textpos()])
1402}
1403
1404//Scans contents of [] (not including []'s), and converts to a set.
1405func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1406 ch := '\x00'
1407 chPrev := '\x00'
1408 inRange := false
1409 firstChar := true
1410 closed := false
1411
1412 var cc *CharSet
1413 if !scanOnly {
1414 cc = &CharSet{}
1415 }
1416
1417 if p.charsRight() > 0 && p.rightChar(0) == '^' {
1418 p.moveRight(1)
1419 if !scanOnly {
1420 cc.negate = true
1421 }
1422 }
1423
1424 for ; p.charsRight() > 0; firstChar = false {
1425 fTranslatedChar := false
1426 ch = p.moveRightGetChar()
1427 if ch == ']' {
1428 if !firstChar {
1429 closed = true
1430 break
1431 } else if p.useOptionE() {
1432 if !scanOnly {
1433 cc.addRanges(NoneClass().ranges)
1434 }
1435 closed = true
1436 break
1437 }
1438
1439 } else if ch == '\\' && p.charsRight() > 0 {
1440 switch ch = p.moveRightGetChar(); ch {
1441 case 'D', 'd':
1442 if !scanOnly {
1443 if inRange {
1444 return nil, p.getErr(ErrBadClassInCharRange, ch)
1445 }
1446 cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
1447 }
1448 continue
1449
1450 case 'S', 's':
1451 if !scanOnly {
1452 if inRange {
1453 return nil, p.getErr(ErrBadClassInCharRange, ch)
1454 }
1455 cc.addSpace(p.useOptionE(), ch == 'S')
1456 }
1457 continue
1458
1459 case 'W', 'w':
1460 if !scanOnly {
1461 if inRange {
1462 return nil, p.getErr(ErrBadClassInCharRange, ch)
1463 }
1464
1465 cc.addWord(p.useOptionE(), ch == 'W')
1466 }
1467 continue
1468
1469 case 'p', 'P':
1470 if !scanOnly {
1471 if inRange {
1472 return nil, p.getErr(ErrBadClassInCharRange, ch)
1473 }
1474 prop, err := p.parseProperty()
1475 if err != nil {
1476 return nil, err
1477 }
1478 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1479 } else {
1480 p.parseProperty()
1481 }
1482
1483 continue
1484
1485 case '-':
1486 if !scanOnly {
1487 cc.addRange(ch, ch)
1488 }
1489 continue
1490
1491 default:
1492 p.moveLeft()
1493 var err error
1494 ch, err = p.scanCharEscape() // non-literal character
1495 if err != nil {
1496 return nil, err
1497 }
1498 fTranslatedChar = true
1499 break // this break will only break out of the switch
1500 }
1501 } else if ch == '[' {
1502 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1503 // It currently doesn't do anything other than skip the whole thing!
1504 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1505 savePos := p.textpos()
1506
1507 p.moveRight(1)
1508 negate := false
1509 if p.charsRight() > 1 && p.rightChar(0) == '^' {
1510 negate = true
1511 p.moveRight(1)
1512 }
1513
1514 nm := p.scanCapname() // snag the name
1515 if !scanOnly && p.useRE2() {
1516 // look up the name since these are valid for RE2
1517 // add the group based on the name
1518 if ok := cc.addNamedASCII(nm, negate); !ok {
1519 return nil, p.getErr(ErrInvalidCharRange)
1520 }
1521 }
1522 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1523 p.textto(savePos)
1524 } else if p.useRE2() {
1525 // move on
1526 continue
1527 }
1528 }
1529 }
1530
1531 if inRange {
1532 inRange = false
1533 if !scanOnly {
1534 if ch == '[' && !fTranslatedChar && !firstChar {
1535 // We thought we were in a range, but we're actually starting a subtraction.
1536 // In that case, we'll add chPrev to our char class, skip the opening [, and
1537 // scan the new character class recursively.
1538 cc.addChar(chPrev)
1539 sub, err := p.scanCharSet(caseInsensitive, false)
1540 if err != nil {
1541 return nil, err
1542 }
1543 cc.addSubtraction(sub)
1544
1545 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1546 return nil, p.getErr(ErrSubtractionMustBeLast)
1547 }
1548 } else {
1549 // a regular range, like a-z
1550 if chPrev > ch {
1551 return nil, p.getErr(ErrReversedCharRange)
1552 }
1553 cc.addRange(chPrev, ch)
1554 }
1555 }
1556 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1557 // this could be the start of a range
1558 chPrev = ch
1559 inRange = true
1560 p.moveRight(1)
1561 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1562 // we aren't in a range, and now there is a subtraction. Usually this happens
1563 // only when a subtraction follows a range, like [a-z-[b]]
1564 if !scanOnly {
1565 p.moveRight(1)
1566 sub, err := p.scanCharSet(caseInsensitive, false)
1567 if err != nil {
1568 return nil, err
1569 }
1570 cc.addSubtraction(sub)
1571
1572 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1573 return nil, p.getErr(ErrSubtractionMustBeLast)
1574 }
1575 } else {
1576 p.moveRight(1)
1577 p.scanCharSet(caseInsensitive, true)
1578 }
1579 } else {
1580 if !scanOnly {
1581 cc.addRange(ch, ch)
1582 }
1583 }
1584 }
1585
1586 if !closed {
1587 return nil, p.getErr(ErrUnterminatedBracket)
1588 }
1589
1590 if !scanOnly && caseInsensitive {
1591 cc.addLowercase()
1592 }
1593
1594 return cc, nil
1595}
1596
1597// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1598func (p *parser) scanDecimal() (int, error) {
1599 i := 0
1600 var d int
1601
1602 for p.charsRight() > 0 {
1603 d = int(p.rightChar(0) - '0')
1604 if d < 0 || d > 9 {
1605 break
1606 }
1607 p.moveRight(1)
1608
1609 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1610 return 0, p.getErr(ErrCaptureGroupOutOfRange)
1611 }
1612
1613 i *= 10
1614 i += d
1615 }
1616
1617 return int(i), nil
1618}
1619
1620// Returns true for options allowed only at the top level
1621func isOnlyTopOption(option RegexOptions) bool {
1622 return option == RightToLeft || option == ECMAScript || option == RE2
1623}
1624
1625// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1626func (p *parser) scanOptions() {
1627
1628 for off := false; p.charsRight() > 0; p.moveRight(1) {
1629 ch := p.rightChar(0)
1630
1631 if ch == '-' {
1632 off = true
1633 } else if ch == '+' {
1634 off = false
1635 } else {
1636 option := optionFromCode(ch)
1637 if option == 0 || isOnlyTopOption(option) {
1638 return
1639 }
1640
1641 if off {
1642 p.options &= ^option
1643 } else {
1644 p.options |= option
1645 }
1646 }
1647 }
1648}
1649
1650// Scans \ code for escape codes that map to single unicode chars.
1651func (p *parser) scanCharEscape() (rune, error) {
1652
1653 ch := p.moveRightGetChar()
1654
1655 if ch >= '0' && ch <= '7' {
1656 p.moveLeft()
1657 return p.scanOctal(), nil
1658 }
1659
1660 switch ch {
1661 case 'x':
1662 // support for \x{HEX} syntax from Perl and PCRE
1663 if p.charsRight() > 0 && p.rightChar(0) == '{' {
1664 p.moveRight(1)
1665 return p.scanHexUntilBrace()
1666 }
1667 return p.scanHex(2)
1668 case 'u':
1669 return p.scanHex(4)
1670 case 'a':
1671 return '\u0007', nil
1672 case 'b':
1673 return '\b', nil
1674 case 'e':
1675 return '\u001B', nil
1676 case 'f':
1677 return '\f', nil
1678 case 'n':
1679 return '\n', nil
1680 case 'r':
1681 return '\r', nil
1682 case 't':
1683 return '\t', nil
1684 case 'v':
1685 return '\u000B', nil
1686 case 'c':
1687 return p.scanControl()
1688 default:
1689 if !p.useOptionE() && IsWordChar(ch) {
1690 return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1691 }
1692 return ch, nil
1693 }
1694}
1695
1696// Grabs and converts an ascii control character
1697func (p *parser) scanControl() (rune, error) {
1698 if p.charsRight() <= 0 {
1699 return 0, p.getErr(ErrMissingControl)
1700 }
1701
1702 ch := p.moveRightGetChar()
1703
1704 // \ca interpreted as \cA
1705
1706 if ch >= 'a' && ch <= 'z' {
1707 ch = (ch - ('a' - 'A'))
1708 }
1709 ch = (ch - '@')
1710 if ch >= 0 && ch < ' ' {
1711 return ch, nil
1712 }
1713
1714 return 0, p.getErr(ErrUnrecognizedControl)
1715
1716}
1717
1718// Scan hex digits until we hit a closing brace.
1719// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1720func (p *parser) scanHexUntilBrace() (rune, error) {
1721 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1722 // so we can enforce that
1723 i := 0
1724 hasContent := false
1725
1726 for p.charsRight() > 0 {
1727 ch := p.moveRightGetChar()
1728 if ch == '}' {
1729 // hit our close brace, we're done here
1730 // prevent \x{}
1731 if !hasContent {
1732 return 0, p.getErr(ErrTooFewHex)
1733 }
1734 return rune(i), nil
1735 }
1736 hasContent = true
1737 // no brace needs to be hex digit
1738 d := hexDigit(ch)
1739 if d < 0 {
1740 return 0, p.getErr(ErrMissingBrace)
1741 }
1742
1743 i *= 0x10
1744 i += d
1745
1746 if i > unicode.MaxRune {
1747 return 0, p.getErr(ErrInvalidHex)
1748 }
1749 }
1750
1751 // we only make it here if we run out of digits without finding the brace
1752 return 0, p.getErr(ErrMissingBrace)
1753}
1754
1755// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1756func (p *parser) scanHex(c int) (rune, error) {
1757
1758 i := 0
1759
1760 if p.charsRight() >= c {
1761 for c > 0 {
1762 d := hexDigit(p.moveRightGetChar())
1763 if d < 0 {
1764 break
1765 }
1766 i *= 0x10
1767 i += d
1768 c--
1769 }
1770 }
1771
1772 if c > 0 {
1773 return 0, p.getErr(ErrTooFewHex)
1774 }
1775
1776 return rune(i), nil
1777}
1778
1779// Returns n <= 0xF for a hex digit.
1780func hexDigit(ch rune) int {
1781
1782 if d := uint(ch - '0'); d <= 9 {
1783 return int(d)
1784 }
1785
1786 if d := uint(ch - 'a'); d <= 5 {
1787 return int(d + 0xa)
1788 }
1789
1790 if d := uint(ch - 'A'); d <= 5 {
1791 return int(d + 0xa)
1792 }
1793
1794 return -1
1795}
1796
1797// Scans up to three octal digits (stops before exceeding 0377).
1798func (p *parser) scanOctal() rune {
1799 // Consume octal chars only up to 3 digits and value 0377
1800
1801 c := 3
1802
1803 if c > p.charsRight() {
1804 c = p.charsRight()
1805 }
1806
1807 //we know the first char is good because the caller had to check
1808 i := 0
1809 d := int(p.rightChar(0) - '0')
1810 for c > 0 && d <= 7 {
1811 i *= 8
1812 i += d
1813 if p.useOptionE() && i >= 0x20 {
1814 break
1815 }
1816 c--
1817
1818 p.moveRight(1)
1819 if !p.rightMost() {
1820 d = int(p.rightChar(0) - '0')
1821 }
1822 }
1823
1824 // Octal codes only go up to 255. Any larger and the behavior that Perl follows
1825 // is simply to truncate the high bits.
1826 i &= 0xFF
1827
1828 return rune(i)
1829}
1830
1831// Returns the current parsing position.
1832func (p *parser) textpos() int {
1833 return p.currentPos
1834}
1835
1836// Zaps to a specific parsing position.
1837func (p *parser) textto(pos int) {
1838 p.currentPos = pos
1839}
1840
1841// Returns the char at the right of the current parsing position and advances to the right.
1842func (p *parser) moveRightGetChar() rune {
1843 ch := p.pattern[p.currentPos]
1844 p.currentPos++
1845 return ch
1846}
1847
1848// Moves the current position to the right.
1849func (p *parser) moveRight(i int) {
1850 // default would be 1
1851 p.currentPos += i
1852}
1853
1854// Moves the current parsing position one to the left.
1855func (p *parser) moveLeft() {
1856 p.currentPos--
1857}
1858
1859// Returns the char left of the current parsing position.
1860func (p *parser) charAt(i int) rune {
1861 return p.pattern[i]
1862}
1863
1864// Returns the char i chars right of the current parsing position.
1865func (p *parser) rightChar(i int) rune {
1866 // default would be 0
1867 return p.pattern[p.currentPos+i]
1868}
1869
1870// Number of characters to the right of the current parsing position.
1871func (p *parser) charsRight() int {
1872 return len(p.pattern) - p.currentPos
1873}
1874
1875func (p *parser) rightMost() bool {
1876 return p.currentPos == len(p.pattern)
1877}
1878
1879// Looks up the slot number for a given name
1880func (p *parser) captureSlotFromName(capname string) int {
1881 return p.capnames[capname]
1882}
1883
1884// True if the capture slot was noted
1885func (p *parser) isCaptureSlot(i int) bool {
1886 if p.caps != nil {
1887 _, ok := p.caps[i]
1888 return ok
1889 }
1890
1891 return (i >= 0 && i < p.capsize)
1892}
1893
1894// Looks up the slot number for a given name
1895func (p *parser) isCaptureName(capname string) bool {
1896 if p.capnames == nil {
1897 return false
1898 }
1899
1900 _, ok := p.capnames[capname]
1901 return ok
1902}
1903
1904// option shortcuts
1905
1906// True if N option disabling '(' autocapture is on.
1907func (p *parser) useOptionN() bool {
1908 return (p.options & ExplicitCapture) != 0
1909}
1910
1911// True if I option enabling case-insensitivity is on.
1912func (p *parser) useOptionI() bool {
1913 return (p.options & IgnoreCase) != 0
1914}
1915
1916// True if M option altering meaning of $ and ^ is on.
1917func (p *parser) useOptionM() bool {
1918 return (p.options & Multiline) != 0
1919}
1920
1921// True if S option altering meaning of . is on.
1922func (p *parser) useOptionS() bool {
1923 return (p.options & Singleline) != 0
1924}
1925
1926// True if X option enabling whitespace/comment mode is on.
1927func (p *parser) useOptionX() bool {
1928 return (p.options & IgnorePatternWhitespace) != 0
1929}
1930
1931// True if E option enabling ECMAScript behavior on.
1932func (p *parser) useOptionE() bool {
1933 return (p.options & ECMAScript) != 0
1934}
1935
1936// true to use RE2 compatibility parsing behavior.
1937func (p *parser) useRE2() bool {
1938 return (p.options & RE2) != 0
1939}
1940
1941// True if options stack is empty.
1942func (p *parser) emptyOptionsStack() bool {
1943 return len(p.optionsStack) == 0
1944}
1945
1946// Finish the current quantifiable (when a quantifier is not found or is not possible)
1947func (p *parser) addConcatenate() {
1948 // The first (| inside a Testgroup group goes directly to the group
1949 p.concatenation.addChild(p.unit)
1950 p.unit = nil
1951}
1952
1953// Finish the current quantifiable (when a quantifier is found)
1954func (p *parser) addConcatenate3(lazy bool, min, max int) {
1955 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
1956 p.unit = nil
1957}
1958
1959// Sets the current unit to a single char node
1960func (p *parser) addUnitOne(ch rune) {
1961 if p.useOptionI() {
1962 ch = unicode.ToLower(ch)
1963 }
1964
1965 p.unit = newRegexNodeCh(ntOne, p.options, ch)
1966}
1967
1968// Sets the current unit to a single inverse-char node
1969func (p *parser) addUnitNotone(ch rune) {
1970 if p.useOptionI() {
1971 ch = unicode.ToLower(ch)
1972 }
1973
1974 p.unit = newRegexNodeCh(ntNotone, p.options, ch)
1975}
1976
1977// Sets the current unit to a single set node
1978func (p *parser) addUnitSet(set *CharSet) {
1979 p.unit = newRegexNodeSet(ntSet, p.options, set)
1980}
1981
1982// Sets the current unit to a subtree
1983func (p *parser) addUnitNode(node *regexNode) {
1984 p.unit = node
1985}
1986
1987// Sets the current unit to an assertion of the specified type
1988func (p *parser) addUnitType(t nodeType) {
1989 p.unit = newRegexNode(t, p.options)
1990}
1991
1992// Finish the current group (in response to a ')' or end)
1993func (p *parser) addGroup() error {
1994 if p.group.t == ntTestgroup || p.group.t == ntTestref {
1995 p.group.addChild(p.concatenation.reverseLeft())
1996 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
1997 return p.getErr(ErrTooManyAlternates)
1998 }
1999 } else {
2000 p.alternation.addChild(p.concatenation.reverseLeft())
2001 p.group.addChild(p.alternation)
2002 }
2003
2004 p.unit = p.group
2005 return nil
2006}
2007
2008// Pops the option stack, but keeps the current options unchanged.
2009func (p *parser) popKeepOptions() {
2010 lastIdx := len(p.optionsStack) - 1
2011 p.optionsStack = p.optionsStack[:lastIdx]
2012}
2013
2014// Recalls options from the stack.
2015func (p *parser) popOptions() {
2016 lastIdx := len(p.optionsStack) - 1
2017 // get the last item on the stack and then remove it by reslicing
2018 p.options = p.optionsStack[lastIdx]
2019 p.optionsStack = p.optionsStack[:lastIdx]
2020}
2021
2022// Saves options on a stack.
2023func (p *parser) pushOptions() {
2024 p.optionsStack = append(p.optionsStack, p.options)
2025}
2026
2027// Add a string to the last concatenate.
2028func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2029 var node *regexNode
2030
2031 if cch == 0 {
2032 return
2033 }
2034
2035 if cch > 1 {
2036 str := p.pattern[pos : pos+cch]
2037
2038 if p.useOptionI() && !isReplacement {
2039 // We do the ToLower character by character for consistency. With surrogate chars, doing
2040 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
2041 // linguistically, but since Regex doesn't support surrogates, it's more important to be
2042 // consistent.
2043 for i := 0; i < len(str); i++ {
2044 str[i] = unicode.ToLower(str[i])
2045 }
2046 }
2047
2048 node = newRegexNodeStr(ntMulti, p.options, str)
2049 } else {
2050 ch := p.charAt(pos)
2051
2052 if p.useOptionI() && !isReplacement {
2053 ch = unicode.ToLower(ch)
2054 }
2055
2056 node = newRegexNodeCh(ntOne, p.options, ch)
2057 }
2058
2059 p.concatenation.addChild(node)
2060}
2061
2062// Push the parser state (in response to an open paren)
2063func (p *parser) pushGroup() {
2064 p.group.next = p.stack
2065 p.alternation.next = p.group
2066 p.concatenation.next = p.alternation
2067 p.stack = p.concatenation
2068}
2069
2070// Remember the pushed state (in response to a ')')
2071func (p *parser) popGroup() error {
2072 p.concatenation = p.stack
2073 p.alternation = p.concatenation.next
2074 p.group = p.alternation.next
2075 p.stack = p.group.next
2076
2077 // The first () inside a Testgroup group goes directly to the group
2078 if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2079 if p.unit == nil {
2080 return p.getErr(ErrConditionalExpression)
2081 }
2082
2083 p.group.addChild(p.unit)
2084 p.unit = nil
2085 }
2086 return nil
2087}
2088
2089// True if the group stack is empty.
2090func (p *parser) emptyStack() bool {
2091 return p.stack == nil
2092}
2093
2094// Start a new round for the parser state (in response to an open paren or string start)
2095func (p *parser) startGroup(openGroup *regexNode) {
2096 p.group = openGroup
2097 p.alternation = newRegexNode(ntAlternate, p.options)
2098 p.concatenation = newRegexNode(ntConcatenate, p.options)
2099}
2100
2101// Finish the current concatenation (in response to a |)
2102func (p *parser) addAlternate() {
2103 // The | parts inside a Testgroup group go directly to the group
2104
2105 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2106 p.group.addChild(p.concatenation.reverseLeft())
2107 } else {
2108 p.alternation.addChild(p.concatenation.reverseLeft())
2109 }
2110
2111 p.concatenation = newRegexNode(ntConcatenate, p.options)
2112}
2113
2114// For categorizing ascii characters.
2115
2116const (
2117 Q byte = 5 // quantifier
2118 S = 4 // ordinary stopper
2119 Z = 3 // ScanBlank stopper
2120 X = 2 // whitespace
2121 E = 1 // should be escaped
2122)
2123
2124var _category = []byte{
2125 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
2126 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2127 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
2128 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2129 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
2130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2131 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
2132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2133}
2134
2135func isSpace(ch rune) bool {
2136 return (ch <= ' ' && _category[ch] == X)
2137}
2138
2139// Returns true for those characters that terminate a string of ordinary chars.
2140func isSpecial(ch rune) bool {
2141 return (ch <= '|' && _category[ch] >= S)
2142}
2143
2144// Returns true for those characters that terminate a string of ordinary chars.
2145func isStopperX(ch rune) bool {
2146 return (ch <= '|' && _category[ch] >= X)
2147}
2148
2149// Returns true for those characters that begin a quantifier.
2150func isQuantifier(ch rune) bool {
2151 return (ch <= '{' && _category[ch] >= Q)
2152}
2153
2154func (p *parser) isTrueQuantifier() bool {
2155 nChars := p.charsRight()
2156 if nChars == 0 {
2157 return false
2158 }
2159
2160 startpos := p.textpos()
2161 ch := p.charAt(startpos)
2162 if ch != '{' {
2163 return ch <= '{' && _category[ch] >= Q
2164 }
2165
2166 //UGLY: this is ugly -- the original code was ugly too
2167 pos := startpos
2168 for {
2169 nChars--
2170 if nChars <= 0 {
2171 break
2172 }
2173 pos++
2174 ch = p.charAt(pos)
2175 if ch < '0' || ch > '9' {
2176 break
2177 }
2178 }
2179
2180 if nChars == 0 || pos-startpos == 1 {
2181 return false
2182 }
2183 if ch == '}' {
2184 return true
2185 }
2186 if ch != ',' {
2187 return false
2188 }
2189 for {
2190 nChars--
2191 if nChars <= 0 {
2192 break
2193 }
2194 pos++
2195 ch = p.charAt(pos)
2196 if ch < '0' || ch > '9' {
2197 break
2198 }
2199 }
2200
2201 return nChars > 0 && ch == '}'
2202}