tree.go

  1package syntax
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"math"
  7	"strconv"
  8)
  9
 10type RegexTree struct {
 11	root       *regexNode
 12	caps       map[int]int
 13	capnumlist []int
 14	captop     int
 15	Capnames   map[string]int
 16	Caplist    []string
 17	options    RegexOptions
 18}
 19
 20// It is built into a parsed tree for a regular expression.
 21
 22// Implementation notes:
 23//
 24// Since the node tree is a temporary data structure only used
 25// during compilation of the regexp to integer codes, it's
 26// designed for clarity and convenience rather than
 27// space efficiency.
 28//
 29// RegexNodes are built into a tree, linked by the n.children list.
 30// Each node also has a n.parent and n.ichild member indicating
 31// its parent and which child # it is in its parent's list.
 32//
 33// RegexNodes come in as many types as there are constructs in
 34// a regular expression, for example, "concatenate", "alternate",
 35// "one", "rept", "group". There are also node types for basic
 36// peephole optimizations, e.g., "onerep", "notsetrep", etc.
 37//
 38// Because perl 5 allows "lookback" groups that scan backwards,
 39// each node also gets a "direction". Normally the value of
 40// boolean n.backward = false.
 41//
 42// During parsing, top-level nodes are also stacked onto a parse
 43// stack (a stack of trees). For this purpose we have a n.next
 44// pointer. [Note that to save a few bytes, we could overload the
 45// n.parent pointer instead.]
 46//
 47// On the parse stack, each tree has a "role" - basically, the
 48// nonterminal in the grammar that the parser has currently
 49// assigned to the tree. That code is stored in n.role.
 50//
 51// Finally, some of the different kinds of nodes have data.
 52// Two integers (for the looping constructs) are stored in
 53// n.operands, an an object (either a string or a set)
 54// is stored in n.data
 55type regexNode struct {
 56	t        nodeType
 57	children []*regexNode
 58	str      []rune
 59	set      *CharSet
 60	ch       rune
 61	m        int
 62	n        int
 63	options  RegexOptions
 64	next     *regexNode
 65}
 66
 67type nodeType int32
 68
 69const (
 70	// The following are leaves, and correspond to primitive operations
 71
 72	ntOnerep      nodeType = 0  // lef,back char,min,max    a {n}
 73	ntNotonerep            = 1  // lef,back char,min,max    .{n}
 74	ntSetrep               = 2  // lef,back set,min,max     [\d]{n}
 75	ntOneloop              = 3  // lef,back char,min,max    a {,n}
 76	ntNotoneloop           = 4  // lef,back char,min,max    .{,n}
 77	ntSetloop              = 5  // lef,back set,min,max     [\d]{,n}
 78	ntOnelazy              = 6  // lef,back char,min,max    a {,n}?
 79	ntNotonelazy           = 7  // lef,back char,min,max    .{,n}?
 80	ntSetlazy              = 8  // lef,back set,min,max     [\d]{,n}?
 81	ntOne                  = 9  // lef      char            a
 82	ntNotone               = 10 // lef      char            [^a]
 83	ntSet                  = 11 // lef      set             [a-z\s]  \w \s \d
 84	ntMulti                = 12 // lef      string          abcd
 85	ntRef                  = 13 // lef      group           \#
 86	ntBol                  = 14 //                          ^
 87	ntEol                  = 15 //                          $
 88	ntBoundary             = 16 //                          \b
 89	ntNonboundary          = 17 //                          \B
 90	ntBeginning            = 18 //                          \A
 91	ntStart                = 19 //                          \G
 92	ntEndZ                 = 20 //                          \Z
 93	ntEnd                  = 21 //                          \Z
 94
 95	// Interior nodes do not correspond to primitive operations, but
 96	// control structures compositing other operations
 97
 98	// Concat and alternate take n children, and can run forward or backwards
 99
100	ntNothing     = 22 //          []
101	ntEmpty       = 23 //          ()
102	ntAlternate   = 24 //          a|b
103	ntConcatenate = 25 //          ab
104	ntLoop        = 26 // m,x      * + ? {,}
105	ntLazyloop    = 27 // m,x      *? +? ?? {,}?
106	ntCapture     = 28 // n        ()
107	ntGroup       = 29 //          (?:)
108	ntRequire     = 30 //          (?=) (?<=)
109	ntPrevent     = 31 //          (?!) (?<!)
110	ntGreedy      = 32 //          (?>) (?<)
111	ntTestref     = 33 //          (?(n) | )
112	ntTestgroup   = 34 //          (?(...) | )
113
114	ntECMABoundary    = 41 //                          \b
115	ntNonECMABoundary = 42 //                          \B
116)
117
118func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
119	return &regexNode{
120		t:       t,
121		options: opt,
122	}
123}
124
125func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
126	return &regexNode{
127		t:       t,
128		options: opt,
129		ch:      ch,
130	}
131}
132
133func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
134	return &regexNode{
135		t:       t,
136		options: opt,
137		str:     str,
138	}
139}
140
141func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
142	return &regexNode{
143		t:       t,
144		options: opt,
145		set:     set,
146	}
147}
148
149func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
150	return &regexNode{
151		t:       t,
152		options: opt,
153		m:       m,
154	}
155}
156func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
157	return &regexNode{
158		t:       t,
159		options: opt,
160		m:       m,
161		n:       n,
162	}
163}
164
165func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
166	for i := 0; i < len(n.str); i++ {
167		buf.WriteRune(n.str[i])
168	}
169}
170
171func (n *regexNode) addChild(child *regexNode) {
172	reduced := child.reduce()
173	n.children = append(n.children, reduced)
174	reduced.next = n
175}
176
177func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
178	newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
179	n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
180}
181
182// removes children including the start but not the end index
183func (n *regexNode) removeChildren(startIndex, endIndex int) {
184	n.children = append(n.children[:startIndex], n.children[endIndex:]...)
185}
186
187// Pass type as OneLazy or OneLoop
188func (n *regexNode) makeRep(t nodeType, min, max int) {
189	n.t += (t - ntOne)
190	n.m = min
191	n.n = max
192}
193
194func (n *regexNode) reduce() *regexNode {
195	switch n.t {
196	case ntAlternate:
197		return n.reduceAlternation()
198
199	case ntConcatenate:
200		return n.reduceConcatenation()
201
202	case ntLoop, ntLazyloop:
203		return n.reduceRep()
204
205	case ntGroup:
206		return n.reduceGroup()
207
208	case ntSet, ntSetloop:
209		return n.reduceSet()
210
211	default:
212		return n
213	}
214}
215
216// Basic optimization. Single-letter alternations can be replaced
217// by faster set specifications, and nested alternations with no
218// intervening operators can be flattened:
219//
220// a|b|c|def|g|h -> [a-c]|def|[gh]
221// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
222func (n *regexNode) reduceAlternation() *regexNode {
223	if len(n.children) == 0 {
224		return newRegexNode(ntNothing, n.options)
225	}
226
227	wasLastSet := false
228	lastNodeCannotMerge := false
229	var optionsLast RegexOptions
230	var i, j int
231
232	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
233		at := n.children[i]
234
235		if j < i {
236			n.children[j] = at
237		}
238
239		for {
240			if at.t == ntAlternate {
241				for k := 0; k < len(at.children); k++ {
242					at.children[k].next = n
243				}
244				n.insertChildren(i+1, at.children)
245
246				j--
247			} else if at.t == ntSet || at.t == ntOne {
248				// Cannot merge sets if L or I options differ, or if either are negated.
249				optionsAt := at.options & (RightToLeft | IgnoreCase)
250
251				if at.t == ntSet {
252					if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
253						wasLastSet = true
254						lastNodeCannotMerge = !at.set.IsMergeable()
255						optionsLast = optionsAt
256						break
257					}
258				} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
259					wasLastSet = true
260					lastNodeCannotMerge = false
261					optionsLast = optionsAt
262					break
263				}
264
265				// The last node was a Set or a One, we're a Set or One and our options are the same.
266				// Merge the two nodes.
267				j--
268				prev := n.children[j]
269
270				var prevCharClass *CharSet
271				if prev.t == ntOne {
272					prevCharClass = &CharSet{}
273					prevCharClass.addChar(prev.ch)
274				} else {
275					prevCharClass = prev.set
276				}
277
278				if at.t == ntOne {
279					prevCharClass.addChar(at.ch)
280				} else {
281					prevCharClass.addSet(*at.set)
282				}
283
284				prev.t = ntSet
285				prev.set = prevCharClass
286			} else if at.t == ntNothing {
287				j--
288			} else {
289				wasLastSet = false
290				lastNodeCannotMerge = false
291			}
292			break
293		}
294	}
295
296	if j < i {
297		n.removeChildren(j, i)
298	}
299
300	return n.stripEnation(ntNothing)
301}
302
303// Basic optimization. Adjacent strings can be concatenated.
304//
305// (?:abc)(?:def) -> abcdef
306func (n *regexNode) reduceConcatenation() *regexNode {
307	// Eliminate empties and concat adjacent strings/chars
308
309	var optionsLast RegexOptions
310	var optionsAt RegexOptions
311	var i, j int
312
313	if len(n.children) == 0 {
314		return newRegexNode(ntEmpty, n.options)
315	}
316
317	wasLastString := false
318
319	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
320		var at, prev *regexNode
321
322		at = n.children[i]
323
324		if j < i {
325			n.children[j] = at
326		}
327
328		if at.t == ntConcatenate &&
329			((at.options & RightToLeft) == (n.options & RightToLeft)) {
330			for k := 0; k < len(at.children); k++ {
331				at.children[k].next = n
332			}
333
334			//insert at.children at i+1 index in n.children
335			n.insertChildren(i+1, at.children)
336
337			j--
338		} else if at.t == ntMulti || at.t == ntOne {
339			// Cannot merge strings if L or I options differ
340			optionsAt = at.options & (RightToLeft | IgnoreCase)
341
342			if !wasLastString || optionsLast != optionsAt {
343				wasLastString = true
344				optionsLast = optionsAt
345				continue
346			}
347
348			j--
349			prev = n.children[j]
350
351			if prev.t == ntOne {
352				prev.t = ntMulti
353				prev.str = []rune{prev.ch}
354			}
355
356			if (optionsAt & RightToLeft) == 0 {
357				if at.t == ntOne {
358					prev.str = append(prev.str, at.ch)
359				} else {
360					prev.str = append(prev.str, at.str...)
361				}
362			} else {
363				if at.t == ntOne {
364					// insert at the front by expanding our slice, copying the data over, and then setting the value
365					prev.str = append(prev.str, 0)
366					copy(prev.str[1:], prev.str)
367					prev.str[0] = at.ch
368				} else {
369					//insert at the front...this one we'll make a new slice and copy both into it
370					merge := make([]rune, len(prev.str)+len(at.str))
371					copy(merge, at.str)
372					copy(merge[len(at.str):], prev.str)
373					prev.str = merge
374				}
375			}
376		} else if at.t == ntEmpty {
377			j--
378		} else {
379			wasLastString = false
380		}
381	}
382
383	if j < i {
384		// remove indices j through i from the children
385		n.removeChildren(j, i)
386	}
387
388	return n.stripEnation(ntEmpty)
389}
390
391// Nested repeaters just get multiplied with each other if they're not
392// too lumpy
393func (n *regexNode) reduceRep() *regexNode {
394
395	u := n
396	t := n.t
397	min := n.m
398	max := n.n
399
400	for {
401		if len(u.children) == 0 {
402			break
403		}
404
405		child := u.children[0]
406
407		// multiply reps of the same type only
408		if child.t != t {
409			childType := child.t
410
411			if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
412				childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
413				break
414			}
415		}
416
417		// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
418		// [but things like (a {2,})+ are not too lumpy...]
419		if u.m == 0 && child.m > 1 || child.n < child.m*2 {
420			break
421		}
422
423		u = child
424		if u.m > 0 {
425			if (math.MaxInt32-1)/u.m < min {
426				u.m = math.MaxInt32
427			} else {
428				u.m = u.m * min
429			}
430		}
431		if u.n > 0 {
432			if (math.MaxInt32-1)/u.n < max {
433				u.n = math.MaxInt32
434			} else {
435				u.n = u.n * max
436			}
437		}
438	}
439
440	if math.MaxInt32 == min {
441		return newRegexNode(ntNothing, n.options)
442	}
443	return u
444
445}
446
447// Simple optimization. If a concatenation or alternation has only
448// one child strip out the intermediate node. If it has zero children,
449// turn it into an empty.
450func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
451	switch len(n.children) {
452	case 0:
453		return newRegexNode(emptyType, n.options)
454	case 1:
455		return n.children[0]
456	default:
457		return n
458	}
459}
460
461func (n *regexNode) reduceGroup() *regexNode {
462	u := n
463
464	for u.t == ntGroup {
465		u = u.children[0]
466	}
467
468	return u
469}
470
471// Simple optimization. If a set is a singleton, an inverse singleton,
472// or empty, it's transformed accordingly.
473func (n *regexNode) reduceSet() *regexNode {
474	// Extract empty-set, one and not-one case as special
475
476	if n.set == nil {
477		n.t = ntNothing
478	} else if n.set.IsSingleton() {
479		n.ch = n.set.SingletonChar()
480		n.set = nil
481		n.t += (ntOne - ntSet)
482	} else if n.set.IsSingletonInverse() {
483		n.ch = n.set.SingletonChar()
484		n.set = nil
485		n.t += (ntNotone - ntSet)
486	}
487
488	return n
489}
490
491func (n *regexNode) reverseLeft() *regexNode {
492	if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
493		//reverse children order
494		for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
495			n.children[left], n.children[right] = n.children[right], n.children[left]
496		}
497	}
498
499	return n
500}
501
502func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
503	if min == 0 && max == 0 {
504		return newRegexNode(ntEmpty, n.options)
505	}
506
507	if min == 1 && max == 1 {
508		return n
509	}
510
511	switch n.t {
512	case ntOne, ntNotone, ntSet:
513		if lazy {
514			n.makeRep(Onelazy, min, max)
515		} else {
516			n.makeRep(Oneloop, min, max)
517		}
518		return n
519
520	default:
521		var t nodeType
522		if lazy {
523			t = ntLazyloop
524		} else {
525			t = ntLoop
526		}
527		result := newRegexNodeMN(t, n.options, min, max)
528		result.addChild(n)
529		return result
530	}
531}
532
533// debug functions
534
535var typeStr = []string{
536	"Onerep", "Notonerep", "Setrep",
537	"Oneloop", "Notoneloop", "Setloop",
538	"Onelazy", "Notonelazy", "Setlazy",
539	"One", "Notone", "Set",
540	"Multi", "Ref",
541	"Bol", "Eol", "Boundary", "Nonboundary",
542	"Beginning", "Start", "EndZ", "End",
543	"Nothing", "Empty",
544	"Alternate", "Concatenate",
545	"Loop", "Lazyloop",
546	"Capture", "Group", "Require", "Prevent", "Greedy",
547	"Testref", "Testgroup",
548	"Unknown", "Unknown", "Unknown",
549	"Unknown", "Unknown", "Unknown",
550	"ECMABoundary", "NonECMABoundary",
551}
552
553func (n *regexNode) description() string {
554	buf := &bytes.Buffer{}
555
556	buf.WriteString(typeStr[n.t])
557
558	if (n.options & ExplicitCapture) != 0 {
559		buf.WriteString("-C")
560	}
561	if (n.options & IgnoreCase) != 0 {
562		buf.WriteString("-I")
563	}
564	if (n.options & RightToLeft) != 0 {
565		buf.WriteString("-L")
566	}
567	if (n.options & Multiline) != 0 {
568		buf.WriteString("-M")
569	}
570	if (n.options & Singleline) != 0 {
571		buf.WriteString("-S")
572	}
573	if (n.options & IgnorePatternWhitespace) != 0 {
574		buf.WriteString("-X")
575	}
576	if (n.options & ECMAScript) != 0 {
577		buf.WriteString("-E")
578	}
579
580	switch n.t {
581	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
582		buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
583		break
584	case ntCapture:
585		buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
586		break
587	case ntRef, ntTestref:
588		buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
589		break
590	case ntMulti:
591		fmt.Fprintf(buf, "(String = %s)", string(n.str))
592		break
593	case ntSet, ntSetloop, ntSetlazy:
594		buf.WriteString("(Set = " + n.set.String() + ")")
595		break
596	}
597
598	switch n.t {
599	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
600		buf.WriteString("(Min = ")
601		buf.WriteString(strconv.Itoa(n.m))
602		buf.WriteString(", Max = ")
603		if n.n == math.MaxInt32 {
604			buf.WriteString("inf")
605		} else {
606			buf.WriteString(strconv.Itoa(n.n))
607		}
608		buf.WriteString(")")
609
610		break
611	}
612
613	return buf.String()
614}
615
616var padSpace = []byte("                                ")
617
618func (t *RegexTree) Dump() string {
619	return t.root.dump()
620}
621
622func (n *regexNode) dump() string {
623	var stack []int
624	CurNode := n
625	CurChild := 0
626
627	buf := bytes.NewBufferString(CurNode.description())
628	buf.WriteRune('\n')
629
630	for {
631		if CurNode.children != nil && CurChild < len(CurNode.children) {
632			stack = append(stack, CurChild+1)
633			CurNode = CurNode.children[CurChild]
634			CurChild = 0
635
636			Depth := len(stack)
637			if Depth > 32 {
638				Depth = 32
639			}
640			buf.Write(padSpace[:Depth])
641			buf.WriteString(CurNode.description())
642			buf.WriteRune('\n')
643		} else {
644			if len(stack) == 0 {
645				break
646			}
647
648			CurChild = stack[len(stack)-1]
649			stack = stack[:len(stack)-1]
650			CurNode = CurNode.next
651		}
652	}
653	return buf.String()
654}