writer.go

  1package syntax
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"math"
  7	"os"
  8)
  9
 10func Write(tree *RegexTree) (*Code, error) {
 11	w := writer{
 12		intStack:   make([]int, 0, 32),
 13		emitted:    make([]int, 2),
 14		stringhash: make(map[string]int),
 15		sethash:    make(map[string]int),
 16	}
 17
 18	code, err := w.codeFromTree(tree)
 19
 20	if tree.options&Debug > 0 && code != nil {
 21		os.Stdout.WriteString(code.Dump())
 22		os.Stdout.WriteString("\n")
 23	}
 24
 25	return code, err
 26}
 27
 28type writer struct {
 29	emitted []int
 30
 31	intStack    []int
 32	curpos      int
 33	stringhash  map[string]int
 34	stringtable [][]rune
 35	sethash     map[string]int
 36	settable    []*CharSet
 37	counting    bool
 38	count       int
 39	trackcount  int
 40	caps        map[int]int
 41}
 42
 43const (
 44	beforeChild nodeType = 64
 45	afterChild           = 128
 46	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
 47	MaxPrefixSize = 50
 48)
 49
 50// The top level RegexCode generator. It does a depth-first walk
 51// through the tree and calls EmitFragment to emits code before
 52// and after each child of an interior node, and at each leaf.
 53//
 54// It runs two passes, first to count the size of the generated
 55// code, and second to generate the code.
 56//
 57// We should time it against the alternative, which is
 58// to just generate the code and grow the array as we go.
 59func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
 60	var (
 61		curNode  *regexNode
 62		curChild int
 63		capsize  int
 64	)
 65	// construct sparse capnum mapping if some numbers are unused
 66
 67	if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
 68		capsize = tree.captop
 69		w.caps = nil
 70	} else {
 71		capsize = len(tree.capnumlist)
 72		w.caps = tree.caps
 73		for i := 0; i < len(tree.capnumlist); i++ {
 74			w.caps[tree.capnumlist[i]] = i
 75		}
 76	}
 77
 78	w.counting = true
 79
 80	for {
 81		if !w.counting {
 82			w.emitted = make([]int, w.count)
 83		}
 84
 85		curNode = tree.root
 86		curChild = 0
 87
 88		w.emit1(Lazybranch, 0)
 89
 90		for {
 91			if len(curNode.children) == 0 {
 92				w.emitFragment(curNode.t, curNode, 0)
 93			} else if curChild < len(curNode.children) {
 94				w.emitFragment(curNode.t|beforeChild, curNode, curChild)
 95
 96				curNode = curNode.children[curChild]
 97
 98				w.pushInt(curChild)
 99				curChild = 0
100				continue
101			}
102
103			if w.emptyStack() {
104				break
105			}
106
107			curChild = w.popInt()
108			curNode = curNode.next
109
110			w.emitFragment(curNode.t|afterChild, curNode, curChild)
111			curChild++
112		}
113
114		w.patchJump(0, w.curPos())
115		w.emit(Stop)
116
117		if !w.counting {
118			break
119		}
120
121		w.counting = false
122	}
123
124	fcPrefix := getFirstCharsPrefix(tree)
125	prefix := getPrefix(tree)
126	rtl := (tree.options & RightToLeft) != 0
127
128	var bmPrefix *BmPrefix
129	//TODO: benchmark string prefixes
130	if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
131		if len(prefix.PrefixStr) > MaxPrefixSize {
132			// limit prefix changes to 10k
133			prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
134		}
135		bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
136	} else {
137		bmPrefix = nil
138	}
139
140	return &Code{
141		Codes:       w.emitted,
142		Strings:     w.stringtable,
143		Sets:        w.settable,
144		TrackCount:  w.trackcount,
145		Caps:        w.caps,
146		Capsize:     capsize,
147		FcPrefix:    fcPrefix,
148		BmPrefix:    bmPrefix,
149		Anchors:     getAnchors(tree),
150		RightToLeft: rtl,
151	}, nil
152}
153
154// The main RegexCode generator. It does a depth-first walk
155// through the tree and calls EmitFragment to emits code before
156// and after each child of an interior node, and at each leaf.
157func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
158	bits := InstOp(0)
159
160	if nodetype <= ntRef {
161		if (node.options & RightToLeft) != 0 {
162			bits |= Rtl
163		}
164		if (node.options & IgnoreCase) != 0 {
165			bits |= Ci
166		}
167	}
168	ntBits := nodeType(bits)
169
170	switch nodetype {
171	case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
172		break
173
174	case ntAlternate | beforeChild:
175		if curIndex < len(node.children)-1 {
176			w.pushInt(w.curPos())
177			w.emit1(Lazybranch, 0)
178		}
179
180	case ntAlternate | afterChild:
181		if curIndex < len(node.children)-1 {
182			lbPos := w.popInt()
183			w.pushInt(w.curPos())
184			w.emit1(Goto, 0)
185			w.patchJump(lbPos, w.curPos())
186		} else {
187			for i := 0; i < curIndex; i++ {
188				w.patchJump(w.popInt(), w.curPos())
189			}
190		}
191		break
192
193	case ntTestref | beforeChild:
194		if curIndex == 0 {
195			w.emit(Setjump)
196			w.pushInt(w.curPos())
197			w.emit1(Lazybranch, 0)
198			w.emit1(Testref, w.mapCapnum(node.m))
199			w.emit(Forejump)
200		}
201
202	case ntTestref | afterChild:
203		if curIndex == 0 {
204			branchpos := w.popInt()
205			w.pushInt(w.curPos())
206			w.emit1(Goto, 0)
207			w.patchJump(branchpos, w.curPos())
208			w.emit(Forejump)
209			if len(node.children) <= 1 {
210				w.patchJump(w.popInt(), w.curPos())
211			}
212		} else if curIndex == 1 {
213			w.patchJump(w.popInt(), w.curPos())
214		}
215
216	case ntTestgroup | beforeChild:
217		if curIndex == 0 {
218			w.emit(Setjump)
219			w.emit(Setmark)
220			w.pushInt(w.curPos())
221			w.emit1(Lazybranch, 0)
222		}
223
224	case ntTestgroup | afterChild:
225		if curIndex == 0 {
226			w.emit(Getmark)
227			w.emit(Forejump)
228		} else if curIndex == 1 {
229			Branchpos := w.popInt()
230			w.pushInt(w.curPos())
231			w.emit1(Goto, 0)
232			w.patchJump(Branchpos, w.curPos())
233			w.emit(Getmark)
234			w.emit(Forejump)
235			if len(node.children) <= 2 {
236				w.patchJump(w.popInt(), w.curPos())
237			}
238		} else if curIndex == 2 {
239			w.patchJump(w.popInt(), w.curPos())
240		}
241
242	case ntLoop | beforeChild, ntLazyloop | beforeChild:
243
244		if node.n < math.MaxInt32 || node.m > 1 {
245			if node.m == 0 {
246				w.emit1(Nullcount, 0)
247			} else {
248				w.emit1(Setcount, 1-node.m)
249			}
250		} else if node.m == 0 {
251			w.emit(Nullmark)
252		} else {
253			w.emit(Setmark)
254		}
255
256		if node.m == 0 {
257			w.pushInt(w.curPos())
258			w.emit1(Goto, 0)
259		}
260		w.pushInt(w.curPos())
261
262	case ntLoop | afterChild, ntLazyloop | afterChild:
263
264		startJumpPos := w.curPos()
265		lazy := (nodetype - (ntLoop | afterChild))
266
267		if node.n < math.MaxInt32 || node.m > 1 {
268			if node.n == math.MaxInt32 {
269				w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
270			} else {
271				w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
272			}
273		} else {
274			w.emit1(InstOp(Branchmark+lazy), w.popInt())
275		}
276
277		if node.m == 0 {
278			w.patchJump(w.popInt(), startJumpPos)
279		}
280
281	case ntGroup | beforeChild, ntGroup | afterChild:
282
283	case ntCapture | beforeChild:
284		w.emit(Setmark)
285
286	case ntCapture | afterChild:
287		w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
288
289	case ntRequire | beforeChild:
290		// NOTE: the following line causes lookahead/lookbehind to be
291		// NON-BACKTRACKING. It can be commented out with (*)
292		w.emit(Setjump)
293
294		w.emit(Setmark)
295
296	case ntRequire | afterChild:
297		w.emit(Getmark)
298
299		// NOTE: the following line causes lookahead/lookbehind to be
300		// NON-BACKTRACKING. It can be commented out with (*)
301		w.emit(Forejump)
302
303	case ntPrevent | beforeChild:
304		w.emit(Setjump)
305		w.pushInt(w.curPos())
306		w.emit1(Lazybranch, 0)
307
308	case ntPrevent | afterChild:
309		w.emit(Backjump)
310		w.patchJump(w.popInt(), w.curPos())
311		w.emit(Forejump)
312
313	case ntGreedy | beforeChild:
314		w.emit(Setjump)
315
316	case ntGreedy | afterChild:
317		w.emit(Forejump)
318
319	case ntOne, ntNotone:
320		w.emit1(InstOp(node.t|ntBits), int(node.ch))
321
322	case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
323		if node.m > 0 {
324			if node.t == ntOneloop || node.t == ntOnelazy {
325				w.emit2(Onerep|bits, int(node.ch), node.m)
326			} else {
327				w.emit2(Notonerep|bits, int(node.ch), node.m)
328			}
329		}
330		if node.n > node.m {
331			if node.n == math.MaxInt32 {
332				w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
333			} else {
334				w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
335			}
336		}
337
338	case ntSetloop, ntSetlazy:
339		if node.m > 0 {
340			w.emit2(Setrep|bits, w.setCode(node.set), node.m)
341		}
342		if node.n > node.m {
343			if node.n == math.MaxInt32 {
344				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
345			} else {
346				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
347			}
348		}
349
350	case ntMulti:
351		w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
352
353	case ntSet:
354		w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
355
356	case ntRef:
357		w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
358
359	case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
360		w.emit(InstOp(node.t))
361
362	default:
363		return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
364	}
365
366	return nil
367}
368
369// To avoid recursion, we use a simple integer stack.
370// This is the push.
371func (w *writer) pushInt(i int) {
372	w.intStack = append(w.intStack, i)
373}
374
375// Returns true if the stack is empty.
376func (w *writer) emptyStack() bool {
377	return len(w.intStack) == 0
378}
379
380// This is the pop.
381func (w *writer) popInt() int {
382	//get our item
383	idx := len(w.intStack) - 1
384	i := w.intStack[idx]
385	//trim our slice
386	w.intStack = w.intStack[:idx]
387	return i
388}
389
390// Returns the current position in the emitted code.
391func (w *writer) curPos() int {
392	return w.curpos
393}
394
395// Fixes up a jump instruction at the specified offset
396// so that it jumps to the specified jumpDest.
397func (w *writer) patchJump(offset, jumpDest int) {
398	w.emitted[offset+1] = jumpDest
399}
400
401// Returns an index in the set table for a charset
402// uses a map to eliminate duplicates.
403func (w *writer) setCode(set *CharSet) int {
404	if w.counting {
405		return 0
406	}
407
408	buf := &bytes.Buffer{}
409
410	set.mapHashFill(buf)
411	hash := buf.String()
412	i, ok := w.sethash[hash]
413	if !ok {
414		i = len(w.sethash)
415		w.sethash[hash] = i
416		w.settable = append(w.settable, set)
417	}
418	return i
419}
420
421// Returns an index in the string table for a string.
422// uses a map to eliminate duplicates.
423func (w *writer) stringCode(str []rune) int {
424	if w.counting {
425		return 0
426	}
427
428	hash := string(str)
429	i, ok := w.stringhash[hash]
430	if !ok {
431		i = len(w.stringhash)
432		w.stringhash[hash] = i
433		w.stringtable = append(w.stringtable, str)
434	}
435
436	return i
437}
438
439// When generating code on a regex that uses a sparse set
440// of capture slots, we hash them to a dense set of indices
441// for an array of capture slots. Instead of doing the hash
442// at match time, it's done at compile time, here.
443func (w *writer) mapCapnum(capnum int) int {
444	if capnum == -1 {
445		return -1
446	}
447
448	if w.caps != nil {
449		return w.caps[capnum]
450	}
451
452	return capnum
453}
454
455// Emits a zero-argument operation. Note that the emit
456// functions all run in two modes: they can emit code, or
457// they can just count the size of the code.
458func (w *writer) emit(op InstOp) {
459	if w.counting {
460		w.count++
461		if opcodeBacktracks(op) {
462			w.trackcount++
463		}
464		return
465	}
466	w.emitted[w.curpos] = int(op)
467	w.curpos++
468}
469
470// Emits a one-argument operation.
471func (w *writer) emit1(op InstOp, opd1 int) {
472	if w.counting {
473		w.count += 2
474		if opcodeBacktracks(op) {
475			w.trackcount++
476		}
477		return
478	}
479	w.emitted[w.curpos] = int(op)
480	w.curpos++
481	w.emitted[w.curpos] = opd1
482	w.curpos++
483}
484
485// Emits a two-argument operation.
486func (w *writer) emit2(op InstOp, opd1, opd2 int) {
487	if w.counting {
488		w.count += 3
489		if opcodeBacktracks(op) {
490			w.trackcount++
491		}
492		return
493	}
494	w.emitted[w.curpos] = int(op)
495	w.curpos++
496	w.emitted[w.curpos] = opd1
497	w.curpos++
498	w.emitted[w.curpos] = opd2
499	w.curpos++
500}