regexp.go

  1package chroma
  2
  3import (
  4	"fmt"
  5	"os"
  6	"regexp"
  7	"strings"
  8	"sync"
  9	"unicode/utf8"
 10
 11	"github.com/dlclark/regexp2"
 12)
 13
 14// A Rule is the fundamental matching unit of the Regex lexer state machine.
 15type Rule struct {
 16	Pattern string
 17	Type    Emitter
 18	Mutator Mutator
 19}
 20
 21// An Emitter takes group matches and returns tokens.
 22type Emitter interface {
 23	// Emit tokens for the given regex groups.
 24	Emit(groups []string, lexer Lexer) Iterator
 25}
 26
 27// EmitterFunc is a function that is an Emitter.
 28type EmitterFunc func(groups []string, lexer Lexer) Iterator
 29
 30// Emit tokens for groups.
 31func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) }
 32
 33// ByGroups emits a token for each matching group in the rule's regex.
 34func ByGroups(emitters ...Emitter) Emitter {
 35	return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
 36		iterators := make([]Iterator, 0, len(groups)-1)
 37		if len(emitters) != len(groups)-1 {
 38			iterators = append(iterators, Error.Emit(groups, lexer))
 39			// panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
 40		} else {
 41			for i, group := range groups[1:] {
 42				iterators = append(iterators, emitters[i].Emit([]string{group}, lexer))
 43			}
 44		}
 45		return Concaterator(iterators...)
 46	})
 47}
 48
 49// UsingByGroup emits tokens for the matched groups in the regex using a
 50// "sublexer". Used when lexing code blocks where the name of a sublexer is
 51// contained within the block, for example on a Markdown text block or SQL
 52// language block.
 53//
 54// The sublexer will be retrieved using sublexerGetFunc (typically
 55// internal.Get), using the captured value from the matched sublexerNameGroup.
 56//
 57// If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
 58// then tokens for the matched codeGroup will be emitted using the retrieved
 59// lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
 60// the passed emitter.
 61//
 62// Example:
 63//
 64// 	var Markdown = internal.Register(MustNewLexer(
 65// 		&Config{
 66// 			Name:      "markdown",
 67// 			Aliases:   []string{"md", "mkd"},
 68// 			Filenames: []string{"*.md", "*.mkd", "*.markdown"},
 69// 			MimeTypes: []string{"text/x-markdown"},
 70// 		},
 71// 		Rules{
 72// 			"root": {
 73// 				{"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
 74// 					UsingByGroup(
 75// 						internal.Get,
 76// 						2, 4,
 77// 						String, String, String, Text, String,
 78// 					),
 79// 					nil,
 80// 				},
 81// 			},
 82// 		},
 83// 	))
 84//
 85// See the lexers/m/markdown.go for the complete example.
 86//
 87// Note: panic's if the number emitters does not equal the number of matched
 88// groups in the regex.
 89func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
 90	return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
 91		// bounds check
 92		if len(emitters) != len(groups)-1 {
 93			panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
 94		}
 95
 96		// grab sublexer
 97		sublexer := sublexerGetFunc(groups[sublexerNameGroup])
 98
 99		// build iterators
100		iterators := make([]Iterator, len(groups)-1)
101		for i, group := range groups[1:] {
102			if i == codeGroup-1 && sublexer != nil {
103				var err error
104				iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
105				if err != nil {
106					panic(err)
107				}
108			} else {
109				iterators[i] = emitters[i].Emit([]string{group}, lexer)
110			}
111		}
112
113		return Concaterator(iterators...)
114	})
115}
116
117// Using returns an Emitter that uses a given Lexer for parsing and emitting.
118func Using(lexer Lexer) Emitter {
119	return EmitterFunc(func(groups []string, _ Lexer) Iterator {
120		it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
121		if err != nil {
122			panic(err)
123		}
124		return it
125	})
126}
127
128// UsingSelf is like Using, but uses the current Lexer.
129func UsingSelf(state string) Emitter {
130	return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
131		it, err := lexer.Tokenise(&TokeniseOptions{State: state, Nested: true}, groups[0])
132		if err != nil {
133			panic(err)
134		}
135		return it
136	})
137}
138
139// Words creates a regex that matches any of the given literal words.
140func Words(prefix, suffix string, words ...string) string {
141	for i, word := range words {
142		words[i] = regexp.QuoteMeta(word)
143	}
144	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
145}
146
147// Tokenise text using lexer, returning tokens as a slice.
148func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
149	var out []Token
150	it, err := lexer.Tokenise(options, text)
151	if err != nil {
152		return nil, err
153	}
154	for t := it(); t != EOF; t = it() {
155		out = append(out, t)
156	}
157	return out, nil
158}
159
160// Rules maps from state to a sequence of Rules.
161type Rules map[string][]Rule
162
163// Clone returns a clone of the Rules.
164func (r Rules) Clone() Rules {
165	out := map[string][]Rule{}
166	for key, rules := range r {
167		out[key] = make([]Rule, len(rules))
168		copy(out[key], rules)
169	}
170	return out
171}
172
173// MustNewLexer creates a new Lexer or panics.
174func MustNewLexer(config *Config, rules Rules) *RegexLexer {
175	lexer, err := NewLexer(config, rules)
176	if err != nil {
177		panic(err)
178	}
179	return lexer
180}
181
182// NewLexer creates a new regex-based Lexer.
183//
184// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
185// that match input, optionally modify lexer state, and output tokens.
186func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
187	if config == nil {
188		config = &Config{}
189	}
190	if _, ok := rules["root"]; !ok {
191		return nil, fmt.Errorf("no \"root\" state")
192	}
193	compiledRules := map[string][]*CompiledRule{}
194	for state, rules := range rules {
195		compiledRules[state] = nil
196		for _, rule := range rules {
197			flags := ""
198			if !config.NotMultiline {
199				flags += "m"
200			}
201			if config.CaseInsensitive {
202				flags += "i"
203			}
204			if config.DotAll {
205				flags += "s"
206			}
207			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
208		}
209	}
210	return &RegexLexer{
211		config: config,
212		rules:  compiledRules,
213	}, nil
214}
215
216// Trace enables debug tracing.
217func (r *RegexLexer) Trace(trace bool) *RegexLexer {
218	r.trace = trace
219	return r
220}
221
222// A CompiledRule is a Rule with a pre-compiled regex.
223//
224// Note that regular expressions are lazily compiled on first use of the lexer.
225type CompiledRule struct {
226	Rule
227	Regexp *regexp2.Regexp
228	flags  string
229}
230
231// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
232type CompiledRules map[string][]*CompiledRule
233
234// LexerState contains the state for a single lex.
235type LexerState struct {
236	Lexer *RegexLexer
237	Text  []rune
238	Pos   int
239	Rules CompiledRules
240	Stack []string
241	State string
242	Rule  int
243	// Group matches.
244	Groups []string
245	// Custum context for mutators.
246	MutatorContext map[interface{}]interface{}
247	iteratorStack  []Iterator
248	options        *TokeniseOptions
249}
250
251// Set mutator context.
252func (l *LexerState) Set(key interface{}, value interface{}) {
253	l.MutatorContext[key] = value
254}
255
256// Get mutator context.
257func (l *LexerState) Get(key interface{}) interface{} {
258	return l.MutatorContext[key]
259}
260
261// Iterator returns the next Token from the lexer.
262func (l *LexerState) Iterator() Token { // nolint: gocognit
263	for l.Pos < len(l.Text) && len(l.Stack) > 0 {
264		// Exhaust the iterator stack, if any.
265		for len(l.iteratorStack) > 0 {
266			n := len(l.iteratorStack) - 1
267			t := l.iteratorStack[n]()
268			if t == EOF {
269				l.iteratorStack = l.iteratorStack[:n]
270				continue
271			}
272			return t
273		}
274
275		l.State = l.Stack[len(l.Stack)-1]
276		if l.Lexer.trace {
277			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
278		}
279		selectedRule, ok := l.Rules[l.State]
280		if !ok {
281			panic("unknown state " + l.State)
282		}
283		ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule)
284		// No match.
285		if groups == nil {
286			// From Pygments :\
287			//
288			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
289			// emptied and the lexer continues scanning in the 'root' state. This can help producing
290			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
291			// closed.
292			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
293				l.Stack = []string{l.options.State}
294				continue
295			}
296			l.Pos++
297			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
298		}
299		l.Rule = ruleIndex
300		l.Groups = groups
301		l.Pos += utf8.RuneCountInString(groups[0])
302		if rule.Mutator != nil {
303			if err := rule.Mutator.Mutate(l); err != nil {
304				panic(err)
305			}
306		}
307		if rule.Type != nil {
308			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l.Lexer))
309		}
310	}
311	// Exhaust the IteratorStack, if any.
312	// Duplicate code, but eh.
313	for len(l.iteratorStack) > 0 {
314		n := len(l.iteratorStack) - 1
315		t := l.iteratorStack[n]()
316		if t == EOF {
317			l.iteratorStack = l.iteratorStack[:n]
318			continue
319		}
320		return t
321	}
322
323	// If we get to here and we still have text, return it as an error.
324	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
325		value := string(l.Text[l.Pos:])
326		l.Pos = len(l.Text)
327		return Token{Type: Error, Value: value}
328	}
329	return EOF
330}
331
332// RegexLexer is the default lexer implementation used in Chroma.
333type RegexLexer struct {
334	config   *Config
335	analyser func(text string) float32
336	trace    bool
337
338	mu       sync.Mutex
339	compiled bool
340	rules    map[string][]*CompiledRule
341}
342
343// SetAnalyser sets the analyser function used to perform content inspection.
344func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
345	r.analyser = analyser
346	return r
347}
348
349func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
350	if r.analyser != nil {
351		return r.analyser(text)
352	}
353	return 0.0
354}
355
356func (r *RegexLexer) Config() *Config { // nolint
357	return r.config
358}
359
360// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
361func (r *RegexLexer) maybeCompile() (err error) {
362	r.mu.Lock()
363	defer r.mu.Unlock()
364	if r.compiled {
365		return nil
366	}
367	for state, rules := range r.rules {
368		for i, rule := range rules {
369			if rule.Regexp == nil {
370				pattern := "(?:" + rule.Pattern + ")"
371				if rule.flags != "" {
372					pattern = "(?" + rule.flags + ")" + pattern
373				}
374				pattern = `\G` + pattern
375				rule.Regexp, err = regexp2.Compile(pattern, 0)
376				if err != nil {
377					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
378				}
379			}
380		}
381	}
382restart:
383	seen := map[LexerMutator]bool{}
384	for state := range r.rules {
385		for i := 0; i < len(r.rules[state]); i++ {
386			rule := r.rules[state][i]
387			if compile, ok := rule.Mutator.(LexerMutator); ok {
388				if seen[compile] {
389					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
390				}
391				seen[compile] = true
392				if err := compile.MutateLexer(r.rules, state, i); err != nil {
393					return err
394				}
395				// Process the rules again in case the mutator added/removed rules.
396				//
397				// This sounds bad, but shouldn't be significant in practice.
398				goto restart
399			}
400		}
401	}
402	r.compiled = true
403	return nil
404}
405
406func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
407	if err := r.maybeCompile(); err != nil {
408		return nil, err
409	}
410	if options == nil {
411		options = defaultOptions
412	}
413	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
414		text += "\n"
415	}
416	state := &LexerState{
417		options:        options,
418		Lexer:          r,
419		Text:           []rune(text),
420		Stack:          []string{options.State},
421		Rules:          r.rules,
422		MutatorContext: map[interface{}]interface{}{},
423	}
424	return state.Iterator, nil
425}
426
427func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) {
428	for i, rule := range rules {
429		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
430		if match != nil && err == nil && match.Index == pos {
431			groups := []string{}
432			for _, g := range match.Groups() {
433				groups = append(groups, g.String())
434			}
435			return i, rule, groups
436		}
437	}
438	return 0, &CompiledRule{}, nil
439}