regexp.go

  1package chroma
  2
  3import (
  4	"fmt"
  5	"os"
  6	"path/filepath"
  7	"regexp"
  8	"sort"
  9	"strings"
 10	"sync"
 11	"time"
 12	"unicode/utf8"
 13
 14	"github.com/dlclark/regexp2"
 15)
 16
 17// A Rule is the fundamental matching unit of the Regex lexer state machine.
 18type Rule struct {
 19	Pattern string
 20	Type    Emitter
 21	Mutator Mutator
 22}
 23
 24// Words creates a regex that matches any of the given literal words.
 25func Words(prefix, suffix string, words ...string) string {
 26	sort.Slice(words, func(i, j int) bool {
 27		return len(words[j]) < len(words[i])
 28	})
 29	for i, word := range words {
 30		words[i] = regexp.QuoteMeta(word)
 31	}
 32	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
 33}
 34
 35// Tokenise text using lexer, returning tokens as a slice.
 36func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
 37	var out []Token
 38	it, err := lexer.Tokenise(options, text)
 39	if err != nil {
 40		return nil, err
 41	}
 42	for t := it(); t != EOF; t = it() {
 43		out = append(out, t)
 44	}
 45	return out, nil
 46}
 47
 48// Rules maps from state to a sequence of Rules.
 49type Rules map[string][]Rule
 50
 51// Rename clones rules then a rule.
 52func (r Rules) Rename(oldRule, newRule string) Rules {
 53	r = r.Clone()
 54	r[newRule] = r[oldRule]
 55	delete(r, oldRule)
 56	return r
 57}
 58
 59// Clone returns a clone of the Rules.
 60func (r Rules) Clone() Rules {
 61	out := map[string][]Rule{}
 62	for key, rules := range r {
 63		out[key] = make([]Rule, len(rules))
 64		copy(out[key], rules)
 65	}
 66	return out
 67}
 68
 69// Merge creates a clone of "r" then merges "rules" into the clone.
 70func (r Rules) Merge(rules Rules) Rules {
 71	out := r.Clone()
 72	for k, v := range rules.Clone() {
 73		out[k] = v
 74	}
 75	return out
 76}
 77
 78// MustNewLexer creates a new Lexer with deferred rules generation or panics.
 79func MustNewLexer(config *Config, rules func() Rules) *RegexLexer {
 80	lexer, err := NewLexer(config, rules)
 81	if err != nil {
 82		panic(err)
 83	}
 84	return lexer
 85}
 86
 87// NewLexer creates a new regex-based Lexer.
 88//
 89// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
 90// that match input, optionally modify lexer state, and output tokens.
 91func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
 92	if config == nil {
 93		config = &Config{}
 94	}
 95	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
 96		_, err := filepath.Match(glob, "")
 97		if err != nil {
 98			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
 99		}
100	}
101	r := &RegexLexer{
102		config:         config,
103		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
104	}
105	// One-off code to generate XML lexers in the Chroma source tree.
106	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
107	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
108	// data, err := Marshal(r)
109	// if err != nil {
110	// 	if errors.Is(err, ErrNotSerialisable) {
111	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
112	// 		return r, nil
113	// 	}
114	// 	return nil, err
115	// }
116	// _, file, _, ok := runtime.Caller(2)
117	// if !ok {
118	// 	panic("??")
119	// }
120	// fmt.Println(file)
121	// if strings.Contains(file, "/lexers/") {
122	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
123	// 	err = os.MkdirAll(dir, 0700)
124	// 	if err != nil {
125	// 		return nil, err
126	// 	}
127	// 	filename := filepath.Join(dir, name) + ".xml"
128	// 	fmt.Println(filename)
129	// 	err = ioutil.WriteFile(filename, data, 0600)
130	// 	if err != nil {
131	// 		return nil, err
132	// 	}
133	// }
134	return r, nil
135}
136
137// Trace enables debug tracing.
138func (r *RegexLexer) Trace(trace bool) *RegexLexer {
139	r.trace = trace
140	return r
141}
142
143// A CompiledRule is a Rule with a pre-compiled regex.
144//
145// Note that regular expressions are lazily compiled on first use of the lexer.
146type CompiledRule struct {
147	Rule
148	Regexp *regexp2.Regexp
149	flags  string
150}
151
152// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
153type CompiledRules map[string][]*CompiledRule
154
155// LexerState contains the state for a single lex.
156type LexerState struct {
157	Lexer    *RegexLexer
158	Registry *LexerRegistry
159	Text     []rune
160	Pos      int
161	Rules    CompiledRules
162	Stack    []string
163	State    string
164	Rule     int
165	// Group matches.
166	Groups []string
167	// Named Group matches.
168	NamedGroups map[string]string
169	// Custum context for mutators.
170	MutatorContext map[interface{}]interface{}
171	iteratorStack  []Iterator
172	options        *TokeniseOptions
173	newlineAdded   bool
174}
175
176// Set mutator context.
177func (l *LexerState) Set(key interface{}, value interface{}) {
178	l.MutatorContext[key] = value
179}
180
181// Get mutator context.
182func (l *LexerState) Get(key interface{}) interface{} {
183	return l.MutatorContext[key]
184}
185
186// Iterator returns the next Token from the lexer.
187func (l *LexerState) Iterator() Token { // nolint: gocognit
188	end := len(l.Text)
189	if l.newlineAdded {
190		end--
191	}
192	for l.Pos < end && len(l.Stack) > 0 {
193		// Exhaust the iterator stack, if any.
194		for len(l.iteratorStack) > 0 {
195			n := len(l.iteratorStack) - 1
196			t := l.iteratorStack[n]()
197			if t.Type == Ignore {
198				continue
199			}
200			if t == EOF {
201				l.iteratorStack = l.iteratorStack[:n]
202				continue
203			}
204			return t
205		}
206
207		l.State = l.Stack[len(l.Stack)-1]
208		if l.Lexer.trace {
209			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
210		}
211		selectedRule, ok := l.Rules[l.State]
212		if !ok {
213			panic("unknown state " + l.State)
214		}
215		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
216		// No match.
217		if groups == nil {
218			// From Pygments :\
219			//
220			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
221			// emptied and the lexer continues scanning in the 'root' state. This can help producing
222			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
223			// closed.
224			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
225				l.Stack = []string{l.options.State}
226				continue
227			}
228			l.Pos++
229			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
230		}
231		l.Rule = ruleIndex
232		l.Groups = groups
233		l.NamedGroups = namedGroups
234		l.Pos += utf8.RuneCountInString(groups[0])
235		if rule.Mutator != nil {
236			if err := rule.Mutator.Mutate(l); err != nil {
237				panic(err)
238			}
239		}
240		if rule.Type != nil {
241			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
242		}
243	}
244	// Exhaust the IteratorStack, if any.
245	// Duplicate code, but eh.
246	for len(l.iteratorStack) > 0 {
247		n := len(l.iteratorStack) - 1
248		t := l.iteratorStack[n]()
249		if t.Type == Ignore {
250			continue
251		}
252		if t == EOF {
253			l.iteratorStack = l.iteratorStack[:n]
254			continue
255		}
256		return t
257	}
258
259	// If we get to here and we still have text, return it as an error.
260	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
261		value := string(l.Text[l.Pos:])
262		l.Pos = len(l.Text)
263		return Token{Type: Error, Value: value}
264	}
265	return EOF
266}
267
268// RegexLexer is the default lexer implementation used in Chroma.
269type RegexLexer struct {
270	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
271	config   *Config
272	analyser func(text string) float32
273	trace    bool
274
275	mu             sync.Mutex
276	compiled       bool
277	rawRules       Rules
278	rules          map[string][]*CompiledRule
279	fetchRulesFunc func() (Rules, error)
280	compileOnce    sync.Once
281}
282
283func (r *RegexLexer) String() string {
284	return r.config.Name
285}
286
287// Rules in the Lexer.
288func (r *RegexLexer) Rules() (Rules, error) {
289	if err := r.needRules(); err != nil {
290		return nil, err
291	}
292	return r.rawRules, nil
293}
294
295// SetRegistry the lexer will use to lookup other lexers if necessary.
296func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
297	r.registry = registry
298	return r
299}
300
301// SetAnalyser sets the analyser function used to perform content inspection.
302func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
303	r.analyser = analyser
304	return r
305}
306
307// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
308func (r *RegexLexer) AnalyseText(text string) float32 {
309	if r.analyser != nil {
310		return r.analyser(text)
311	}
312	return 0
313}
314
315// SetConfig replaces the Config for this Lexer.
316func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
317	r.config = config
318	return r
319}
320
321// Config returns the Config for this Lexer.
322func (r *RegexLexer) Config() *Config {
323	return r.config
324}
325
326// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
327func (r *RegexLexer) maybeCompile() (err error) {
328	r.mu.Lock()
329	defer r.mu.Unlock()
330	if r.compiled {
331		return nil
332	}
333	for state, rules := range r.rules {
334		for i, rule := range rules {
335			if rule.Regexp == nil {
336				pattern := "(?:" + rule.Pattern + ")"
337				if rule.flags != "" {
338					pattern = "(?" + rule.flags + ")" + pattern
339				}
340				pattern = `\G` + pattern
341				rule.Regexp, err = regexp2.Compile(pattern, 0)
342				if err != nil {
343					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
344				}
345				rule.Regexp.MatchTimeout = time.Millisecond * 250
346			}
347		}
348	}
349restart:
350	seen := map[LexerMutator]bool{}
351	for state := range r.rules {
352		for i := 0; i < len(r.rules[state]); i++ {
353			rule := r.rules[state][i]
354			if compile, ok := rule.Mutator.(LexerMutator); ok {
355				if seen[compile] {
356					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
357				}
358				seen[compile] = true
359				if err := compile.MutateLexer(r.rules, state, i); err != nil {
360					return err
361				}
362				// Process the rules again in case the mutator added/removed rules.
363				//
364				// This sounds bad, but shouldn't be significant in practice.
365				goto restart
366			}
367		}
368	}
369	r.compiled = true
370	return nil
371}
372
373func (r *RegexLexer) fetchRules() error {
374	rules, err := r.fetchRulesFunc()
375	if err != nil {
376		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
377	}
378	if _, ok := rules["root"]; !ok {
379		return fmt.Errorf("no \"root\" state")
380	}
381	compiledRules := map[string][]*CompiledRule{}
382	for state, rules := range rules {
383		compiledRules[state] = nil
384		for _, rule := range rules {
385			flags := ""
386			if !r.config.NotMultiline {
387				flags += "m"
388			}
389			if r.config.CaseInsensitive {
390				flags += "i"
391			}
392			if r.config.DotAll {
393				flags += "s"
394			}
395			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
396		}
397	}
398
399	r.rawRules = rules
400	r.rules = compiledRules
401	return nil
402}
403
404func (r *RegexLexer) needRules() error {
405	var err error
406	if r.fetchRulesFunc != nil {
407		r.compileOnce.Do(func() {
408			err = r.fetchRules()
409		})
410	}
411	if err := r.maybeCompile(); err != nil {
412		return err
413	}
414	return err
415}
416
417// Tokenise text using lexer, returning an iterator.
418func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
419	err := r.needRules()
420	if err != nil {
421		return nil, err
422	}
423	if options == nil {
424		options = defaultOptions
425	}
426	if options.EnsureLF {
427		text = ensureLF(text)
428	}
429	newlineAdded := false
430	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
431		text += "\n"
432		newlineAdded = true
433	}
434	state := &LexerState{
435		Registry:       r.registry,
436		newlineAdded:   newlineAdded,
437		options:        options,
438		Lexer:          r,
439		Text:           []rune(text),
440		Stack:          []string{options.State},
441		Rules:          r.rules,
442		MutatorContext: map[interface{}]interface{}{},
443	}
444	return state.Iterator, nil
445}
446
447// MustRules is like Rules() but will panic on error.
448func (r *RegexLexer) MustRules() Rules {
449	rules, err := r.Rules()
450	if err != nil {
451		panic(err)
452	}
453	return rules
454}
455
456func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
457	for i, rule := range rules {
458		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
459		if match != nil && err == nil && match.Index == pos {
460			groups := []string{}
461			namedGroups := make(map[string]string)
462			for _, g := range match.Groups() {
463				namedGroups[g.Name] = g.String()
464				groups = append(groups, g.String())
465			}
466			return i, rule, groups, namedGroups
467		}
468	}
469	return 0, &CompiledRule{}, nil, nil
470}
471
472// replace \r and \r\n with \n
473// same as strings.ReplaceAll but more efficient
474func ensureLF(text string) string {
475	buf := make([]byte, len(text))
476	var j int
477	for i := 0; i < len(text); i++ {
478		c := text[i]
479		if c == '\r' {
480			if i < len(text)-1 && text[i+1] == '\n' {
481				continue
482			}
483			c = '\n'
484		}
485		buf[j] = c
486		j++
487	}
488	return string(buf[:j])
489}