1package chroma
2
3import (
4 "fmt"
5 "os"
6 "path/filepath"
7 "regexp"
8 "sort"
9 "strings"
10 "sync"
11 "time"
12 "unicode/utf8"
13
14 "github.com/dlclark/regexp2"
15)
16
17// A Rule is the fundamental matching unit of the Regex lexer state machine.
18type Rule struct {
19 Pattern string
20 Type Emitter
21 Mutator Mutator
22}
23
24// Words creates a regex that matches any of the given literal words.
25func Words(prefix, suffix string, words ...string) string {
26 sort.Slice(words, func(i, j int) bool {
27 return len(words[j]) < len(words[i])
28 })
29 for i, word := range words {
30 words[i] = regexp.QuoteMeta(word)
31 }
32 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
33}
34
35// Tokenise text using lexer, returning tokens as a slice.
36func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
37 var out []Token
38 it, err := lexer.Tokenise(options, text)
39 if err != nil {
40 return nil, err
41 }
42 for t := it(); t != EOF; t = it() {
43 out = append(out, t)
44 }
45 return out, nil
46}
47
48// Rules maps from state to a sequence of Rules.
49type Rules map[string][]Rule
50
51// Rename clones rules then a rule.
52func (r Rules) Rename(oldRule, newRule string) Rules {
53 r = r.Clone()
54 r[newRule] = r[oldRule]
55 delete(r, oldRule)
56 return r
57}
58
59// Clone returns a clone of the Rules.
60func (r Rules) Clone() Rules {
61 out := map[string][]Rule{}
62 for key, rules := range r {
63 out[key] = make([]Rule, len(rules))
64 copy(out[key], rules)
65 }
66 return out
67}
68
69// Merge creates a clone of "r" then merges "rules" into the clone.
70func (r Rules) Merge(rules Rules) Rules {
71 out := r.Clone()
72 for k, v := range rules.Clone() {
73 out[k] = v
74 }
75 return out
76}
77
78// MustNewLexer creates a new Lexer with deferred rules generation or panics.
79func MustNewLexer(config *Config, rules func() Rules) *RegexLexer {
80 lexer, err := NewLexer(config, rules)
81 if err != nil {
82 panic(err)
83 }
84 return lexer
85}
86
87// NewLexer creates a new regex-based Lexer.
88//
89// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
90// that match input, optionally modify lexer state, and output tokens.
91func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
92 if config == nil {
93 config = &Config{}
94 }
95 for _, glob := range append(config.Filenames, config.AliasFilenames...) {
96 _, err := filepath.Match(glob, "")
97 if err != nil {
98 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
99 }
100 }
101 r := &RegexLexer{
102 config: config,
103 fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
104 }
105 // One-off code to generate XML lexers in the Chroma source tree.
106 // var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
107 // name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
108 // data, err := Marshal(r)
109 // if err != nil {
110 // if errors.Is(err, ErrNotSerialisable) {
111 // fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
112 // return r, nil
113 // }
114 // return nil, err
115 // }
116 // _, file, _, ok := runtime.Caller(2)
117 // if !ok {
118 // panic("??")
119 // }
120 // fmt.Println(file)
121 // if strings.Contains(file, "/lexers/") {
122 // dir := filepath.Join(filepath.Dir(file), "embedded")
123 // err = os.MkdirAll(dir, 0700)
124 // if err != nil {
125 // return nil, err
126 // }
127 // filename := filepath.Join(dir, name) + ".xml"
128 // fmt.Println(filename)
129 // err = ioutil.WriteFile(filename, data, 0600)
130 // if err != nil {
131 // return nil, err
132 // }
133 // }
134 return r, nil
135}
136
137// Trace enables debug tracing.
138func (r *RegexLexer) Trace(trace bool) *RegexLexer {
139 r.trace = trace
140 return r
141}
142
143// A CompiledRule is a Rule with a pre-compiled regex.
144//
145// Note that regular expressions are lazily compiled on first use of the lexer.
146type CompiledRule struct {
147 Rule
148 Regexp *regexp2.Regexp
149 flags string
150}
151
152// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
153type CompiledRules map[string][]*CompiledRule
154
155// LexerState contains the state for a single lex.
156type LexerState struct {
157 Lexer *RegexLexer
158 Registry *LexerRegistry
159 Text []rune
160 Pos int
161 Rules CompiledRules
162 Stack []string
163 State string
164 Rule int
165 // Group matches.
166 Groups []string
167 // Named Group matches.
168 NamedGroups map[string]string
169 // Custum context for mutators.
170 MutatorContext map[interface{}]interface{}
171 iteratorStack []Iterator
172 options *TokeniseOptions
173 newlineAdded bool
174}
175
176// Set mutator context.
177func (l *LexerState) Set(key interface{}, value interface{}) {
178 l.MutatorContext[key] = value
179}
180
181// Get mutator context.
182func (l *LexerState) Get(key interface{}) interface{} {
183 return l.MutatorContext[key]
184}
185
186// Iterator returns the next Token from the lexer.
187func (l *LexerState) Iterator() Token { // nolint: gocognit
188 end := len(l.Text)
189 if l.newlineAdded {
190 end--
191 }
192 for l.Pos < end && len(l.Stack) > 0 {
193 // Exhaust the iterator stack, if any.
194 for len(l.iteratorStack) > 0 {
195 n := len(l.iteratorStack) - 1
196 t := l.iteratorStack[n]()
197 if t.Type == Ignore {
198 continue
199 }
200 if t == EOF {
201 l.iteratorStack = l.iteratorStack[:n]
202 continue
203 }
204 return t
205 }
206
207 l.State = l.Stack[len(l.Stack)-1]
208 if l.Lexer.trace {
209 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
210 }
211 selectedRule, ok := l.Rules[l.State]
212 if !ok {
213 panic("unknown state " + l.State)
214 }
215 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
216 // No match.
217 if groups == nil {
218 // From Pygments :\
219 //
220 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
221 // emptied and the lexer continues scanning in the 'root' state. This can help producing
222 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
223 // closed.
224 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
225 l.Stack = []string{l.options.State}
226 continue
227 }
228 l.Pos++
229 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
230 }
231 l.Rule = ruleIndex
232 l.Groups = groups
233 l.NamedGroups = namedGroups
234 l.Pos += utf8.RuneCountInString(groups[0])
235 if rule.Mutator != nil {
236 if err := rule.Mutator.Mutate(l); err != nil {
237 panic(err)
238 }
239 }
240 if rule.Type != nil {
241 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
242 }
243 }
244 // Exhaust the IteratorStack, if any.
245 // Duplicate code, but eh.
246 for len(l.iteratorStack) > 0 {
247 n := len(l.iteratorStack) - 1
248 t := l.iteratorStack[n]()
249 if t.Type == Ignore {
250 continue
251 }
252 if t == EOF {
253 l.iteratorStack = l.iteratorStack[:n]
254 continue
255 }
256 return t
257 }
258
259 // If we get to here and we still have text, return it as an error.
260 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
261 value := string(l.Text[l.Pos:])
262 l.Pos = len(l.Text)
263 return Token{Type: Error, Value: value}
264 }
265 return EOF
266}
267
268// RegexLexer is the default lexer implementation used in Chroma.
269type RegexLexer struct {
270 registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
271 config *Config
272 analyser func(text string) float32
273 trace bool
274
275 mu sync.Mutex
276 compiled bool
277 rawRules Rules
278 rules map[string][]*CompiledRule
279 fetchRulesFunc func() (Rules, error)
280 compileOnce sync.Once
281}
282
283func (r *RegexLexer) String() string {
284 return r.config.Name
285}
286
287// Rules in the Lexer.
288func (r *RegexLexer) Rules() (Rules, error) {
289 if err := r.needRules(); err != nil {
290 return nil, err
291 }
292 return r.rawRules, nil
293}
294
295// SetRegistry the lexer will use to lookup other lexers if necessary.
296func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
297 r.registry = registry
298 return r
299}
300
301// SetAnalyser sets the analyser function used to perform content inspection.
302func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
303 r.analyser = analyser
304 return r
305}
306
307// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
308func (r *RegexLexer) AnalyseText(text string) float32 {
309 if r.analyser != nil {
310 return r.analyser(text)
311 }
312 return 0
313}
314
315// SetConfig replaces the Config for this Lexer.
316func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
317 r.config = config
318 return r
319}
320
321// Config returns the Config for this Lexer.
322func (r *RegexLexer) Config() *Config {
323 return r.config
324}
325
326// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
327func (r *RegexLexer) maybeCompile() (err error) {
328 r.mu.Lock()
329 defer r.mu.Unlock()
330 if r.compiled {
331 return nil
332 }
333 for state, rules := range r.rules {
334 for i, rule := range rules {
335 if rule.Regexp == nil {
336 pattern := "(?:" + rule.Pattern + ")"
337 if rule.flags != "" {
338 pattern = "(?" + rule.flags + ")" + pattern
339 }
340 pattern = `\G` + pattern
341 rule.Regexp, err = regexp2.Compile(pattern, 0)
342 if err != nil {
343 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
344 }
345 rule.Regexp.MatchTimeout = time.Millisecond * 250
346 }
347 }
348 }
349restart:
350 seen := map[LexerMutator]bool{}
351 for state := range r.rules {
352 for i := 0; i < len(r.rules[state]); i++ {
353 rule := r.rules[state][i]
354 if compile, ok := rule.Mutator.(LexerMutator); ok {
355 if seen[compile] {
356 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
357 }
358 seen[compile] = true
359 if err := compile.MutateLexer(r.rules, state, i); err != nil {
360 return err
361 }
362 // Process the rules again in case the mutator added/removed rules.
363 //
364 // This sounds bad, but shouldn't be significant in practice.
365 goto restart
366 }
367 }
368 }
369 r.compiled = true
370 return nil
371}
372
373func (r *RegexLexer) fetchRules() error {
374 rules, err := r.fetchRulesFunc()
375 if err != nil {
376 return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
377 }
378 if _, ok := rules["root"]; !ok {
379 return fmt.Errorf("no \"root\" state")
380 }
381 compiledRules := map[string][]*CompiledRule{}
382 for state, rules := range rules {
383 compiledRules[state] = nil
384 for _, rule := range rules {
385 flags := ""
386 if !r.config.NotMultiline {
387 flags += "m"
388 }
389 if r.config.CaseInsensitive {
390 flags += "i"
391 }
392 if r.config.DotAll {
393 flags += "s"
394 }
395 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
396 }
397 }
398
399 r.rawRules = rules
400 r.rules = compiledRules
401 return nil
402}
403
404func (r *RegexLexer) needRules() error {
405 var err error
406 if r.fetchRulesFunc != nil {
407 r.compileOnce.Do(func() {
408 err = r.fetchRules()
409 })
410 }
411 if err := r.maybeCompile(); err != nil {
412 return err
413 }
414 return err
415}
416
417// Tokenise text using lexer, returning an iterator.
418func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
419 err := r.needRules()
420 if err != nil {
421 return nil, err
422 }
423 if options == nil {
424 options = defaultOptions
425 }
426 if options.EnsureLF {
427 text = ensureLF(text)
428 }
429 newlineAdded := false
430 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
431 text += "\n"
432 newlineAdded = true
433 }
434 state := &LexerState{
435 Registry: r.registry,
436 newlineAdded: newlineAdded,
437 options: options,
438 Lexer: r,
439 Text: []rune(text),
440 Stack: []string{options.State},
441 Rules: r.rules,
442 MutatorContext: map[interface{}]interface{}{},
443 }
444 return state.Iterator, nil
445}
446
447// MustRules is like Rules() but will panic on error.
448func (r *RegexLexer) MustRules() Rules {
449 rules, err := r.Rules()
450 if err != nil {
451 panic(err)
452 }
453 return rules
454}
455
456func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
457 for i, rule := range rules {
458 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
459 if match != nil && err == nil && match.Index == pos {
460 groups := []string{}
461 namedGroups := make(map[string]string)
462 for _, g := range match.Groups() {
463 namedGroups[g.Name] = g.String()
464 groups = append(groups, g.String())
465 }
466 return i, rule, groups, namedGroups
467 }
468 }
469 return 0, &CompiledRule{}, nil, nil
470}
471
472// replace \r and \r\n with \n
473// same as strings.ReplaceAll but more efficient
474func ensureLF(text string) string {
475 buf := make([]byte, len(text))
476 var j int
477 for i := 0; i < len(text); i++ {
478 c := text[i]
479 if c == '\r' {
480 if i < len(text)-1 && text[i+1] == '\n' {
481 continue
482 }
483 c = '\n'
484 }
485 buf[j] = c
486 j++
487 }
488 return string(buf[:j])
489}