1package chroma
2
3import (
4 "fmt"
5 "os"
6 "regexp"
7 "strings"
8 "sync"
9 "unicode/utf8"
10
11 "github.com/dlclark/regexp2"
12)
13
14// A Rule is the fundamental matching unit of the Regex lexer state machine.
15type Rule struct {
16 Pattern string
17 Type Emitter
18 Mutator Mutator
19}
20
21// An Emitter takes group matches and returns tokens.
22type Emitter interface {
23 // Emit tokens for the given regex groups.
24 Emit(groups []string, lexer Lexer) Iterator
25}
26
27// EmitterFunc is a function that is an Emitter.
28type EmitterFunc func(groups []string, lexer Lexer) Iterator
29
30// Emit tokens for groups.
31func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) }
32
33// ByGroups emits a token for each matching group in the rule's regex.
34func ByGroups(emitters ...Emitter) Emitter {
35 return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
36 iterators := make([]Iterator, 0, len(groups)-1)
37 if len(emitters) != len(groups)-1 {
38 iterators = append(iterators, Error.Emit(groups, lexer))
39 // panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
40 } else {
41 for i, group := range groups[1:] {
42 iterators = append(iterators, emitters[i].Emit([]string{group}, lexer))
43 }
44 }
45 return Concaterator(iterators...)
46 })
47}
48
49// UsingByGroup emits tokens for the matched groups in the regex using a
50// "sublexer". Used when lexing code blocks where the name of a sublexer is
51// contained within the block, for example on a Markdown text block or SQL
52// language block.
53//
54// The sublexer will be retrieved using sublexerGetFunc (typically
55// internal.Get), using the captured value from the matched sublexerNameGroup.
56//
57// If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
58// then tokens for the matched codeGroup will be emitted using the retrieved
59// lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
60// the passed emitter.
61//
62// Example:
63//
64// var Markdown = internal.Register(MustNewLexer(
65// &Config{
66// Name: "markdown",
67// Aliases: []string{"md", "mkd"},
68// Filenames: []string{"*.md", "*.mkd", "*.markdown"},
69// MimeTypes: []string{"text/x-markdown"},
70// },
71// Rules{
72// "root": {
73// {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
74// UsingByGroup(
75// internal.Get,
76// 2, 4,
77// String, String, String, Text, String,
78// ),
79// nil,
80// },
81// },
82// },
83// ))
84//
85// See the lexers/m/markdown.go for the complete example.
86//
87// Note: panic's if the number emitters does not equal the number of matched
88// groups in the regex.
89func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
90 return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
91 // bounds check
92 if len(emitters) != len(groups)-1 {
93 panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
94 }
95
96 // grab sublexer
97 sublexer := sublexerGetFunc(groups[sublexerNameGroup])
98
99 // build iterators
100 iterators := make([]Iterator, len(groups)-1)
101 for i, group := range groups[1:] {
102 if i == codeGroup-1 && sublexer != nil {
103 var err error
104 iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
105 if err != nil {
106 panic(err)
107 }
108 } else {
109 iterators[i] = emitters[i].Emit([]string{group}, lexer)
110 }
111 }
112
113 return Concaterator(iterators...)
114 })
115}
116
117// Using returns an Emitter that uses a given Lexer for parsing and emitting.
118func Using(lexer Lexer) Emitter {
119 return EmitterFunc(func(groups []string, _ Lexer) Iterator {
120 it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
121 if err != nil {
122 panic(err)
123 }
124 return it
125 })
126}
127
128// UsingSelf is like Using, but uses the current Lexer.
129func UsingSelf(state string) Emitter {
130 return EmitterFunc(func(groups []string, lexer Lexer) Iterator {
131 it, err := lexer.Tokenise(&TokeniseOptions{State: state, Nested: true}, groups[0])
132 if err != nil {
133 panic(err)
134 }
135 return it
136 })
137}
138
139// Words creates a regex that matches any of the given literal words.
140func Words(prefix, suffix string, words ...string) string {
141 for i, word := range words {
142 words[i] = regexp.QuoteMeta(word)
143 }
144 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
145}
146
147// Tokenise text using lexer, returning tokens as a slice.
148func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
149 var out []Token
150 it, err := lexer.Tokenise(options, text)
151 if err != nil {
152 return nil, err
153 }
154 for t := it(); t != EOF; t = it() {
155 out = append(out, t)
156 }
157 return out, nil
158}
159
160// Rules maps from state to a sequence of Rules.
161type Rules map[string][]Rule
162
163// Clone returns a clone of the Rules.
164func (r Rules) Clone() Rules {
165 out := map[string][]Rule{}
166 for key, rules := range r {
167 out[key] = make([]Rule, len(rules))
168 copy(out[key], rules)
169 }
170 return out
171}
172
173// MustNewLexer creates a new Lexer or panics.
174func MustNewLexer(config *Config, rules Rules) *RegexLexer {
175 lexer, err := NewLexer(config, rules)
176 if err != nil {
177 panic(err)
178 }
179 return lexer
180}
181
182// NewLexer creates a new regex-based Lexer.
183//
184// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
185// that match input, optionally modify lexer state, and output tokens.
186func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
187 if config == nil {
188 config = &Config{}
189 }
190 if _, ok := rules["root"]; !ok {
191 return nil, fmt.Errorf("no \"root\" state")
192 }
193 compiledRules := map[string][]*CompiledRule{}
194 for state, rules := range rules {
195 compiledRules[state] = nil
196 for _, rule := range rules {
197 flags := ""
198 if !config.NotMultiline {
199 flags += "m"
200 }
201 if config.CaseInsensitive {
202 flags += "i"
203 }
204 if config.DotAll {
205 flags += "s"
206 }
207 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
208 }
209 }
210 return &RegexLexer{
211 config: config,
212 rules: compiledRules,
213 }, nil
214}
215
216// Trace enables debug tracing.
217func (r *RegexLexer) Trace(trace bool) *RegexLexer {
218 r.trace = trace
219 return r
220}
221
222// A CompiledRule is a Rule with a pre-compiled regex.
223//
224// Note that regular expressions are lazily compiled on first use of the lexer.
225type CompiledRule struct {
226 Rule
227 Regexp *regexp2.Regexp
228 flags string
229}
230
231// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
232type CompiledRules map[string][]*CompiledRule
233
234// LexerState contains the state for a single lex.
235type LexerState struct {
236 Lexer *RegexLexer
237 Text []rune
238 Pos int
239 Rules CompiledRules
240 Stack []string
241 State string
242 Rule int
243 // Group matches.
244 Groups []string
245 // Custum context for mutators.
246 MutatorContext map[interface{}]interface{}
247 iteratorStack []Iterator
248 options *TokeniseOptions
249}
250
251// Set mutator context.
252func (l *LexerState) Set(key interface{}, value interface{}) {
253 l.MutatorContext[key] = value
254}
255
256// Get mutator context.
257func (l *LexerState) Get(key interface{}) interface{} {
258 return l.MutatorContext[key]
259}
260
261// Iterator returns the next Token from the lexer.
262func (l *LexerState) Iterator() Token { // nolint: gocognit
263 for l.Pos < len(l.Text) && len(l.Stack) > 0 {
264 // Exhaust the iterator stack, if any.
265 for len(l.iteratorStack) > 0 {
266 n := len(l.iteratorStack) - 1
267 t := l.iteratorStack[n]()
268 if t == EOF {
269 l.iteratorStack = l.iteratorStack[:n]
270 continue
271 }
272 return t
273 }
274
275 l.State = l.Stack[len(l.Stack)-1]
276 if l.Lexer.trace {
277 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
278 }
279 selectedRule, ok := l.Rules[l.State]
280 if !ok {
281 panic("unknown state " + l.State)
282 }
283 ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule)
284 // No match.
285 if groups == nil {
286 // From Pygments :\
287 //
288 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
289 // emptied and the lexer continues scanning in the 'root' state. This can help producing
290 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
291 // closed.
292 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
293 l.Stack = []string{l.options.State}
294 continue
295 }
296 l.Pos++
297 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
298 }
299 l.Rule = ruleIndex
300 l.Groups = groups
301 l.Pos += utf8.RuneCountInString(groups[0])
302 if rule.Mutator != nil {
303 if err := rule.Mutator.Mutate(l); err != nil {
304 panic(err)
305 }
306 }
307 if rule.Type != nil {
308 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l.Lexer))
309 }
310 }
311 // Exhaust the IteratorStack, if any.
312 // Duplicate code, but eh.
313 for len(l.iteratorStack) > 0 {
314 n := len(l.iteratorStack) - 1
315 t := l.iteratorStack[n]()
316 if t == EOF {
317 l.iteratorStack = l.iteratorStack[:n]
318 continue
319 }
320 return t
321 }
322
323 // If we get to here and we still have text, return it as an error.
324 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
325 value := string(l.Text[l.Pos:])
326 l.Pos = len(l.Text)
327 return Token{Type: Error, Value: value}
328 }
329 return EOF
330}
331
332// RegexLexer is the default lexer implementation used in Chroma.
333type RegexLexer struct {
334 config *Config
335 analyser func(text string) float32
336 trace bool
337
338 mu sync.Mutex
339 compiled bool
340 rules map[string][]*CompiledRule
341}
342
343// SetAnalyser sets the analyser function used to perform content inspection.
344func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
345 r.analyser = analyser
346 return r
347}
348
349func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
350 if r.analyser != nil {
351 return r.analyser(text)
352 }
353 return 0.0
354}
355
356func (r *RegexLexer) Config() *Config { // nolint
357 return r.config
358}
359
360// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
361func (r *RegexLexer) maybeCompile() (err error) {
362 r.mu.Lock()
363 defer r.mu.Unlock()
364 if r.compiled {
365 return nil
366 }
367 for state, rules := range r.rules {
368 for i, rule := range rules {
369 if rule.Regexp == nil {
370 pattern := "(?:" + rule.Pattern + ")"
371 if rule.flags != "" {
372 pattern = "(?" + rule.flags + ")" + pattern
373 }
374 pattern = `\G` + pattern
375 rule.Regexp, err = regexp2.Compile(pattern, 0)
376 if err != nil {
377 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
378 }
379 }
380 }
381 }
382restart:
383 seen := map[LexerMutator]bool{}
384 for state := range r.rules {
385 for i := 0; i < len(r.rules[state]); i++ {
386 rule := r.rules[state][i]
387 if compile, ok := rule.Mutator.(LexerMutator); ok {
388 if seen[compile] {
389 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
390 }
391 seen[compile] = true
392 if err := compile.MutateLexer(r.rules, state, i); err != nil {
393 return err
394 }
395 // Process the rules again in case the mutator added/removed rules.
396 //
397 // This sounds bad, but shouldn't be significant in practice.
398 goto restart
399 }
400 }
401 }
402 r.compiled = true
403 return nil
404}
405
406func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
407 if err := r.maybeCompile(); err != nil {
408 return nil, err
409 }
410 if options == nil {
411 options = defaultOptions
412 }
413 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
414 text += "\n"
415 }
416 state := &LexerState{
417 options: options,
418 Lexer: r,
419 Text: []rune(text),
420 Stack: []string{options.State},
421 Rules: r.rules,
422 MutatorContext: map[interface{}]interface{}{},
423 }
424 return state.Iterator, nil
425}
426
427func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) {
428 for i, rule := range rules {
429 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
430 if match != nil && err == nil && match.Index == pos {
431 groups := []string{}
432 for _, g := range match.Groups() {
433 groups = append(groups, g.String())
434 }
435 return i, rule, groups
436 }
437 }
438 return 0, &CompiledRule{}, nil
439}