parser_decode.go

  1package ansi
  2
  3import (
  4	"unicode/utf8"
  5
  6	"github.com/charmbracelet/x/ansi/parser"
  7	"github.com/mattn/go-runewidth"
  8	"github.com/rivo/uniseg"
  9)
 10
 11// State represents the state of the ANSI escape sequence parser used by
 12// [DecodeSequence].
 13type State = byte
 14
 15// ANSI escape sequence states used by [DecodeSequence].
 16const (
 17	NormalState State = iota
 18	PrefixState
 19	ParamsState
 20	IntermedState
 21	EscapeState
 22	StringState
 23)
 24
 25// DecodeSequence decodes the first ANSI escape sequence or a printable
 26// grapheme from the given data. It returns the sequence slice, the number of
 27// bytes read, the cell width for each sequence, and the new state.
 28//
 29// The cell width will always be 0 for control and escape sequences, 1 for
 30// ASCII printable characters, and the number of cells other Unicode characters
 31// occupy. It uses the uniseg package to calculate the width of Unicode
 32// graphemes and characters. This means it will always do grapheme clustering
 33// (mode 2027).
 34//
 35// Passing a non-nil [*Parser] as the last argument will allow the decoder to
 36// collect sequence parameters, data, and commands. The parser cmd will have
 37// the packed command value that contains intermediate and prefix characters.
 38// In the case of a OSC sequence, the cmd will be the OSC command number. Use
 39// [Cmd] and [Param] types to unpack command intermediates and prefixes as well
 40// as parameters.
 41//
 42// Zero [Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
 43// validity of other data sequences, OSC, DCS, etc, will require checking for
 44// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
 45//
 46// We store the command byte in [Cmd] in the most significant byte, the
 47// prefix byte in the next byte, and the intermediate byte in the least
 48// significant byte. This is done to avoid using a struct to store the command
 49// and its intermediates and prefixes. The command byte is always the least
 50// significant byte i.e. [Cmd & 0xff]. Use the [Cmd] type to unpack the
 51// command, intermediate, and prefix bytes. Note that we only collect the last
 52// prefix character and intermediate byte.
 53//
 54// The [p.Params] slice will contain the parameters of the sequence. Any
 55// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
 56// to unpack the parameters.
 57//
 58// Example:
 59//
 60//	var state byte // the initial state is always zero [NormalState]
 61//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
 62//	input := []byte("\x1b[31mHello, World!\x1b[0m")
 63//	for len(input) > 0 {
 64//		seq, width, n, newState := DecodeSequence(input, state, p)
 65//		log.Printf("seq: %q, width: %d", seq, width)
 66//		state = newState
 67//		input = input[n:]
 68//	}
 69//
 70// This function treats the text as a sequence of grapheme clusters.
 71func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
 72	return decodeSequence(GraphemeWidth, b, state, p)
 73}
 74
 75// DecodeSequenceWc decodes the first ANSI escape sequence or a printable
 76// grapheme from the given data. It returns the sequence slice, the number of
 77// bytes read, the cell width for each sequence, and the new state.
 78//
 79// The cell width will always be 0 for control and escape sequences, 1 for
 80// ASCII printable characters, and the number of cells other Unicode characters
 81// occupy. It uses the uniseg package to calculate the width of Unicode
 82// graphemes and characters. This means it will always do grapheme clustering
 83// (mode 2027).
 84//
 85// Passing a non-nil [*Parser] as the last argument will allow the decoder to
 86// collect sequence parameters, data, and commands. The parser cmd will have
 87// the packed command value that contains intermediate and prefix characters.
 88// In the case of a OSC sequence, the cmd will be the OSC command number. Use
 89// [Cmd] and [Param] types to unpack command intermediates and prefixes as well
 90// as parameters.
 91//
 92// Zero [Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
 93// validity of other data sequences, OSC, DCS, etc, will require checking for
 94// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
 95//
 96// We store the command byte in [Cmd] in the most significant byte, the
 97// prefix byte in the next byte, and the intermediate byte in the least
 98// significant byte. This is done to avoid using a struct to store the command
 99// and its intermediates and prefixes. The command byte is always the least
100// significant byte i.e. [Cmd & 0xff]. Use the [Cmd] type to unpack the
101// command, intermediate, and prefix bytes. Note that we only collect the last
102// prefix character and intermediate byte.
103//
104// The [p.Params] slice will contain the parameters of the sequence. Any
105// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
106// to unpack the parameters.
107//
108// Example:
109//
110//	var state byte // the initial state is always zero [NormalState]
111//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
112//	input := []byte("\x1b[31mHello, World!\x1b[0m")
113//	for len(input) > 0 {
114//		seq, width, n, newState := DecodeSequenceWc(input, state, p)
115//		log.Printf("seq: %q, width: %d", seq, width)
116//		state = newState
117//		input = input[n:]
118//	}
119//
120// This function treats the text as a sequence of wide characters and runes.
121func DecodeSequenceWc[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
122	return decodeSequence(WcWidth, b, state, p)
123}
124
125func decodeSequence[T string | []byte](m Method, b T, state State, p *Parser) (seq T, width int, n int, newState byte) {
126	for i := 0; i < len(b); i++ {
127		c := b[i]
128
129		switch state {
130		case NormalState:
131			switch c {
132			case ESC:
133				if p != nil {
134					if len(p.params) > 0 {
135						p.params[0] = parser.MissingParam
136					}
137					p.cmd = 0
138					p.paramsLen = 0
139					p.dataLen = 0
140				}
141				state = EscapeState
142				continue
143			case CSI, DCS:
144				if p != nil {
145					if len(p.params) > 0 {
146						p.params[0] = parser.MissingParam
147					}
148					p.cmd = 0
149					p.paramsLen = 0
150					p.dataLen = 0
151				}
152				state = PrefixState
153				continue
154			case OSC, APC, SOS, PM:
155				if p != nil {
156					p.cmd = parser.MissingCommand
157					p.dataLen = 0
158				}
159				state = StringState
160				continue
161			}
162
163			if p != nil {
164				p.dataLen = 0
165				p.paramsLen = 0
166				p.cmd = 0
167			}
168			if c > US && c < DEL {
169				// ASCII printable characters
170				return b[i : i+1], 1, 1, NormalState
171			}
172
173			if c <= US || c == DEL || c < 0xC0 {
174				// C0 & C1 control characters & DEL
175				return b[i : i+1], 0, 1, NormalState
176			}
177
178			if utf8.RuneStart(c) {
179				seq, _, width, _ = FirstGraphemeCluster(b, -1)
180				if m == WcWidth {
181					width = runewidth.StringWidth(string(seq))
182				}
183				i += len(seq)
184				return b[:i], width, i, NormalState
185			}
186
187			// Invalid UTF-8 sequence
188			return b[:i], 0, i, NormalState
189		case PrefixState:
190			if c >= '<' && c <= '?' {
191				if p != nil {
192					// We only collect the last prefix character.
193					p.cmd &^= 0xff << parser.PrefixShift
194					p.cmd |= int(c) << parser.PrefixShift
195				}
196				break
197			}
198
199			state = ParamsState
200			fallthrough
201		case ParamsState:
202			if c >= '0' && c <= '9' {
203				if p != nil {
204					if p.params[p.paramsLen] == parser.MissingParam {
205						p.params[p.paramsLen] = 0
206					}
207
208					p.params[p.paramsLen] *= 10
209					p.params[p.paramsLen] += int(c - '0')
210				}
211				break
212			}
213
214			if c == ':' {
215				if p != nil {
216					p.params[p.paramsLen] |= parser.HasMoreFlag
217				}
218			}
219
220			if c == ';' || c == ':' {
221				if p != nil {
222					p.paramsLen++
223					if p.paramsLen < len(p.params) {
224						p.params[p.paramsLen] = parser.MissingParam
225					}
226				}
227				break
228			}
229
230			state = IntermedState
231			fallthrough
232		case IntermedState:
233			if c >= ' ' && c <= '/' {
234				if p != nil {
235					p.cmd &^= 0xff << parser.IntermedShift
236					p.cmd |= int(c) << parser.IntermedShift
237				}
238				break
239			}
240
241			if p != nil {
242				// Increment the last parameter
243				if p.paramsLen > 0 && p.paramsLen < len(p.params)-1 ||
244					p.paramsLen == 0 && len(p.params) > 0 && p.params[0] != parser.MissingParam {
245					p.paramsLen++
246				}
247			}
248
249			if c >= '@' && c <= '~' {
250				if p != nil {
251					p.cmd &^= 0xff
252					p.cmd |= int(c)
253				}
254
255				if HasDcsPrefix(b) {
256					// Continue to collect DCS data
257					if p != nil {
258						p.dataLen = 0
259					}
260					state = StringState
261					continue
262				}
263
264				return b[:i+1], 0, i + 1, NormalState
265			}
266
267			// Invalid CSI/DCS sequence
268			return b[:i], 0, i, NormalState
269		case EscapeState:
270			switch c {
271			case '[', 'P':
272				if p != nil {
273					if len(p.params) > 0 {
274						p.params[0] = parser.MissingParam
275					}
276					p.paramsLen = 0
277					p.cmd = 0
278				}
279				state = PrefixState
280				continue
281			case ']', 'X', '^', '_':
282				if p != nil {
283					p.cmd = parser.MissingCommand
284					p.dataLen = 0
285				}
286				state = StringState
287				continue
288			}
289
290			if c >= ' ' && c <= '/' {
291				if p != nil {
292					p.cmd &^= 0xff << parser.IntermedShift
293					p.cmd |= int(c) << parser.IntermedShift
294				}
295				continue
296			} else if c >= '0' && c <= '~' {
297				if p != nil {
298					p.cmd &^= 0xff
299					p.cmd |= int(c)
300				}
301				return b[:i+1], 0, i + 1, NormalState
302			}
303
304			// Invalid escape sequence
305			return b[:i], 0, i, NormalState
306		case StringState:
307			switch c {
308			case BEL:
309				if HasOscPrefix(b) {
310					parseOscCmd(p)
311					return b[:i+1], 0, i + 1, NormalState
312				}
313			case CAN, SUB:
314				if HasOscPrefix(b) {
315					// Ensure we parse the OSC command number
316					parseOscCmd(p)
317				}
318
319				// Cancel the sequence
320				return b[:i], 0, i, NormalState
321			case ST:
322				if HasOscPrefix(b) {
323					// Ensure we parse the OSC command number
324					parseOscCmd(p)
325				}
326
327				return b[:i+1], 0, i + 1, NormalState
328			case ESC:
329				if HasStPrefix(b[i:]) {
330					if HasOscPrefix(b) {
331						// Ensure we parse the OSC command number
332						parseOscCmd(p)
333					}
334
335					// End of string 7-bit (ST)
336					return b[:i+2], 0, i + 2, NormalState
337				}
338
339				// Otherwise, cancel the sequence
340				return b[:i], 0, i, NormalState
341			}
342
343			if p != nil && p.dataLen < len(p.data) {
344				p.data[p.dataLen] = c
345				p.dataLen++
346
347				// Parse the OSC command number
348				if c == ';' && HasOscPrefix(b) {
349					parseOscCmd(p)
350				}
351			}
352		}
353	}
354
355	return b, 0, len(b), state
356}
357
358func parseOscCmd(p *Parser) {
359	if p == nil || p.cmd != parser.MissingCommand {
360		return
361	}
362	for j := range p.dataLen {
363		d := p.data[j]
364		if d < '0' || d > '9' {
365			break
366		}
367		if p.cmd == parser.MissingCommand {
368			p.cmd = 0
369		}
370		p.cmd *= 10
371		p.cmd += int(d - '0')
372	}
373}
374
375// Equal returns true if the given byte slices are equal.
376func Equal[T string | []byte](a, b T) bool {
377	return string(a) == string(b)
378}
379
380// HasPrefix returns true if the given byte slice has prefix.
381func HasPrefix[T string | []byte](b, prefix T) bool {
382	return len(b) >= len(prefix) && Equal(b[0:len(prefix)], prefix)
383}
384
385// HasSuffix returns true if the given byte slice has suffix.
386func HasSuffix[T string | []byte](b, suffix T) bool {
387	return len(b) >= len(suffix) && Equal(b[len(b)-len(suffix):], suffix)
388}
389
390// HasCsiPrefix returns true if the given byte slice has a CSI prefix.
391func HasCsiPrefix[T string | []byte](b T) bool {
392	return (len(b) > 0 && b[0] == CSI) ||
393		(len(b) > 1 && b[0] == ESC && b[1] == '[')
394}
395
396// HasOscPrefix returns true if the given byte slice has an OSC prefix.
397func HasOscPrefix[T string | []byte](b T) bool {
398	return (len(b) > 0 && b[0] == OSC) ||
399		(len(b) > 1 && b[0] == ESC && b[1] == ']')
400}
401
402// HasApcPrefix returns true if the given byte slice has an APC prefix.
403func HasApcPrefix[T string | []byte](b T) bool {
404	return (len(b) > 0 && b[0] == APC) ||
405		(len(b) > 1 && b[0] == ESC && b[1] == '_')
406}
407
408// HasDcsPrefix returns true if the given byte slice has a DCS prefix.
409func HasDcsPrefix[T string | []byte](b T) bool {
410	return (len(b) > 0 && b[0] == DCS) ||
411		(len(b) > 1 && b[0] == ESC && b[1] == 'P')
412}
413
414// HasSosPrefix returns true if the given byte slice has a SOS prefix.
415func HasSosPrefix[T string | []byte](b T) bool {
416	return (len(b) > 0 && b[0] == SOS) ||
417		(len(b) > 1 && b[0] == ESC && b[1] == 'X')
418}
419
420// HasPmPrefix returns true if the given byte slice has a PM prefix.
421func HasPmPrefix[T string | []byte](b T) bool {
422	return (len(b) > 0 && b[0] == PM) ||
423		(len(b) > 1 && b[0] == ESC && b[1] == '^')
424}
425
426// HasStPrefix returns true if the given byte slice has a ST prefix.
427func HasStPrefix[T string | []byte](b T) bool {
428	return (len(b) > 0 && b[0] == ST) ||
429		(len(b) > 1 && b[0] == ESC && b[1] == '\\')
430}
431
432// HasEscPrefix returns true if the given byte slice has an ESC prefix.
433func HasEscPrefix[T string | []byte](b T) bool {
434	return len(b) > 0 && b[0] == ESC
435}
436
437// FirstGraphemeCluster returns the first grapheme cluster in the given string or byte slice.
438// This is a syntactic sugar function that wraps
439// uniseg.FirstGraphemeClusterInString and uniseg.FirstGraphemeCluster.
440func FirstGraphemeCluster[T string | []byte](b T, state int) (T, T, int, int) {
441	switch b := any(b).(type) {
442	case string:
443		cluster, rest, width, newState := uniseg.FirstGraphemeClusterInString(b, state)
444		return T(cluster), T(rest), width, newState
445	case []byte:
446		cluster, rest, width, newState := uniseg.FirstGraphemeCluster(b, state)
447		return T(cluster), T(rest), width, newState
448	}
449	panic("unreachable")
450}
451
452// Cmd represents a sequence command. This is used to pack/unpack a sequence
453// command with its intermediate and prefix characters. Those are commonly
454// found in CSI and DCS sequences.
455type Cmd int
456
457// Prefix returns the unpacked prefix byte of the CSI sequence.
458// This is always gonna be one of the following '<' '=' '>' '?' and in the
459// range of 0x3C-0x3F.
460// Zero is returned if the sequence does not have a prefix.
461func (c Cmd) Prefix() byte {
462	return byte(parser.Prefix(int(c)))
463}
464
465// Intermediate returns the unpacked intermediate byte of the CSI sequence.
466// An intermediate byte is in the range of 0x20-0x2F. This includes these
467// characters from ' ', '!', '"', '#', '$', '%', '&', ”', '(', ')', '*', '+',
468// ',', '-', '.', '/'.
469// Zero is returned if the sequence does not have an intermediate byte.
470func (c Cmd) Intermediate() byte {
471	return byte(parser.Intermediate(int(c)))
472}
473
474// Final returns the unpacked command byte of the CSI sequence.
475func (c Cmd) Final() byte {
476	return byte(parser.Command(int(c)))
477}
478
479// Command packs a command with the given prefix, intermediate, and final. A
480// zero byte means the sequence does not have a prefix or intermediate.
481//
482// Prefixes are in the range of 0x3C-0x3F that is one of `<=>?`.
483//
484// Intermediates are in the range of 0x20-0x2F that is anything in
485// `!"#$%&'()*+,-./`.
486//
487// Final bytes are in the range of 0x40-0x7E that is anything in the range
488// `@A–Z[\]^_`a–z{|}~`.
489func Command(prefix, inter, final byte) (c int) {
490	c = int(final)
491	c |= int(prefix) << parser.PrefixShift
492	c |= int(inter) << parser.IntermedShift
493	return
494}
495
496// Param represents a sequence parameter. Sequence parameters with
497// sub-parameters are packed with the HasMoreFlag set. This is used to unpack
498// the parameters from a CSI and DCS sequences.
499type Param int
500
501// Param returns the unpacked parameter at the given index.
502// It returns the default value if the parameter is missing.
503func (s Param) Param(def int) int {
504	p := int(s) & parser.ParamMask
505	if p == parser.MissingParam {
506		return def
507	}
508	return p
509}
510
511// HasMore unpacks the HasMoreFlag from the parameter.
512func (s Param) HasMore() bool {
513	return s&parser.HasMoreFlag != 0
514}
515
516// Parameter packs an escape code parameter with the given parameter and
517// whether this parameter has following sub-parameters.
518func Parameter(p int, hasMore bool) (s int) {
519	s = p & parser.ParamMask
520	if hasMore {
521		s |= parser.HasMoreFlag
522	}
523	return
524}