1package ansi
2
3import (
4 "unicode/utf8"
5
6 "github.com/charmbracelet/x/ansi/parser"
7 "github.com/mattn/go-runewidth"
8 "github.com/rivo/uniseg"
9)
10
11// State represents the state of the ANSI escape sequence parser used by
12// [DecodeSequence].
13type State = byte
14
15// ANSI escape sequence states used by [DecodeSequence].
16const (
17 NormalState State = iota
18 PrefixState
19 ParamsState
20 IntermedState
21 EscapeState
22 StringState
23)
24
25// DecodeSequence decodes the first ANSI escape sequence or a printable
26// grapheme from the given data. It returns the sequence slice, the number of
27// bytes read, the cell width for each sequence, and the new state.
28//
29// The cell width will always be 0 for control and escape sequences, 1 for
30// ASCII printable characters, and the number of cells other Unicode characters
31// occupy. It uses the uniseg package to calculate the width of Unicode
32// graphemes and characters. This means it will always do grapheme clustering
33// (mode 2027).
34//
35// Passing a non-nil [*Parser] as the last argument will allow the decoder to
36// collect sequence parameters, data, and commands. The parser cmd will have
37// the packed command value that contains intermediate and prefix characters.
38// In the case of a OSC sequence, the cmd will be the OSC command number. Use
39// [Cmd] and [Param] types to unpack command intermediates and prefixes as well
40// as parameters.
41//
42// Zero [Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
43// validity of other data sequences, OSC, DCS, etc, will require checking for
44// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
45//
46// We store the command byte in [Cmd] in the most significant byte, the
47// prefix byte in the next byte, and the intermediate byte in the least
48// significant byte. This is done to avoid using a struct to store the command
49// and its intermediates and prefixes. The command byte is always the least
50// significant byte i.e. [Cmd & 0xff]. Use the [Cmd] type to unpack the
51// command, intermediate, and prefix bytes. Note that we only collect the last
52// prefix character and intermediate byte.
53//
54// The [p.Params] slice will contain the parameters of the sequence. Any
55// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
56// to unpack the parameters.
57//
58// Example:
59//
60// var state byte // the initial state is always zero [NormalState]
61// p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
62// input := []byte("\x1b[31mHello, World!\x1b[0m")
63// for len(input) > 0 {
64// seq, width, n, newState := DecodeSequence(input, state, p)
65// log.Printf("seq: %q, width: %d", seq, width)
66// state = newState
67// input = input[n:]
68// }
69//
70// This function treats the text as a sequence of grapheme clusters.
71func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
72 return decodeSequence(GraphemeWidth, b, state, p)
73}
74
75// DecodeSequenceWc decodes the first ANSI escape sequence or a printable
76// grapheme from the given data. It returns the sequence slice, the number of
77// bytes read, the cell width for each sequence, and the new state.
78//
79// The cell width will always be 0 for control and escape sequences, 1 for
80// ASCII printable characters, and the number of cells other Unicode characters
81// occupy. It uses the uniseg package to calculate the width of Unicode
82// graphemes and characters. This means it will always do grapheme clustering
83// (mode 2027).
84//
85// Passing a non-nil [*Parser] as the last argument will allow the decoder to
86// collect sequence parameters, data, and commands. The parser cmd will have
87// the packed command value that contains intermediate and prefix characters.
88// In the case of a OSC sequence, the cmd will be the OSC command number. Use
89// [Cmd] and [Param] types to unpack command intermediates and prefixes as well
90// as parameters.
91//
92// Zero [Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
93// validity of other data sequences, OSC, DCS, etc, will require checking for
94// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
95//
96// We store the command byte in [Cmd] in the most significant byte, the
97// prefix byte in the next byte, and the intermediate byte in the least
98// significant byte. This is done to avoid using a struct to store the command
99// and its intermediates and prefixes. The command byte is always the least
100// significant byte i.e. [Cmd & 0xff]. Use the [Cmd] type to unpack the
101// command, intermediate, and prefix bytes. Note that we only collect the last
102// prefix character and intermediate byte.
103//
104// The [p.Params] slice will contain the parameters of the sequence. Any
105// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
106// to unpack the parameters.
107//
108// Example:
109//
110// var state byte // the initial state is always zero [NormalState]
111// p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
112// input := []byte("\x1b[31mHello, World!\x1b[0m")
113// for len(input) > 0 {
114// seq, width, n, newState := DecodeSequenceWc(input, state, p)
115// log.Printf("seq: %q, width: %d", seq, width)
116// state = newState
117// input = input[n:]
118// }
119//
120// This function treats the text as a sequence of wide characters and runes.
121func DecodeSequenceWc[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
122 return decodeSequence(WcWidth, b, state, p)
123}
124
125func decodeSequence[T string | []byte](m Method, b T, state State, p *Parser) (seq T, width int, n int, newState byte) {
126 for i := 0; i < len(b); i++ {
127 c := b[i]
128
129 switch state {
130 case NormalState:
131 switch c {
132 case ESC:
133 if p != nil {
134 if len(p.params) > 0 {
135 p.params[0] = parser.MissingParam
136 }
137 p.cmd = 0
138 p.paramsLen = 0
139 p.dataLen = 0
140 }
141 state = EscapeState
142 continue
143 case CSI, DCS:
144 if p != nil {
145 if len(p.params) > 0 {
146 p.params[0] = parser.MissingParam
147 }
148 p.cmd = 0
149 p.paramsLen = 0
150 p.dataLen = 0
151 }
152 state = PrefixState
153 continue
154 case OSC, APC, SOS, PM:
155 if p != nil {
156 p.cmd = parser.MissingCommand
157 p.dataLen = 0
158 }
159 state = StringState
160 continue
161 }
162
163 if p != nil {
164 p.dataLen = 0
165 p.paramsLen = 0
166 p.cmd = 0
167 }
168 if c > US && c < DEL {
169 // ASCII printable characters
170 return b[i : i+1], 1, 1, NormalState
171 }
172
173 if c <= US || c == DEL || c < 0xC0 {
174 // C0 & C1 control characters & DEL
175 return b[i : i+1], 0, 1, NormalState
176 }
177
178 if utf8.RuneStart(c) {
179 seq, _, width, _ = FirstGraphemeCluster(b, -1)
180 if m == WcWidth {
181 width = runewidth.StringWidth(string(seq))
182 }
183 i += len(seq)
184 return b[:i], width, i, NormalState
185 }
186
187 // Invalid UTF-8 sequence
188 return b[:i], 0, i, NormalState
189 case PrefixState:
190 if c >= '<' && c <= '?' {
191 if p != nil {
192 // We only collect the last prefix character.
193 p.cmd &^= 0xff << parser.PrefixShift
194 p.cmd |= int(c) << parser.PrefixShift
195 }
196 break
197 }
198
199 state = ParamsState
200 fallthrough
201 case ParamsState:
202 if c >= '0' && c <= '9' {
203 if p != nil {
204 if p.params[p.paramsLen] == parser.MissingParam {
205 p.params[p.paramsLen] = 0
206 }
207
208 p.params[p.paramsLen] *= 10
209 p.params[p.paramsLen] += int(c - '0')
210 }
211 break
212 }
213
214 if c == ':' {
215 if p != nil {
216 p.params[p.paramsLen] |= parser.HasMoreFlag
217 }
218 }
219
220 if c == ';' || c == ':' {
221 if p != nil {
222 p.paramsLen++
223 if p.paramsLen < len(p.params) {
224 p.params[p.paramsLen] = parser.MissingParam
225 }
226 }
227 break
228 }
229
230 state = IntermedState
231 fallthrough
232 case IntermedState:
233 if c >= ' ' && c <= '/' {
234 if p != nil {
235 p.cmd &^= 0xff << parser.IntermedShift
236 p.cmd |= int(c) << parser.IntermedShift
237 }
238 break
239 }
240
241 if p != nil {
242 // Increment the last parameter
243 if p.paramsLen > 0 && p.paramsLen < len(p.params)-1 ||
244 p.paramsLen == 0 && len(p.params) > 0 && p.params[0] != parser.MissingParam {
245 p.paramsLen++
246 }
247 }
248
249 if c >= '@' && c <= '~' {
250 if p != nil {
251 p.cmd &^= 0xff
252 p.cmd |= int(c)
253 }
254
255 if HasDcsPrefix(b) {
256 // Continue to collect DCS data
257 if p != nil {
258 p.dataLen = 0
259 }
260 state = StringState
261 continue
262 }
263
264 return b[:i+1], 0, i + 1, NormalState
265 }
266
267 // Invalid CSI/DCS sequence
268 return b[:i], 0, i, NormalState
269 case EscapeState:
270 switch c {
271 case '[', 'P':
272 if p != nil {
273 if len(p.params) > 0 {
274 p.params[0] = parser.MissingParam
275 }
276 p.paramsLen = 0
277 p.cmd = 0
278 }
279 state = PrefixState
280 continue
281 case ']', 'X', '^', '_':
282 if p != nil {
283 p.cmd = parser.MissingCommand
284 p.dataLen = 0
285 }
286 state = StringState
287 continue
288 }
289
290 if c >= ' ' && c <= '/' {
291 if p != nil {
292 p.cmd &^= 0xff << parser.IntermedShift
293 p.cmd |= int(c) << parser.IntermedShift
294 }
295 continue
296 } else if c >= '0' && c <= '~' {
297 if p != nil {
298 p.cmd &^= 0xff
299 p.cmd |= int(c)
300 }
301 return b[:i+1], 0, i + 1, NormalState
302 }
303
304 // Invalid escape sequence
305 return b[:i], 0, i, NormalState
306 case StringState:
307 switch c {
308 case BEL:
309 if HasOscPrefix(b) {
310 parseOscCmd(p)
311 return b[:i+1], 0, i + 1, NormalState
312 }
313 case CAN, SUB:
314 if HasOscPrefix(b) {
315 // Ensure we parse the OSC command number
316 parseOscCmd(p)
317 }
318
319 // Cancel the sequence
320 return b[:i], 0, i, NormalState
321 case ST:
322 if HasOscPrefix(b) {
323 // Ensure we parse the OSC command number
324 parseOscCmd(p)
325 }
326
327 return b[:i+1], 0, i + 1, NormalState
328 case ESC:
329 if HasStPrefix(b[i:]) {
330 if HasOscPrefix(b) {
331 // Ensure we parse the OSC command number
332 parseOscCmd(p)
333 }
334
335 // End of string 7-bit (ST)
336 return b[:i+2], 0, i + 2, NormalState
337 }
338
339 // Otherwise, cancel the sequence
340 return b[:i], 0, i, NormalState
341 }
342
343 if p != nil && p.dataLen < len(p.data) {
344 p.data[p.dataLen] = c
345 p.dataLen++
346
347 // Parse the OSC command number
348 if c == ';' && HasOscPrefix(b) {
349 parseOscCmd(p)
350 }
351 }
352 }
353 }
354
355 return b, 0, len(b), state
356}
357
358func parseOscCmd(p *Parser) {
359 if p == nil || p.cmd != parser.MissingCommand {
360 return
361 }
362 for j := range p.dataLen {
363 d := p.data[j]
364 if d < '0' || d > '9' {
365 break
366 }
367 if p.cmd == parser.MissingCommand {
368 p.cmd = 0
369 }
370 p.cmd *= 10
371 p.cmd += int(d - '0')
372 }
373}
374
375// Equal returns true if the given byte slices are equal.
376func Equal[T string | []byte](a, b T) bool {
377 return string(a) == string(b)
378}
379
380// HasPrefix returns true if the given byte slice has prefix.
381func HasPrefix[T string | []byte](b, prefix T) bool {
382 return len(b) >= len(prefix) && Equal(b[0:len(prefix)], prefix)
383}
384
385// HasSuffix returns true if the given byte slice has suffix.
386func HasSuffix[T string | []byte](b, suffix T) bool {
387 return len(b) >= len(suffix) && Equal(b[len(b)-len(suffix):], suffix)
388}
389
390// HasCsiPrefix returns true if the given byte slice has a CSI prefix.
391func HasCsiPrefix[T string | []byte](b T) bool {
392 return (len(b) > 0 && b[0] == CSI) ||
393 (len(b) > 1 && b[0] == ESC && b[1] == '[')
394}
395
396// HasOscPrefix returns true if the given byte slice has an OSC prefix.
397func HasOscPrefix[T string | []byte](b T) bool {
398 return (len(b) > 0 && b[0] == OSC) ||
399 (len(b) > 1 && b[0] == ESC && b[1] == ']')
400}
401
402// HasApcPrefix returns true if the given byte slice has an APC prefix.
403func HasApcPrefix[T string | []byte](b T) bool {
404 return (len(b) > 0 && b[0] == APC) ||
405 (len(b) > 1 && b[0] == ESC && b[1] == '_')
406}
407
408// HasDcsPrefix returns true if the given byte slice has a DCS prefix.
409func HasDcsPrefix[T string | []byte](b T) bool {
410 return (len(b) > 0 && b[0] == DCS) ||
411 (len(b) > 1 && b[0] == ESC && b[1] == 'P')
412}
413
414// HasSosPrefix returns true if the given byte slice has a SOS prefix.
415func HasSosPrefix[T string | []byte](b T) bool {
416 return (len(b) > 0 && b[0] == SOS) ||
417 (len(b) > 1 && b[0] == ESC && b[1] == 'X')
418}
419
420// HasPmPrefix returns true if the given byte slice has a PM prefix.
421func HasPmPrefix[T string | []byte](b T) bool {
422 return (len(b) > 0 && b[0] == PM) ||
423 (len(b) > 1 && b[0] == ESC && b[1] == '^')
424}
425
426// HasStPrefix returns true if the given byte slice has a ST prefix.
427func HasStPrefix[T string | []byte](b T) bool {
428 return (len(b) > 0 && b[0] == ST) ||
429 (len(b) > 1 && b[0] == ESC && b[1] == '\\')
430}
431
432// HasEscPrefix returns true if the given byte slice has an ESC prefix.
433func HasEscPrefix[T string | []byte](b T) bool {
434 return len(b) > 0 && b[0] == ESC
435}
436
437// FirstGraphemeCluster returns the first grapheme cluster in the given string or byte slice.
438// This is a syntactic sugar function that wraps
439// uniseg.FirstGraphemeClusterInString and uniseg.FirstGraphemeCluster.
440func FirstGraphemeCluster[T string | []byte](b T, state int) (T, T, int, int) {
441 switch b := any(b).(type) {
442 case string:
443 cluster, rest, width, newState := uniseg.FirstGraphemeClusterInString(b, state)
444 return T(cluster), T(rest), width, newState
445 case []byte:
446 cluster, rest, width, newState := uniseg.FirstGraphemeCluster(b, state)
447 return T(cluster), T(rest), width, newState
448 }
449 panic("unreachable")
450}
451
452// Cmd represents a sequence command. This is used to pack/unpack a sequence
453// command with its intermediate and prefix characters. Those are commonly
454// found in CSI and DCS sequences.
455type Cmd int
456
457// Prefix returns the unpacked prefix byte of the CSI sequence.
458// This is always gonna be one of the following '<' '=' '>' '?' and in the
459// range of 0x3C-0x3F.
460// Zero is returned if the sequence does not have a prefix.
461func (c Cmd) Prefix() byte {
462 return byte(parser.Prefix(int(c)))
463}
464
465// Intermediate returns the unpacked intermediate byte of the CSI sequence.
466// An intermediate byte is in the range of 0x20-0x2F. This includes these
467// characters from ' ', '!', '"', '#', '$', '%', '&', ”', '(', ')', '*', '+',
468// ',', '-', '.', '/'.
469// Zero is returned if the sequence does not have an intermediate byte.
470func (c Cmd) Intermediate() byte {
471 return byte(parser.Intermediate(int(c)))
472}
473
474// Final returns the unpacked command byte of the CSI sequence.
475func (c Cmd) Final() byte {
476 return byte(parser.Command(int(c)))
477}
478
479// Command packs a command with the given prefix, intermediate, and final. A
480// zero byte means the sequence does not have a prefix or intermediate.
481//
482// Prefixes are in the range of 0x3C-0x3F that is one of `<=>?`.
483//
484// Intermediates are in the range of 0x20-0x2F that is anything in
485// `!"#$%&'()*+,-./`.
486//
487// Final bytes are in the range of 0x40-0x7E that is anything in the range
488// `@A–Z[\]^_`a–z{|}~`.
489func Command(prefix, inter, final byte) (c int) {
490 c = int(final)
491 c |= int(prefix) << parser.PrefixShift
492 c |= int(inter) << parser.IntermedShift
493 return
494}
495
496// Param represents a sequence parameter. Sequence parameters with
497// sub-parameters are packed with the HasMoreFlag set. This is used to unpack
498// the parameters from a CSI and DCS sequences.
499type Param int
500
501// Param returns the unpacked parameter at the given index.
502// It returns the default value if the parameter is missing.
503func (s Param) Param(def int) int {
504 p := int(s) & parser.ParamMask
505 if p == parser.MissingParam {
506 return def
507 }
508 return p
509}
510
511// HasMore unpacks the HasMoreFlag from the parameter.
512func (s Param) HasMore() bool {
513 return s&parser.HasMoreFlag != 0
514}
515
516// Parameter packs an escape code parameter with the given parameter and
517// whether this parameter has following sub-parameters.
518func Parameter(p int, hasMore bool) (s int) {
519 s = p & parser.ParamMask
520 if hasMore {
521 s |= parser.HasMoreFlag
522 }
523 return
524}