parser.go

  1package ansi
  2
  3import (
  4	"unicode/utf8"
  5	"unsafe"
  6
  7	"github.com/charmbracelet/x/ansi/parser"
  8)
  9
 10// Parser represents a DEC ANSI compatible sequence parser.
 11//
 12// It uses a state machine to parse ANSI escape sequences and control
 13// characters. The parser is designed to be used with a terminal emulator or
 14// similar application that needs to parse ANSI escape sequences and control
 15// characters.
 16// See package [parser] for more information.
 17//
 18//go:generate go run ./gen.go
 19type Parser struct {
 20	handler Handler
 21
 22	// params contains the raw parameters of the sequence.
 23	// These parameters used when constructing CSI and DCS sequences.
 24	params []int
 25
 26	// data contains the raw data of the sequence.
 27	// These data used when constructing OSC, DCS, SOS, PM, and APC sequences.
 28	data []byte
 29
 30	// dataLen keeps track of the length of the data buffer.
 31	// If dataLen is -1, the data buffer is unlimited and will grow as needed.
 32	// Otherwise, dataLen is limited by the size of the data buffer.
 33	dataLen int
 34
 35	// paramsLen keeps track of the number of parameters.
 36	// This is limited by the size of the params buffer.
 37	//
 38	// This is also used when collecting UTF-8 runes to keep track of the
 39	// number of rune bytes collected.
 40	paramsLen int
 41
 42	// cmd contains the raw command along with the private prefix and
 43	// intermediate bytes of the sequence.
 44	// The first lower byte contains the command byte, the next byte contains
 45	// the private prefix, and the next byte contains the intermediate byte.
 46	//
 47	// This is also used when collecting UTF-8 runes treating it as a slice of
 48	// 4 bytes.
 49	cmd int
 50
 51	// state is the current state of the parser.
 52	state byte
 53}
 54
 55// NewParser returns a new parser with the default settings.
 56// The [Parser] uses a default size of 32 for the parameters and 64KB for the
 57// data buffer. Use [Parser.SetParamsSize] and [Parser.SetDataSize] to set the
 58// size of the parameters and data buffer respectively.
 59func NewParser() *Parser {
 60	p := new(Parser)
 61	p.SetParamsSize(parser.MaxParamsSize)
 62	p.SetDataSize(1024 * 64) // 64KB data buffer
 63	return p
 64}
 65
 66// SetParamsSize sets the size of the parameters buffer.
 67// This is used when constructing CSI and DCS sequences.
 68func (p *Parser) SetParamsSize(size int) {
 69	p.params = make([]int, size)
 70}
 71
 72// SetDataSize sets the size of the data buffer.
 73// This is used when constructing OSC, DCS, SOS, PM, and APC sequences.
 74// If size is less than or equal to 0, the data buffer is unlimited and will
 75// grow as needed.
 76func (p *Parser) SetDataSize(size int) {
 77	if size <= 0 {
 78		size = 0
 79		p.dataLen = -1
 80	}
 81	p.data = make([]byte, size)
 82}
 83
 84// Params returns the list of parsed packed parameters.
 85func (p *Parser) Params() Params {
 86	return unsafe.Slice((*Param)(unsafe.Pointer(&p.params[0])), p.paramsLen)
 87}
 88
 89// Param returns the parameter at the given index and falls back to the default
 90// value if the parameter is missing. If the index is out of bounds, it returns
 91// the default value and false.
 92func (p *Parser) Param(i, def int) (int, bool) {
 93	if i < 0 || i >= p.paramsLen {
 94		return def, false
 95	}
 96	return Param(p.params[i]).Param(def), true
 97}
 98
 99// Command returns the packed command of the last dispatched sequence. Use
100// [Cmd] to unpack the command.
101func (p *Parser) Command() int {
102	return p.cmd
103}
104
105// Rune returns the last dispatched sequence as a rune.
106func (p *Parser) Rune() rune {
107	rw := utf8ByteLen(byte(p.cmd & 0xff))
108	if rw == -1 {
109		return utf8.RuneError
110	}
111	r, _ := utf8.DecodeRune((*[utf8.UTFMax]byte)(unsafe.Pointer(&p.cmd))[:rw])
112	return r
113}
114
115// Control returns the last dispatched sequence as a control code.
116func (p *Parser) Control() byte {
117	return byte(p.cmd & 0xff)
118}
119
120// Data returns the raw data of the last dispatched sequence.
121func (p *Parser) Data() []byte {
122	return p.data[:p.dataLen]
123}
124
125// Reset resets the parser to its initial state.
126func (p *Parser) Reset() {
127	p.clear()
128	p.state = parser.GroundState
129}
130
131// clear clears the parser parameters and command.
132func (p *Parser) clear() {
133	if len(p.params) > 0 {
134		p.params[0] = parser.MissingParam
135	}
136	p.paramsLen = 0
137	p.cmd = 0
138}
139
140// State returns the current state of the parser.
141func (p *Parser) State() parser.State {
142	return p.state
143}
144
145// StateName returns the name of the current state.
146func (p *Parser) StateName() string {
147	return parser.StateNames[p.state]
148}
149
150// Parse parses the given dispatcher and byte buffer.
151// Deprecated: Loop over the buffer and call [Parser.Advance] instead.
152func (p *Parser) Parse(b []byte) {
153	for i := range b {
154		p.Advance(b[i])
155	}
156}
157
158// Advance advances the parser using the given byte. It	returns the action
159// performed by the parser.
160func (p *Parser) Advance(b byte) parser.Action {
161	switch p.state {
162	case parser.Utf8State:
163		// We handle UTF-8 here.
164		return p.advanceUtf8(b)
165	default:
166		return p.advance(b)
167	}
168}
169
170func (p *Parser) collectRune(b byte) {
171	if p.paramsLen >= utf8.UTFMax {
172		return
173	}
174
175	shift := p.paramsLen * 8
176	p.cmd &^= 0xff << shift
177	p.cmd |= int(b) << shift
178	p.paramsLen++
179}
180
181func (p *Parser) advanceUtf8(b byte) parser.Action {
182	// Collect UTF-8 rune bytes.
183	p.collectRune(b)
184	rw := utf8ByteLen(byte(p.cmd & 0xff))
185	if rw == -1 {
186		// We panic here because the first byte comes from the state machine,
187		// if this panics, it means there is a bug in the state machine!
188		panic("invalid rune") // unreachable
189	}
190
191	if p.paramsLen < rw {
192		return parser.CollectAction
193	}
194
195	// We have enough bytes to decode the rune using unsafe
196	if p.handler.Print != nil {
197		p.handler.Print(p.Rune())
198	}
199
200	p.state = parser.GroundState
201	p.paramsLen = 0
202
203	return parser.PrintAction
204}
205
206func (p *Parser) advance(b byte) parser.Action {
207	state, action := parser.Table.Transition(p.state, b)
208
209	// We need to clear the parser state if the state changes from EscapeState.
210	// This is because when we enter the EscapeState, we don't get a chance to
211	// clear the parser state. For example, when a sequence terminates with a
212	// ST (\x1b\\ or \x9c), we dispatch the current sequence and transition to
213	// EscapeState. However, the parser state is not cleared in this case and
214	// we need to clear it here before dispatching the esc sequence.
215	if p.state != state {
216		if p.state == parser.EscapeState {
217			p.performAction(parser.ClearAction, state, b)
218		}
219		if action == parser.PutAction &&
220			p.state == parser.DcsEntryState && state == parser.DcsStringState {
221			// XXX: This is a special case where we need to start collecting
222			// non-string parameterized data i.e. doesn't follow the ECMA-48 §
223			// 5.4.1 string parameters format.
224			p.performAction(parser.StartAction, state, 0)
225		}
226	}
227
228	// Handle special cases
229	switch {
230	case b == ESC && p.state == parser.EscapeState:
231		// Two ESCs in a row
232		p.performAction(parser.ExecuteAction, state, b)
233	default:
234		p.performAction(action, state, b)
235	}
236
237	p.state = state
238
239	return action
240}
241
242func (p *Parser) parseStringCmd() {
243	// Try to parse the command
244	datalen := len(p.data)
245	if p.dataLen >= 0 {
246		datalen = p.dataLen
247	}
248	for i := range datalen {
249		d := p.data[i]
250		if d < '0' || d > '9' {
251			break
252		}
253		if p.cmd == parser.MissingCommand {
254			p.cmd = 0
255		}
256		p.cmd *= 10
257		p.cmd += int(d - '0')
258	}
259}
260
261func (p *Parser) performAction(action parser.Action, state parser.State, b byte) {
262	switch action {
263	case parser.IgnoreAction:
264		break
265
266	case parser.ClearAction:
267		p.clear()
268
269	case parser.PrintAction:
270		p.cmd = int(b)
271		if p.handler.Print != nil {
272			p.handler.Print(rune(b))
273		}
274
275	case parser.ExecuteAction:
276		p.cmd = int(b)
277		if p.handler.Execute != nil {
278			p.handler.Execute(b)
279		}
280
281	case parser.PrefixAction:
282		// Collect private prefix
283		// we only store the last prefix
284		p.cmd &^= 0xff << parser.PrefixShift
285		p.cmd |= int(b) << parser.PrefixShift
286
287	case parser.CollectAction:
288		if state == parser.Utf8State {
289			// Reset the UTF-8 counter
290			p.paramsLen = 0
291			p.collectRune(b)
292		} else {
293			// Collect intermediate bytes
294			// we only store the last intermediate byte
295			p.cmd &^= 0xff << parser.IntermedShift
296			p.cmd |= int(b) << parser.IntermedShift
297		}
298
299	case parser.ParamAction:
300		// Collect parameters
301		if p.paramsLen >= len(p.params) {
302			break
303		}
304
305		if b >= '0' && b <= '9' {
306			if p.params[p.paramsLen] == parser.MissingParam {
307				p.params[p.paramsLen] = 0
308			}
309
310			p.params[p.paramsLen] *= 10
311			p.params[p.paramsLen] += int(b - '0')
312		}
313
314		if b == ':' {
315			p.params[p.paramsLen] |= parser.HasMoreFlag
316		}
317
318		if b == ';' || b == ':' {
319			p.paramsLen++
320			if p.paramsLen < len(p.params) {
321				p.params[p.paramsLen] = parser.MissingParam
322			}
323		}
324
325	case parser.StartAction:
326		if p.dataLen < 0 && p.data != nil {
327			p.data = p.data[:0]
328		} else {
329			p.dataLen = 0
330		}
331		if p.state >= parser.DcsEntryState && p.state <= parser.DcsStringState {
332			// Collect the command byte for DCS
333			p.cmd |= int(b)
334		} else {
335			p.cmd = parser.MissingCommand
336		}
337
338	case parser.PutAction:
339		switch p.state {
340		case parser.OscStringState:
341			if b == ';' && p.cmd == parser.MissingCommand {
342				p.parseStringCmd()
343			}
344		}
345
346		if p.dataLen < 0 {
347			p.data = append(p.data, b)
348		} else {
349			if p.dataLen < len(p.data) {
350				p.data[p.dataLen] = b
351				p.dataLen++
352			}
353		}
354
355	case parser.DispatchAction:
356		// Increment the last parameter
357		if p.paramsLen > 0 && p.paramsLen < len(p.params)-1 ||
358			p.paramsLen == 0 && len(p.params) > 0 && p.params[0] != parser.MissingParam {
359			p.paramsLen++
360		}
361
362		if p.state == parser.OscStringState && p.cmd == parser.MissingCommand {
363			// Ensure we have a command for OSC
364			p.parseStringCmd()
365		}
366
367		data := p.data
368		if p.dataLen >= 0 {
369			data = data[:p.dataLen]
370		}
371		switch p.state {
372		case parser.CsiEntryState, parser.CsiParamState, parser.CsiIntermediateState:
373			p.cmd |= int(b)
374			if p.handler.HandleCsi != nil {
375				p.handler.HandleCsi(Cmd(p.cmd), p.Params())
376			}
377		case parser.EscapeState, parser.EscapeIntermediateState:
378			p.cmd |= int(b)
379			if p.handler.HandleEsc != nil {
380				p.handler.HandleEsc(Cmd(p.cmd))
381			}
382		case parser.DcsEntryState, parser.DcsParamState, parser.DcsIntermediateState, parser.DcsStringState:
383			if p.handler.HandleDcs != nil {
384				p.handler.HandleDcs(Cmd(p.cmd), p.Params(), data)
385			}
386		case parser.OscStringState:
387			if p.handler.HandleOsc != nil {
388				p.handler.HandleOsc(p.cmd, data)
389			}
390		case parser.SosStringState:
391			if p.handler.HandleSos != nil {
392				p.handler.HandleSos(data)
393			}
394		case parser.PmStringState:
395			if p.handler.HandlePm != nil {
396				p.handler.HandlePm(data)
397			}
398		case parser.ApcStringState:
399			if p.handler.HandleApc != nil {
400				p.handler.HandleApc(data)
401			}
402		}
403	}
404}
405
406func utf8ByteLen(b byte) int {
407	if b <= 0b0111_1111 { // 0x00-0x7F
408		return 1
409	} else if b >= 0b1100_0000 && b <= 0b1101_1111 { // 0xC0-0xDF
410		return 2
411	} else if b >= 0b1110_0000 && b <= 0b1110_1111 { // 0xE0-0xEF
412		return 3
413	} else if b >= 0b1111_0000 && b <= 0b1111_0111 { // 0xF0-0xF7
414		return 4
415	}
416	return -1
417}