quote.go

  1// Copyright (c) 2021, Daniel MartΓ­ <mvdan@mvdan.cc>
  2// See LICENSE for licensing information
  3
  4package syntax
  5
  6import (
  7	"fmt"
  8	"strings"
  9	"unicode"
 10	"unicode/utf8"
 11)
 12
 13type QuoteError struct {
 14	ByteOffset int
 15	Message    string
 16}
 17
 18func (e QuoteError) Error() string {
 19	return fmt.Sprintf("cannot quote character at byte %d: %s", e.ByteOffset, e.Message)
 20}
 21
 22const (
 23	quoteErrNull  = "shell strings cannot contain null bytes"
 24	quoteErrPOSIX = "POSIX shell lacks escape sequences"
 25	quoteErrRange = "rune out of range"
 26	quoteErrMksh  = "mksh cannot escape codepoints above 16 bits"
 27)
 28
 29// Quote returns a quoted version of the input string,
 30// so that the quoted version is expanded or interpreted
 31// as the original string in the given language variant.
 32//
 33// Quoting is necessary when using arbitrary literal strings
 34// as words in a shell script or command.
 35// Without quoting, one can run into syntax errors,
 36// as well as the possibility of running unintended code.
 37//
 38// An error is returned when a string cannot be quoted for a variant.
 39// For instance, POSIX lacks escape sequences for non-printable characters,
 40// and no language variant can represent a string containing null bytes.
 41// In such cases, the returned error type will be *QuoteError.
 42//
 43// The quoting strategy is chosen on a best-effort basis,
 44// to minimize the amount of extra bytes necessary.
 45//
 46// Some strings do not require any quoting and are returned unchanged.
 47// Those strings can be directly surrounded in single quotes as well.
 48func Quote(s string, lang LangVariant) (string, error) {
 49	if s == "" {
 50		// Special case; an empty string must always be quoted,
 51		// as otherwise it expands to zero fields.
 52		return "''", nil
 53	}
 54	shellChars := false
 55	nonPrintable := false
 56	offs := 0
 57	for rem := s; len(rem) > 0; {
 58		r, size := utf8.DecodeRuneInString(rem)
 59		switch r {
 60		// Like regOps; token characters.
 61		case ';', '"', '\'', '(', ')', '$', '|', '&', '>', '<', '`',
 62			// Whitespace; might result in multiple fields.
 63			' ', '\t', '\r', '\n',
 64			// Escape sequences would be expanded.
 65			'\\',
 66			// Would start a comment unless quoted.
 67			'#',
 68			// Might result in brace expansion.
 69			'{',
 70			// Might result in tilde expansion.
 71			'~',
 72			// Might result in globbing.
 73			'*', '?', '[',
 74			// Might result in an assignment.
 75			'=':
 76			shellChars = true
 77		case '\x00':
 78			return "", &QuoteError{ByteOffset: offs, Message: quoteErrNull}
 79		}
 80		if r == utf8.RuneError || !unicode.IsPrint(r) {
 81			if lang == LangPOSIX {
 82				return "", &QuoteError{ByteOffset: offs, Message: quoteErrPOSIX}
 83			}
 84			nonPrintable = true
 85		}
 86		rem = rem[size:]
 87		offs += size
 88	}
 89	if !shellChars && !nonPrintable && !IsKeyword(s) {
 90		// Nothing to quote; avoid allocating.
 91		return s, nil
 92	}
 93
 94	// Single quotes are usually best,
 95	// as they don't require any escaping of characters.
 96	// If we have any invalid utf8 or non-printable runes,
 97	// use $'' so that we can escape them.
 98	// Note that we can't use double quotes for those.
 99	var b strings.Builder
100	if nonPrintable {
101		b.WriteString("$'")
102		lastRequoteIfHex := false
103		offs := 0
104		for rem := s; len(rem) > 0; {
105			nextRequoteIfHex := false
106			r, size := utf8.DecodeRuneInString(rem)
107			switch {
108			case r == '\'', r == '\\':
109				b.WriteByte('\\')
110				b.WriteRune(r)
111			case unicode.IsPrint(r) && r != utf8.RuneError:
112				if lastRequoteIfHex && isHex(r) {
113					b.WriteString("'$'")
114				}
115				b.WriteRune(r)
116			case r == '\a':
117				b.WriteString(`\a`)
118			case r == '\b':
119				b.WriteString(`\b`)
120			case r == '\f':
121				b.WriteString(`\f`)
122			case r == '\n':
123				b.WriteString(`\n`)
124			case r == '\r':
125				b.WriteString(`\r`)
126			case r == '\t':
127				b.WriteString(`\t`)
128			case r == '\v':
129				b.WriteString(`\v`)
130			case r < utf8.RuneSelf, r == utf8.RuneError && size == 1:
131				// \xXX, fixed at two hexadecimal characters.
132				fmt.Fprintf(&b, "\\x%02x", rem[0])
133				// Unfortunately, mksh allows \x to consume more hex characters.
134				// Ensure that we don't allow it to read more than two.
135				if lang == LangMirBSDKorn {
136					nextRequoteIfHex = true
137				}
138			case r > utf8.MaxRune:
139				// Not a valid Unicode code point?
140				return "", &QuoteError{ByteOffset: offs, Message: quoteErrRange}
141			case lang == LangMirBSDKorn && r > 0xFFFD:
142				// From the CAVEATS section in R59's man page:
143				//
144				// mksh currently uses OPTU-16 internally, which is the same as
145				// UTF-8 and CESU-8 with 0000..FFFD being valid codepoints.
146				return "", &QuoteError{ByteOffset: offs, Message: quoteErrMksh}
147			case r < 0x10000:
148				// \uXXXX, fixed at four hexadecimal characters.
149				fmt.Fprintf(&b, "\\u%04x", r)
150			default:
151				// \UXXXXXXXX, fixed at eight hexadecimal characters.
152				fmt.Fprintf(&b, "\\U%08x", r)
153			}
154			rem = rem[size:]
155			lastRequoteIfHex = nextRequoteIfHex
156			offs += size
157		}
158		b.WriteString("'")
159		return b.String(), nil
160	}
161
162	// Single quotes without any need for escaping.
163	if !strings.Contains(s, "'") {
164		return "'" + s + "'", nil
165	}
166
167	// The string contains single quotes,
168	// so fall back to double quotes.
169	b.WriteByte('"')
170	for _, r := range s {
171		switch r {
172		case '"', '\\', '`', '$':
173			b.WriteByte('\\')
174		}
175		b.WriteRune(r)
176	}
177	b.WriteByte('"')
178	return b.String(), nil
179}
180
181func isHex(r rune) bool {
182	return (r >= '0' && r <= '9') ||
183		(r >= 'a' && r <= 'f') ||
184		(r >= 'A' && r <= 'F')
185}