1// Copyright (c) 2021, Daniel MartΓ <mvdan@mvdan.cc>
2// See LICENSE for licensing information
3
4package syntax
5
6import (
7 "fmt"
8 "strings"
9 "unicode"
10 "unicode/utf8"
11)
12
13type QuoteError struct {
14 ByteOffset int
15 Message string
16}
17
18func (e QuoteError) Error() string {
19 return fmt.Sprintf("cannot quote character at byte %d: %s", e.ByteOffset, e.Message)
20}
21
22const (
23 quoteErrNull = "shell strings cannot contain null bytes"
24 quoteErrPOSIX = "POSIX shell lacks escape sequences"
25 quoteErrRange = "rune out of range"
26 quoteErrMksh = "mksh cannot escape codepoints above 16 bits"
27)
28
29// Quote returns a quoted version of the input string,
30// so that the quoted version is expanded or interpreted
31// as the original string in the given language variant.
32//
33// Quoting is necessary when using arbitrary literal strings
34// as words in a shell script or command.
35// Without quoting, one can run into syntax errors,
36// as well as the possibility of running unintended code.
37//
38// An error is returned when a string cannot be quoted for a variant.
39// For instance, POSIX lacks escape sequences for non-printable characters,
40// and no language variant can represent a string containing null bytes.
41// In such cases, the returned error type will be *QuoteError.
42//
43// The quoting strategy is chosen on a best-effort basis,
44// to minimize the amount of extra bytes necessary.
45//
46// Some strings do not require any quoting and are returned unchanged.
47// Those strings can be directly surrounded in single quotes as well.
48func Quote(s string, lang LangVariant) (string, error) {
49 if s == "" {
50 // Special case; an empty string must always be quoted,
51 // as otherwise it expands to zero fields.
52 return "''", nil
53 }
54 shellChars := false
55 nonPrintable := false
56 offs := 0
57 for rem := s; len(rem) > 0; {
58 r, size := utf8.DecodeRuneInString(rem)
59 switch r {
60 // Like regOps; token characters.
61 case ';', '"', '\'', '(', ')', '$', '|', '&', '>', '<', '`',
62 // Whitespace; might result in multiple fields.
63 ' ', '\t', '\r', '\n',
64 // Escape sequences would be expanded.
65 '\\',
66 // Would start a comment unless quoted.
67 '#',
68 // Might result in brace expansion.
69 '{',
70 // Might result in tilde expansion.
71 '~',
72 // Might result in globbing.
73 '*', '?', '[',
74 // Might result in an assignment.
75 '=':
76 shellChars = true
77 case '\x00':
78 return "", &QuoteError{ByteOffset: offs, Message: quoteErrNull}
79 }
80 if r == utf8.RuneError || !unicode.IsPrint(r) {
81 if lang == LangPOSIX {
82 return "", &QuoteError{ByteOffset: offs, Message: quoteErrPOSIX}
83 }
84 nonPrintable = true
85 }
86 rem = rem[size:]
87 offs += size
88 }
89 if !shellChars && !nonPrintable && !IsKeyword(s) {
90 // Nothing to quote; avoid allocating.
91 return s, nil
92 }
93
94 // Single quotes are usually best,
95 // as they don't require any escaping of characters.
96 // If we have any invalid utf8 or non-printable runes,
97 // use $'' so that we can escape them.
98 // Note that we can't use double quotes for those.
99 var b strings.Builder
100 if nonPrintable {
101 b.WriteString("$'")
102 lastRequoteIfHex := false
103 offs := 0
104 for rem := s; len(rem) > 0; {
105 nextRequoteIfHex := false
106 r, size := utf8.DecodeRuneInString(rem)
107 switch {
108 case r == '\'', r == '\\':
109 b.WriteByte('\\')
110 b.WriteRune(r)
111 case unicode.IsPrint(r) && r != utf8.RuneError:
112 if lastRequoteIfHex && isHex(r) {
113 b.WriteString("'$'")
114 }
115 b.WriteRune(r)
116 case r == '\a':
117 b.WriteString(`\a`)
118 case r == '\b':
119 b.WriteString(`\b`)
120 case r == '\f':
121 b.WriteString(`\f`)
122 case r == '\n':
123 b.WriteString(`\n`)
124 case r == '\r':
125 b.WriteString(`\r`)
126 case r == '\t':
127 b.WriteString(`\t`)
128 case r == '\v':
129 b.WriteString(`\v`)
130 case r < utf8.RuneSelf, r == utf8.RuneError && size == 1:
131 // \xXX, fixed at two hexadecimal characters.
132 fmt.Fprintf(&b, "\\x%02x", rem[0])
133 // Unfortunately, mksh allows \x to consume more hex characters.
134 // Ensure that we don't allow it to read more than two.
135 if lang == LangMirBSDKorn {
136 nextRequoteIfHex = true
137 }
138 case r > utf8.MaxRune:
139 // Not a valid Unicode code point?
140 return "", &QuoteError{ByteOffset: offs, Message: quoteErrRange}
141 case lang == LangMirBSDKorn && r > 0xFFFD:
142 // From the CAVEATS section in R59's man page:
143 //
144 // mksh currently uses OPTU-16 internally, which is the same as
145 // UTF-8 and CESU-8 with 0000..FFFD being valid codepoints.
146 return "", &QuoteError{ByteOffset: offs, Message: quoteErrMksh}
147 case r < 0x10000:
148 // \uXXXX, fixed at four hexadecimal characters.
149 fmt.Fprintf(&b, "\\u%04x", r)
150 default:
151 // \UXXXXXXXX, fixed at eight hexadecimal characters.
152 fmt.Fprintf(&b, "\\U%08x", r)
153 }
154 rem = rem[size:]
155 lastRequoteIfHex = nextRequoteIfHex
156 offs += size
157 }
158 b.WriteString("'")
159 return b.String(), nil
160 }
161
162 // Single quotes without any need for escaping.
163 if !strings.Contains(s, "'") {
164 return "'" + s + "'", nil
165 }
166
167 // The string contains single quotes,
168 // so fall back to double quotes.
169 b.WriteByte('"')
170 for _, r := range s {
171 switch r {
172 case '"', '\\', '`', '$':
173 b.WriteByte('\\')
174 }
175 b.WriteRune(r)
176 }
177 b.WriteByte('"')
178 return b.String(), nil
179}
180
181func isHex(r rune) bool {
182 return (r >= '0' && r <= '9') ||
183 (r >= 'a' && r <= 'f') ||
184 (r >= 'A' && r <= 'F')
185}