pattern.go

  1// Copyright (c) 2017, Daniel MartΓ­ <mvdan@mvdan.cc>
  2// See LICENSE for licensing information
  3
  4// Package pattern allows working with shell pattern matching notation, also
  5// known as wildcards or globbing.
  6//
  7// For reference, see
  8// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_13.
  9package pattern
 10
 11import (
 12	"bytes"
 13	"fmt"
 14	"regexp"
 15	"strconv"
 16	"strings"
 17)
 18
 19// Mode can be used to supply a number of options to the package's functions.
 20// Not all functions change their behavior with all of the options below.
 21type Mode uint
 22
 23type SyntaxError struct {
 24	msg string
 25	err error
 26}
 27
 28func (e SyntaxError) Error() string { return e.msg }
 29
 30func (e SyntaxError) Unwrap() error { return e.err }
 31
 32const (
 33	Shortest     Mode = 1 << iota // prefer the shortest match.
 34	Filenames                     // "*" and "?" don't match slashes; only "**" does
 35	Braces                        // support "{a,b}" and "{1..4}"
 36	EntireString                  // match the entire string using ^$ delimiters
 37	NoGlobCase                    // Do case-insensitive match (that is, use (?i) in the regexp)
 38)
 39
 40var numRange = regexp.MustCompile(`^([+-]?\d+)\.\.([+-]?\d+)}`)
 41
 42// Regexp turns a shell pattern into a regular expression that can be used with
 43// [regexp.Compile]. It will return an error if the input pattern was incorrect.
 44// Otherwise, the returned expression can be passed to [regexp.MustCompile].
 45//
 46// For example, Regexp(`foo*bar?`, true) returns `foo.*bar.`.
 47//
 48// Note that this function (and [QuoteMeta]) should not be directly used with file
 49// paths if Windows is supported, as the path separator on that platform is the
 50// same character as the escaping character for shell patterns.
 51func Regexp(pat string, mode Mode) (string, error) {
 52	needsEscaping := false
 53noopLoop:
 54	for _, r := range pat {
 55		switch r {
 56		// including those that need escaping since they are
 57		// regular expression metacharacters
 58		case '*', '?', '[', '\\', '.', '+', '(', ')', '|',
 59			']', '{', '}', '^', '$':
 60			needsEscaping = true
 61			break noopLoop
 62		}
 63	}
 64	if !needsEscaping && mode&EntireString == 0 { // short-cut without a string copy
 65		return pat, nil
 66	}
 67	closingBraces := []int{}
 68	var buf bytes.Buffer
 69	// Enable matching `\n` with the `.` metacharacter as globs match `\n`
 70	buf.WriteString("(?s)")
 71	dotMeta := false
 72	if mode&NoGlobCase != 0 {
 73		buf.WriteString("(?i)")
 74	}
 75	if mode&EntireString != 0 {
 76		buf.WriteString("^")
 77	}
 78writeLoop:
 79	for i := 0; i < len(pat); i++ {
 80		switch c := pat[i]; c {
 81		case '*':
 82			if mode&Filenames != 0 {
 83				if i++; i < len(pat) && pat[i] == '*' {
 84					if i++; i < len(pat) && pat[i] == '/' {
 85						buf.WriteString("(.*/|)")
 86						dotMeta = true
 87					} else {
 88						buf.WriteString(".*")
 89						dotMeta = true
 90						i--
 91					}
 92				} else {
 93					buf.WriteString("[^/]*")
 94					i--
 95				}
 96			} else {
 97				buf.WriteString(".*")
 98				dotMeta = true
 99			}
100			if mode&Shortest != 0 {
101				buf.WriteByte('?')
102			}
103		case '?':
104			if mode&Filenames != 0 {
105				buf.WriteString("[^/]")
106			} else {
107				buf.WriteByte('.')
108				dotMeta = true
109			}
110		case '\\':
111			if i++; i >= len(pat) {
112				return "", &SyntaxError{msg: `\ at end of pattern`}
113			}
114			buf.WriteString(regexp.QuoteMeta(string(pat[i])))
115		case '[':
116			name, err := charClass(pat[i:])
117			if err != nil {
118				return "", &SyntaxError{msg: "charClass invalid", err: err}
119			}
120			if name != "" {
121				buf.WriteString(name)
122				i += len(name) - 1
123				break
124			}
125			if mode&Filenames != 0 {
126				for _, c := range pat[i:] {
127					if c == ']' {
128						break
129					} else if c == '/' {
130						buf.WriteString("\\[")
131						continue writeLoop
132					}
133				}
134			}
135			buf.WriteByte(c)
136			if i++; i >= len(pat) {
137				return "", &SyntaxError{msg: "[ was not matched with a closing ]"}
138			}
139			switch c = pat[i]; c {
140			case '!', '^':
141				buf.WriteByte('^')
142				if i++; i >= len(pat) {
143					return "", &SyntaxError{msg: "[ was not matched with a closing ]"}
144				}
145			}
146			if c = pat[i]; c == ']' {
147				buf.WriteByte(']')
148				if i++; i >= len(pat) {
149					return "", &SyntaxError{msg: "[ was not matched with a closing ]"}
150				}
151			}
152			rangeStart := byte(0)
153		loopBracket:
154			for ; i < len(pat); i++ {
155				c = pat[i]
156				buf.WriteByte(c)
157				switch c {
158				case '\\':
159					if i++; i < len(pat) {
160						buf.WriteByte(pat[i])
161					}
162					continue
163				case ']':
164					break loopBracket
165				}
166				if rangeStart != 0 && rangeStart > c {
167					return "", &SyntaxError{msg: fmt.Sprintf("invalid range: %c-%c", rangeStart, c)}
168				}
169				if c == '-' {
170					rangeStart = pat[i-1]
171				} else {
172					rangeStart = 0
173				}
174			}
175			if i >= len(pat) {
176				return "", &SyntaxError{msg: "[ was not matched with a closing ]"}
177			}
178		case '{':
179			if mode&Braces == 0 {
180				buf.WriteString(regexp.QuoteMeta(string(c)))
181				break
182			}
183			innerLevel := 1
184			commas := false
185		peekBrace:
186			for j := i + 1; j < len(pat); j++ {
187				switch c := pat[j]; c {
188				case '{':
189					innerLevel++
190				case ',':
191					commas = true
192				case '\\':
193					j++
194				case '}':
195					if innerLevel--; innerLevel > 0 {
196						continue
197					}
198					if !commas {
199						break peekBrace
200					}
201					closingBraces = append(closingBraces, j)
202					buf.WriteString("(?:")
203					continue writeLoop
204				}
205			}
206			if match := numRange.FindStringSubmatch(pat[i+1:]); len(match) == 3 {
207				start, err1 := strconv.Atoi(match[1])
208				end, err2 := strconv.Atoi(match[2])
209				if err1 != nil || err2 != nil || start > end {
210					return "", &SyntaxError{msg: fmt.Sprintf("invalid range: %q", match[0])}
211				}
212				// TODO: can we do better here?
213				buf.WriteString("(?:")
214				for n := start; n <= end; n++ {
215					if n > start {
216						buf.WriteByte('|')
217					}
218					fmt.Fprintf(&buf, "%d", n)
219				}
220				buf.WriteByte(')')
221				i += len(match[0])
222				break
223			}
224			buf.WriteString(regexp.QuoteMeta(string(c)))
225		case ',':
226			if len(closingBraces) == 0 {
227				buf.WriteString(regexp.QuoteMeta(string(c)))
228			} else {
229				buf.WriteByte('|')
230			}
231		case '}':
232			if len(closingBraces) > 0 && closingBraces[len(closingBraces)-1] == i {
233				buf.WriteByte(')')
234				closingBraces = closingBraces[:len(closingBraces)-1]
235			} else {
236				buf.WriteString(regexp.QuoteMeta(string(c)))
237			}
238		default:
239			if c > 128 {
240				buf.WriteByte(c)
241			} else {
242				buf.WriteString(regexp.QuoteMeta(string(c)))
243			}
244		}
245	}
246	if mode&EntireString != 0 {
247		buf.WriteString("$")
248	}
249	// No `.` metacharacters were used, so don't return the (?s) flag.
250	if !dotMeta {
251		return string(buf.Bytes()[4:]), nil
252	}
253	return buf.String(), nil
254}
255
256func charClass(s string) (string, error) {
257	if strings.HasPrefix(s, "[[.") || strings.HasPrefix(s, "[[=") {
258		return "", fmt.Errorf("collating features not available")
259	}
260	if !strings.HasPrefix(s, "[[:") {
261		return "", nil
262	}
263	name := s[3:]
264	end := strings.Index(name, ":]]")
265	if end < 0 {
266		return "", fmt.Errorf("[[: was not matched with a closing :]]")
267	}
268	name = name[:end]
269	switch name {
270	case "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
271		"lower", "print", "punct", "space", "upper", "word", "xdigit":
272	default:
273		return "", fmt.Errorf("invalid character class: %q", name)
274	}
275	return s[:len(name)+6], nil
276}
277
278// HasMeta returns whether a string contains any unescaped pattern
279// metacharacters: '*', '?', or '['. When the function returns false, the given
280// pattern can only match at most one string.
281//
282// For example, HasMeta(`foo\*bar`) returns false, but HasMeta(`foo*bar`)
283// returns true.
284//
285// This can be useful to avoid extra work, like [Regexp]. Note that this
286// function cannot be used to avoid [QuoteMeta], as backslashes are quoted by
287// that function but ignored here.
288func HasMeta(pat string, mode Mode) bool {
289	for i := 0; i < len(pat); i++ {
290		switch pat[i] {
291		case '\\':
292			i++
293		case '*', '?', '[':
294			return true
295		case '{':
296			if mode&Braces != 0 {
297				return true
298			}
299		}
300	}
301	return false
302}
303
304// QuoteMeta returns a string that quotes all pattern metacharacters in the
305// given text. The returned string is a pattern that matches the literal text.
306//
307// For example, QuoteMeta(`foo*bar?`) returns `foo\*bar\?`.
308func QuoteMeta(pat string, mode Mode) string {
309	needsEscaping := false
310loop:
311	for _, r := range pat {
312		switch r {
313		case '{':
314			if mode&Braces == 0 {
315				continue
316			}
317			fallthrough
318		case '*', '?', '[', '\\':
319			needsEscaping = true
320			break loop
321		}
322	}
323	if !needsEscaping { // short-cut without a string copy
324		return pat
325	}
326	var sb strings.Builder
327	for _, r := range pat {
328		switch r {
329		case '*', '?', '[', '\\':
330			sb.WriteByte('\\')
331		case '{':
332			if mode&Braces != 0 {
333				sb.WriteByte('\\')
334			}
335		}
336		sb.WriteRune(r)
337	}
338	return sb.String()
339}