regexp.go

  1/*
  2Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
  3more feature full regex engine behind the scenes.
  4
  5It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
  6You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
  7need to write very complex patterns or require compatibility with .NET.
  8*/
  9package regexp2
 10
 11import (
 12	"errors"
 13	"math"
 14	"strconv"
 15	"sync"
 16	"time"
 17
 18	"github.com/dlclark/regexp2/syntax"
 19)
 20
 21var (
 22	// DefaultMatchTimeout used when running regexp matches -- "forever"
 23	DefaultMatchTimeout = time.Duration(math.MaxInt64)
 24	// DefaultUnmarshalOptions used when unmarshaling a regex from text
 25	DefaultUnmarshalOptions = None
 26)
 27
 28// Regexp is the representation of a compiled regular expression.
 29// A Regexp is safe for concurrent use by multiple goroutines.
 30type Regexp struct {
 31	// A match will time out if it takes (approximately) more than
 32	// MatchTimeout. This is a safety check in case the match
 33	// encounters catastrophic backtracking.  The default value
 34	// (DefaultMatchTimeout) causes all time out checking to be
 35	// suppressed.
 36	MatchTimeout time.Duration
 37
 38	// read-only after Compile
 39	pattern string       // as passed to Compile
 40	options RegexOptions // options
 41
 42	caps     map[int]int    // capnum->index
 43	capnames map[string]int //capture group name -> index
 44	capslist []string       //sorted list of capture group names
 45	capsize  int            // size of the capture array
 46
 47	code *syntax.Code // compiled program
 48
 49	// cache of machines for running regexp
 50	muRun  *sync.Mutex
 51	runner []*runner
 52}
 53
 54// Compile parses a regular expression and returns, if successful,
 55// a Regexp object that can be used to match against text.
 56func Compile(expr string, opt RegexOptions) (*Regexp, error) {
 57	// parse it
 58	tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
 59	if err != nil {
 60		return nil, err
 61	}
 62
 63	// translate it to code
 64	code, err := syntax.Write(tree)
 65	if err != nil {
 66		return nil, err
 67	}
 68
 69	// return it
 70	return &Regexp{
 71		pattern:      expr,
 72		options:      opt,
 73		caps:         code.Caps,
 74		capnames:     tree.Capnames,
 75		capslist:     tree.Caplist,
 76		capsize:      code.Capsize,
 77		code:         code,
 78		MatchTimeout: DefaultMatchTimeout,
 79		muRun:        &sync.Mutex{},
 80	}, nil
 81}
 82
 83// MustCompile is like Compile but panics if the expression cannot be parsed.
 84// It simplifies safe initialization of global variables holding compiled regular
 85// expressions.
 86func MustCompile(str string, opt RegexOptions) *Regexp {
 87	regexp, error := Compile(str, opt)
 88	if error != nil {
 89		panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
 90	}
 91	return regexp
 92}
 93
 94// Escape adds backslashes to any special characters in the input string
 95func Escape(input string) string {
 96	return syntax.Escape(input)
 97}
 98
 99// Unescape removes any backslashes from previously-escaped special characters in the input string
100func Unescape(input string) (string, error) {
101	return syntax.Unescape(input)
102}
103
104// SetTimeoutPeriod is a debug function that sets the frequency of the timeout goroutine's sleep cycle.
105// Defaults to 100ms. The only benefit of setting this lower is that the 1 background goroutine that manages
106// timeouts may exit slightly sooner after all the timeouts have expired. See Github issue #63
107func SetTimeoutCheckPeriod(d time.Duration) {
108	clockPeriod = d
109}
110
111// StopTimeoutClock should only be used in unit tests to prevent the timeout clock goroutine
112// from appearing like a leaking goroutine
113func StopTimeoutClock() {
114	stopClock()
115}
116
117// String returns the source text used to compile the regular expression.
118func (re *Regexp) String() string {
119	return re.pattern
120}
121
122func quote(s string) string {
123	if strconv.CanBackquote(s) {
124		return "`" + s + "`"
125	}
126	return strconv.Quote(s)
127}
128
129// RegexOptions impact the runtime and parsing behavior
130// for each specific regex.  They are setable in code as well
131// as in the regex pattern itself.
132type RegexOptions int32
133
134const (
135	None                    RegexOptions = 0x0
136	IgnoreCase                           = 0x0001 // "i"
137	Multiline                            = 0x0002 // "m"
138	ExplicitCapture                      = 0x0004 // "n"
139	Compiled                             = 0x0008 // "c"
140	Singleline                           = 0x0010 // "s"
141	IgnorePatternWhitespace              = 0x0020 // "x"
142	RightToLeft                          = 0x0040 // "r"
143	Debug                                = 0x0080 // "d"
144	ECMAScript                           = 0x0100 // "e"
145	RE2                                  = 0x0200 // RE2 (regexp package) compatibility mode
146	Unicode                              = 0x0400 // "u"
147)
148
149func (re *Regexp) RightToLeft() bool {
150	return re.options&RightToLeft != 0
151}
152
153func (re *Regexp) Debug() bool {
154	return re.options&Debug != 0
155}
156
157// Replace searches the input string and replaces each match found with the replacement text.
158// Count will limit the number of matches attempted and startAt will allow
159// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
160// Set startAt and count to -1 to go through the whole string
161func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
162	data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
163	if err != nil {
164		return "", err
165	}
166	//TODO: cache ReplacerData
167
168	return replace(re, data, nil, input, startAt, count)
169}
170
171// ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
172// Count will limit the number of matches attempted and startAt will allow
173// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
174// Set startAt and count to -1 to go through the whole string.
175func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
176	return replace(re, nil, evaluator, input, startAt, count)
177}
178
179// FindStringMatch searches the input string for a Regexp match
180func (re *Regexp) FindStringMatch(s string) (*Match, error) {
181	// convert string to runes
182	return re.run(false, -1, getRunes(s))
183}
184
185// FindRunesMatch searches the input rune slice for a Regexp match
186func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
187	return re.run(false, -1, r)
188}
189
190// FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
191func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
192	if startAt > len(s) {
193		return nil, errors.New("startAt must be less than the length of the input string")
194	}
195	r, startAt := re.getRunesAndStart(s, startAt)
196	if startAt == -1 {
197		// we didn't find our start index in the string -- that's a problem
198		return nil, errors.New("startAt must align to the start of a valid rune in the input string")
199	}
200
201	return re.run(false, startAt, r)
202}
203
204// FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
205func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
206	return re.run(false, startAt, r)
207}
208
209// FindNextMatch returns the next match in the same input string as the match parameter.
210// Will return nil if there is no next match or if given a nil match.
211func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
212	if m == nil {
213		return nil, nil
214	}
215
216	// If previous match was empty, advance by one before matching to prevent
217	// infinite loop
218	startAt := m.textpos
219	if m.Length == 0 {
220		if m.textpos == len(m.text) {
221			return nil, nil
222		}
223
224		if re.RightToLeft() {
225			startAt--
226		} else {
227			startAt++
228		}
229	}
230	return re.run(false, startAt, m.text)
231}
232
233// MatchString return true if the string matches the regex
234// error will be set if a timeout occurs
235func (re *Regexp) MatchString(s string) (bool, error) {
236	m, err := re.run(true, -1, getRunes(s))
237	if err != nil {
238		return false, err
239	}
240	return m != nil, nil
241}
242
243func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
244	if startAt < 0 {
245		if re.RightToLeft() {
246			r := getRunes(s)
247			return r, len(r)
248		}
249		return getRunes(s), 0
250	}
251	ret := make([]rune, len(s))
252	i := 0
253	runeIdx := -1
254	for strIdx, r := range s {
255		if strIdx == startAt {
256			runeIdx = i
257		}
258		ret[i] = r
259		i++
260	}
261	if startAt == len(s) {
262		runeIdx = i
263	}
264	return ret[:i], runeIdx
265}
266
267func getRunes(s string) []rune {
268	return []rune(s)
269}
270
271// MatchRunes return true if the runes matches the regex
272// error will be set if a timeout occurs
273func (re *Regexp) MatchRunes(r []rune) (bool, error) {
274	m, err := re.run(true, -1, r)
275	if err != nil {
276		return false, err
277	}
278	return m != nil, nil
279}
280
281// GetGroupNames Returns the set of strings used to name capturing groups in the expression.
282func (re *Regexp) GetGroupNames() []string {
283	var result []string
284
285	if re.capslist == nil {
286		result = make([]string, re.capsize)
287
288		for i := 0; i < len(result); i++ {
289			result[i] = strconv.Itoa(i)
290		}
291	} else {
292		result = make([]string, len(re.capslist))
293		copy(result, re.capslist)
294	}
295
296	return result
297}
298
299// GetGroupNumbers returns the integer group numbers corresponding to a group name.
300func (re *Regexp) GetGroupNumbers() []int {
301	var result []int
302
303	if re.caps == nil {
304		result = make([]int, re.capsize)
305
306		for i := 0; i < len(result); i++ {
307			result[i] = i
308		}
309	} else {
310		result = make([]int, len(re.caps))
311
312		for k, v := range re.caps {
313			result[v] = k
314		}
315	}
316
317	return result
318}
319
320// GroupNameFromNumber retrieves a group name that corresponds to a group number.
321// It will return "" for and unknown group number.  Unnamed groups automatically
322// receive a name that is the decimal string equivalent of its number.
323func (re *Regexp) GroupNameFromNumber(i int) string {
324	if re.capslist == nil {
325		if i >= 0 && i < re.capsize {
326			return strconv.Itoa(i)
327		}
328
329		return ""
330	}
331
332	if re.caps != nil {
333		var ok bool
334		if i, ok = re.caps[i]; !ok {
335			return ""
336		}
337	}
338
339	if i >= 0 && i < len(re.capslist) {
340		return re.capslist[i]
341	}
342
343	return ""
344}
345
346// GroupNumberFromName returns a group number that corresponds to a group name.
347// Returns -1 if the name is not a recognized group name.  Numbered groups
348// automatically get a group name that is the decimal string equivalent of its number.
349func (re *Regexp) GroupNumberFromName(name string) int {
350	// look up name if we have a hashtable of names
351	if re.capnames != nil {
352		if k, ok := re.capnames[name]; ok {
353			return k
354		}
355
356		return -1
357	}
358
359	// convert to an int if it looks like a number
360	result := 0
361	for i := 0; i < len(name); i++ {
362		ch := name[i]
363
364		if ch > '9' || ch < '0' {
365			return -1
366		}
367
368		result *= 10
369		result += int(ch - '0')
370	}
371
372	// return int if it's in range
373	if result >= 0 && result < re.capsize {
374		return result
375	}
376
377	return -1
378}
379
380// MarshalText implements [encoding.TextMarshaler]. The output
381// matches that of calling the [Regexp.String] method.
382func (re *Regexp) MarshalText() ([]byte, error) {
383	return []byte(re.String()), nil
384}
385
386// UnmarshalText implements [encoding.TextUnmarshaler] by calling
387// [Compile] on the encoded value.
388func (re *Regexp) UnmarshalText(text []byte) error {
389	newRE, err := Compile(string(text), DefaultUnmarshalOptions)
390	if err != nil {
391		return err
392	}
393	*re = *newRE
394	return nil
395}