1/*
2Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
3more feature full regex engine behind the scenes.
4
5It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
6You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
7need to write very complex patterns or require compatibility with .NET.
8*/
9package regexp2
10
11import (
12 "errors"
13 "math"
14 "strconv"
15 "sync"
16 "time"
17
18 "github.com/dlclark/regexp2/syntax"
19)
20
21var (
22 // DefaultMatchTimeout used when running regexp matches -- "forever"
23 DefaultMatchTimeout = time.Duration(math.MaxInt64)
24 // DefaultUnmarshalOptions used when unmarshaling a regex from text
25 DefaultUnmarshalOptions = None
26)
27
28// Regexp is the representation of a compiled regular expression.
29// A Regexp is safe for concurrent use by multiple goroutines.
30type Regexp struct {
31 // A match will time out if it takes (approximately) more than
32 // MatchTimeout. This is a safety check in case the match
33 // encounters catastrophic backtracking. The default value
34 // (DefaultMatchTimeout) causes all time out checking to be
35 // suppressed.
36 MatchTimeout time.Duration
37
38 // read-only after Compile
39 pattern string // as passed to Compile
40 options RegexOptions // options
41
42 caps map[int]int // capnum->index
43 capnames map[string]int //capture group name -> index
44 capslist []string //sorted list of capture group names
45 capsize int // size of the capture array
46
47 code *syntax.Code // compiled program
48
49 // cache of machines for running regexp
50 muRun *sync.Mutex
51 runner []*runner
52}
53
54// Compile parses a regular expression and returns, if successful,
55// a Regexp object that can be used to match against text.
56func Compile(expr string, opt RegexOptions) (*Regexp, error) {
57 // parse it
58 tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
59 if err != nil {
60 return nil, err
61 }
62
63 // translate it to code
64 code, err := syntax.Write(tree)
65 if err != nil {
66 return nil, err
67 }
68
69 // return it
70 return &Regexp{
71 pattern: expr,
72 options: opt,
73 caps: code.Caps,
74 capnames: tree.Capnames,
75 capslist: tree.Caplist,
76 capsize: code.Capsize,
77 code: code,
78 MatchTimeout: DefaultMatchTimeout,
79 muRun: &sync.Mutex{},
80 }, nil
81}
82
83// MustCompile is like Compile but panics if the expression cannot be parsed.
84// It simplifies safe initialization of global variables holding compiled regular
85// expressions.
86func MustCompile(str string, opt RegexOptions) *Regexp {
87 regexp, error := Compile(str, opt)
88 if error != nil {
89 panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
90 }
91 return regexp
92}
93
94// Escape adds backslashes to any special characters in the input string
95func Escape(input string) string {
96 return syntax.Escape(input)
97}
98
99// Unescape removes any backslashes from previously-escaped special characters in the input string
100func Unescape(input string) (string, error) {
101 return syntax.Unescape(input)
102}
103
104// SetTimeoutPeriod is a debug function that sets the frequency of the timeout goroutine's sleep cycle.
105// Defaults to 100ms. The only benefit of setting this lower is that the 1 background goroutine that manages
106// timeouts may exit slightly sooner after all the timeouts have expired. See Github issue #63
107func SetTimeoutCheckPeriod(d time.Duration) {
108 clockPeriod = d
109}
110
111// StopTimeoutClock should only be used in unit tests to prevent the timeout clock goroutine
112// from appearing like a leaking goroutine
113func StopTimeoutClock() {
114 stopClock()
115}
116
117// String returns the source text used to compile the regular expression.
118func (re *Regexp) String() string {
119 return re.pattern
120}
121
122func quote(s string) string {
123 if strconv.CanBackquote(s) {
124 return "`" + s + "`"
125 }
126 return strconv.Quote(s)
127}
128
129// RegexOptions impact the runtime and parsing behavior
130// for each specific regex. They are setable in code as well
131// as in the regex pattern itself.
132type RegexOptions int32
133
134const (
135 None RegexOptions = 0x0
136 IgnoreCase = 0x0001 // "i"
137 Multiline = 0x0002 // "m"
138 ExplicitCapture = 0x0004 // "n"
139 Compiled = 0x0008 // "c"
140 Singleline = 0x0010 // "s"
141 IgnorePatternWhitespace = 0x0020 // "x"
142 RightToLeft = 0x0040 // "r"
143 Debug = 0x0080 // "d"
144 ECMAScript = 0x0100 // "e"
145 RE2 = 0x0200 // RE2 (regexp package) compatibility mode
146 Unicode = 0x0400 // "u"
147)
148
149func (re *Regexp) RightToLeft() bool {
150 return re.options&RightToLeft != 0
151}
152
153func (re *Regexp) Debug() bool {
154 return re.options&Debug != 0
155}
156
157// Replace searches the input string and replaces each match found with the replacement text.
158// Count will limit the number of matches attempted and startAt will allow
159// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
160// Set startAt and count to -1 to go through the whole string
161func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
162 data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
163 if err != nil {
164 return "", err
165 }
166 //TODO: cache ReplacerData
167
168 return replace(re, data, nil, input, startAt, count)
169}
170
171// ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
172// Count will limit the number of matches attempted and startAt will allow
173// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
174// Set startAt and count to -1 to go through the whole string.
175func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
176 return replace(re, nil, evaluator, input, startAt, count)
177}
178
179// FindStringMatch searches the input string for a Regexp match
180func (re *Regexp) FindStringMatch(s string) (*Match, error) {
181 // convert string to runes
182 return re.run(false, -1, getRunes(s))
183}
184
185// FindRunesMatch searches the input rune slice for a Regexp match
186func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
187 return re.run(false, -1, r)
188}
189
190// FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
191func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
192 if startAt > len(s) {
193 return nil, errors.New("startAt must be less than the length of the input string")
194 }
195 r, startAt := re.getRunesAndStart(s, startAt)
196 if startAt == -1 {
197 // we didn't find our start index in the string -- that's a problem
198 return nil, errors.New("startAt must align to the start of a valid rune in the input string")
199 }
200
201 return re.run(false, startAt, r)
202}
203
204// FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
205func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
206 return re.run(false, startAt, r)
207}
208
209// FindNextMatch returns the next match in the same input string as the match parameter.
210// Will return nil if there is no next match or if given a nil match.
211func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
212 if m == nil {
213 return nil, nil
214 }
215
216 // If previous match was empty, advance by one before matching to prevent
217 // infinite loop
218 startAt := m.textpos
219 if m.Length == 0 {
220 if m.textpos == len(m.text) {
221 return nil, nil
222 }
223
224 if re.RightToLeft() {
225 startAt--
226 } else {
227 startAt++
228 }
229 }
230 return re.run(false, startAt, m.text)
231}
232
233// MatchString return true if the string matches the regex
234// error will be set if a timeout occurs
235func (re *Regexp) MatchString(s string) (bool, error) {
236 m, err := re.run(true, -1, getRunes(s))
237 if err != nil {
238 return false, err
239 }
240 return m != nil, nil
241}
242
243func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
244 if startAt < 0 {
245 if re.RightToLeft() {
246 r := getRunes(s)
247 return r, len(r)
248 }
249 return getRunes(s), 0
250 }
251 ret := make([]rune, len(s))
252 i := 0
253 runeIdx := -1
254 for strIdx, r := range s {
255 if strIdx == startAt {
256 runeIdx = i
257 }
258 ret[i] = r
259 i++
260 }
261 if startAt == len(s) {
262 runeIdx = i
263 }
264 return ret[:i], runeIdx
265}
266
267func getRunes(s string) []rune {
268 return []rune(s)
269}
270
271// MatchRunes return true if the runes matches the regex
272// error will be set if a timeout occurs
273func (re *Regexp) MatchRunes(r []rune) (bool, error) {
274 m, err := re.run(true, -1, r)
275 if err != nil {
276 return false, err
277 }
278 return m != nil, nil
279}
280
281// GetGroupNames Returns the set of strings used to name capturing groups in the expression.
282func (re *Regexp) GetGroupNames() []string {
283 var result []string
284
285 if re.capslist == nil {
286 result = make([]string, re.capsize)
287
288 for i := 0; i < len(result); i++ {
289 result[i] = strconv.Itoa(i)
290 }
291 } else {
292 result = make([]string, len(re.capslist))
293 copy(result, re.capslist)
294 }
295
296 return result
297}
298
299// GetGroupNumbers returns the integer group numbers corresponding to a group name.
300func (re *Regexp) GetGroupNumbers() []int {
301 var result []int
302
303 if re.caps == nil {
304 result = make([]int, re.capsize)
305
306 for i := 0; i < len(result); i++ {
307 result[i] = i
308 }
309 } else {
310 result = make([]int, len(re.caps))
311
312 for k, v := range re.caps {
313 result[v] = k
314 }
315 }
316
317 return result
318}
319
320// GroupNameFromNumber retrieves a group name that corresponds to a group number.
321// It will return "" for and unknown group number. Unnamed groups automatically
322// receive a name that is the decimal string equivalent of its number.
323func (re *Regexp) GroupNameFromNumber(i int) string {
324 if re.capslist == nil {
325 if i >= 0 && i < re.capsize {
326 return strconv.Itoa(i)
327 }
328
329 return ""
330 }
331
332 if re.caps != nil {
333 var ok bool
334 if i, ok = re.caps[i]; !ok {
335 return ""
336 }
337 }
338
339 if i >= 0 && i < len(re.capslist) {
340 return re.capslist[i]
341 }
342
343 return ""
344}
345
346// GroupNumberFromName returns a group number that corresponds to a group name.
347// Returns -1 if the name is not a recognized group name. Numbered groups
348// automatically get a group name that is the decimal string equivalent of its number.
349func (re *Regexp) GroupNumberFromName(name string) int {
350 // look up name if we have a hashtable of names
351 if re.capnames != nil {
352 if k, ok := re.capnames[name]; ok {
353 return k
354 }
355
356 return -1
357 }
358
359 // convert to an int if it looks like a number
360 result := 0
361 for i := 0; i < len(name); i++ {
362 ch := name[i]
363
364 if ch > '9' || ch < '0' {
365 return -1
366 }
367
368 result *= 10
369 result += int(ch - '0')
370 }
371
372 // return int if it's in range
373 if result >= 0 && result < re.capsize {
374 return result
375 }
376
377 return -1
378}
379
380// MarshalText implements [encoding.TextMarshaler]. The output
381// matches that of calling the [Regexp.String] method.
382func (re *Regexp) MarshalText() ([]byte, error) {
383 return []byte(re.String()), nil
384}
385
386// UnmarshalText implements [encoding.TextUnmarshaler] by calling
387// [Compile] on the encoded value.
388func (re *Regexp) UnmarshalText(text []byte) error {
389 newRE, err := Compile(string(text), DefaultUnmarshalOptions)
390 if err != nil {
391 return err
392 }
393 *re = *newRE
394 return nil
395}