fuzzy.go

  1/*
  2Package fuzzy provides fuzzy string matching optimized
  3for filenames and code symbols in the style of Sublime Text,
  4VSCode, IntelliJ IDEA et al.
  5*/
  6package fuzzy
  7
  8import (
  9	"sort"
 10	"unicode"
 11	"unicode/utf8"
 12)
 13
 14// Match represents a matched string.
 15type Match struct {
 16	// The matched string.
 17	Str string
 18	// The index of the matched string in the supplied slice.
 19	Index int
 20	// The indexes of matched characters. Useful for highlighting matches.
 21	MatchedIndexes []int
 22	// Score used to rank matches
 23	Score int
 24}
 25
 26const (
 27	firstCharMatchBonus            = 10
 28	matchFollowingSeparatorBonus   = 20
 29	camelCaseMatchBonus            = 20
 30	adjacentMatchBonus             = 5
 31	unmatchedLeadingCharPenalty    = -5
 32	maxUnmatchedLeadingCharPenalty = -15
 33)
 34
 35var separators = []rune("/-_ .\\")
 36
 37// Matches is a slice of Match structs
 38type Matches []Match
 39
 40func (a Matches) Len() int           { return len(a) }
 41func (a Matches) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 42func (a Matches) Less(i, j int) bool { return a[i].Score >= a[j].Score }
 43
 44// Source represents an abstract source of a list of strings. Source must be iterable type such as a slice.
 45// The source will be iterated over till Len() with String(i) being called for each element where i is the
 46// index of the element. You can find a working example in the README.
 47type Source interface {
 48	// The string to be matched at position i.
 49	String(i int) string
 50	// The length of the source. Typically is the length of the slice of things that you want to match.
 51	Len() int
 52}
 53
 54type stringSource []string
 55
 56func (ss stringSource) String(i int) string {
 57	return ss[i]
 58}
 59
 60func (ss stringSource) Len() int { return len(ss) }
 61
 62/*
 63Find looks up pattern in data and returns matches
 64in descending order of match quality. Match quality
 65is determined by a set of bonus and penalty rules.
 66
 67The following types of matches apply a bonus:
 68
 69* The first character in the pattern matches the first character in the match string.
 70
 71* The matched character is camel cased.
 72
 73* The matched character follows a separator such as an underscore character.
 74
 75* The matched character is adjacent to a previous match.
 76
 77Penalties are applied for every character in the search string that wasn't matched and all leading
 78characters upto the first match.
 79
 80Results are sorted by best match.
 81*/
 82func Find(pattern string, data []string) Matches {
 83	return FindFrom(pattern, stringSource(data))
 84}
 85
 86/*
 87FindNoSort is an alternative Find implementation that does not sort
 88the results in the end.
 89*/
 90func FindNoSort(pattern string, data []string) Matches {
 91	return FindFromNoSort(pattern, stringSource(data))
 92}
 93
 94/*
 95FindFrom is an alternative implementation of Find using a Source
 96instead of a list of strings.
 97*/
 98func FindFrom(pattern string, data Source) Matches {
 99	matches := FindFromNoSort(pattern, data)
100	sort.Stable(matches)
101	return matches
102}
103
104/*
105FindFromNoSort is an alternative FindFrom implementation that does
106not sort results in the end.
107*/
108func FindFromNoSort(pattern string, data Source) Matches {
109	if len(pattern) == 0 {
110		return nil
111	}
112	runes := []rune(pattern)
113	var matches Matches
114	var matchedIndexes []int
115	for i := 0; i < data.Len(); i++ {
116		var match Match
117		match.Str = data.String(i)
118		match.Index = i
119		if matchedIndexes != nil {
120			match.MatchedIndexes = matchedIndexes
121		} else {
122			match.MatchedIndexes = make([]int, 0, len(runes))
123		}
124		var score int
125		patternIndex := 0
126		bestScore := -1
127		matchedIndex := -1
128		currAdjacentMatchBonus := 0
129		var last rune
130		var lastIndex int
131		nextc, nextSize := utf8.DecodeRuneInString(data.String(i))
132		var candidate rune
133		var candidateSize int
134		for j := 0; j < len(data.String(i)); j += candidateSize {
135			candidate, candidateSize = nextc, nextSize
136			if equalFold(candidate, runes[patternIndex]) {
137				score = 0
138				if j == 0 {
139					score += firstCharMatchBonus
140				}
141				if unicode.IsLower(last) && unicode.IsUpper(candidate) {
142					score += camelCaseMatchBonus
143				}
144				if j != 0 && isSeparator(last) {
145					score += matchFollowingSeparatorBonus
146				}
147				if len(match.MatchedIndexes) > 0 {
148					lastMatch := match.MatchedIndexes[len(match.MatchedIndexes)-1]
149					bonus := adjacentCharBonus(lastIndex, lastMatch, currAdjacentMatchBonus)
150					score += bonus
151					// adjacent matches are incremental and keep increasing based on previous adjacent matches
152					// thus we need to maintain the current match bonus
153					currAdjacentMatchBonus += bonus
154				}
155				if score > bestScore {
156					bestScore = score
157					matchedIndex = j
158				}
159			}
160			var nextp rune
161			if patternIndex < len(runes)-1 {
162				nextp = runes[patternIndex+1]
163			}
164			if j+candidateSize < len(data.String(i)) {
165				if data.String(i)[j+candidateSize] < utf8.RuneSelf { // Fast path for ASCII
166					nextc, nextSize = rune(data.String(i)[j+candidateSize]), 1
167				} else {
168					nextc, nextSize = utf8.DecodeRuneInString(data.String(i)[j+candidateSize:])
169				}
170			} else {
171				nextc, nextSize = 0, 0
172			}
173			// We apply the best score when we have the next match coming up or when the search string has ended.
174			// Tracking when the next match is coming up allows us to exhaustively find the best match and not necessarily
175			// the first match.
176			// For example given the pattern "tk" and search string "The Black Knight", exhaustively matching allows us
177			// to match the second k thus giving this string a higher score.
178			if equalFold(nextp, nextc) || nextc == 0 {
179				if matchedIndex > -1 {
180					if len(match.MatchedIndexes) == 0 {
181						penalty := matchedIndex * unmatchedLeadingCharPenalty
182						bestScore += max(penalty, maxUnmatchedLeadingCharPenalty)
183					}
184					match.Score += bestScore
185					match.MatchedIndexes = append(match.MatchedIndexes, matchedIndex)
186					score = 0
187					bestScore = -1
188					patternIndex++
189				}
190			}
191			lastIndex = j
192			last = candidate
193		}
194		// apply penalty for each unmatched character
195		penalty := len(match.MatchedIndexes) - len(data.String(i))
196		match.Score += penalty
197		if len(match.MatchedIndexes) == len(runes) {
198			matches = append(matches, match)
199			matchedIndexes = nil
200		} else {
201			matchedIndexes = match.MatchedIndexes[:0] // Recycle match index slice
202		}
203	}
204	return matches
205}
206
207// Taken from strings.EqualFold
208func equalFold(tr, sr rune) bool {
209	if tr == sr {
210		return true
211	}
212	if tr < sr {
213		tr, sr = sr, tr
214	}
215	// Fast check for ASCII.
216	if tr < utf8.RuneSelf {
217		// ASCII, and sr is upper case.  tr must be lower case.
218		if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
219			return true
220		}
221		return false
222	}
223
224	// General case. SimpleFold(x) returns the next equivalent rune > x
225	// or wraps around to smaller values.
226	r := unicode.SimpleFold(sr)
227	for r != sr && r < tr {
228		r = unicode.SimpleFold(r)
229	}
230	return r == tr
231}
232
233func adjacentCharBonus(i int, lastMatch int, currentBonus int) int {
234	if lastMatch == i {
235		return currentBonus*2 + adjacentMatchBonus
236	}
237	return 0
238}
239
240func isSeparator(s rune) bool {
241	for _, sep := range separators {
242		if s == sep {
243			return true
244		}
245	}
246	return false
247}
248
249func max(x int, y int) int {
250	if x > y {
251		return x
252	}
253	return y
254}