suggest.go

  1package spellcheck
  2
  3import (
  4	"sort"
  5	"strings"
  6	"unicode"
  7)
  8
  9// Suggest returns up to limit candidate corrections for word, ranked by
 10// edit distance ascending then alphabetically. Returns nil when the
 11// checker has no dictionary loaded or when word is too short.
 12func (c *Checker) Suggest(word string, limit int) []string {
 13	if c == nil {
 14		return nil
 15	}
 16	c.mu.RLock()
 17	defer c.mu.RUnlock()
 18	if !c.loaded || len(c.words) == 0 {
 19		return nil
 20	}
 21	if limit <= 0 {
 22		limit = 5
 23	}
 24
 25	lower := strings.ToLower(word)
 26	wRunes := []rune(lower)
 27	if len(wRunes) < 2 {
 28		return nil
 29	}
 30
 31	// Allow up to 2 edits for short-to-medium words, 3 for longer ones.
 32	maxDist := 2
 33	if len(wRunes) >= 8 {
 34		maxDist = 3
 35	}
 36
 37	type cand struct {
 38		word string
 39		dist int
 40	}
 41	var cands []cand
 42
 43	for w := range c.words {
 44		// Length filter: prune impossible candidates without an alloc.
 45		ld := len(w) - len(lower)
 46		if ld < 0 {
 47			ld = -ld
 48		}
 49		if ld > maxDist {
 50			continue
 51		}
 52		// First-rune similarity prunes most mismatched candidates cheaply.
 53		if !firstRuneClose(w, lower) {
 54			continue
 55		}
 56		d := levenshtein(wRunes, []rune(w), maxDist)
 57		if d > maxDist {
 58			continue
 59		}
 60		cands = append(cands, cand{w, d})
 61	}
 62
 63	sort.Slice(cands, func(i, j int) bool {
 64		if cands[i].dist != cands[j].dist {
 65			return cands[i].dist < cands[j].dist
 66		}
 67		return cands[i].word < cands[j].word
 68	})
 69
 70	if len(cands) > limit {
 71		cands = cands[:limit]
 72	}
 73	out := make([]string, len(cands))
 74	upper := unicode.IsUpper([]rune(word)[0])
 75	for i, c := range cands {
 76		out[i] = matchCase(c.word, upper)
 77	}
 78	return out
 79}
 80
 81// firstRuneClose returns true when the first runes of a and b are equal,
 82// adjacent on a QWERTY keyboard, or one of them is missing.
 83func firstRuneClose(a, b string) bool {
 84	if a == "" || b == "" {
 85		return true
 86	}
 87	var ar, br rune
 88	for _, r := range a {
 89		ar = r
 90		break
 91	}
 92	for _, r := range b {
 93		br = r
 94		break
 95	}
 96	if ar == br {
 97		return true
 98	}
 99	return keyboardAdjacent(ar, br)
100}
101
102// keyboardAdjacent returns true when a and b are neighbours on a QWERTY
103// keyboard. Used purely to widen the candidate pool around typos like
104// "guzzy"→"fuzzy" (g↔f) without exploding the cost of suggestion.
105func keyboardAdjacent(a, b rune) bool {
106	neighbours := map[rune]string{
107		'a': "qwsz", 'b': "vghn", 'c': "xdfv", 'd': "serfcx", 'e': "wsdr",
108		'f': "drtgcv", 'g': "ftyhvb", 'h': "gyujnb", 'i': "ujko", 'j': "huikmn",
109		'k': "jiolm", 'l': "kop", 'm': "njk", 'n': "bhjm", 'o': "iklp",
110		'p': "ol", 'q': "wa", 'r': "edft", 's': "awedxz", 't': "rfgy",
111		'u': "yhji", 'v': "cfgb", 'w': "qase", 'x': "zsdc", 'y': "tghu",
112		'z': "asx",
113	}
114	a = unicode.ToLower(a)
115	b = unicode.ToLower(b)
116	if ns, ok := neighbours[a]; ok {
117		return strings.ContainsRune(ns, b)
118	}
119	return false
120}
121
122// levenshtein computes the edit distance between a and b, returning early
123// once the running minimum exceeds cutoff. Two-row dynamic programming
124// keeps the allocation small.
125func levenshtein(a, b []rune, cutoff int) int {
126	la, lb := len(a), len(b)
127	if la == 0 {
128		return lb
129	}
130	if lb == 0 {
131		return la
132	}
133	prev := make([]int, lb+1)
134	curr := make([]int, lb+1)
135	for j := 0; j <= lb; j++ {
136		prev[j] = j
137	}
138	for i := 1; i <= la; i++ {
139		curr[0] = i
140		minRow := curr[0]
141		for j := 1; j <= lb; j++ {
142			cost := 1
143			if a[i-1] == b[j-1] {
144				cost = 0
145			}
146			curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost)
147			if curr[j] < minRow {
148				minRow = curr[j]
149			}
150		}
151		if minRow > cutoff {
152			return cutoff + 1
153		}
154		prev, curr = curr, prev
155	}
156	return prev[lb]
157}
158
159func min3(a, b, c int) int {
160	m := a
161	if b < m {
162		m = b
163	}
164	if c < m {
165		m = c
166	}
167	return m
168}
169
170// matchCase capitalises the first rune of s when the original word
171// started with an uppercase letter, so suggestions blend back into the
172// user's writing.
173func matchCase(s string, upperFirst bool) string {
174	if !upperFirst || s == "" {
175		return s
176	}
177	r := []rune(s)
178	r[0] = unicode.ToUpper(r[0])
179	return string(r)
180}