checker.go

  1package spellcheck
  2
  3import (
  4	"strings"
  5	"sync"
  6	"unicode"
  7)
  8
  9// Checker holds a loaded word set and reports whether tokens are known.
 10type Checker struct {
 11	mu       sync.RWMutex
 12	words    map[string]struct{}
 13	runes    map[rune]struct{}
 14	loaded   bool
 15	language string
 16}
 17
 18// NewChecker returns an empty checker. Load must be called before Check
 19// returns useful results.
 20func NewChecker() *Checker {
 21	return &Checker{words: make(map[string]struct{}), runes: make(map[rune]struct{})}
 22}
 23
 24// Load reads a dictionary file from disk and replaces the current word set.
 25func (c *Checker) Load(path, language string) error {
 26	w, runes, err := parseHunspellDic(path)
 27	if err != nil {
 28		return err
 29	}
 30	c.mu.Lock()
 31	c.words = w
 32	c.runes = runes
 33	c.loaded = true
 34	c.language = language
 35	c.mu.Unlock()
 36	return nil
 37}
 38
 39// LoadLang loads the dictionary for the given language code from the
 40// configured dicts directory.
 41func (c *Checker) LoadLang(lang string) error {
 42	path, err := DictPath(lang)
 43	if err != nil {
 44		return err
 45	}
 46	return c.Load(path, lang)
 47}
 48
 49// Loaded reports whether the checker has a dictionary ready.
 50func (c *Checker) Loaded() bool {
 51	c.mu.RLock()
 52	defer c.mu.RUnlock()
 53	return c.loaded
 54}
 55
 56// Language returns the language code of the loaded dictionary.
 57func (c *Checker) Language() string {
 58	c.mu.RLock()
 59	defer c.mu.RUnlock()
 60	return c.language
 61}
 62
 63// Check reports whether the word is recognised. Words shorter than 2 runes,
 64// numeric, or containing only punctuation are always treated as correct.
 65// Words that contain letter runes outside the loaded dictionary's
 66// alphabet (e.g. Cyrillic text against an English dictionary, or accented
 67// characters not present in the dictionary's base forms) are also treated
 68// as correct — we have no signal to judge them.
 69func (c *Checker) Check(word string) bool {
 70	if !IsCheckable(word) {
 71		return true
 72	}
 73	c.mu.RLock()
 74	defer c.mu.RUnlock()
 75	if !c.loaded {
 76		return true
 77	}
 78	if !c.coversWord(word) {
 79		return true
 80	}
 81	lower := strings.ToLower(word)
 82	if _, ok := c.words[lower]; ok {
 83		return true
 84	}
 85	// Strip a trailing apostrophe-suffix ('s, 'd, 'll, 're, 've, 't, 'm)
 86	// so possessives and common contractions don't get flagged when the
 87	// dictionary lists only the base form.
 88	if idx := strings.IndexByte(lower, '\''); idx > 0 {
 89		base := lower[:idx]
 90		if _, ok := c.words[base]; ok {
 91			return true
 92		}
 93	}
 94	return false
 95}
 96
 97// coversWord returns true when every letter rune in word is present in
 98// the loaded dictionary's rune set. Caller must hold c.mu.
 99func (c *Checker) coversWord(word string) bool {
100	if len(c.runes) == 0 {
101		return true
102	}
103	for _, r := range word {
104		if !unicode.IsLetter(r) {
105			continue
106		}
107		lr := unicode.ToLower(r)
108		if _, ok := c.runes[lr]; !ok {
109			return false
110		}
111	}
112	return true
113}
114
115// IsCheckable returns true when the token looks like a natural-language
116// word worth spell-checking. URLs, email-like fragments, numbers, single
117// letters, and all-uppercase short tokens (likely acronyms) are skipped.
118func IsCheckable(word string) bool {
119	runes := []rune(word)
120	if len(runes) < 2 {
121		return false
122	}
123	if strings.ContainsAny(word, "@/\\") {
124		return false
125	}
126	hasLetter := false
127	hasDigit := false
128	allUpper := true
129	for _, r := range runes {
130		switch {
131		case unicode.IsLetter(r):
132			hasLetter = true
133			if !unicode.IsUpper(r) {
134				allUpper = false
135			}
136		case unicode.IsDigit(r):
137			hasDigit = true
138		}
139	}
140	if !hasLetter {
141		return false
142	}
143	if hasDigit {
144		return false
145	}
146	if allUpper && len(runes) <= 5 {
147		return false
148	}
149	return true
150}
151
152// Token records a word and its byte offsets inside the original text.
153type Token struct {
154	Word  string
155	Start int
156	End   int
157}
158
159// Tokenize splits s into word tokens. A word is a maximal run of letters
160// optionally containing internal apostrophes or hyphens. Leading and
161// trailing connector characters are stripped.
162func Tokenize(s string) []Token {
163	var tokens []Token
164	start := -1
165	lastLetter := -1
166	for i, r := range s {
167		switch {
168		case unicode.IsLetter(r):
169			if start < 0 {
170				start = i
171			}
172			lastLetter = i + utf8RuneLen(r)
173		case start >= 0 && (r == '\'' || r == '’' || r == '-'):
174			// connector — keep word open
175		default:
176			if start >= 0 && lastLetter > start {
177				tokens = append(tokens, Token{Word: s[start:lastLetter], Start: start, End: lastLetter})
178			}
179			start = -1
180			lastLetter = -1
181		}
182	}
183	if start >= 0 && lastLetter > start {
184		tokens = append(tokens, Token{Word: s[start:lastLetter], Start: start, End: lastLetter})
185	}
186	return tokens
187}
188
189func utf8RuneLen(r rune) int {
190	switch {
191	case r < 0x80:
192		return 1
193	case r < 0x800:
194		return 2
195	case r < 0x10000:
196		return 3
197	default:
198		return 4
199	}
200}