1package spellcheck
2
3import (
4 "strings"
5 "sync"
6 "unicode"
7)
8
9// Checker holds a loaded word set and reports whether tokens are known.
10type Checker struct {
11 mu sync.RWMutex
12 words map[string]struct{}
13 runes map[rune]struct{}
14 loaded bool
15 language string
16}
17
18// NewChecker returns an empty checker. Load must be called before Check
19// returns useful results.
20func NewChecker() *Checker {
21 return &Checker{words: make(map[string]struct{}), runes: make(map[rune]struct{})}
22}
23
24// Load reads a dictionary file from disk and replaces the current word set.
25func (c *Checker) Load(path, language string) error {
26 w, runes, err := parseHunspellDic(path)
27 if err != nil {
28 return err
29 }
30 c.mu.Lock()
31 c.words = w
32 c.runes = runes
33 c.loaded = true
34 c.language = language
35 c.mu.Unlock()
36 return nil
37}
38
39// LoadLang loads the dictionary for the given language code from the
40// configured dicts directory.
41func (c *Checker) LoadLang(lang string) error {
42 path, err := DictPath(lang)
43 if err != nil {
44 return err
45 }
46 return c.Load(path, lang)
47}
48
49// Loaded reports whether the checker has a dictionary ready.
50func (c *Checker) Loaded() bool {
51 c.mu.RLock()
52 defer c.mu.RUnlock()
53 return c.loaded
54}
55
56// Language returns the language code of the loaded dictionary.
57func (c *Checker) Language() string {
58 c.mu.RLock()
59 defer c.mu.RUnlock()
60 return c.language
61}
62
63// Check reports whether the word is recognised. Words shorter than 2 runes,
64// numeric, or containing only punctuation are always treated as correct.
65// Words that contain letter runes outside the loaded dictionary's
66// alphabet (e.g. Cyrillic text against an English dictionary, or accented
67// characters not present in the dictionary's base forms) are also treated
68// as correct — we have no signal to judge them.
69func (c *Checker) Check(word string) bool {
70 if !IsCheckable(word) {
71 return true
72 }
73 c.mu.RLock()
74 defer c.mu.RUnlock()
75 if !c.loaded {
76 return true
77 }
78 if !c.coversWord(word) {
79 return true
80 }
81 lower := strings.ToLower(word)
82 if _, ok := c.words[lower]; ok {
83 return true
84 }
85 // Strip a trailing apostrophe-suffix ('s, 'd, 'll, 're, 've, 't, 'm)
86 // so possessives and common contractions don't get flagged when the
87 // dictionary lists only the base form.
88 if idx := strings.IndexByte(lower, '\''); idx > 0 {
89 base := lower[:idx]
90 if _, ok := c.words[base]; ok {
91 return true
92 }
93 }
94 return false
95}
96
97// coversWord returns true when every letter rune in word is present in
98// the loaded dictionary's rune set. Caller must hold c.mu.
99func (c *Checker) coversWord(word string) bool {
100 if len(c.runes) == 0 {
101 return true
102 }
103 for _, r := range word {
104 if !unicode.IsLetter(r) {
105 continue
106 }
107 lr := unicode.ToLower(r)
108 if _, ok := c.runes[lr]; !ok {
109 return false
110 }
111 }
112 return true
113}
114
115// IsCheckable returns true when the token looks like a natural-language
116// word worth spell-checking. URLs, email-like fragments, numbers, single
117// letters, and all-uppercase short tokens (likely acronyms) are skipped.
118func IsCheckable(word string) bool {
119 runes := []rune(word)
120 if len(runes) < 2 {
121 return false
122 }
123 if strings.ContainsAny(word, "@/\\") {
124 return false
125 }
126 hasLetter := false
127 hasDigit := false
128 allUpper := true
129 for _, r := range runes {
130 switch {
131 case unicode.IsLetter(r):
132 hasLetter = true
133 if !unicode.IsUpper(r) {
134 allUpper = false
135 }
136 case unicode.IsDigit(r):
137 hasDigit = true
138 }
139 }
140 if !hasLetter {
141 return false
142 }
143 if hasDigit {
144 return false
145 }
146 if allUpper && len(runes) <= 5 {
147 return false
148 }
149 return true
150}
151
152// Token records a word and its byte offsets inside the original text.
153type Token struct {
154 Word string
155 Start int
156 End int
157}
158
159// Tokenize splits s into word tokens. A word is a maximal run of letters
160// optionally containing internal apostrophes or hyphens. Leading and
161// trailing connector characters are stripped.
162func Tokenize(s string) []Token {
163 var tokens []Token
164 start := -1
165 lastLetter := -1
166 for i, r := range s {
167 switch {
168 case unicode.IsLetter(r):
169 if start < 0 {
170 start = i
171 }
172 lastLetter = i + utf8RuneLen(r)
173 case start >= 0 && (r == '\'' || r == '’' || r == '-'):
174 // connector — keep word open
175 default:
176 if start >= 0 && lastLetter > start {
177 tokens = append(tokens, Token{Word: s[start:lastLetter], Start: start, End: lastLetter})
178 }
179 start = -1
180 lastLetter = -1
181 }
182 }
183 if start >= 0 && lastLetter > start {
184 tokens = append(tokens, Token{Word: s[start:lastLetter], Start: start, End: lastLetter})
185 }
186 return tokens
187}
188
189func utf8RuneLen(r rune) int {
190 switch {
191 case r < 0x80:
192 return 1
193 case r < 0x800:
194 return 2
195 case r < 0x10000:
196 return 3
197 default:
198 return 4
199 }
200}