1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package language
6
7import (
8 "bytes"
9 "errors"
10 "fmt"
11 "sort"
12
13 "golang.org/x/text/internal/tag"
14)
15
16// isAlpha returns true if the byte is not a digit.
17// b must be an ASCII letter or digit.
18func isAlpha(b byte) bool {
19 return b > '9'
20}
21
22// isAlphaNum returns true if the string contains only ASCII letters or digits.
23func isAlphaNum(s []byte) bool {
24 for _, c := range s {
25 if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
26 return false
27 }
28 }
29 return true
30}
31
32// ErrSyntax is returned by any of the parsing functions when the
33// input is not well-formed, according to BCP 47.
34// TODO: return the position at which the syntax error occurred?
35var ErrSyntax = errors.New("language: tag is not well-formed")
36
37// ErrDuplicateKey is returned when a tag contains the same key twice with
38// different values in the -u section.
39var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
40
41// ValueError is returned by any of the parsing functions when the
42// input is well-formed but the respective subtag is not recognized
43// as a valid value.
44type ValueError struct {
45 v [8]byte
46}
47
48// NewValueError creates a new ValueError.
49func NewValueError(tag []byte) ValueError {
50 var e ValueError
51 copy(e.v[:], tag)
52 return e
53}
54
55func (e ValueError) tag() []byte {
56 n := bytes.IndexByte(e.v[:], 0)
57 if n == -1 {
58 n = 8
59 }
60 return e.v[:n]
61}
62
63// Error implements the error interface.
64func (e ValueError) Error() string {
65 return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
66}
67
68// Subtag returns the subtag for which the error occurred.
69func (e ValueError) Subtag() string {
70 return string(e.tag())
71}
72
73// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
74type scanner struct {
75 b []byte
76 bytes [max99thPercentileSize]byte
77 token []byte
78 start int // start position of the current token
79 end int // end position of the current token
80 next int // next point for scan
81 err error
82 done bool
83}
84
85func makeScannerString(s string) scanner {
86 scan := scanner{}
87 if len(s) <= len(scan.bytes) {
88 scan.b = scan.bytes[:copy(scan.bytes[:], s)]
89 } else {
90 scan.b = []byte(s)
91 }
92 scan.init()
93 return scan
94}
95
96// makeScanner returns a scanner using b as the input buffer.
97// b is not copied and may be modified by the scanner routines.
98func makeScanner(b []byte) scanner {
99 scan := scanner{b: b}
100 scan.init()
101 return scan
102}
103
104func (s *scanner) init() {
105 for i, c := range s.b {
106 if c == '_' {
107 s.b[i] = '-'
108 }
109 }
110 s.scan()
111}
112
113// restToLower converts the string between start and end to lower case.
114func (s *scanner) toLower(start, end int) {
115 for i := start; i < end; i++ {
116 c := s.b[i]
117 if 'A' <= c && c <= 'Z' {
118 s.b[i] += 'a' - 'A'
119 }
120 }
121}
122
123func (s *scanner) setError(e error) {
124 if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
125 s.err = e
126 }
127}
128
129// resizeRange shrinks or grows the array at position oldStart such that
130// a new string of size newSize can fit between oldStart and oldEnd.
131// Sets the scan point to after the resized range.
132func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
133 s.start = oldStart
134 if end := oldStart + newSize; end != oldEnd {
135 diff := end - oldEnd
136 var b []byte
137 if n := len(s.b) + diff; n > cap(s.b) {
138 b = make([]byte, n)
139 copy(b, s.b[:oldStart])
140 } else {
141 b = s.b[:n]
142 }
143 copy(b[end:], s.b[oldEnd:])
144 s.b = b
145 s.next = end + (s.next - s.end)
146 s.end = end
147 }
148}
149
150// replace replaces the current token with repl.
151func (s *scanner) replace(repl string) {
152 s.resizeRange(s.start, s.end, len(repl))
153 copy(s.b[s.start:], repl)
154}
155
156// gobble removes the current token from the input.
157// Caller must call scan after calling gobble.
158func (s *scanner) gobble(e error) {
159 s.setError(e)
160 if s.start == 0 {
161 s.b = s.b[:+copy(s.b, s.b[s.next:])]
162 s.end = 0
163 } else {
164 s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
165 s.end = s.start - 1
166 }
167 s.next = s.start
168}
169
170// deleteRange removes the given range from s.b before the current token.
171func (s *scanner) deleteRange(start, end int) {
172 s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
173 diff := end - start
174 s.next -= diff
175 s.start -= diff
176 s.end -= diff
177}
178
179// scan parses the next token of a BCP 47 string. Tokens that are larger
180// than 8 characters or include non-alphanumeric characters result in an error
181// and are gobbled and removed from the output.
182// It returns the end position of the last token consumed.
183func (s *scanner) scan() (end int) {
184 end = s.end
185 s.token = nil
186 for s.start = s.next; s.next < len(s.b); {
187 i := bytes.IndexByte(s.b[s.next:], '-')
188 if i == -1 {
189 s.end = len(s.b)
190 s.next = len(s.b)
191 i = s.end - s.start
192 } else {
193 s.end = s.next + i
194 s.next = s.end + 1
195 }
196 token := s.b[s.start:s.end]
197 if i < 1 || i > 8 || !isAlphaNum(token) {
198 s.gobble(ErrSyntax)
199 continue
200 }
201 s.token = token
202 return end
203 }
204 if n := len(s.b); n > 0 && s.b[n-1] == '-' {
205 s.setError(ErrSyntax)
206 s.b = s.b[:len(s.b)-1]
207 }
208 s.done = true
209 return end
210}
211
212// acceptMinSize parses multiple tokens of the given size or greater.
213// It returns the end position of the last token consumed.
214func (s *scanner) acceptMinSize(min int) (end int) {
215 end = s.end
216 s.scan()
217 for ; len(s.token) >= min; s.scan() {
218 end = s.end
219 }
220 return end
221}
222
223// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
224// failed it returns an error and any part of the tag that could be parsed.
225// If parsing succeeded but an unknown value was found, it returns
226// ValueError. The Tag returned in this case is just stripped of the unknown
227// value. All other values are preserved. It accepts tags in the BCP 47 format
228// and extensions to this standard defined in
229// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
230func Parse(s string) (t Tag, err error) {
231 // TODO: consider supporting old-style locale key-value pairs.
232 if s == "" {
233 return Und, ErrSyntax
234 }
235 defer func() {
236 if recover() != nil {
237 t = Und
238 err = ErrSyntax
239 return
240 }
241 }()
242 if len(s) <= maxAltTaglen {
243 b := [maxAltTaglen]byte{}
244 for i, c := range s {
245 // Generating invalid UTF-8 is okay as it won't match.
246 if 'A' <= c && c <= 'Z' {
247 c += 'a' - 'A'
248 } else if c == '_' {
249 c = '-'
250 }
251 b[i] = byte(c)
252 }
253 if t, ok := grandfathered(b); ok {
254 return t, nil
255 }
256 }
257 scan := makeScannerString(s)
258 return parse(&scan, s)
259}
260
261func parse(scan *scanner, s string) (t Tag, err error) {
262 t = Und
263 var end int
264 if n := len(scan.token); n <= 1 {
265 scan.toLower(0, len(scan.b))
266 if n == 0 || scan.token[0] != 'x' {
267 return t, ErrSyntax
268 }
269 end = parseExtensions(scan)
270 } else if n >= 4 {
271 return Und, ErrSyntax
272 } else { // the usual case
273 t, end = parseTag(scan, true)
274 if n := len(scan.token); n == 1 {
275 t.pExt = uint16(end)
276 end = parseExtensions(scan)
277 } else if end < len(scan.b) {
278 scan.setError(ErrSyntax)
279 scan.b = scan.b[:end]
280 }
281 }
282 if int(t.pVariant) < len(scan.b) {
283 if end < len(s) {
284 s = s[:end]
285 }
286 if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
287 t.str = s
288 } else {
289 t.str = string(scan.b)
290 }
291 } else {
292 t.pVariant, t.pExt = 0, 0
293 }
294 return t, scan.err
295}
296
297// parseTag parses language, script, region and variants.
298// It returns a Tag and the end position in the input that was parsed.
299// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
300func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
301 var e error
302 // TODO: set an error if an unknown lang, script or region is encountered.
303 t.LangID, e = getLangID(scan.token)
304 scan.setError(e)
305 scan.replace(t.LangID.String())
306 langStart := scan.start
307 end = scan.scan()
308 for len(scan.token) == 3 && isAlpha(scan.token[0]) {
309 // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
310 // to a tag of the form <extlang>.
311 if doNorm {
312 lang, e := getLangID(scan.token)
313 if lang != 0 {
314 t.LangID = lang
315 langStr := lang.String()
316 copy(scan.b[langStart:], langStr)
317 scan.b[langStart+len(langStr)] = '-'
318 scan.start = langStart + len(langStr) + 1
319 }
320 scan.gobble(e)
321 }
322 end = scan.scan()
323 }
324 if len(scan.token) == 4 && isAlpha(scan.token[0]) {
325 t.ScriptID, e = getScriptID(script, scan.token)
326 if t.ScriptID == 0 {
327 scan.gobble(e)
328 }
329 end = scan.scan()
330 }
331 if n := len(scan.token); n >= 2 && n <= 3 {
332 t.RegionID, e = getRegionID(scan.token)
333 if t.RegionID == 0 {
334 scan.gobble(e)
335 } else {
336 scan.replace(t.RegionID.String())
337 }
338 end = scan.scan()
339 }
340 scan.toLower(scan.start, len(scan.b))
341 t.pVariant = byte(end)
342 end = parseVariants(scan, end, t)
343 t.pExt = uint16(end)
344 return t, end
345}
346
347var separator = []byte{'-'}
348
349// parseVariants scans tokens as long as each token is a valid variant string.
350// Duplicate variants are removed.
351func parseVariants(scan *scanner, end int, t Tag) int {
352 start := scan.start
353 varIDBuf := [4]uint8{}
354 variantBuf := [4][]byte{}
355 varID := varIDBuf[:0]
356 variant := variantBuf[:0]
357 last := -1
358 needSort := false
359 for ; len(scan.token) >= 4; scan.scan() {
360 // TODO: measure the impact of needing this conversion and redesign
361 // the data structure if there is an issue.
362 v, ok := variantIndex[string(scan.token)]
363 if !ok {
364 // unknown variant
365 // TODO: allow user-defined variants?
366 scan.gobble(NewValueError(scan.token))
367 continue
368 }
369 varID = append(varID, v)
370 variant = append(variant, scan.token)
371 if !needSort {
372 if last < int(v) {
373 last = int(v)
374 } else {
375 needSort = true
376 // There is no legal combinations of more than 7 variants
377 // (and this is by no means a useful sequence).
378 const maxVariants = 8
379 if len(varID) > maxVariants {
380 break
381 }
382 }
383 }
384 end = scan.end
385 }
386 if needSort {
387 sort.Sort(variantsSort{varID, variant})
388 k, l := 0, -1
389 for i, v := range varID {
390 w := int(v)
391 if l == w {
392 // Remove duplicates.
393 continue
394 }
395 varID[k] = varID[i]
396 variant[k] = variant[i]
397 k++
398 l = w
399 }
400 if str := bytes.Join(variant[:k], separator); len(str) == 0 {
401 end = start - 1
402 } else {
403 scan.resizeRange(start, end, len(str))
404 copy(scan.b[scan.start:], str)
405 end = scan.end
406 }
407 }
408 return end
409}
410
411type variantsSort struct {
412 i []uint8
413 v [][]byte
414}
415
416func (s variantsSort) Len() int {
417 return len(s.i)
418}
419
420func (s variantsSort) Swap(i, j int) {
421 s.i[i], s.i[j] = s.i[j], s.i[i]
422 s.v[i], s.v[j] = s.v[j], s.v[i]
423}
424
425func (s variantsSort) Less(i, j int) bool {
426 return s.i[i] < s.i[j]
427}
428
429type bytesSort struct {
430 b [][]byte
431 n int // first n bytes to compare
432}
433
434func (b bytesSort) Len() int {
435 return len(b.b)
436}
437
438func (b bytesSort) Swap(i, j int) {
439 b.b[i], b.b[j] = b.b[j], b.b[i]
440}
441
442func (b bytesSort) Less(i, j int) bool {
443 for k := 0; k < b.n; k++ {
444 if b.b[i][k] == b.b[j][k] {
445 continue
446 }
447 return b.b[i][k] < b.b[j][k]
448 }
449 return false
450}
451
452// parseExtensions parses and normalizes the extensions in the buffer.
453// It returns the last position of scan.b that is part of any extension.
454// It also trims scan.b to remove excess parts accordingly.
455func parseExtensions(scan *scanner) int {
456 start := scan.start
457 exts := [][]byte{}
458 private := []byte{}
459 end := scan.end
460 for len(scan.token) == 1 {
461 extStart := scan.start
462 ext := scan.token[0]
463 end = parseExtension(scan)
464 extension := scan.b[extStart:end]
465 if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
466 scan.setError(ErrSyntax)
467 end = extStart
468 continue
469 } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
470 scan.b = scan.b[:end]
471 return end
472 } else if ext == 'x' {
473 private = extension
474 break
475 }
476 exts = append(exts, extension)
477 }
478 sort.Sort(bytesSort{exts, 1})
479 if len(private) > 0 {
480 exts = append(exts, private)
481 }
482 scan.b = scan.b[:start]
483 if len(exts) > 0 {
484 scan.b = append(scan.b, bytes.Join(exts, separator)...)
485 } else if start > 0 {
486 // Strip trailing '-'.
487 scan.b = scan.b[:start-1]
488 }
489 return end
490}
491
492// parseExtension parses a single extension and returns the position of
493// the extension end.
494func parseExtension(scan *scanner) int {
495 start, end := scan.start, scan.end
496 switch scan.token[0] {
497 case 'u': // https://www.ietf.org/rfc/rfc6067.txt
498 attrStart := end
499 scan.scan()
500 for last := []byte{}; len(scan.token) > 2; scan.scan() {
501 if bytes.Compare(scan.token, last) != -1 {
502 // Attributes are unsorted. Start over from scratch.
503 p := attrStart + 1
504 scan.next = p
505 attrs := [][]byte{}
506 for scan.scan(); len(scan.token) > 2; scan.scan() {
507 attrs = append(attrs, scan.token)
508 end = scan.end
509 }
510 sort.Sort(bytesSort{attrs, 3})
511 copy(scan.b[p:], bytes.Join(attrs, separator))
512 break
513 }
514 last = scan.token
515 end = scan.end
516 }
517 // Scan key-type sequences. A key is of length 2 and may be followed
518 // by 0 or more "type" subtags from 3 to the maximum of 8 letters.
519 var last, key []byte
520 for attrEnd := end; len(scan.token) == 2; last = key {
521 key = scan.token
522 end = scan.end
523 for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
524 end = scan.end
525 }
526 // TODO: check key value validity
527 if bytes.Compare(key, last) != 1 || scan.err != nil {
528 // We have an invalid key or the keys are not sorted.
529 // Start scanning keys from scratch and reorder.
530 p := attrEnd + 1
531 scan.next = p
532 keys := [][]byte{}
533 for scan.scan(); len(scan.token) == 2; {
534 keyStart := scan.start
535 end = scan.end
536 for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
537 end = scan.end
538 }
539 keys = append(keys, scan.b[keyStart:end])
540 }
541 sort.Stable(bytesSort{keys, 2})
542 if n := len(keys); n > 0 {
543 k := 0
544 for i := 1; i < n; i++ {
545 if !bytes.Equal(keys[k][:2], keys[i][:2]) {
546 k++
547 keys[k] = keys[i]
548 } else if !bytes.Equal(keys[k], keys[i]) {
549 scan.setError(ErrDuplicateKey)
550 }
551 }
552 keys = keys[:k+1]
553 }
554 reordered := bytes.Join(keys, separator)
555 if e := p + len(reordered); e < end {
556 scan.deleteRange(e, end)
557 end = e
558 }
559 copy(scan.b[p:], reordered)
560 break
561 }
562 }
563 case 't': // https://www.ietf.org/rfc/rfc6497.txt
564 scan.scan()
565 if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
566 _, end = parseTag(scan, false)
567 scan.toLower(start, end)
568 }
569 for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
570 end = scan.acceptMinSize(3)
571 }
572 case 'x':
573 end = scan.acceptMinSize(1)
574 default:
575 end = scan.acceptMinSize(2)
576 }
577 return end
578}
579
580// getExtension returns the name, body and end position of the extension.
581func getExtension(s string, p int) (end int, ext string) {
582 if s[p] == '-' {
583 p++
584 }
585 if s[p] == 'x' {
586 return len(s), s[p:]
587 }
588 end = nextExtension(s, p)
589 return end, s[p:end]
590}
591
592// nextExtension finds the next extension within the string, searching
593// for the -<char>- pattern from position p.
594// In the fast majority of cases, language tags will have at most
595// one extension and extensions tend to be small.
596func nextExtension(s string, p int) int {
597 for n := len(s) - 3; p < n; {
598 if s[p] == '-' {
599 if s[p+2] == '-' {
600 return p
601 }
602 p += 3
603 } else {
604 p++
605 }
606 }
607 return len(s)
608}