parse.go

  1// Copyright 2013 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package language
  6
  7import (
  8	"bytes"
  9	"errors"
 10	"fmt"
 11	"sort"
 12
 13	"golang.org/x/text/internal/tag"
 14)
 15
 16// isAlpha returns true if the byte is not a digit.
 17// b must be an ASCII letter or digit.
 18func isAlpha(b byte) bool {
 19	return b > '9'
 20}
 21
 22// isAlphaNum returns true if the string contains only ASCII letters or digits.
 23func isAlphaNum(s []byte) bool {
 24	for _, c := range s {
 25		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
 26			return false
 27		}
 28	}
 29	return true
 30}
 31
 32// ErrSyntax is returned by any of the parsing functions when the
 33// input is not well-formed, according to BCP 47.
 34// TODO: return the position at which the syntax error occurred?
 35var ErrSyntax = errors.New("language: tag is not well-formed")
 36
 37// ErrDuplicateKey is returned when a tag contains the same key twice with
 38// different values in the -u section.
 39var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
 40
 41// ValueError is returned by any of the parsing functions when the
 42// input is well-formed but the respective subtag is not recognized
 43// as a valid value.
 44type ValueError struct {
 45	v [8]byte
 46}
 47
 48// NewValueError creates a new ValueError.
 49func NewValueError(tag []byte) ValueError {
 50	var e ValueError
 51	copy(e.v[:], tag)
 52	return e
 53}
 54
 55func (e ValueError) tag() []byte {
 56	n := bytes.IndexByte(e.v[:], 0)
 57	if n == -1 {
 58		n = 8
 59	}
 60	return e.v[:n]
 61}
 62
 63// Error implements the error interface.
 64func (e ValueError) Error() string {
 65	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
 66}
 67
 68// Subtag returns the subtag for which the error occurred.
 69func (e ValueError) Subtag() string {
 70	return string(e.tag())
 71}
 72
 73// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
 74type scanner struct {
 75	b     []byte
 76	bytes [max99thPercentileSize]byte
 77	token []byte
 78	start int // start position of the current token
 79	end   int // end position of the current token
 80	next  int // next point for scan
 81	err   error
 82	done  bool
 83}
 84
 85func makeScannerString(s string) scanner {
 86	scan := scanner{}
 87	if len(s) <= len(scan.bytes) {
 88		scan.b = scan.bytes[:copy(scan.bytes[:], s)]
 89	} else {
 90		scan.b = []byte(s)
 91	}
 92	scan.init()
 93	return scan
 94}
 95
 96// makeScanner returns a scanner using b as the input buffer.
 97// b is not copied and may be modified by the scanner routines.
 98func makeScanner(b []byte) scanner {
 99	scan := scanner{b: b}
100	scan.init()
101	return scan
102}
103
104func (s *scanner) init() {
105	for i, c := range s.b {
106		if c == '_' {
107			s.b[i] = '-'
108		}
109	}
110	s.scan()
111}
112
113// restToLower converts the string between start and end to lower case.
114func (s *scanner) toLower(start, end int) {
115	for i := start; i < end; i++ {
116		c := s.b[i]
117		if 'A' <= c && c <= 'Z' {
118			s.b[i] += 'a' - 'A'
119		}
120	}
121}
122
123func (s *scanner) setError(e error) {
124	if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
125		s.err = e
126	}
127}
128
129// resizeRange shrinks or grows the array at position oldStart such that
130// a new string of size newSize can fit between oldStart and oldEnd.
131// Sets the scan point to after the resized range.
132func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
133	s.start = oldStart
134	if end := oldStart + newSize; end != oldEnd {
135		diff := end - oldEnd
136		var b []byte
137		if n := len(s.b) + diff; n > cap(s.b) {
138			b = make([]byte, n)
139			copy(b, s.b[:oldStart])
140		} else {
141			b = s.b[:n]
142		}
143		copy(b[end:], s.b[oldEnd:])
144		s.b = b
145		s.next = end + (s.next - s.end)
146		s.end = end
147	}
148}
149
150// replace replaces the current token with repl.
151func (s *scanner) replace(repl string) {
152	s.resizeRange(s.start, s.end, len(repl))
153	copy(s.b[s.start:], repl)
154}
155
156// gobble removes the current token from the input.
157// Caller must call scan after calling gobble.
158func (s *scanner) gobble(e error) {
159	s.setError(e)
160	if s.start == 0 {
161		s.b = s.b[:+copy(s.b, s.b[s.next:])]
162		s.end = 0
163	} else {
164		s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
165		s.end = s.start - 1
166	}
167	s.next = s.start
168}
169
170// deleteRange removes the given range from s.b before the current token.
171func (s *scanner) deleteRange(start, end int) {
172	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
173	diff := end - start
174	s.next -= diff
175	s.start -= diff
176	s.end -= diff
177}
178
179// scan parses the next token of a BCP 47 string.  Tokens that are larger
180// than 8 characters or include non-alphanumeric characters result in an error
181// and are gobbled and removed from the output.
182// It returns the end position of the last token consumed.
183func (s *scanner) scan() (end int) {
184	end = s.end
185	s.token = nil
186	for s.start = s.next; s.next < len(s.b); {
187		i := bytes.IndexByte(s.b[s.next:], '-')
188		if i == -1 {
189			s.end = len(s.b)
190			s.next = len(s.b)
191			i = s.end - s.start
192		} else {
193			s.end = s.next + i
194			s.next = s.end + 1
195		}
196		token := s.b[s.start:s.end]
197		if i < 1 || i > 8 || !isAlphaNum(token) {
198			s.gobble(ErrSyntax)
199			continue
200		}
201		s.token = token
202		return end
203	}
204	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
205		s.setError(ErrSyntax)
206		s.b = s.b[:len(s.b)-1]
207	}
208	s.done = true
209	return end
210}
211
212// acceptMinSize parses multiple tokens of the given size or greater.
213// It returns the end position of the last token consumed.
214func (s *scanner) acceptMinSize(min int) (end int) {
215	end = s.end
216	s.scan()
217	for ; len(s.token) >= min; s.scan() {
218		end = s.end
219	}
220	return end
221}
222
223// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
224// failed it returns an error and any part of the tag that could be parsed.
225// If parsing succeeded but an unknown value was found, it returns
226// ValueError. The Tag returned in this case is just stripped of the unknown
227// value. All other values are preserved. It accepts tags in the BCP 47 format
228// and extensions to this standard defined in
229// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
230func Parse(s string) (t Tag, err error) {
231	// TODO: consider supporting old-style locale key-value pairs.
232	if s == "" {
233		return Und, ErrSyntax
234	}
235	defer func() {
236		if recover() != nil {
237			t = Und
238			err = ErrSyntax
239			return
240		}
241	}()
242	if len(s) <= maxAltTaglen {
243		b := [maxAltTaglen]byte{}
244		for i, c := range s {
245			// Generating invalid UTF-8 is okay as it won't match.
246			if 'A' <= c && c <= 'Z' {
247				c += 'a' - 'A'
248			} else if c == '_' {
249				c = '-'
250			}
251			b[i] = byte(c)
252		}
253		if t, ok := grandfathered(b); ok {
254			return t, nil
255		}
256	}
257	scan := makeScannerString(s)
258	return parse(&scan, s)
259}
260
261func parse(scan *scanner, s string) (t Tag, err error) {
262	t = Und
263	var end int
264	if n := len(scan.token); n <= 1 {
265		scan.toLower(0, len(scan.b))
266		if n == 0 || scan.token[0] != 'x' {
267			return t, ErrSyntax
268		}
269		end = parseExtensions(scan)
270	} else if n >= 4 {
271		return Und, ErrSyntax
272	} else { // the usual case
273		t, end = parseTag(scan, true)
274		if n := len(scan.token); n == 1 {
275			t.pExt = uint16(end)
276			end = parseExtensions(scan)
277		} else if end < len(scan.b) {
278			scan.setError(ErrSyntax)
279			scan.b = scan.b[:end]
280		}
281	}
282	if int(t.pVariant) < len(scan.b) {
283		if end < len(s) {
284			s = s[:end]
285		}
286		if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
287			t.str = s
288		} else {
289			t.str = string(scan.b)
290		}
291	} else {
292		t.pVariant, t.pExt = 0, 0
293	}
294	return t, scan.err
295}
296
297// parseTag parses language, script, region and variants.
298// It returns a Tag and the end position in the input that was parsed.
299// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
300func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
301	var e error
302	// TODO: set an error if an unknown lang, script or region is encountered.
303	t.LangID, e = getLangID(scan.token)
304	scan.setError(e)
305	scan.replace(t.LangID.String())
306	langStart := scan.start
307	end = scan.scan()
308	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
309		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
310		// to a tag of the form <extlang>.
311		if doNorm {
312			lang, e := getLangID(scan.token)
313			if lang != 0 {
314				t.LangID = lang
315				langStr := lang.String()
316				copy(scan.b[langStart:], langStr)
317				scan.b[langStart+len(langStr)] = '-'
318				scan.start = langStart + len(langStr) + 1
319			}
320			scan.gobble(e)
321		}
322		end = scan.scan()
323	}
324	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
325		t.ScriptID, e = getScriptID(script, scan.token)
326		if t.ScriptID == 0 {
327			scan.gobble(e)
328		}
329		end = scan.scan()
330	}
331	if n := len(scan.token); n >= 2 && n <= 3 {
332		t.RegionID, e = getRegionID(scan.token)
333		if t.RegionID == 0 {
334			scan.gobble(e)
335		} else {
336			scan.replace(t.RegionID.String())
337		}
338		end = scan.scan()
339	}
340	scan.toLower(scan.start, len(scan.b))
341	t.pVariant = byte(end)
342	end = parseVariants(scan, end, t)
343	t.pExt = uint16(end)
344	return t, end
345}
346
347var separator = []byte{'-'}
348
349// parseVariants scans tokens as long as each token is a valid variant string.
350// Duplicate variants are removed.
351func parseVariants(scan *scanner, end int, t Tag) int {
352	start := scan.start
353	varIDBuf := [4]uint8{}
354	variantBuf := [4][]byte{}
355	varID := varIDBuf[:0]
356	variant := variantBuf[:0]
357	last := -1
358	needSort := false
359	for ; len(scan.token) >= 4; scan.scan() {
360		// TODO: measure the impact of needing this conversion and redesign
361		// the data structure if there is an issue.
362		v, ok := variantIndex[string(scan.token)]
363		if !ok {
364			// unknown variant
365			// TODO: allow user-defined variants?
366			scan.gobble(NewValueError(scan.token))
367			continue
368		}
369		varID = append(varID, v)
370		variant = append(variant, scan.token)
371		if !needSort {
372			if last < int(v) {
373				last = int(v)
374			} else {
375				needSort = true
376				// There is no legal combinations of more than 7 variants
377				// (and this is by no means a useful sequence).
378				const maxVariants = 8
379				if len(varID) > maxVariants {
380					break
381				}
382			}
383		}
384		end = scan.end
385	}
386	if needSort {
387		sort.Sort(variantsSort{varID, variant})
388		k, l := 0, -1
389		for i, v := range varID {
390			w := int(v)
391			if l == w {
392				// Remove duplicates.
393				continue
394			}
395			varID[k] = varID[i]
396			variant[k] = variant[i]
397			k++
398			l = w
399		}
400		if str := bytes.Join(variant[:k], separator); len(str) == 0 {
401			end = start - 1
402		} else {
403			scan.resizeRange(start, end, len(str))
404			copy(scan.b[scan.start:], str)
405			end = scan.end
406		}
407	}
408	return end
409}
410
411type variantsSort struct {
412	i []uint8
413	v [][]byte
414}
415
416func (s variantsSort) Len() int {
417	return len(s.i)
418}
419
420func (s variantsSort) Swap(i, j int) {
421	s.i[i], s.i[j] = s.i[j], s.i[i]
422	s.v[i], s.v[j] = s.v[j], s.v[i]
423}
424
425func (s variantsSort) Less(i, j int) bool {
426	return s.i[i] < s.i[j]
427}
428
429type bytesSort struct {
430	b [][]byte
431	n int // first n bytes to compare
432}
433
434func (b bytesSort) Len() int {
435	return len(b.b)
436}
437
438func (b bytesSort) Swap(i, j int) {
439	b.b[i], b.b[j] = b.b[j], b.b[i]
440}
441
442func (b bytesSort) Less(i, j int) bool {
443	for k := 0; k < b.n; k++ {
444		if b.b[i][k] == b.b[j][k] {
445			continue
446		}
447		return b.b[i][k] < b.b[j][k]
448	}
449	return false
450}
451
452// parseExtensions parses and normalizes the extensions in the buffer.
453// It returns the last position of scan.b that is part of any extension.
454// It also trims scan.b to remove excess parts accordingly.
455func parseExtensions(scan *scanner) int {
456	start := scan.start
457	exts := [][]byte{}
458	private := []byte{}
459	end := scan.end
460	for len(scan.token) == 1 {
461		extStart := scan.start
462		ext := scan.token[0]
463		end = parseExtension(scan)
464		extension := scan.b[extStart:end]
465		if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
466			scan.setError(ErrSyntax)
467			end = extStart
468			continue
469		} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
470			scan.b = scan.b[:end]
471			return end
472		} else if ext == 'x' {
473			private = extension
474			break
475		}
476		exts = append(exts, extension)
477	}
478	sort.Sort(bytesSort{exts, 1})
479	if len(private) > 0 {
480		exts = append(exts, private)
481	}
482	scan.b = scan.b[:start]
483	if len(exts) > 0 {
484		scan.b = append(scan.b, bytes.Join(exts, separator)...)
485	} else if start > 0 {
486		// Strip trailing '-'.
487		scan.b = scan.b[:start-1]
488	}
489	return end
490}
491
492// parseExtension parses a single extension and returns the position of
493// the extension end.
494func parseExtension(scan *scanner) int {
495	start, end := scan.start, scan.end
496	switch scan.token[0] {
497	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
498		attrStart := end
499		scan.scan()
500		for last := []byte{}; len(scan.token) > 2; scan.scan() {
501			if bytes.Compare(scan.token, last) != -1 {
502				// Attributes are unsorted. Start over from scratch.
503				p := attrStart + 1
504				scan.next = p
505				attrs := [][]byte{}
506				for scan.scan(); len(scan.token) > 2; scan.scan() {
507					attrs = append(attrs, scan.token)
508					end = scan.end
509				}
510				sort.Sort(bytesSort{attrs, 3})
511				copy(scan.b[p:], bytes.Join(attrs, separator))
512				break
513			}
514			last = scan.token
515			end = scan.end
516		}
517		// Scan key-type sequences. A key is of length 2 and may be followed
518		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
519		var last, key []byte
520		for attrEnd := end; len(scan.token) == 2; last = key {
521			key = scan.token
522			end = scan.end
523			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
524				end = scan.end
525			}
526			// TODO: check key value validity
527			if bytes.Compare(key, last) != 1 || scan.err != nil {
528				// We have an invalid key or the keys are not sorted.
529				// Start scanning keys from scratch and reorder.
530				p := attrEnd + 1
531				scan.next = p
532				keys := [][]byte{}
533				for scan.scan(); len(scan.token) == 2; {
534					keyStart := scan.start
535					end = scan.end
536					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
537						end = scan.end
538					}
539					keys = append(keys, scan.b[keyStart:end])
540				}
541				sort.Stable(bytesSort{keys, 2})
542				if n := len(keys); n > 0 {
543					k := 0
544					for i := 1; i < n; i++ {
545						if !bytes.Equal(keys[k][:2], keys[i][:2]) {
546							k++
547							keys[k] = keys[i]
548						} else if !bytes.Equal(keys[k], keys[i]) {
549							scan.setError(ErrDuplicateKey)
550						}
551					}
552					keys = keys[:k+1]
553				}
554				reordered := bytes.Join(keys, separator)
555				if e := p + len(reordered); e < end {
556					scan.deleteRange(e, end)
557					end = e
558				}
559				copy(scan.b[p:], reordered)
560				break
561			}
562		}
563	case 't': // https://www.ietf.org/rfc/rfc6497.txt
564		scan.scan()
565		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
566			_, end = parseTag(scan, false)
567			scan.toLower(start, end)
568		}
569		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
570			end = scan.acceptMinSize(3)
571		}
572	case 'x':
573		end = scan.acceptMinSize(1)
574	default:
575		end = scan.acceptMinSize(2)
576	}
577	return end
578}
579
580// getExtension returns the name, body and end position of the extension.
581func getExtension(s string, p int) (end int, ext string) {
582	if s[p] == '-' {
583		p++
584	}
585	if s[p] == 'x' {
586		return len(s), s[p:]
587	}
588	end = nextExtension(s, p)
589	return end, s[p:end]
590}
591
592// nextExtension finds the next extension within the string, searching
593// for the -<char>- pattern from position p.
594// In the fast majority of cases, language tags will have at most
595// one extension and extensions tend to be small.
596func nextExtension(s string, p int) int {
597	for n := len(s) - 3; p < n; {
598		if s[p] == '-' {
599			if s[p+2] == '-' {
600				return p
601			}
602			p += 3
603		} else {
604			p++
605		}
606	}
607	return len(s)
608}