parse.go

  1// Copyright 2013 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package language
  6
  7import (
  8	"errors"
  9	"sort"
 10	"strconv"
 11	"strings"
 12
 13	"golang.org/x/text/internal/language"
 14)
 15
 16// ValueError is returned by any of the parsing functions when the
 17// input is well-formed but the respective subtag is not recognized
 18// as a valid value.
 19type ValueError interface {
 20	error
 21
 22	// Subtag returns the subtag for which the error occurred.
 23	Subtag() string
 24}
 25
 26// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
 27// failed it returns an error and any part of the tag that could be parsed.
 28// If parsing succeeded but an unknown value was found, it returns
 29// ValueError. The Tag returned in this case is just stripped of the unknown
 30// value. All other values are preserved. It accepts tags in the BCP 47 format
 31// and extensions to this standard defined in
 32// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
 33// The resulting tag is canonicalized using the default canonicalization type.
 34func Parse(s string) (t Tag, err error) {
 35	return Default.Parse(s)
 36}
 37
 38// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
 39// failed it returns an error and any part of the tag that could be parsed.
 40// If parsing succeeded but an unknown value was found, it returns
 41// ValueError. The Tag returned in this case is just stripped of the unknown
 42// value. All other values are preserved. It accepts tags in the BCP 47 format
 43// and extensions to this standard defined in
 44// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
 45// The resulting tag is canonicalized using the canonicalization type c.
 46func (c CanonType) Parse(s string) (t Tag, err error) {
 47	defer func() {
 48		if recover() != nil {
 49			t = Tag{}
 50			err = language.ErrSyntax
 51		}
 52	}()
 53
 54	tt, err := language.Parse(s)
 55	if err != nil {
 56		return makeTag(tt), err
 57	}
 58	tt, changed := canonicalize(c, tt)
 59	if changed {
 60		tt.RemakeString()
 61	}
 62	return makeTag(tt), nil
 63}
 64
 65// Compose creates a Tag from individual parts, which may be of type Tag, Base,
 66// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
 67// Base, Script or Region or slice of type Variant or Extension is passed more
 68// than once, the latter will overwrite the former. Variants and Extensions are
 69// accumulated, but if two extensions of the same type are passed, the latter
 70// will replace the former. For -u extensions, though, the key-type pairs are
 71// added, where later values overwrite older ones. A Tag overwrites all former
 72// values and typically only makes sense as the first argument. The resulting
 73// tag is returned after canonicalizing using the Default CanonType. If one or
 74// more errors are encountered, one of the errors is returned.
 75func Compose(part ...interface{}) (t Tag, err error) {
 76	return Default.Compose(part...)
 77}
 78
 79// Compose creates a Tag from individual parts, which may be of type Tag, Base,
 80// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
 81// Base, Script or Region or slice of type Variant or Extension is passed more
 82// than once, the latter will overwrite the former. Variants and Extensions are
 83// accumulated, but if two extensions of the same type are passed, the latter
 84// will replace the former. For -u extensions, though, the key-type pairs are
 85// added, where later values overwrite older ones. A Tag overwrites all former
 86// values and typically only makes sense as the first argument. The resulting
 87// tag is returned after canonicalizing using CanonType c. If one or more errors
 88// are encountered, one of the errors is returned.
 89func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
 90	defer func() {
 91		if recover() != nil {
 92			t = Tag{}
 93			err = language.ErrSyntax
 94		}
 95	}()
 96
 97	var b language.Builder
 98	if err = update(&b, part...); err != nil {
 99		return und, err
100	}
101	b.Tag, _ = canonicalize(c, b.Tag)
102	return makeTag(b.Make()), err
103}
104
105var errInvalidArgument = errors.New("invalid Extension or Variant")
106
107func update(b *language.Builder, part ...interface{}) (err error) {
108	for _, x := range part {
109		switch v := x.(type) {
110		case Tag:
111			b.SetTag(v.tag())
112		case Base:
113			b.Tag.LangID = v.langID
114		case Script:
115			b.Tag.ScriptID = v.scriptID
116		case Region:
117			b.Tag.RegionID = v.regionID
118		case Variant:
119			if v.variant == "" {
120				err = errInvalidArgument
121				break
122			}
123			b.AddVariant(v.variant)
124		case Extension:
125			if v.s == "" {
126				err = errInvalidArgument
127				break
128			}
129			b.SetExt(v.s)
130		case []Variant:
131			b.ClearVariants()
132			for _, v := range v {
133				b.AddVariant(v.variant)
134			}
135		case []Extension:
136			b.ClearExtensions()
137			for _, e := range v {
138				b.SetExt(e.s)
139			}
140		// TODO: support parsing of raw strings based on morphology or just extensions?
141		case error:
142			if v != nil {
143				err = v
144			}
145		}
146	}
147	return
148}
149
150var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
151var errTagListTooLarge = errors.New("tag list exceeds max length")
152
153// ParseAcceptLanguage parses the contents of an Accept-Language header as
154// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
155// a list of corresponding quality weights. It is more permissive than RFC 2616
156// and may return non-nil slices even if the input is not valid.
157// The Tags will be sorted by highest weight first and then by first occurrence.
158// Tags with a weight of zero will be dropped. An error will be returned if the
159// input could not be parsed.
160func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
161	defer func() {
162		if recover() != nil {
163			tag = nil
164			q = nil
165			err = language.ErrSyntax
166		}
167	}()
168
169	if strings.Count(s, "-") > 1000 {
170		return nil, nil, errTagListTooLarge
171	}
172
173	var entry string
174	for s != "" {
175		if entry, s = split(s, ','); entry == "" {
176			continue
177		}
178
179		entry, weight := split(entry, ';')
180
181		// Scan the language.
182		t, err := Parse(entry)
183		if err != nil {
184			id, ok := acceptFallback[entry]
185			if !ok {
186				return nil, nil, err
187			}
188			t = makeTag(language.Tag{LangID: id})
189		}
190
191		// Scan the optional weight.
192		w := 1.0
193		if weight != "" {
194			weight = consume(weight, 'q')
195			weight = consume(weight, '=')
196			// consume returns the empty string when a token could not be
197			// consumed, resulting in an error for ParseFloat.
198			if w, err = strconv.ParseFloat(weight, 32); err != nil {
199				return nil, nil, errInvalidWeight
200			}
201			// Drop tags with a quality weight of 0.
202			if w <= 0 {
203				continue
204			}
205		}
206
207		tag = append(tag, t)
208		q = append(q, float32(w))
209	}
210	sort.Stable(&tagSort{tag, q})
211	return tag, q, nil
212}
213
214// consume removes a leading token c from s and returns the result or the empty
215// string if there is no such token.
216func consume(s string, c byte) string {
217	if s == "" || s[0] != c {
218		return ""
219	}
220	return strings.TrimSpace(s[1:])
221}
222
223func split(s string, c byte) (head, tail string) {
224	if i := strings.IndexByte(s, c); i >= 0 {
225		return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
226	}
227	return strings.TrimSpace(s), ""
228}
229
230// Add hack mapping to deal with a small number of cases that occur
231// in Accept-Language (with reasonable frequency).
232var acceptFallback = map[string]language.Language{
233	"english": _en,
234	"deutsch": _de,
235	"italian": _it,
236	"french":  _fr,
237	"*":       _mul, // defined in the spec to match all languages.
238}
239
240type tagSort struct {
241	tag []Tag
242	q   []float32
243}
244
245func (s *tagSort) Len() int {
246	return len(s.q)
247}
248
249func (s *tagSort) Less(i, j int) bool {
250	return s.q[i] > s.q[j]
251}
252
253func (s *tagSort) Swap(i, j int) {
254	s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
255	s.q[i], s.q[j] = s.q[j], s.q[i]
256}