bidirule.go

  1// Copyright 2016 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5// Package bidirule implements the Bidi Rule defined by RFC 5893.
  6//
  7// This package is under development. The API may change without notice and
  8// without preserving backward compatibility.
  9package bidirule
 10
 11import (
 12	"errors"
 13	"unicode/utf8"
 14
 15	"golang.org/x/text/transform"
 16	"golang.org/x/text/unicode/bidi"
 17)
 18
 19// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
 20// Internationalized Domain Names for Applications (IDNA)
 21//
 22// A label is an individual component of a domain name.  Labels are usually
 23// shown separated by dots; for example, the domain name "www.example.com" is
 24// composed of three labels: "www", "example", and "com".
 25//
 26// An RTL label is a label that contains at least one character of class R, AL,
 27// or AN. An LTR label is any label that is not an RTL label.
 28//
 29// A "Bidi domain name" is a domain name that contains at least one RTL label.
 30//
 31//  The following guarantees can be made based on the above:
 32//
 33//  o  In a domain name consisting of only labels that satisfy the rule,
 34//     the requirements of Section 3 are satisfied.  Note that even LTR
 35//     labels and pure ASCII labels have to be tested.
 36//
 37//  o  In a domain name consisting of only LDH labels (as defined in the
 38//     Definitions document [RFC5890]) and labels that satisfy the rule,
 39//     the requirements of Section 3 are satisfied as long as a label
 40//     that starts with an ASCII digit does not come after a
 41//     right-to-left label.
 42//
 43//  No guarantee is given for other combinations.
 44
 45// ErrInvalid indicates a label is invalid according to the Bidi Rule.
 46var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
 47
 48type ruleState uint8
 49
 50const (
 51	ruleInitial ruleState = iota
 52	ruleLTR
 53	ruleLTRFinal
 54	ruleRTL
 55	ruleRTLFinal
 56	ruleInvalid
 57)
 58
 59type ruleTransition struct {
 60	next ruleState
 61	mask uint16
 62}
 63
 64var transitions = [...][2]ruleTransition{
 65	// [2.1] The first character must be a character with Bidi property L, R, or
 66	// AL. If it has the R or AL property, it is an RTL label; if it has the L
 67	// property, it is an LTR label.
 68	ruleInitial: {
 69		{ruleLTRFinal, 1 << bidi.L},
 70		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
 71	},
 72	ruleRTL: {
 73		// [2.3] In an RTL label, the end of the label must be a character with
 74		// Bidi property R, AL, EN, or AN, followed by zero or more characters
 75		// with Bidi property NSM.
 76		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
 77
 78		// [2.2] In an RTL label, only characters with the Bidi properties R,
 79		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
 80		// We exclude the entries from [2.3]
 81		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
 82	},
 83	ruleRTLFinal: {
 84		// [2.3] In an RTL label, the end of the label must be a character with
 85		// Bidi property R, AL, EN, or AN, followed by zero or more characters
 86		// with Bidi property NSM.
 87		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
 88
 89		// [2.2] In an RTL label, only characters with the Bidi properties R,
 90		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
 91		// We exclude the entries from [2.3] and NSM.
 92		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
 93	},
 94	ruleLTR: {
 95		// [2.6] In an LTR label, the end of the label must be a character with
 96		// Bidi property L or EN, followed by zero or more characters with Bidi
 97		// property NSM.
 98		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
 99
100		// [2.5] In an LTR label, only characters with the Bidi properties L,
101		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
102		// We exclude the entries from [2.6].
103		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
104	},
105	ruleLTRFinal: {
106		// [2.6] In an LTR label, the end of the label must be a character with
107		// Bidi property L or EN, followed by zero or more characters with Bidi
108		// property NSM.
109		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
110
111		// [2.5] In an LTR label, only characters with the Bidi properties L,
112		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
113		// We exclude the entries from [2.6].
114		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
115	},
116	ruleInvalid: {
117		{ruleInvalid, 0},
118		{ruleInvalid, 0},
119	},
120}
121
122// [2.4] In an RTL label, if an EN is present, no AN may be present, and
123// vice versa.
124const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
125
126// From RFC 5893
127// An RTL label is a label that contains at least one character of type
128// R, AL, or AN.
129//
130// An LTR label is any label that is not an RTL label.
131
132// Direction reports the direction of the given label as defined by RFC 5893.
133// The Bidi Rule does not have to be applied to labels of the category
134// LeftToRight.
135func Direction(b []byte) bidi.Direction {
136	for i := 0; i < len(b); {
137		e, sz := bidi.Lookup(b[i:])
138		if sz == 0 {
139			i++
140		}
141		c := e.Class()
142		if c == bidi.R || c == bidi.AL || c == bidi.AN {
143			return bidi.RightToLeft
144		}
145		i += sz
146	}
147	return bidi.LeftToRight
148}
149
150// DirectionString reports the direction of the given label as defined by RFC
151// 5893. The Bidi Rule does not have to be applied to labels of the category
152// LeftToRight.
153func DirectionString(s string) bidi.Direction {
154	for i := 0; i < len(s); {
155		e, sz := bidi.LookupString(s[i:])
156		if sz == 0 {
157			i++
158			continue
159		}
160		c := e.Class()
161		if c == bidi.R || c == bidi.AL || c == bidi.AN {
162			return bidi.RightToLeft
163		}
164		i += sz
165	}
166	return bidi.LeftToRight
167}
168
169// Valid reports whether b conforms to the BiDi rule.
170func Valid(b []byte) bool {
171	var t Transformer
172	if n, ok := t.advance(b); !ok || n < len(b) {
173		return false
174	}
175	return t.isFinal()
176}
177
178// ValidString reports whether s conforms to the BiDi rule.
179func ValidString(s string) bool {
180	var t Transformer
181	if n, ok := t.advanceString(s); !ok || n < len(s) {
182		return false
183	}
184	return t.isFinal()
185}
186
187// New returns a Transformer that verifies that input adheres to the Bidi Rule.
188func New() *Transformer {
189	return &Transformer{}
190}
191
192// Transformer implements transform.Transform.
193type Transformer struct {
194	state  ruleState
195	hasRTL bool
196	seen   uint16
197}
198
199// A rule can only be violated for "Bidi Domain names", meaning if one of the
200// following categories has been observed.
201func (t *Transformer) isRTL() bool {
202	const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
203	return t.seen&isRTL != 0
204}
205
206// Reset implements transform.Transformer.
207func (t *Transformer) Reset() { *t = Transformer{} }
208
209// Transform implements transform.Transformer. This Transformer has state and
210// needs to be reset between uses.
211func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
212	if len(dst) < len(src) {
213		src = src[:len(dst)]
214		atEOF = false
215		err = transform.ErrShortDst
216	}
217	n, err1 := t.Span(src, atEOF)
218	copy(dst, src[:n])
219	if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
220		err = err1
221	}
222	return n, n, err
223}
224
225// Span returns the first n bytes of src that conform to the Bidi rule.
226func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
227	if t.state == ruleInvalid && t.isRTL() {
228		return 0, ErrInvalid
229	}
230	n, ok := t.advance(src)
231	switch {
232	case !ok:
233		err = ErrInvalid
234	case n < len(src):
235		if !atEOF {
236			err = transform.ErrShortSrc
237			break
238		}
239		err = ErrInvalid
240	case !t.isFinal():
241		err = ErrInvalid
242	}
243	return n, err
244}
245
246// Precomputing the ASCII values decreases running time for the ASCII fast path
247// by about 30%.
248var asciiTable [128]bidi.Properties
249
250func init() {
251	for i := range asciiTable {
252		p, _ := bidi.LookupRune(rune(i))
253		asciiTable[i] = p
254	}
255}
256
257func (t *Transformer) advance(s []byte) (n int, ok bool) {
258	var e bidi.Properties
259	var sz int
260	for n < len(s) {
261		if s[n] < utf8.RuneSelf {
262			e, sz = asciiTable[s[n]], 1
263		} else {
264			e, sz = bidi.Lookup(s[n:])
265			if sz <= 1 {
266				if sz == 1 {
267					// We always consider invalid UTF-8 to be invalid, even if
268					// the string has not yet been determined to be RTL.
269					// TODO: is this correct?
270					return n, false
271				}
272				return n, true // incomplete UTF-8 encoding
273			}
274		}
275		// TODO: using CompactClass would result in noticeable speedup.
276		// See unicode/bidi/prop.go:Properties.CompactClass.
277		c := uint16(1 << e.Class())
278		t.seen |= c
279		if t.seen&exclusiveRTL == exclusiveRTL {
280			t.state = ruleInvalid
281			return n, false
282		}
283		switch tr := transitions[t.state]; {
284		case tr[0].mask&c != 0:
285			t.state = tr[0].next
286		case tr[1].mask&c != 0:
287			t.state = tr[1].next
288		default:
289			t.state = ruleInvalid
290			if t.isRTL() {
291				return n, false
292			}
293		}
294		n += sz
295	}
296	return n, true
297}
298
299func (t *Transformer) advanceString(s string) (n int, ok bool) {
300	var e bidi.Properties
301	var sz int
302	for n < len(s) {
303		if s[n] < utf8.RuneSelf {
304			e, sz = asciiTable[s[n]], 1
305		} else {
306			e, sz = bidi.LookupString(s[n:])
307			if sz <= 1 {
308				if sz == 1 {
309					return n, false // invalid UTF-8
310				}
311				return n, true // incomplete UTF-8 encoding
312			}
313		}
314		// TODO: using CompactClass results in noticeable speedup.
315		// See unicode/bidi/prop.go:Properties.CompactClass.
316		c := uint16(1 << e.Class())
317		t.seen |= c
318		if t.seen&exclusiveRTL == exclusiveRTL {
319			t.state = ruleInvalid
320			return n, false
321		}
322		switch tr := transitions[t.state]; {
323		case tr[0].mask&c != 0:
324			t.state = tr[0].next
325		case tr[1].mask&c != 0:
326			t.state = tr[1].next
327		default:
328			t.state = ruleInvalid
329			if t.isRTL() {
330				return n, false
331			}
332		}
333		n += sz
334	}
335	return n, true
336}