prop.go

  1// Copyright 2016 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package bidi
  6
  7import "unicode/utf8"
  8
  9// Properties provides access to BiDi properties of runes.
 10type Properties struct {
 11	entry uint8
 12	last  uint8
 13}
 14
 15var trie = newBidiTrie(0)
 16
 17// TODO: using this for bidirule reduces the running time by about 5%. Consider
 18// if this is worth exposing or if we can find a way to speed up the Class
 19// method.
 20//
 21// // CompactClass is like Class, but maps all of the BiDi control classes
 22// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
 23// func (p Properties) CompactClass() Class {
 24// 	return Class(p.entry & 0x0F)
 25// }
 26
 27// Class returns the Bidi class for p.
 28func (p Properties) Class() Class {
 29	c := Class(p.entry & 0x0F)
 30	if c == Control {
 31		c = controlByteToClass[p.last&0xF]
 32	}
 33	return c
 34}
 35
 36// IsBracket reports whether the rune is a bracket.
 37func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
 38
 39// IsOpeningBracket reports whether the rune is an opening bracket.
 40// IsBracket must return true.
 41func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
 42
 43// TODO: find a better API and expose.
 44func (p Properties) reverseBracket(r rune) rune {
 45	return xorMasks[p.entry>>xorMaskShift] ^ r
 46}
 47
 48var controlByteToClass = [16]Class{
 49	0xD: LRO, // U+202D LeftToRightOverride,
 50	0xE: RLO, // U+202E RightToLeftOverride,
 51	0xA: LRE, // U+202A LeftToRightEmbedding,
 52	0xB: RLE, // U+202B RightToLeftEmbedding,
 53	0xC: PDF, // U+202C PopDirectionalFormat,
 54	0x6: LRI, // U+2066 LeftToRightIsolate,
 55	0x7: RLI, // U+2067 RightToLeftIsolate,
 56	0x8: FSI, // U+2068 FirstStrongIsolate,
 57	0x9: PDI, // U+2069 PopDirectionalIsolate,
 58}
 59
 60// LookupRune returns properties for r.
 61func LookupRune(r rune) (p Properties, size int) {
 62	var buf [4]byte
 63	n := utf8.EncodeRune(buf[:], r)
 64	return Lookup(buf[:n])
 65}
 66
 67// TODO: these lookup methods are based on the generated trie code. The returned
 68// sizes have slightly different semantics from the generated code, in that it
 69// always returns size==1 for an illegal UTF-8 byte (instead of the length
 70// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
 71// leave invalid UTF-8 untouched, in which case it has performance benefits to
 72// do so (without changing the semantics). Bidi requires the semantics used here
 73// for the bidirule implementation to be compatible with the Go semantics.
 74//  They ultimately should perhaps be adopted by all trie implementations, for
 75// convenience sake.
 76// This unrolled code also boosts performance of the secure/bidirule package by
 77// about 30%.
 78// So, to remove this code:
 79//   - add option to trie generator to define return type.
 80//   - always return 1 byte size for ill-formed UTF-8 runes.
 81
 82// Lookup returns properties for the first rune in s and the width in bytes of
 83// its encoding. The size will be 0 if s does not hold enough bytes to complete
 84// the encoding.
 85func Lookup(s []byte) (p Properties, sz int) {
 86	c0 := s[0]
 87	switch {
 88	case c0 < 0x80: // is ASCII
 89		return Properties{entry: bidiValues[c0]}, 1
 90	case c0 < 0xC2:
 91		return Properties{}, 1
 92	case c0 < 0xE0: // 2-byte UTF-8
 93		if len(s) < 2 {
 94			return Properties{}, 0
 95		}
 96		i := bidiIndex[c0]
 97		c1 := s[1]
 98		if c1 < 0x80 || 0xC0 <= c1 {
 99			return Properties{}, 1
100		}
101		return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
102	case c0 < 0xF0: // 3-byte UTF-8
103		if len(s) < 3 {
104			return Properties{}, 0
105		}
106		i := bidiIndex[c0]
107		c1 := s[1]
108		if c1 < 0x80 || 0xC0 <= c1 {
109			return Properties{}, 1
110		}
111		o := uint32(i)<<6 + uint32(c1)
112		i = bidiIndex[o]
113		c2 := s[2]
114		if c2 < 0x80 || 0xC0 <= c2 {
115			return Properties{}, 1
116		}
117		return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
118	case c0 < 0xF8: // 4-byte UTF-8
119		if len(s) < 4 {
120			return Properties{}, 0
121		}
122		i := bidiIndex[c0]
123		c1 := s[1]
124		if c1 < 0x80 || 0xC0 <= c1 {
125			return Properties{}, 1
126		}
127		o := uint32(i)<<6 + uint32(c1)
128		i = bidiIndex[o]
129		c2 := s[2]
130		if c2 < 0x80 || 0xC0 <= c2 {
131			return Properties{}, 1
132		}
133		o = uint32(i)<<6 + uint32(c2)
134		i = bidiIndex[o]
135		c3 := s[3]
136		if c3 < 0x80 || 0xC0 <= c3 {
137			return Properties{}, 1
138		}
139		return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
140	}
141	// Illegal rune
142	return Properties{}, 1
143}
144
145// LookupString returns properties for the first rune in s and the width in
146// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
147// complete the encoding.
148func LookupString(s string) (p Properties, sz int) {
149	c0 := s[0]
150	switch {
151	case c0 < 0x80: // is ASCII
152		return Properties{entry: bidiValues[c0]}, 1
153	case c0 < 0xC2:
154		return Properties{}, 1
155	case c0 < 0xE0: // 2-byte UTF-8
156		if len(s) < 2 {
157			return Properties{}, 0
158		}
159		i := bidiIndex[c0]
160		c1 := s[1]
161		if c1 < 0x80 || 0xC0 <= c1 {
162			return Properties{}, 1
163		}
164		return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
165	case c0 < 0xF0: // 3-byte UTF-8
166		if len(s) < 3 {
167			return Properties{}, 0
168		}
169		i := bidiIndex[c0]
170		c1 := s[1]
171		if c1 < 0x80 || 0xC0 <= c1 {
172			return Properties{}, 1
173		}
174		o := uint32(i)<<6 + uint32(c1)
175		i = bidiIndex[o]
176		c2 := s[2]
177		if c2 < 0x80 || 0xC0 <= c2 {
178			return Properties{}, 1
179		}
180		return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
181	case c0 < 0xF8: // 4-byte UTF-8
182		if len(s) < 4 {
183			return Properties{}, 0
184		}
185		i := bidiIndex[c0]
186		c1 := s[1]
187		if c1 < 0x80 || 0xC0 <= c1 {
188			return Properties{}, 1
189		}
190		o := uint32(i)<<6 + uint32(c1)
191		i = bidiIndex[o]
192		c2 := s[2]
193		if c2 < 0x80 || 0xC0 <= c2 {
194			return Properties{}, 1
195		}
196		o = uint32(i)<<6 + uint32(c2)
197		i = bidiIndex[o]
198		c3 := s[3]
199		if c3 < 0x80 || 0xC0 <= c3 {
200			return Properties{}, 1
201		}
202		return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
203	}
204	// Illegal rune
205	return Properties{}, 1
206}