runes.go

  1// Copyright 2014 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5// Package runes provide transforms for UTF-8 encoded text.
  6package runes // import "golang.org/x/text/runes"
  7
  8import (
  9	"unicode"
 10	"unicode/utf8"
 11
 12	"golang.org/x/text/transform"
 13)
 14
 15// A Set is a collection of runes.
 16type Set interface {
 17	// Contains returns true if r is contained in the set.
 18	Contains(r rune) bool
 19}
 20
 21type setFunc func(rune) bool
 22
 23func (s setFunc) Contains(r rune) bool {
 24	return s(r)
 25}
 26
 27// Note: using funcs here instead of wrapping types result in cleaner
 28// documentation and a smaller API.
 29
 30// In creates a Set with a Contains method that returns true for all runes in
 31// the given RangeTable.
 32func In(rt *unicode.RangeTable) Set {
 33	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
 34}
 35
 36// In creates a Set with a Contains method that returns true for all runes not
 37// in the given RangeTable.
 38func NotIn(rt *unicode.RangeTable) Set {
 39	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
 40}
 41
 42// Predicate creates a Set with a Contains method that returns f(r).
 43func Predicate(f func(rune) bool) Set {
 44	return setFunc(f)
 45}
 46
 47// Transformer implements the transform.Transformer interface.
 48type Transformer struct {
 49	t transform.SpanningTransformer
 50}
 51
 52func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 53	return t.t.Transform(dst, src, atEOF)
 54}
 55
 56func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
 57	return t.t.Span(b, atEOF)
 58}
 59
 60func (t Transformer) Reset() { t.t.Reset() }
 61
 62// Bytes returns a new byte slice with the result of converting b using t.  It
 63// calls Reset on t. It returns nil if any error was found. This can only happen
 64// if an error-producing Transformer is passed to If.
 65func (t Transformer) Bytes(b []byte) []byte {
 66	b, _, err := transform.Bytes(t, b)
 67	if err != nil {
 68		return nil
 69	}
 70	return b
 71}
 72
 73// String returns a string with the result of converting s using t. It calls
 74// Reset on t. It returns the empty string if any error was found. This can only
 75// happen if an error-producing Transformer is passed to If.
 76func (t Transformer) String(s string) string {
 77	s, _, err := transform.String(t, s)
 78	if err != nil {
 79		return ""
 80	}
 81	return s
 82}
 83
 84// TODO:
 85// - Copy: copying strings and bytes in whole-rune units.
 86// - Validation (maybe)
 87// - Well-formed-ness (maybe)
 88
 89const runeErrorString = string(utf8.RuneError)
 90
 91// Remove returns a Transformer that removes runes r for which s.Contains(r).
 92// Illegal input bytes are replaced by RuneError before being passed to f.
 93func Remove(s Set) Transformer {
 94	if f, ok := s.(setFunc); ok {
 95		// This little trick cuts the running time of BenchmarkRemove for sets
 96		// created by Predicate roughly in half.
 97		// TODO: special-case RangeTables as well.
 98		return Transformer{remove(f)}
 99	}
100	return Transformer{remove(s.Contains)}
101}
102
103// TODO: remove transform.RemoveFunc.
104
105type remove func(r rune) bool
106
107func (remove) Reset() {}
108
109// Span implements transform.Spanner.
110func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
111	for r, size := rune(0), 0; n < len(src); {
112		if r = rune(src[n]); r < utf8.RuneSelf {
113			size = 1
114		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
115			// Invalid rune.
116			if !atEOF && !utf8.FullRune(src[n:]) {
117				err = transform.ErrShortSrc
118			} else {
119				err = transform.ErrEndOfSpan
120			}
121			break
122		}
123		if t(r) {
124			err = transform.ErrEndOfSpan
125			break
126		}
127		n += size
128	}
129	return
130}
131
132// Transform implements transform.Transformer.
133func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
134	for r, size := rune(0), 0; nSrc < len(src); {
135		if r = rune(src[nSrc]); r < utf8.RuneSelf {
136			size = 1
137		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
138			// Invalid rune.
139			if !atEOF && !utf8.FullRune(src[nSrc:]) {
140				err = transform.ErrShortSrc
141				break
142			}
143			// We replace illegal bytes with RuneError. Not doing so might
144			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
145			// The resulting byte sequence may subsequently contain runes
146			// for which t(r) is true that were passed unnoticed.
147			if !t(utf8.RuneError) {
148				if nDst+3 > len(dst) {
149					err = transform.ErrShortDst
150					break
151				}
152				dst[nDst+0] = runeErrorString[0]
153				dst[nDst+1] = runeErrorString[1]
154				dst[nDst+2] = runeErrorString[2]
155				nDst += 3
156			}
157			nSrc++
158			continue
159		}
160		if t(r) {
161			nSrc += size
162			continue
163		}
164		if nDst+size > len(dst) {
165			err = transform.ErrShortDst
166			break
167		}
168		for i := 0; i < size; i++ {
169			dst[nDst] = src[nSrc]
170			nDst++
171			nSrc++
172		}
173	}
174	return
175}
176
177// Map returns a Transformer that maps the runes in the input using the given
178// mapping. Illegal bytes in the input are converted to utf8.RuneError before
179// being passed to the mapping func.
180func Map(mapping func(rune) rune) Transformer {
181	return Transformer{mapper(mapping)}
182}
183
184type mapper func(rune) rune
185
186func (mapper) Reset() {}
187
188// Span implements transform.Spanner.
189func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
190	for r, size := rune(0), 0; n < len(src); n += size {
191		if r = rune(src[n]); r < utf8.RuneSelf {
192			size = 1
193		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
194			// Invalid rune.
195			if !atEOF && !utf8.FullRune(src[n:]) {
196				err = transform.ErrShortSrc
197			} else {
198				err = transform.ErrEndOfSpan
199			}
200			break
201		}
202		if t(r) != r {
203			err = transform.ErrEndOfSpan
204			break
205		}
206	}
207	return n, err
208}
209
210// Transform implements transform.Transformer.
211func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
212	var replacement rune
213	var b [utf8.UTFMax]byte
214
215	for r, size := rune(0), 0; nSrc < len(src); {
216		if r = rune(src[nSrc]); r < utf8.RuneSelf {
217			if replacement = t(r); replacement < utf8.RuneSelf {
218				if nDst == len(dst) {
219					err = transform.ErrShortDst
220					break
221				}
222				dst[nDst] = byte(replacement)
223				nDst++
224				nSrc++
225				continue
226			}
227			size = 1
228		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
229			// Invalid rune.
230			if !atEOF && !utf8.FullRune(src[nSrc:]) {
231				err = transform.ErrShortSrc
232				break
233			}
234
235			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
236				if nDst+3 > len(dst) {
237					err = transform.ErrShortDst
238					break
239				}
240				dst[nDst+0] = runeErrorString[0]
241				dst[nDst+1] = runeErrorString[1]
242				dst[nDst+2] = runeErrorString[2]
243				nDst += 3
244				nSrc++
245				continue
246			}
247		} else if replacement = t(r); replacement == r {
248			if nDst+size > len(dst) {
249				err = transform.ErrShortDst
250				break
251			}
252			for i := 0; i < size; i++ {
253				dst[nDst] = src[nSrc]
254				nDst++
255				nSrc++
256			}
257			continue
258		}
259
260		n := utf8.EncodeRune(b[:], replacement)
261
262		if nDst+n > len(dst) {
263			err = transform.ErrShortDst
264			break
265		}
266		for i := 0; i < n; i++ {
267			dst[nDst] = b[i]
268			nDst++
269		}
270		nSrc += size
271	}
272	return
273}
274
275// ReplaceIllFormed returns a transformer that replaces all input bytes that are
276// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
277func ReplaceIllFormed() Transformer {
278	return Transformer{&replaceIllFormed{}}
279}
280
281type replaceIllFormed struct{ transform.NopResetter }
282
283func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
284	for n < len(src) {
285		// ASCII fast path.
286		if src[n] < utf8.RuneSelf {
287			n++
288			continue
289		}
290
291		r, size := utf8.DecodeRune(src[n:])
292
293		// Look for a valid non-ASCII rune.
294		if r != utf8.RuneError || size != 1 {
295			n += size
296			continue
297		}
298
299		// Look for short source data.
300		if !atEOF && !utf8.FullRune(src[n:]) {
301			err = transform.ErrShortSrc
302			break
303		}
304
305		// We have an invalid rune.
306		err = transform.ErrEndOfSpan
307		break
308	}
309	return n, err
310}
311
312func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
313	for nSrc < len(src) {
314		// ASCII fast path.
315		if r := src[nSrc]; r < utf8.RuneSelf {
316			if nDst == len(dst) {
317				err = transform.ErrShortDst
318				break
319			}
320			dst[nDst] = r
321			nDst++
322			nSrc++
323			continue
324		}
325
326		// Look for a valid non-ASCII rune.
327		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
328			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
329				err = transform.ErrShortDst
330				break
331			}
332			nDst += size
333			nSrc += size
334			continue
335		}
336
337		// Look for short source data.
338		if !atEOF && !utf8.FullRune(src[nSrc:]) {
339			err = transform.ErrShortSrc
340			break
341		}
342
343		// We have an invalid rune.
344		if nDst+3 > len(dst) {
345			err = transform.ErrShortDst
346			break
347		}
348		dst[nDst+0] = runeErrorString[0]
349		dst[nDst+1] = runeErrorString[1]
350		dst[nDst+2] = runeErrorString[2]
351		nDst += 3
352		nSrc++
353	}
354	return nDst, nSrc, err
355}