iter.go

  1// Copyright 2011 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package norm
  6
  7import (
  8	"fmt"
  9	"unicode/utf8"
 10)
 11
 12// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
 13// sequence of starter and non-starter runes for the purpose of normalization.
 14const MaxSegmentSize = maxByteBufferSize
 15
 16// An Iter iterates over a string or byte slice, while normalizing it
 17// to a given Form.
 18type Iter struct {
 19	rb     reorderBuffer
 20	buf    [maxByteBufferSize]byte
 21	info   Properties // first character saved from previous iteration
 22	next   iterFunc   // implementation of next depends on form
 23	asciiF iterFunc
 24
 25	p        int    // current position in input source
 26	multiSeg []byte // remainder of multi-segment decomposition
 27}
 28
 29type iterFunc func(*Iter) []byte
 30
 31// Init initializes i to iterate over src after normalizing it to Form f.
 32func (i *Iter) Init(f Form, src []byte) {
 33	i.p = 0
 34	if len(src) == 0 {
 35		i.setDone()
 36		i.rb.nsrc = 0
 37		return
 38	}
 39	i.multiSeg = nil
 40	i.rb.init(f, src)
 41	i.next = i.rb.f.nextMain
 42	i.asciiF = nextASCIIBytes
 43	i.info = i.rb.f.info(i.rb.src, i.p)
 44	i.rb.ss.first(i.info)
 45}
 46
 47// InitString initializes i to iterate over src after normalizing it to Form f.
 48func (i *Iter) InitString(f Form, src string) {
 49	i.p = 0
 50	if len(src) == 0 {
 51		i.setDone()
 52		i.rb.nsrc = 0
 53		return
 54	}
 55	i.multiSeg = nil
 56	i.rb.initString(f, src)
 57	i.next = i.rb.f.nextMain
 58	i.asciiF = nextASCIIString
 59	i.info = i.rb.f.info(i.rb.src, i.p)
 60	i.rb.ss.first(i.info)
 61}
 62
 63// Seek sets the segment to be returned by the next call to Next to start
 64// at position p.  It is the responsibility of the caller to set p to the
 65// start of a segment.
 66func (i *Iter) Seek(offset int64, whence int) (int64, error) {
 67	var abs int64
 68	switch whence {
 69	case 0:
 70		abs = offset
 71	case 1:
 72		abs = int64(i.p) + offset
 73	case 2:
 74		abs = int64(i.rb.nsrc) + offset
 75	default:
 76		return 0, fmt.Errorf("norm: invalid whence")
 77	}
 78	if abs < 0 {
 79		return 0, fmt.Errorf("norm: negative position")
 80	}
 81	if int(abs) >= i.rb.nsrc {
 82		i.setDone()
 83		return int64(i.p), nil
 84	}
 85	i.p = int(abs)
 86	i.multiSeg = nil
 87	i.next = i.rb.f.nextMain
 88	i.info = i.rb.f.info(i.rb.src, i.p)
 89	i.rb.ss.first(i.info)
 90	return abs, nil
 91}
 92
 93// returnSlice returns a slice of the underlying input type as a byte slice.
 94// If the underlying is of type []byte, it will simply return a slice.
 95// If the underlying is of type string, it will copy the slice to the buffer
 96// and return that.
 97func (i *Iter) returnSlice(a, b int) []byte {
 98	if i.rb.src.bytes == nil {
 99		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
100	}
101	return i.rb.src.bytes[a:b]
102}
103
104// Pos returns the byte position at which the next call to Next will commence processing.
105func (i *Iter) Pos() int {
106	return i.p
107}
108
109func (i *Iter) setDone() {
110	i.next = nextDone
111	i.p = i.rb.nsrc
112}
113
114// Done returns true if there is no more input to process.
115func (i *Iter) Done() bool {
116	return i.p >= i.rb.nsrc
117}
118
119// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
120// For any input a and b for which f(a) == f(b), subsequent calls
121// to Next will return the same segments.
122// Modifying runes are grouped together with the preceding starter, if such a starter exists.
123// Although not guaranteed, n will typically be the smallest possible n.
124func (i *Iter) Next() []byte {
125	return i.next(i)
126}
127
128func nextASCIIBytes(i *Iter) []byte {
129	p := i.p + 1
130	if p >= i.rb.nsrc {
131		p0 := i.p
132		i.setDone()
133		return i.rb.src.bytes[p0:p]
134	}
135	if i.rb.src.bytes[p] < utf8.RuneSelf {
136		p0 := i.p
137		i.p = p
138		return i.rb.src.bytes[p0:p]
139	}
140	i.info = i.rb.f.info(i.rb.src, i.p)
141	i.next = i.rb.f.nextMain
142	return i.next(i)
143}
144
145func nextASCIIString(i *Iter) []byte {
146	p := i.p + 1
147	if p >= i.rb.nsrc {
148		i.buf[0] = i.rb.src.str[i.p]
149		i.setDone()
150		return i.buf[:1]
151	}
152	if i.rb.src.str[p] < utf8.RuneSelf {
153		i.buf[0] = i.rb.src.str[i.p]
154		i.p = p
155		return i.buf[:1]
156	}
157	i.info = i.rb.f.info(i.rb.src, i.p)
158	i.next = i.rb.f.nextMain
159	return i.next(i)
160}
161
162func nextHangul(i *Iter) []byte {
163	p := i.p
164	next := p + hangulUTF8Size
165	if next >= i.rb.nsrc {
166		i.setDone()
167	} else if i.rb.src.hangul(next) == 0 {
168		i.rb.ss.next(i.info)
169		i.info = i.rb.f.info(i.rb.src, i.p)
170		i.next = i.rb.f.nextMain
171		return i.next(i)
172	}
173	i.p = next
174	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
175}
176
177func nextDone(i *Iter) []byte {
178	return nil
179}
180
181// nextMulti is used for iterating over multi-segment decompositions
182// for decomposing normal forms.
183func nextMulti(i *Iter) []byte {
184	j := 0
185	d := i.multiSeg
186	// skip first rune
187	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
188	}
189	for j < len(d) {
190		info := i.rb.f.info(input{bytes: d}, j)
191		if info.BoundaryBefore() {
192			i.multiSeg = d[j:]
193			return d[:j]
194		}
195		j += int(info.size)
196	}
197	// treat last segment as normal decomposition
198	i.next = i.rb.f.nextMain
199	return i.next(i)
200}
201
202// nextMultiNorm is used for iterating over multi-segment decompositions
203// for composing normal forms.
204func nextMultiNorm(i *Iter) []byte {
205	j := 0
206	d := i.multiSeg
207	for j < len(d) {
208		info := i.rb.f.info(input{bytes: d}, j)
209		if info.BoundaryBefore() {
210			i.rb.compose()
211			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
212			i.rb.insertUnsafe(input{bytes: d}, j, info)
213			i.multiSeg = d[j+int(info.size):]
214			return seg
215		}
216		i.rb.insertUnsafe(input{bytes: d}, j, info)
217		j += int(info.size)
218	}
219	i.multiSeg = nil
220	i.next = nextComposed
221	return doNormComposed(i)
222}
223
224// nextDecomposed is the implementation of Next for forms NFD and NFKD.
225func nextDecomposed(i *Iter) (next []byte) {
226	outp := 0
227	inCopyStart, outCopyStart := i.p, 0
228	for {
229		if sz := int(i.info.size); sz <= 1 {
230			i.rb.ss = 0
231			p := i.p
232			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
233			if i.p >= i.rb.nsrc {
234				i.setDone()
235				return i.returnSlice(p, i.p)
236			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
237				i.next = i.asciiF
238				return i.returnSlice(p, i.p)
239			}
240			outp++
241		} else if d := i.info.Decomposition(); d != nil {
242			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
243			// Case 1: there is a leftover to copy.  In this case the decomposition
244			// must begin with a modifier and should always be appended.
245			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
246			p := outp + len(d)
247			if outp > 0 {
248				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
249				// TODO: this condition should not be possible, but we leave it
250				// in for defensive purposes.
251				if p > len(i.buf) {
252					return i.buf[:outp]
253				}
254			} else if i.info.multiSegment() {
255				// outp must be 0 as multi-segment decompositions always
256				// start a new segment.
257				if i.multiSeg == nil {
258					i.multiSeg = d
259					i.next = nextMulti
260					return nextMulti(i)
261				}
262				// We are in the last segment.  Treat as normal decomposition.
263				d = i.multiSeg
264				i.multiSeg = nil
265				p = len(d)
266			}
267			prevCC := i.info.tccc
268			if i.p += sz; i.p >= i.rb.nsrc {
269				i.setDone()
270				i.info = Properties{} // Force BoundaryBefore to succeed.
271			} else {
272				i.info = i.rb.f.info(i.rb.src, i.p)
273			}
274			switch i.rb.ss.next(i.info) {
275			case ssOverflow:
276				i.next = nextCGJDecompose
277				fallthrough
278			case ssStarter:
279				if outp > 0 {
280					copy(i.buf[outp:], d)
281					return i.buf[:p]
282				}
283				return d
284			}
285			copy(i.buf[outp:], d)
286			outp = p
287			inCopyStart, outCopyStart = i.p, outp
288			if i.info.ccc < prevCC {
289				goto doNorm
290			}
291			continue
292		} else if r := i.rb.src.hangul(i.p); r != 0 {
293			outp = decomposeHangul(i.buf[:], r)
294			i.p += hangulUTF8Size
295			inCopyStart, outCopyStart = i.p, outp
296			if i.p >= i.rb.nsrc {
297				i.setDone()
298				break
299			} else if i.rb.src.hangul(i.p) != 0 {
300				i.next = nextHangul
301				return i.buf[:outp]
302			}
303		} else {
304			p := outp + sz
305			if p > len(i.buf) {
306				break
307			}
308			outp = p
309			i.p += sz
310		}
311		if i.p >= i.rb.nsrc {
312			i.setDone()
313			break
314		}
315		prevCC := i.info.tccc
316		i.info = i.rb.f.info(i.rb.src, i.p)
317		if v := i.rb.ss.next(i.info); v == ssStarter {
318			break
319		} else if v == ssOverflow {
320			i.next = nextCGJDecompose
321			break
322		}
323		if i.info.ccc < prevCC {
324			goto doNorm
325		}
326	}
327	if outCopyStart == 0 {
328		return i.returnSlice(inCopyStart, i.p)
329	} else if inCopyStart < i.p {
330		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
331	}
332	return i.buf[:outp]
333doNorm:
334	// Insert what we have decomposed so far in the reorderBuffer.
335	// As we will only reorder, there will always be enough room.
336	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
337	i.rb.insertDecomposed(i.buf[0:outp])
338	return doNormDecomposed(i)
339}
340
341func doNormDecomposed(i *Iter) []byte {
342	for {
343		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
344		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
345			i.setDone()
346			break
347		}
348		i.info = i.rb.f.info(i.rb.src, i.p)
349		if i.info.ccc == 0 {
350			break
351		}
352		if s := i.rb.ss.next(i.info); s == ssOverflow {
353			i.next = nextCGJDecompose
354			break
355		}
356	}
357	// new segment or too many combining characters: exit normalization
358	return i.buf[:i.rb.flushCopy(i.buf[:])]
359}
360
361func nextCGJDecompose(i *Iter) []byte {
362	i.rb.ss = 0
363	i.rb.insertCGJ()
364	i.next = nextDecomposed
365	i.rb.ss.first(i.info)
366	buf := doNormDecomposed(i)
367	return buf
368}
369
370// nextComposed is the implementation of Next for forms NFC and NFKC.
371func nextComposed(i *Iter) []byte {
372	outp, startp := 0, i.p
373	var prevCC uint8
374	for {
375		if !i.info.isYesC() {
376			goto doNorm
377		}
378		prevCC = i.info.tccc
379		sz := int(i.info.size)
380		if sz == 0 {
381			sz = 1 // illegal rune: copy byte-by-byte
382		}
383		p := outp + sz
384		if p > len(i.buf) {
385			break
386		}
387		outp = p
388		i.p += sz
389		if i.p >= i.rb.nsrc {
390			i.setDone()
391			break
392		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
393			i.rb.ss = 0
394			i.next = i.asciiF
395			break
396		}
397		i.info = i.rb.f.info(i.rb.src, i.p)
398		if v := i.rb.ss.next(i.info); v == ssStarter {
399			break
400		} else if v == ssOverflow {
401			i.next = nextCGJCompose
402			break
403		}
404		if i.info.ccc < prevCC {
405			goto doNorm
406		}
407	}
408	return i.returnSlice(startp, i.p)
409doNorm:
410	// reset to start position
411	i.p = startp
412	i.info = i.rb.f.info(i.rb.src, i.p)
413	i.rb.ss.first(i.info)
414	if i.info.multiSegment() {
415		d := i.info.Decomposition()
416		info := i.rb.f.info(input{bytes: d}, 0)
417		i.rb.insertUnsafe(input{bytes: d}, 0, info)
418		i.multiSeg = d[int(info.size):]
419		i.next = nextMultiNorm
420		return nextMultiNorm(i)
421	}
422	i.rb.ss.first(i.info)
423	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
424	return doNormComposed(i)
425}
426
427func doNormComposed(i *Iter) []byte {
428	// First rune should already be inserted.
429	for {
430		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
431			i.setDone()
432			break
433		}
434		i.info = i.rb.f.info(i.rb.src, i.p)
435		if s := i.rb.ss.next(i.info); s == ssStarter {
436			break
437		} else if s == ssOverflow {
438			i.next = nextCGJCompose
439			break
440		}
441		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
442	}
443	i.rb.compose()
444	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
445	return seg
446}
447
448func nextCGJCompose(i *Iter) []byte {
449	i.rb.ss = 0 // instead of first
450	i.rb.insertCGJ()
451	i.next = nextComposed
452	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
453	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
454	// If we ever change that, insert a check here.
455	i.rb.ss.first(i.info)
456	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
457	return doNormComposed(i)
458}