hzgb2312.go

  1// Copyright 2013 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package simplifiedchinese
  6
  7import (
  8	"unicode/utf8"
  9
 10	"golang.org/x/text/encoding"
 11	"golang.org/x/text/encoding/internal"
 12	"golang.org/x/text/encoding/internal/identifier"
 13	"golang.org/x/text/transform"
 14)
 15
 16// HZGB2312 is the HZ-GB2312 encoding.
 17var HZGB2312 encoding.Encoding = &hzGB2312
 18
 19var hzGB2312 = internal.Encoding{
 20	internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
 21	"HZ-GB2312",
 22	identifier.HZGB2312,
 23}
 24
 25func hzGB2312NewDecoder() transform.Transformer {
 26	return new(hzGB2312Decoder)
 27}
 28
 29func hzGB2312NewEncoder() transform.Transformer {
 30	return new(hzGB2312Encoder)
 31}
 32
 33const (
 34	asciiState = iota
 35	gbState
 36)
 37
 38type hzGB2312Decoder int
 39
 40func (d *hzGB2312Decoder) Reset() {
 41	*d = asciiState
 42}
 43
 44func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 45	r, size := rune(0), 0
 46loop:
 47	for ; nSrc < len(src); nSrc += size {
 48		c0 := src[nSrc]
 49		if c0 >= utf8.RuneSelf {
 50			r, size = utf8.RuneError, 1
 51			goto write
 52		}
 53
 54		if c0 == '~' {
 55			if nSrc+1 >= len(src) {
 56				if !atEOF {
 57					err = transform.ErrShortSrc
 58					break loop
 59				}
 60				r, size = utf8.RuneError, 1
 61				goto write
 62			}
 63			size = 2
 64			switch src[nSrc+1] {
 65			case '{':
 66				*d = gbState
 67				continue
 68			case '}':
 69				*d = asciiState
 70				continue
 71			case '~':
 72				if nDst >= len(dst) {
 73					err = transform.ErrShortDst
 74					break loop
 75				}
 76				dst[nDst] = '~'
 77				nDst++
 78				continue
 79			case '\n':
 80				continue
 81			default:
 82				r = utf8.RuneError
 83				goto write
 84			}
 85		}
 86
 87		if *d == asciiState {
 88			r, size = rune(c0), 1
 89		} else {
 90			if nSrc+1 >= len(src) {
 91				if !atEOF {
 92					err = transform.ErrShortSrc
 93					break loop
 94				}
 95				r, size = utf8.RuneError, 1
 96				goto write
 97			}
 98			size = 2
 99			c1 := src[nSrc+1]
100			if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
101				// error
102			} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
103				r = rune(decode[i])
104				if r != 0 {
105					goto write
106				}
107			}
108			if c1 > utf8.RuneSelf {
109				// Be consistent and always treat non-ASCII as a single error.
110				size = 1
111			}
112			r = utf8.RuneError
113		}
114
115	write:
116		if nDst+utf8.RuneLen(r) > len(dst) {
117			err = transform.ErrShortDst
118			break loop
119		}
120		nDst += utf8.EncodeRune(dst[nDst:], r)
121	}
122	return nDst, nSrc, err
123}
124
125type hzGB2312Encoder int
126
127func (d *hzGB2312Encoder) Reset() {
128	*d = asciiState
129}
130
131func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
132	r, size := rune(0), 0
133	for ; nSrc < len(src); nSrc += size {
134		r = rune(src[nSrc])
135
136		// Decode a 1-byte rune.
137		if r < utf8.RuneSelf {
138			size = 1
139			if r == '~' {
140				if nDst+2 > len(dst) {
141					err = transform.ErrShortDst
142					break
143				}
144				dst[nDst+0] = '~'
145				dst[nDst+1] = '~'
146				nDst += 2
147				continue
148			} else if *e != asciiState {
149				if nDst+3 > len(dst) {
150					err = transform.ErrShortDst
151					break
152				}
153				*e = asciiState
154				dst[nDst+0] = '~'
155				dst[nDst+1] = '}'
156				nDst += 2
157			} else if nDst >= len(dst) {
158				err = transform.ErrShortDst
159				break
160			}
161			dst[nDst] = uint8(r)
162			nDst += 1
163			continue
164
165		}
166
167		// Decode a multi-byte rune.
168		r, size = utf8.DecodeRune(src[nSrc:])
169		if size == 1 {
170			// All valid runes of size 1 (those below utf8.RuneSelf) were
171			// handled above. We have invalid UTF-8 or we haven't seen the
172			// full character yet.
173			if !atEOF && !utf8.FullRune(src[nSrc:]) {
174				err = transform.ErrShortSrc
175				break
176			}
177		}
178
179		// func init checks that the switch covers all tables.
180		switch {
181		case encode0Low <= r && r < encode0High:
182			if r = rune(encode0[r-encode0Low]); r != 0 {
183				goto writeGB
184			}
185		case encode1Low <= r && r < encode1High:
186			if r = rune(encode1[r-encode1Low]); r != 0 {
187				goto writeGB
188			}
189		case encode2Low <= r && r < encode2High:
190			if r = rune(encode2[r-encode2Low]); r != 0 {
191				goto writeGB
192			}
193		case encode3Low <= r && r < encode3High:
194			if r = rune(encode3[r-encode3Low]); r != 0 {
195				goto writeGB
196			}
197		case encode4Low <= r && r < encode4High:
198			if r = rune(encode4[r-encode4Low]); r != 0 {
199				goto writeGB
200			}
201		}
202
203	terminateInASCIIState:
204		// Switch back to ASCII state in case of error so that an ASCII
205		// replacement character can be written in the correct state.
206		if *e != asciiState {
207			if nDst+2 > len(dst) {
208				err = transform.ErrShortDst
209				break
210			}
211			dst[nDst+0] = '~'
212			dst[nDst+1] = '}'
213			nDst += 2
214		}
215		err = internal.ErrASCIIReplacement
216		break
217
218	writeGB:
219		c0 := uint8(r>>8) - 0x80
220		c1 := uint8(r) - 0x80
221		if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
222			goto terminateInASCIIState
223		}
224		if *e == asciiState {
225			if nDst+4 > len(dst) {
226				err = transform.ErrShortDst
227				break
228			}
229			*e = gbState
230			dst[nDst+0] = '~'
231			dst[nDst+1] = '{'
232			nDst += 2
233		} else if nDst+2 > len(dst) {
234			err = transform.ErrShortDst
235			break
236		}
237		dst[nDst+0] = c0
238		dst[nDst+1] = c1
239		nDst += 2
240		continue
241	}
242	// TODO: should one always terminate in ASCII state to make it safe to
243	// concatenate two HZ-GB2312-encoded strings?
244	return nDst, nSrc, err
245}