iso2022jp.go

  1// Copyright 2013 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package japanese
  6
  7import (
  8	"unicode/utf8"
  9
 10	"golang.org/x/text/encoding"
 11	"golang.org/x/text/encoding/internal"
 12	"golang.org/x/text/encoding/internal/identifier"
 13	"golang.org/x/text/transform"
 14)
 15
 16// ISO2022JP is the ISO-2022-JP encoding.
 17var ISO2022JP encoding.Encoding = &iso2022JP
 18
 19var iso2022JP = internal.Encoding{
 20	internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
 21	"ISO-2022-JP",
 22	identifier.ISO2022JP,
 23}
 24
 25func iso2022JPNewDecoder() transform.Transformer {
 26	return new(iso2022JPDecoder)
 27}
 28
 29func iso2022JPNewEncoder() transform.Transformer {
 30	return new(iso2022JPEncoder)
 31}
 32
 33const (
 34	asciiState = iota
 35	katakanaState
 36	jis0208State
 37	jis0212State
 38)
 39
 40const asciiEsc = 0x1b
 41
 42type iso2022JPDecoder int
 43
 44func (d *iso2022JPDecoder) Reset() {
 45	*d = asciiState
 46}
 47
 48func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 49	r, size := rune(0), 0
 50	for ; nSrc < len(src); nSrc += size {
 51		c0 := src[nSrc]
 52		if c0 >= utf8.RuneSelf {
 53			r, size = '\ufffd', 1
 54			goto write
 55		}
 56
 57		if c0 == asciiEsc {
 58			if nSrc+2 >= len(src) {
 59				if !atEOF {
 60					return nDst, nSrc, transform.ErrShortSrc
 61				}
 62				// TODO: is it correct to only skip 1??
 63				r, size = '\ufffd', 1
 64				goto write
 65			}
 66			size = 3
 67			c1 := src[nSrc+1]
 68			c2 := src[nSrc+2]
 69			switch {
 70			case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
 71				*d = jis0208State
 72				continue
 73			case c1 == '$' && c2 == '(': // 0x24 0x28
 74				if nSrc+3 >= len(src) {
 75					if !atEOF {
 76						return nDst, nSrc, transform.ErrShortSrc
 77					}
 78					r, size = '\ufffd', 1
 79					goto write
 80				}
 81				size = 4
 82				if src[nSrc+3] == 'D' {
 83					*d = jis0212State
 84					continue
 85				}
 86			case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
 87				*d = asciiState
 88				continue
 89			case c1 == '(' && c2 == 'I': // 0x28 0x49
 90				*d = katakanaState
 91				continue
 92			}
 93			r, size = '\ufffd', 1
 94			goto write
 95		}
 96
 97		switch *d {
 98		case asciiState:
 99			r, size = rune(c0), 1
100
101		case katakanaState:
102			if c0 < 0x21 || 0x60 <= c0 {
103				r, size = '\ufffd', 1
104				goto write
105			}
106			r, size = rune(c0)+(0xff61-0x21), 1
107
108		default:
109			if c0 == 0x0a {
110				*d = asciiState
111				r, size = rune(c0), 1
112				goto write
113			}
114			if nSrc+1 >= len(src) {
115				if !atEOF {
116					return nDst, nSrc, transform.ErrShortSrc
117				}
118				r, size = '\ufffd', 1
119				goto write
120			}
121			size = 2
122			c1 := src[nSrc+1]
123			i := int(c0-0x21)*94 + int(c1-0x21)
124			if *d == jis0208State && i < len(jis0208Decode) {
125				r = rune(jis0208Decode[i])
126			} else if *d == jis0212State && i < len(jis0212Decode) {
127				r = rune(jis0212Decode[i])
128			} else {
129				r = '\ufffd'
130				goto write
131			}
132			if r == 0 {
133				r = '\ufffd'
134			}
135		}
136
137	write:
138		if nDst+utf8.RuneLen(r) > len(dst) {
139			return nDst, nSrc, transform.ErrShortDst
140		}
141		nDst += utf8.EncodeRune(dst[nDst:], r)
142	}
143	return nDst, nSrc, err
144}
145
146type iso2022JPEncoder int
147
148func (e *iso2022JPEncoder) Reset() {
149	*e = asciiState
150}
151
152func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
153	r, size := rune(0), 0
154	for ; nSrc < len(src); nSrc += size {
155		r = rune(src[nSrc])
156
157		// Decode a 1-byte rune.
158		if r < utf8.RuneSelf {
159			size = 1
160
161		} else {
162			// Decode a multi-byte rune.
163			r, size = utf8.DecodeRune(src[nSrc:])
164			if size == 1 {
165				// All valid runes of size 1 (those below utf8.RuneSelf) were
166				// handled above. We have invalid UTF-8 or we haven't seen the
167				// full character yet.
168				if !atEOF && !utf8.FullRune(src[nSrc:]) {
169					err = transform.ErrShortSrc
170					break
171				}
172			}
173
174			// func init checks that the switch covers all tables.
175			//
176			// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
177			// is not used by the iso-2022-jp encoder due to lack of widespread support".
178			//
179			// TODO: do we have to special-case U+00A5 and U+203E, as per
180			// http://encoding.spec.whatwg.org/#iso-2022-jp
181			// Doing so would mean that "\u00a5" would not be preserved
182			// after an encode-decode round trip.
183			switch {
184			case encode0Low <= r && r < encode0High:
185				if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
186					goto writeJIS
187				}
188			case encode1Low <= r && r < encode1High:
189				if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
190					goto writeJIS
191				}
192			case encode2Low <= r && r < encode2High:
193				if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
194					goto writeJIS
195				}
196			case encode3Low <= r && r < encode3High:
197				if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
198					goto writeJIS
199				}
200			case encode4Low <= r && r < encode4High:
201				if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
202					goto writeJIS
203				}
204			case encode5Low <= r && r < encode5High:
205				if 0xff61 <= r && r < 0xffa0 {
206					goto writeKatakana
207				}
208				if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
209					goto writeJIS
210				}
211			}
212
213			// Switch back to ASCII state in case of error so that an ASCII
214			// replacement character can be written in the correct state.
215			if *e != asciiState {
216				if nDst+3 > len(dst) {
217					err = transform.ErrShortDst
218					break
219				}
220				*e = asciiState
221				dst[nDst+0] = asciiEsc
222				dst[nDst+1] = '('
223				dst[nDst+2] = 'B'
224				nDst += 3
225			}
226			err = internal.ErrASCIIReplacement
227			break
228		}
229
230		if *e != asciiState {
231			if nDst+4 > len(dst) {
232				err = transform.ErrShortDst
233				break
234			}
235			*e = asciiState
236			dst[nDst+0] = asciiEsc
237			dst[nDst+1] = '('
238			dst[nDst+2] = 'B'
239			nDst += 3
240		} else if nDst >= len(dst) {
241			err = transform.ErrShortDst
242			break
243		}
244		dst[nDst] = uint8(r)
245		nDst++
246		continue
247
248	writeJIS:
249		if *e != jis0208State {
250			if nDst+5 > len(dst) {
251				err = transform.ErrShortDst
252				break
253			}
254			*e = jis0208State
255			dst[nDst+0] = asciiEsc
256			dst[nDst+1] = '$'
257			dst[nDst+2] = 'B'
258			nDst += 3
259		} else if nDst+2 > len(dst) {
260			err = transform.ErrShortDst
261			break
262		}
263		dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
264		dst[nDst+1] = 0x21 + uint8(r)&codeMask
265		nDst += 2
266		continue
267
268	writeKatakana:
269		if *e != katakanaState {
270			if nDst+4 > len(dst) {
271				err = transform.ErrShortDst
272				break
273			}
274			*e = katakanaState
275			dst[nDst+0] = asciiEsc
276			dst[nDst+1] = '('
277			dst[nDst+2] = 'I'
278			nDst += 3
279		} else if nDst >= len(dst) {
280			err = transform.ErrShortDst
281			break
282		}
283		dst[nDst] = uint8(r - (0xff61 - 0x21))
284		nDst++
285		continue
286	}
287	if atEOF && err == nil && *e != asciiState {
288		if nDst+3 > len(dst) {
289			err = transform.ErrShortDst
290		} else {
291			*e = asciiState
292			dst[nDst+0] = asciiEsc
293			dst[nDst+1] = '('
294			dst[nDst+2] = 'B'
295			nDst += 3
296		}
297	}
298	return nDst, nSrc, err
299}