1package jsonparser
2
3import (
4 "bytes"
5 "unicode/utf8"
6)
7
8// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
9
10const supplementalPlanesOffset = 0x10000
11const highSurrogateOffset = 0xD800
12const lowSurrogateOffset = 0xDC00
13
14const basicMultilingualPlaneReservedOffset = 0xDFFF
15const basicMultilingualPlaneOffset = 0xFFFF
16
17func combineUTF16Surrogates(high, low rune) rune {
18 return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
19}
20
21const badHex = -1
22
23func h2I(c byte) int {
24 switch {
25 case c >= '0' && c <= '9':
26 return int(c - '0')
27 case c >= 'A' && c <= 'F':
28 return int(c - 'A' + 10)
29 case c >= 'a' && c <= 'f':
30 return int(c - 'a' + 10)
31 }
32 return badHex
33}
34
35// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
36// is not checked.
37// In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
38// This function only handles one; decodeUnicodeEscape handles this more complex case.
39func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
40 // We need at least 6 characters total
41 if len(in) < 6 {
42 return utf8.RuneError, false
43 }
44
45 // Convert hex to decimal
46 h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
47 if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
48 return utf8.RuneError, false
49 }
50
51 // Compose the hex digits
52 return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
53}
54
55// isUTF16EncodedRune checks if a rune is in the range for non-BMP characters,
56// which is used to describe UTF16 chars.
57// Source: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
58func isUTF16EncodedRune(r rune) bool {
59 return highSurrogateOffset <= r && r <= basicMultilingualPlaneReservedOffset
60}
61
62func decodeUnicodeEscape(in []byte) (rune, int) {
63 if r, ok := decodeSingleUnicodeEscape(in); !ok {
64 // Invalid Unicode escape
65 return utf8.RuneError, -1
66 } else if r <= basicMultilingualPlaneOffset && !isUTF16EncodedRune(r) {
67 // Valid Unicode escape in Basic Multilingual Plane
68 return r, 6
69 } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
70 // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
71 return utf8.RuneError, -1
72 } else if r2 < lowSurrogateOffset {
73 // Invalid UTF16 "low surrogate"
74 return utf8.RuneError, -1
75 } else {
76 // Valid UTF16 surrogate pair
77 return combineUTF16Surrogates(r, r2), 12
78 }
79}
80
81// backslashCharEscapeTable: when '\X' is found for some byte X, it is to be replaced with backslashCharEscapeTable[X]
82var backslashCharEscapeTable = [...]byte{
83 '"': '"',
84 '\\': '\\',
85 '/': '/',
86 'b': '\b',
87 'f': '\f',
88 'n': '\n',
89 'r': '\r',
90 't': '\t',
91}
92
93// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
94// how many characters were consumed from 'in' and emitted into 'out'.
95// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
96func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
97 if len(in) < 2 || in[0] != '\\' {
98 // Invalid escape due to insufficient characters for any escape or no initial backslash
99 return -1, -1
100 }
101
102 // https://tools.ietf.org/html/rfc7159#section-7
103 switch e := in[1]; e {
104 case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
105 // Valid basic 2-character escapes (use lookup table)
106 out[0] = backslashCharEscapeTable[e]
107 return 2, 1
108 case 'u':
109 // Unicode escape
110 if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
111 // Invalid Unicode escape
112 return -1, -1
113 } else {
114 // Valid Unicode escape; re-encode as UTF8
115 outLen := utf8.EncodeRune(out, r)
116 return inLen, outLen
117 }
118 }
119
120 return -1, -1
121}
122
123// unescape unescapes the string contained in 'in' and returns it as a slice.
124// If 'in' contains no escaped characters:
125// Returns 'in'.
126// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
127// 'out' is used to build the unescaped string and is returned with no extra allocation
128// Else:
129// A new slice is allocated and returned.
130func Unescape(in, out []byte) ([]byte, error) {
131 firstBackslash := bytes.IndexByte(in, '\\')
132 if firstBackslash == -1 {
133 return in, nil
134 }
135
136 // Get a buffer of sufficient size (allocate if needed)
137 if cap(out) < len(in) {
138 out = make([]byte, len(in))
139 } else {
140 out = out[0:len(in)]
141 }
142
143 // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
144 copy(out, in[:firstBackslash])
145 in = in[firstBackslash:]
146 buf := out[firstBackslash:]
147
148 for len(in) > 0 {
149 // Unescape the next escaped character
150 inLen, bufLen := unescapeToUTF8(in, buf)
151 if inLen == -1 {
152 return nil, MalformedStringEscapeError
153 }
154
155 in = in[inLen:]
156 buf = buf[bufLen:]
157
158 // Copy everything up until the next backslash
159 nextBackslash := bytes.IndexByte(in, '\\')
160 if nextBackslash == -1 {
161 copy(buf, in)
162 buf = buf[len(in):]
163 break
164 } else {
165 copy(buf, in[:nextBackslash])
166 buf = buf[nextBackslash:]
167 in = in[nextBackslash:]
168 }
169 }
170
171 // Trim the out buffer to the amount that was actually emitted
172 return out[:len(out)-len(buf)], nil
173}