1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package json
6
7// JSON value parser state machine.
8// Just about at the limit of what is reasonable to write by hand.
9// Some parts are a bit tedious, but overall it nicely factors out the
10// otherwise common code from the multiple scanning functions
11// in this package (Compact, Indent, checkValid, etc).
12//
13// This file starts with two simple examples using the scanner
14// before diving into the scanner itself.
15
16import (
17 "strconv"
18 "sync"
19)
20
21// Valid reports whether data is a valid JSON encoding.
22func Valid(data []byte) bool {
23 scan := newScanner()
24 defer freeScanner(scan)
25 return checkValid(data, scan) == nil
26}
27
28// checkValid verifies that data is valid JSON-encoded data.
29// scan is passed in for use by checkValid to avoid an allocation.
30// checkValid returns nil or a SyntaxError.
31func checkValid(data []byte, scan *scanner) error {
32 scan.reset()
33 for _, c := range data {
34 scan.bytes++
35 if scan.step(scan, c) == scanError {
36 return scan.err
37 }
38 }
39 if scan.eof() == scanError {
40 return scan.err
41 }
42 return nil
43}
44
45// A SyntaxError is a description of a JSON syntax error.
46// [Unmarshal] will return a SyntaxError if the JSON can't be parsed.
47type SyntaxError struct {
48 msg string // description of error
49 Offset int64 // error occurred after reading Offset bytes
50}
51
52func (e *SyntaxError) Error() string { return e.msg }
53
54// A scanner is a JSON scanning state machine.
55// Callers call scan.reset and then pass bytes in one at a time
56// by calling scan.step(&scan, c) for each byte.
57// The return value, referred to as an opcode, tells the
58// caller about significant parsing events like beginning
59// and ending literals, objects, and arrays, so that the
60// caller can follow along if it wishes.
61// The return value scanEnd indicates that a single top-level
62// JSON value has been completed, *before* the byte that
63// just got passed in. (The indication must be delayed in order
64// to recognize the end of numbers: is 123 a whole value or
65// the beginning of 12345e+6?).
66type scanner struct {
67 // The step is a func to be called to execute the next transition.
68 // Also tried using an integer constant and a single func
69 // with a switch, but using the func directly was 10% faster
70 // on a 64-bit Mac Mini, and it's nicer to read.
71 step func(*scanner, byte) int
72
73 // Reached end of top-level value.
74 endTop bool
75
76 // Stack of what we're in the middle of - array values, object keys, object values.
77 parseState []int
78
79 // Error that happened, if any.
80 err error
81
82 // total bytes consumed, updated by decoder.Decode (and deliberately
83 // not set to zero by scan.reset)
84 bytes int64
85}
86
87var scannerPool = sync.Pool{
88 New: func() any {
89 return &scanner{}
90 },
91}
92
93func newScanner() *scanner {
94 scan := scannerPool.Get().(*scanner)
95 // scan.reset by design doesn't set bytes to zero
96 scan.bytes = 0
97 scan.reset()
98 return scan
99}
100
101func freeScanner(scan *scanner) {
102 // Avoid hanging on to too much memory in extreme cases.
103 if len(scan.parseState) > 1024 {
104 scan.parseState = nil
105 }
106 scannerPool.Put(scan)
107}
108
109// These values are returned by the state transition functions
110// assigned to scanner.state and the method scanner.eof.
111// They give details about the current state of the scan that
112// callers might be interested to know about.
113// It is okay to ignore the return value of any particular
114// call to scanner.state: if one call returns scanError,
115// every subsequent call will return scanError too.
116const (
117 // Continue.
118 scanContinue = iota // uninteresting byte
119 scanBeginLiteral // end implied by next result != scanContinue
120 scanBeginObject // begin object
121 scanObjectKey // just finished object key (string)
122 scanObjectValue // just finished non-last object value
123 scanEndObject // end object (implies scanObjectValue if possible)
124 scanBeginArray // begin array
125 scanArrayValue // just finished array value
126 scanEndArray // end array (implies scanArrayValue if possible)
127 scanSkipSpace // space byte; can skip; known to be last "continue" result
128
129 // Stop.
130 scanEnd // top-level value ended *before* this byte; known to be first "stop" result
131 scanError // hit an error, scanner.err.
132)
133
134// These values are stored in the parseState stack.
135// They give the current state of a composite value
136// being scanned. If the parser is inside a nested value
137// the parseState describes the nested state, outermost at entry 0.
138const (
139 parseObjectKey = iota // parsing object key (before colon)
140 parseObjectValue // parsing object value (after colon)
141 parseArrayValue // parsing array value
142)
143
144// This limits the max nesting depth to prevent stack overflow.
145// This is permitted by https://tools.ietf.org/html/rfc7159#section-9
146const maxNestingDepth = 10000
147
148// reset prepares the scanner for use.
149// It must be called before calling s.step.
150func (s *scanner) reset() {
151 s.step = stateBeginValue
152 s.parseState = s.parseState[0:0]
153 s.err = nil
154 s.endTop = false
155}
156
157// eof tells the scanner that the end of input has been reached.
158// It returns a scan status just as s.step does.
159func (s *scanner) eof() int {
160 if s.err != nil {
161 return scanError
162 }
163 if s.endTop {
164 return scanEnd
165 }
166 s.step(s, ' ')
167 if s.endTop {
168 return scanEnd
169 }
170 if s.err == nil {
171 s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
172 }
173 return scanError
174}
175
176// pushParseState pushes a new parse state p onto the parse stack.
177// an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
178func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
179 s.parseState = append(s.parseState, newParseState)
180 if len(s.parseState) <= maxNestingDepth {
181 return successState
182 }
183 return s.error(c, "exceeded max depth")
184}
185
186// popParseState pops a parse state (already obtained) off the stack
187// and updates s.step accordingly.
188func (s *scanner) popParseState() {
189 n := len(s.parseState) - 1
190 s.parseState = s.parseState[0:n]
191 if n == 0 {
192 s.step = stateEndTop
193 s.endTop = true
194 } else {
195 s.step = stateEndValue
196 }
197}
198
199func isSpace(c byte) bool {
200 return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
201}
202
203// stateBeginValueOrEmpty is the state after reading `[`.
204func stateBeginValueOrEmpty(s *scanner, c byte) int {
205 if isSpace(c) {
206 return scanSkipSpace
207 }
208 if c == ']' {
209 return stateEndValue(s, c)
210 }
211 return stateBeginValue(s, c)
212}
213
214// stateBeginValue is the state at the beginning of the input.
215func stateBeginValue(s *scanner, c byte) int {
216 if isSpace(c) {
217 return scanSkipSpace
218 }
219 switch c {
220 case '{':
221 s.step = stateBeginStringOrEmpty
222 return s.pushParseState(c, parseObjectKey, scanBeginObject)
223 case '[':
224 s.step = stateBeginValueOrEmpty
225 return s.pushParseState(c, parseArrayValue, scanBeginArray)
226 case '"':
227 s.step = stateInString
228 return scanBeginLiteral
229 case '-':
230 s.step = stateNeg
231 return scanBeginLiteral
232 case '0': // beginning of 0.123
233 s.step = state0
234 return scanBeginLiteral
235 case 't': // beginning of true
236 s.step = stateT
237 return scanBeginLiteral
238 case 'f': // beginning of false
239 s.step = stateF
240 return scanBeginLiteral
241 case 'n': // beginning of null
242 s.step = stateN
243 return scanBeginLiteral
244 }
245 if '1' <= c && c <= '9' { // beginning of 1234.5
246 s.step = state1
247 return scanBeginLiteral
248 }
249 return s.error(c, "looking for beginning of value")
250}
251
252// stateBeginStringOrEmpty is the state after reading `{`.
253func stateBeginStringOrEmpty(s *scanner, c byte) int {
254 if isSpace(c) {
255 return scanSkipSpace
256 }
257 if c == '}' {
258 n := len(s.parseState)
259 s.parseState[n-1] = parseObjectValue
260 return stateEndValue(s, c)
261 }
262 return stateBeginString(s, c)
263}
264
265// stateBeginString is the state after reading `{"key": value,`.
266func stateBeginString(s *scanner, c byte) int {
267 if isSpace(c) {
268 return scanSkipSpace
269 }
270 if c == '"' {
271 s.step = stateInString
272 return scanBeginLiteral
273 }
274 return s.error(c, "looking for beginning of object key string")
275}
276
277// stateEndValue is the state after completing a value,
278// such as after reading `{}` or `true` or `["x"`.
279func stateEndValue(s *scanner, c byte) int {
280 n := len(s.parseState)
281 if n == 0 {
282 // Completed top-level before the current byte.
283 s.step = stateEndTop
284 s.endTop = true
285 return stateEndTop(s, c)
286 }
287 if isSpace(c) {
288 s.step = stateEndValue
289 return scanSkipSpace
290 }
291 ps := s.parseState[n-1]
292 switch ps {
293 case parseObjectKey:
294 if c == ':' {
295 s.parseState[n-1] = parseObjectValue
296 s.step = stateBeginValue
297 return scanObjectKey
298 }
299 return s.error(c, "after object key")
300 case parseObjectValue:
301 if c == ',' {
302 s.parseState[n-1] = parseObjectKey
303 s.step = stateBeginString
304 return scanObjectValue
305 }
306 if c == '}' {
307 s.popParseState()
308 return scanEndObject
309 }
310 return s.error(c, "after object key:value pair")
311 case parseArrayValue:
312 if c == ',' {
313 s.step = stateBeginValue
314 return scanArrayValue
315 }
316 if c == ']' {
317 s.popParseState()
318 return scanEndArray
319 }
320 return s.error(c, "after array element")
321 }
322 return s.error(c, "")
323}
324
325// stateEndTop is the state after finishing the top-level value,
326// such as after reading `{}` or `[1,2,3]`.
327// Only space characters should be seen now.
328func stateEndTop(s *scanner, c byte) int {
329 if !isSpace(c) {
330 // Complain about non-space byte on next call.
331 s.error(c, "after top-level value")
332 }
333 return scanEnd
334}
335
336// stateInString is the state after reading `"`.
337func stateInString(s *scanner, c byte) int {
338 if c == '"' {
339 s.step = stateEndValue
340 return scanContinue
341 }
342 if c == '\\' {
343 s.step = stateInStringEsc
344 return scanContinue
345 }
346 if c < 0x20 {
347 return s.error(c, "in string literal")
348 }
349 return scanContinue
350}
351
352// stateInStringEsc is the state after reading `"\` during a quoted string.
353func stateInStringEsc(s *scanner, c byte) int {
354 switch c {
355 case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
356 s.step = stateInString
357 return scanContinue
358 case 'u':
359 s.step = stateInStringEscU
360 return scanContinue
361 }
362 return s.error(c, "in string escape code")
363}
364
365// stateInStringEscU is the state after reading `"\u` during a quoted string.
366func stateInStringEscU(s *scanner, c byte) int {
367 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
368 s.step = stateInStringEscU1
369 return scanContinue
370 }
371 // numbers
372 return s.error(c, "in \\u hexadecimal character escape")
373}
374
375// stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
376func stateInStringEscU1(s *scanner, c byte) int {
377 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
378 s.step = stateInStringEscU12
379 return scanContinue
380 }
381 // numbers
382 return s.error(c, "in \\u hexadecimal character escape")
383}
384
385// stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
386func stateInStringEscU12(s *scanner, c byte) int {
387 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
388 s.step = stateInStringEscU123
389 return scanContinue
390 }
391 // numbers
392 return s.error(c, "in \\u hexadecimal character escape")
393}
394
395// stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
396func stateInStringEscU123(s *scanner, c byte) int {
397 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
398 s.step = stateInString
399 return scanContinue
400 }
401 // numbers
402 return s.error(c, "in \\u hexadecimal character escape")
403}
404
405// stateNeg is the state after reading `-` during a number.
406func stateNeg(s *scanner, c byte) int {
407 if c == '0' {
408 s.step = state0
409 return scanContinue
410 }
411 if '1' <= c && c <= '9' {
412 s.step = state1
413 return scanContinue
414 }
415 return s.error(c, "in numeric literal")
416}
417
418// state1 is the state after reading a non-zero integer during a number,
419// such as after reading `1` or `100` but not `0`.
420func state1(s *scanner, c byte) int {
421 if '0' <= c && c <= '9' {
422 s.step = state1
423 return scanContinue
424 }
425 return state0(s, c)
426}
427
428// state0 is the state after reading `0` during a number.
429func state0(s *scanner, c byte) int {
430 if c == '.' {
431 s.step = stateDot
432 return scanContinue
433 }
434 if c == 'e' || c == 'E' {
435 s.step = stateE
436 return scanContinue
437 }
438 return stateEndValue(s, c)
439}
440
441// stateDot is the state after reading the integer and decimal point in a number,
442// such as after reading `1.`.
443func stateDot(s *scanner, c byte) int {
444 if '0' <= c && c <= '9' {
445 s.step = stateDot0
446 return scanContinue
447 }
448 return s.error(c, "after decimal point in numeric literal")
449}
450
451// stateDot0 is the state after reading the integer, decimal point, and subsequent
452// digits of a number, such as after reading `3.14`.
453func stateDot0(s *scanner, c byte) int {
454 if '0' <= c && c <= '9' {
455 return scanContinue
456 }
457 if c == 'e' || c == 'E' {
458 s.step = stateE
459 return scanContinue
460 }
461 return stateEndValue(s, c)
462}
463
464// stateE is the state after reading the mantissa and e in a number,
465// such as after reading `314e` or `0.314e`.
466func stateE(s *scanner, c byte) int {
467 if c == '+' || c == '-' {
468 s.step = stateESign
469 return scanContinue
470 }
471 return stateESign(s, c)
472}
473
474// stateESign is the state after reading the mantissa, e, and sign in a number,
475// such as after reading `314e-` or `0.314e+`.
476func stateESign(s *scanner, c byte) int {
477 if '0' <= c && c <= '9' {
478 s.step = stateE0
479 return scanContinue
480 }
481 return s.error(c, "in exponent of numeric literal")
482}
483
484// stateE0 is the state after reading the mantissa, e, optional sign,
485// and at least one digit of the exponent in a number,
486// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
487func stateE0(s *scanner, c byte) int {
488 if '0' <= c && c <= '9' {
489 return scanContinue
490 }
491 return stateEndValue(s, c)
492}
493
494// stateT is the state after reading `t`.
495func stateT(s *scanner, c byte) int {
496 if c == 'r' {
497 s.step = stateTr
498 return scanContinue
499 }
500 return s.error(c, "in literal true (expecting 'r')")
501}
502
503// stateTr is the state after reading `tr`.
504func stateTr(s *scanner, c byte) int {
505 if c == 'u' {
506 s.step = stateTru
507 return scanContinue
508 }
509 return s.error(c, "in literal true (expecting 'u')")
510}
511
512// stateTru is the state after reading `tru`.
513func stateTru(s *scanner, c byte) int {
514 if c == 'e' {
515 s.step = stateEndValue
516 return scanContinue
517 }
518 return s.error(c, "in literal true (expecting 'e')")
519}
520
521// stateF is the state after reading `f`.
522func stateF(s *scanner, c byte) int {
523 if c == 'a' {
524 s.step = stateFa
525 return scanContinue
526 }
527 return s.error(c, "in literal false (expecting 'a')")
528}
529
530// stateFa is the state after reading `fa`.
531func stateFa(s *scanner, c byte) int {
532 if c == 'l' {
533 s.step = stateFal
534 return scanContinue
535 }
536 return s.error(c, "in literal false (expecting 'l')")
537}
538
539// stateFal is the state after reading `fal`.
540func stateFal(s *scanner, c byte) int {
541 if c == 's' {
542 s.step = stateFals
543 return scanContinue
544 }
545 return s.error(c, "in literal false (expecting 's')")
546}
547
548// stateFals is the state after reading `fals`.
549func stateFals(s *scanner, c byte) int {
550 if c == 'e' {
551 s.step = stateEndValue
552 return scanContinue
553 }
554 return s.error(c, "in literal false (expecting 'e')")
555}
556
557// stateN is the state after reading `n`.
558func stateN(s *scanner, c byte) int {
559 if c == 'u' {
560 s.step = stateNu
561 return scanContinue
562 }
563 return s.error(c, "in literal null (expecting 'u')")
564}
565
566// stateNu is the state after reading `nu`.
567func stateNu(s *scanner, c byte) int {
568 if c == 'l' {
569 s.step = stateNul
570 return scanContinue
571 }
572 return s.error(c, "in literal null (expecting 'l')")
573}
574
575// stateNul is the state after reading `nul`.
576func stateNul(s *scanner, c byte) int {
577 if c == 'l' {
578 s.step = stateEndValue
579 return scanContinue
580 }
581 return s.error(c, "in literal null (expecting 'l')")
582}
583
584// stateError is the state after reaching a syntax error,
585// such as after reading `[1}` or `5.1.2`.
586func stateError(s *scanner, c byte) int {
587 return scanError
588}
589
590// error records an error and switches to the error state.
591func (s *scanner) error(c byte, context string) int {
592 s.step = stateError
593 s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
594 return scanError
595}
596
597// quoteChar formats c as a quoted character literal.
598func quoteChar(c byte) string {
599 // special cases - different from quoted strings
600 if c == '\'' {
601 return `'\''`
602 }
603 if c == '"' {
604 return `'"'`
605 }
606
607 // use quoted string with different quotation marks
608 s := strconv.Quote(string(c))
609 return "'" + s[1:len(s)-1] + "'"
610}