1package lexer
2
3import (
4 "bytes"
5 "fmt"
6 "regexp"
7 "strings"
8 "unicode/utf8"
9
10 "github.com/graphql-go/graphql/gqlerrors"
11 "github.com/graphql-go/graphql/language/source"
12)
13
14const (
15 EOF = iota + 1
16 BANG
17 DOLLAR
18 PAREN_L
19 PAREN_R
20 SPREAD
21 COLON
22 EQUALS
23 AT
24 BRACKET_L
25 BRACKET_R
26 BRACE_L
27 PIPE
28 BRACE_R
29 NAME
30 INT
31 FLOAT
32 STRING
33 BLOCK_STRING
34)
35
36var TokenKind map[int]int
37var tokenDescription map[int]string
38
39func init() {
40 TokenKind = make(map[int]int)
41 tokenDescription = make(map[int]string)
42 TokenKind[EOF] = EOF
43 TokenKind[BANG] = BANG
44 TokenKind[DOLLAR] = DOLLAR
45 TokenKind[PAREN_L] = PAREN_L
46 TokenKind[PAREN_R] = PAREN_R
47 TokenKind[SPREAD] = SPREAD
48 TokenKind[COLON] = COLON
49 TokenKind[EQUALS] = EQUALS
50 TokenKind[AT] = AT
51 TokenKind[BRACKET_L] = BRACKET_L
52 TokenKind[BRACKET_R] = BRACKET_R
53 TokenKind[BRACE_L] = BRACE_L
54 TokenKind[PIPE] = PIPE
55 TokenKind[BRACE_R] = BRACE_R
56 TokenKind[NAME] = NAME
57 TokenKind[INT] = INT
58 TokenKind[FLOAT] = FLOAT
59 TokenKind[STRING] = STRING
60 TokenKind[BLOCK_STRING] = BLOCK_STRING
61 tokenDescription[TokenKind[EOF]] = "EOF"
62 tokenDescription[TokenKind[BANG]] = "!"
63 tokenDescription[TokenKind[DOLLAR]] = "$"
64 tokenDescription[TokenKind[PAREN_L]] = "("
65 tokenDescription[TokenKind[PAREN_R]] = ")"
66 tokenDescription[TokenKind[SPREAD]] = "..."
67 tokenDescription[TokenKind[COLON]] = ":"
68 tokenDescription[TokenKind[EQUALS]] = "="
69 tokenDescription[TokenKind[AT]] = "@"
70 tokenDescription[TokenKind[BRACKET_L]] = "["
71 tokenDescription[TokenKind[BRACKET_R]] = "]"
72 tokenDescription[TokenKind[BRACE_L]] = "{"
73 tokenDescription[TokenKind[PIPE]] = "|"
74 tokenDescription[TokenKind[BRACE_R]] = "}"
75 tokenDescription[TokenKind[NAME]] = "Name"
76 tokenDescription[TokenKind[INT]] = "Int"
77 tokenDescription[TokenKind[FLOAT]] = "Float"
78 tokenDescription[TokenKind[STRING]] = "String"
79 tokenDescription[TokenKind[BLOCK_STRING]] = "BlockString"
80}
81
82// Token is a representation of a lexed Token. Value only appears for non-punctuation
83// tokens: NAME, INT, FLOAT, and STRING.
84type Token struct {
85 Kind int
86 Start int
87 End int
88 Value string
89}
90
91type Lexer func(resetPosition int) (Token, error)
92
93func Lex(s *source.Source) Lexer {
94 var prevPosition int
95 return func(resetPosition int) (Token, error) {
96 if resetPosition == 0 {
97 resetPosition = prevPosition
98 }
99 token, err := readToken(s, resetPosition)
100 if err != nil {
101 return token, err
102 }
103 prevPosition = token.End
104 return token, nil
105 }
106}
107
108// Reads an alphanumeric + underscore name from the source.
109// [_A-Za-z][_0-9A-Za-z]*
110// position: Points to the byte position in the byte array
111// runePosition: Points to the rune position in the byte array
112func readName(source *source.Source, position, runePosition int) Token {
113 body := source.Body
114 bodyLength := len(body)
115 endByte := position + 1
116 endRune := runePosition + 1
117 for {
118 code, _ := runeAt(body, endByte)
119 if (endByte != bodyLength) &&
120 (code == '_' || // _
121 code >= '0' && code <= '9' || // 0-9
122 code >= 'A' && code <= 'Z' || // A-Z
123 code >= 'a' && code <= 'z') { // a-z
124 endByte++
125 endRune++
126 continue
127 } else {
128 break
129 }
130 }
131 return makeToken(TokenKind[NAME], runePosition, endRune, string(body[position:endByte]))
132}
133
134// Reads a number token from the source file, either a float
135// or an int depending on whether a decimal point appears.
136// Int: -?(0|[1-9][0-9]*)
137// Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
138func readNumber(s *source.Source, start int, firstCode rune, codeLength int) (Token, error) {
139 code := firstCode
140 body := s.Body
141 position := start
142 isFloat := false
143 if code == '-' { // -
144 position += codeLength
145 code, codeLength = runeAt(body, position)
146 }
147 if code == '0' { // 0
148 position += codeLength
149 code, codeLength = runeAt(body, position)
150 if code >= '0' && code <= '9' {
151 description := fmt.Sprintf("Invalid number, unexpected digit after 0: %v.", printCharCode(code))
152 return Token{}, gqlerrors.NewSyntaxError(s, position, description)
153 }
154 } else {
155 p, err := readDigits(s, position, code, codeLength)
156 if err != nil {
157 return Token{}, err
158 }
159 position = p
160 code, codeLength = runeAt(body, position)
161 }
162 if code == '.' { // .
163 isFloat = true
164 position += codeLength
165 code, codeLength = runeAt(body, position)
166 p, err := readDigits(s, position, code, codeLength)
167 if err != nil {
168 return Token{}, err
169 }
170 position = p
171 code, codeLength = runeAt(body, position)
172 }
173 if code == 'E' || code == 'e' { // E e
174 isFloat = true
175 position += codeLength
176 code, codeLength = runeAt(body, position)
177 if code == '+' || code == '-' { // + -
178 position += codeLength
179 code, codeLength = runeAt(body, position)
180 }
181 p, err := readDigits(s, position, code, codeLength)
182 if err != nil {
183 return Token{}, err
184 }
185 position = p
186 }
187 kind := TokenKind[INT]
188 if isFloat {
189 kind = TokenKind[FLOAT]
190 }
191
192 return makeToken(kind, start, position, string(body[start:position])), nil
193}
194
195// Returns the new position in the source after reading digits.
196func readDigits(s *source.Source, start int, firstCode rune, codeLength int) (int, error) {
197 body := s.Body
198 position := start
199 code := firstCode
200 if code >= '0' && code <= '9' { // 0 - 9
201 for {
202 if code >= '0' && code <= '9' { // 0 - 9
203 position += codeLength
204 code, codeLength = runeAt(body, position)
205 continue
206 } else {
207 break
208 }
209 }
210 return position, nil
211 }
212 var description string
213 description = fmt.Sprintf("Invalid number, expected digit but got: %v.", printCharCode(code))
214 return position, gqlerrors.NewSyntaxError(s, position, description)
215}
216
217func readString(s *source.Source, start int) (Token, error) {
218 body := s.Body
219 position := start + 1
220 runePosition := start + 1
221 chunkStart := position
222 var code rune
223 var n int
224 var valueBuffer bytes.Buffer
225 for {
226 code, n = runeAt(body, position)
227 if position < len(body) &&
228 // not LineTerminator
229 code != 0x000A && code != 0x000D &&
230 // not Quote (")
231 code != '"' {
232
233 // SourceCharacter
234 if code < 0x0020 && code != 0x0009 {
235 return Token{}, gqlerrors.NewSyntaxError(s, runePosition, fmt.Sprintf(`Invalid character within String: %v.`, printCharCode(code)))
236 }
237 position += n
238 runePosition++
239 if code == '\\' { // \
240 valueBuffer.Write(body[chunkStart : position-1])
241 code, n = runeAt(body, position)
242 switch code {
243 case '"':
244 valueBuffer.WriteRune('"')
245 break
246 case '/':
247 valueBuffer.WriteRune('/')
248 break
249 case '\\':
250 valueBuffer.WriteRune('\\')
251 break
252 case 'b':
253 valueBuffer.WriteRune('\b')
254 break
255 case 'f':
256 valueBuffer.WriteRune('\f')
257 break
258 case 'n':
259 valueBuffer.WriteRune('\n')
260 break
261 case 'r':
262 valueBuffer.WriteRune('\r')
263 break
264 case 't':
265 valueBuffer.WriteRune('\t')
266 break
267 case 'u':
268 // Check if there are at least 4 bytes available
269 if len(body) <= position+4 {
270 return Token{}, gqlerrors.NewSyntaxError(s, runePosition,
271 fmt.Sprintf("Invalid character escape sequence: "+
272 "\\u%v", string(body[position+1:])))
273 }
274 charCode := uniCharCode(
275 rune(body[position+1]),
276 rune(body[position+2]),
277 rune(body[position+3]),
278 rune(body[position+4]),
279 )
280 if charCode < 0 {
281 return Token{}, gqlerrors.NewSyntaxError(s, runePosition,
282 fmt.Sprintf("Invalid character escape sequence: "+
283 "\\u%v", string(body[position+1:position+5])))
284 }
285 valueBuffer.WriteRune(charCode)
286 position += 4
287 runePosition += 4
288 break
289 default:
290 return Token{}, gqlerrors.NewSyntaxError(s, runePosition,
291 fmt.Sprintf(`Invalid character escape sequence: \\%c.`, code))
292 }
293 position += n
294 runePosition++
295 chunkStart = position
296 }
297 continue
298 } else {
299 break
300 }
301 }
302 if code != '"' { // quote (")
303 return Token{}, gqlerrors.NewSyntaxError(s, runePosition, "Unterminated string.")
304 }
305 stringContent := body[chunkStart:position]
306 valueBuffer.Write(stringContent)
307 value := valueBuffer.String()
308 return makeToken(TokenKind[STRING], start, position+1, value), nil
309}
310
311// readBlockString reads a block string token from the source file.
312//
313// """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
314func readBlockString(s *source.Source, start int) (Token, error) {
315 body := s.Body
316 position := start + 3
317 runePosition := start + 3
318 chunkStart := position
319 var valueBuffer bytes.Buffer
320
321 for {
322 // Stop if we've reached the end of the buffer
323 if position >= len(body) {
324 break
325 }
326
327 code, n := runeAt(body, position)
328
329 // Closing Triple-Quote (""")
330 if code == '"' {
331 x, _ := runeAt(body, position+1)
332 y, _ := runeAt(body, position+2)
333 if x == '"' && y == '"' {
334 stringContent := body[chunkStart:position]
335 valueBuffer.Write(stringContent)
336 value := blockStringValue(valueBuffer.String())
337 return makeToken(TokenKind[BLOCK_STRING], start, position+3, value), nil
338 }
339 }
340
341 // SourceCharacter
342 if code < 0x0020 &&
343 code != 0x0009 &&
344 code != 0x000a &&
345 code != 0x000d {
346 return Token{}, gqlerrors.NewSyntaxError(s, runePosition, fmt.Sprintf(`Invalid character within String: %v.`, printCharCode(code)))
347 }
348
349 // Escape Triple-Quote (\""")
350 if code == '\\' { // \
351 x, _ := runeAt(body, position+1)
352 y, _ := runeAt(body, position+2)
353 z, _ := runeAt(body, position+3)
354 if x == '"' && y == '"' && z == '"' {
355 stringContent := append(body[chunkStart:position], []byte(`"""`)...)
356 valueBuffer.Write(stringContent)
357 position += 4 // account for `"""` characters
358 runePosition += 4 // " " " "
359 chunkStart = position
360 continue
361 }
362 }
363
364 position += n
365 runePosition++
366 }
367
368 return Token{}, gqlerrors.NewSyntaxError(s, runePosition, "Unterminated string.")
369}
370
371var splitLinesRegex = regexp.MustCompile("\r\n|[\n\r]")
372
373// This implements the GraphQL spec's BlockStringValue() static algorithm.
374//
375// Produces the value of a block string from its parsed raw value, similar to
376// Coffeescript's block string, Python's docstring trim or Ruby's strip_heredoc.
377//
378// Spec: http://facebook.github.io/graphql/draft/#BlockStringValue()
379// Heavily borrows from: https://github.com/graphql/graphql-js/blob/8e0c599ceccfa8c40d6edf3b72ee2a71490b10e0/src/language/blockStringValue.js
380func blockStringValue(in string) string {
381 // Expand a block string's raw value into independent lines.
382 lines := splitLinesRegex.Split(in, -1)
383
384 // Remove common indentation from all lines but first
385 commonIndent := -1
386 for i := 1; i < len(lines); i++ {
387 line := lines[i]
388 indent := leadingWhitespaceLen(line)
389 if indent < len(line) && (commonIndent == -1 || indent < commonIndent) {
390 commonIndent = indent
391 if commonIndent == 0 {
392 break
393 }
394 }
395 }
396 if commonIndent > 0 {
397 for i, line := range lines {
398 if commonIndent > len(line) {
399 continue
400 }
401 lines[i] = line[commonIndent:]
402 }
403 }
404
405 // Remove leading blank lines.
406 for {
407 if isBlank := lineIsBlank(lines[0]); !isBlank {
408 break
409 }
410 lines = lines[1:]
411 }
412
413 // Remove trailing blank lines.
414 for {
415 i := len(lines) - 1
416 if isBlank := lineIsBlank(lines[i]); !isBlank {
417 break
418 }
419 lines = append(lines[:i], lines[i+1:]...)
420 }
421
422 // Return a string of the lines joined with U+000A.
423 return strings.Join(lines, "\n")
424}
425
426// leadingWhitespaceLen returns count of whitespace characters on given line.
427func leadingWhitespaceLen(in string) (n int) {
428 for _, ch := range in {
429 if ch == ' ' || ch == '\t' {
430 n++
431 } else {
432 break
433 }
434 }
435 return
436}
437
438// lineIsBlank returns true when given line has no content.
439func lineIsBlank(in string) bool {
440 return leadingWhitespaceLen(in) == len(in)
441}
442
443// Converts four hexidecimal chars to the integer that the
444// string represents. For example, uniCharCode('0','0','0','f')
445// will return 15, and uniCharCode('0','0','f','f') returns 255.
446// Returns a negative number on error, if a char was invalid.
447// This is implemented by noting that char2hex() returns -1 on error,
448// which means the result of ORing the char2hex() will also be negative.
449func uniCharCode(a, b, c, d rune) rune {
450 return rune(char2hex(a)<<12 | char2hex(b)<<8 | char2hex(c)<<4 | char2hex(d))
451}
452
453// Converts a hex character to its integer value.
454// '0' becomes 0, '9' becomes 9
455// 'A' becomes 10, 'F' becomes 15
456// 'a' becomes 10, 'f' becomes 15
457// Returns -1 on error.
458func char2hex(a rune) int {
459 if a >= 48 && a <= 57 { // 0-9
460 return int(a) - 48
461 } else if a >= 65 && a <= 70 { // A-F
462 return int(a) - 55
463 } else if a >= 97 && a <= 102 {
464 // a-f
465 return int(a) - 87
466 }
467 return -1
468}
469
470func makeToken(kind int, start int, end int, value string) Token {
471 return Token{Kind: kind, Start: start, End: end, Value: value}
472}
473
474func printCharCode(code rune) string {
475 // NaN/undefined represents access beyond the end of the file.
476 if code < 0 {
477 return "<EOF>"
478 }
479 // print as ASCII for printable range
480 if code >= 0x0020 && code < 0x007F {
481 return fmt.Sprintf(`"%c"`, code)
482 }
483 // Otherwise print the escaped form. e.g. `"\\u0007"`
484 return fmt.Sprintf(`"\\u%04X"`, code)
485}
486
487func readToken(s *source.Source, fromPosition int) (Token, error) {
488 body := s.Body
489 bodyLength := len(body)
490 position, runePosition := positionAfterWhitespace(body, fromPosition)
491 if position >= bodyLength {
492 return makeToken(TokenKind[EOF], position, position, ""), nil
493 }
494 code, codeLength := runeAt(body, position)
495
496 // SourceCharacter
497 if code < 0x0020 && code != 0x0009 && code != 0x000A && code != 0x000D {
498 return Token{}, gqlerrors.NewSyntaxError(s, runePosition, fmt.Sprintf(`Invalid character %v`, printCharCode(code)))
499 }
500
501 switch code {
502 // !
503 case '!':
504 return makeToken(TokenKind[BANG], position, position+1, ""), nil
505 // $
506 case '$':
507 return makeToken(TokenKind[DOLLAR], position, position+1, ""), nil
508 // (
509 case '(':
510 return makeToken(TokenKind[PAREN_L], position, position+1, ""), nil
511 // )
512 case ')':
513 return makeToken(TokenKind[PAREN_R], position, position+1, ""), nil
514 // .
515 case '.':
516 next1, _ := runeAt(body, position+1)
517 next2, _ := runeAt(body, position+2)
518 if next1 == '.' && next2 == '.' {
519 return makeToken(TokenKind[SPREAD], position, position+3, ""), nil
520 }
521 break
522 // :
523 case ':':
524 return makeToken(TokenKind[COLON], position, position+1, ""), nil
525 // =
526 case '=':
527 return makeToken(TokenKind[EQUALS], position, position+1, ""), nil
528 // @
529 case '@':
530 return makeToken(TokenKind[AT], position, position+1, ""), nil
531 // [
532 case '[':
533 return makeToken(TokenKind[BRACKET_L], position, position+1, ""), nil
534 // ]
535 case ']':
536 return makeToken(TokenKind[BRACKET_R], position, position+1, ""), nil
537 // {
538 case '{':
539 return makeToken(TokenKind[BRACE_L], position, position+1, ""), nil
540 // |
541 case '|':
542 return makeToken(TokenKind[PIPE], position, position+1, ""), nil
543 // }
544 case '}':
545 return makeToken(TokenKind[BRACE_R], position, position+1, ""), nil
546 // A-Z
547 case 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
548 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z':
549 return readName(s, position, runePosition), nil
550 // _
551 // a-z
552 case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
553 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z':
554 return readName(s, position, runePosition), nil
555 // -
556 // 0-9
557 case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
558 token, err := readNumber(s, position, code, codeLength)
559 if err != nil {
560 return token, err
561 }
562 return token, nil
563 // "
564 case '"':
565 var token Token
566 var err error
567 x, _ := runeAt(body, position+1)
568 y, _ := runeAt(body, position+2)
569 if x == '"' && y == '"' {
570 token, err = readBlockString(s, position)
571 } else {
572 token, err = readString(s, position)
573 }
574 return token, err
575 }
576 description := fmt.Sprintf("Unexpected character %v.", printCharCode(code))
577 return Token{}, gqlerrors.NewSyntaxError(s, runePosition, description)
578}
579
580// Gets the rune from the byte array at given byte position and it's width in bytes
581func runeAt(body []byte, position int) (code rune, charWidth int) {
582 if len(body) <= position {
583 // <EOF>
584 return -1, utf8.RuneError
585 }
586
587 c := body[position]
588 if c < utf8.RuneSelf {
589 return rune(c), 1
590 }
591
592 r, n := utf8.DecodeRune(body[position:])
593 return r, n
594}
595
596// Reads from body starting at startPosition until it finds a non-whitespace
597// or commented character, then returns the position of that character for lexing.
598// lexing.
599// Returns both byte positions and rune position
600func positionAfterWhitespace(body []byte, startPosition int) (position int, runePosition int) {
601 bodyLength := len(body)
602 position = startPosition
603 runePosition = startPosition
604 for {
605 if position < bodyLength {
606 code, n := runeAt(body, position)
607
608 // Skip Ignored
609 if code == 0xFEFF || // BOM
610 // White Space
611 code == 0x0009 || // tab
612 code == 0x0020 || // space
613 // Line Terminator
614 code == 0x000A || // new line
615 code == 0x000D || // carriage return
616 // Comma
617 code == 0x002C {
618 position += n
619 runePosition++
620 } else if code == 35 { // #
621 position += n
622 runePosition++
623 for {
624 code, n := runeAt(body, position)
625 if position < bodyLength &&
626 code != 0 &&
627 // SourceCharacter but not LineTerminator
628 (code > 0x001F || code == 0x0009) && code != 0x000A && code != 0x000D {
629 position += n
630 runePosition++
631 continue
632 } else {
633 break
634 }
635 }
636 } else {
637 break
638 }
639 continue
640 } else {
641 break
642 }
643 }
644 return position, runePosition
645}
646
647func GetTokenDesc(token Token) string {
648 if token.Value == "" {
649 return GetTokenKindDesc(token.Kind)
650 }
651 return fmt.Sprintf("%s \"%s\"", GetTokenKindDesc(token.Kind), token.Value)
652}
653
654func GetTokenKindDesc(kind int) string {
655 return tokenDescription[kind]
656}