lexer.go

  1package lexer
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"regexp"
  7	"strings"
  8	"unicode/utf8"
  9
 10	"github.com/graphql-go/graphql/gqlerrors"
 11	"github.com/graphql-go/graphql/language/source"
 12)
 13
 14const (
 15	EOF = iota + 1
 16	BANG
 17	DOLLAR
 18	PAREN_L
 19	PAREN_R
 20	SPREAD
 21	COLON
 22	EQUALS
 23	AT
 24	BRACKET_L
 25	BRACKET_R
 26	BRACE_L
 27	PIPE
 28	BRACE_R
 29	NAME
 30	INT
 31	FLOAT
 32	STRING
 33	BLOCK_STRING
 34)
 35
 36var TokenKind map[int]int
 37var tokenDescription map[int]string
 38
 39func init() {
 40	TokenKind = make(map[int]int)
 41	tokenDescription = make(map[int]string)
 42	TokenKind[EOF] = EOF
 43	TokenKind[BANG] = BANG
 44	TokenKind[DOLLAR] = DOLLAR
 45	TokenKind[PAREN_L] = PAREN_L
 46	TokenKind[PAREN_R] = PAREN_R
 47	TokenKind[SPREAD] = SPREAD
 48	TokenKind[COLON] = COLON
 49	TokenKind[EQUALS] = EQUALS
 50	TokenKind[AT] = AT
 51	TokenKind[BRACKET_L] = BRACKET_L
 52	TokenKind[BRACKET_R] = BRACKET_R
 53	TokenKind[BRACE_L] = BRACE_L
 54	TokenKind[PIPE] = PIPE
 55	TokenKind[BRACE_R] = BRACE_R
 56	TokenKind[NAME] = NAME
 57	TokenKind[INT] = INT
 58	TokenKind[FLOAT] = FLOAT
 59	TokenKind[STRING] = STRING
 60	TokenKind[BLOCK_STRING] = BLOCK_STRING
 61	tokenDescription[TokenKind[EOF]] = "EOF"
 62	tokenDescription[TokenKind[BANG]] = "!"
 63	tokenDescription[TokenKind[DOLLAR]] = "$"
 64	tokenDescription[TokenKind[PAREN_L]] = "("
 65	tokenDescription[TokenKind[PAREN_R]] = ")"
 66	tokenDescription[TokenKind[SPREAD]] = "..."
 67	tokenDescription[TokenKind[COLON]] = ":"
 68	tokenDescription[TokenKind[EQUALS]] = "="
 69	tokenDescription[TokenKind[AT]] = "@"
 70	tokenDescription[TokenKind[BRACKET_L]] = "["
 71	tokenDescription[TokenKind[BRACKET_R]] = "]"
 72	tokenDescription[TokenKind[BRACE_L]] = "{"
 73	tokenDescription[TokenKind[PIPE]] = "|"
 74	tokenDescription[TokenKind[BRACE_R]] = "}"
 75	tokenDescription[TokenKind[NAME]] = "Name"
 76	tokenDescription[TokenKind[INT]] = "Int"
 77	tokenDescription[TokenKind[FLOAT]] = "Float"
 78	tokenDescription[TokenKind[STRING]] = "String"
 79	tokenDescription[TokenKind[BLOCK_STRING]] = "BlockString"
 80}
 81
 82// Token is a representation of a lexed Token. Value only appears for non-punctuation
 83// tokens: NAME, INT, FLOAT, and STRING.
 84type Token struct {
 85	Kind  int
 86	Start int
 87	End   int
 88	Value string
 89}
 90
 91type Lexer func(resetPosition int) (Token, error)
 92
 93func Lex(s *source.Source) Lexer {
 94	var prevPosition int
 95	return func(resetPosition int) (Token, error) {
 96		if resetPosition == 0 {
 97			resetPosition = prevPosition
 98		}
 99		token, err := readToken(s, resetPosition)
100		if err != nil {
101			return token, err
102		}
103		prevPosition = token.End
104		return token, nil
105	}
106}
107
108// Reads an alphanumeric + underscore name from the source.
109// [_A-Za-z][_0-9A-Za-z]*
110// position: Points to the byte position in the byte array
111// runePosition: Points to the rune position in the byte array
112func readName(source *source.Source, position, runePosition int) Token {
113	body := source.Body
114	bodyLength := len(body)
115	endByte := position + 1
116	endRune := runePosition + 1
117	for {
118		code, _ := runeAt(body, endByte)
119		if (endByte != bodyLength) &&
120			(code == '_' || // _
121				code >= '0' && code <= '9' || // 0-9
122				code >= 'A' && code <= 'Z' || // A-Z
123				code >= 'a' && code <= 'z') { // a-z
124			endByte++
125			endRune++
126			continue
127		} else {
128			break
129		}
130	}
131	return makeToken(TokenKind[NAME], runePosition, endRune, string(body[position:endByte]))
132}
133
134// Reads a number token from the source file, either a float
135// or an int depending on whether a decimal point appears.
136// Int:   -?(0|[1-9][0-9]*)
137// Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
138func readNumber(s *source.Source, start int, firstCode rune, codeLength int) (Token, error) {
139	code := firstCode
140	body := s.Body
141	position := start
142	isFloat := false
143	if code == '-' { // -
144		position += codeLength
145		code, codeLength = runeAt(body, position)
146	}
147	if code == '0' { // 0
148		position += codeLength
149		code, codeLength = runeAt(body, position)
150		if code >= '0' && code <= '9' {
151			description := fmt.Sprintf("Invalid number, unexpected digit after 0: %v.", printCharCode(code))
152			return Token{}, gqlerrors.NewSyntaxError(s, position, description)
153		}
154	} else {
155		p, err := readDigits(s, position, code, codeLength)
156		if err != nil {
157			return Token{}, err
158		}
159		position = p
160		code, codeLength = runeAt(body, position)
161	}
162	if code == '.' { // .
163		isFloat = true
164		position += codeLength
165		code, codeLength = runeAt(body, position)
166		p, err := readDigits(s, position, code, codeLength)
167		if err != nil {
168			return Token{}, err
169		}
170		position = p
171		code, codeLength = runeAt(body, position)
172	}
173	if code == 'E' || code == 'e' { // E e
174		isFloat = true
175		position += codeLength
176		code, codeLength = runeAt(body, position)
177		if code == '+' || code == '-' { // + -
178			position += codeLength
179			code, codeLength = runeAt(body, position)
180		}
181		p, err := readDigits(s, position, code, codeLength)
182		if err != nil {
183			return Token{}, err
184		}
185		position = p
186	}
187	kind := TokenKind[INT]
188	if isFloat {
189		kind = TokenKind[FLOAT]
190	}
191
192	return makeToken(kind, start, position, string(body[start:position])), nil
193}
194
195// Returns the new position in the source after reading digits.
196func readDigits(s *source.Source, start int, firstCode rune, codeLength int) (int, error) {
197	body := s.Body
198	position := start
199	code := firstCode
200	if code >= '0' && code <= '9' { // 0 - 9
201		for {
202			if code >= '0' && code <= '9' { // 0 - 9
203				position += codeLength
204				code, codeLength = runeAt(body, position)
205				continue
206			} else {
207				break
208			}
209		}
210		return position, nil
211	}
212	var description string
213	description = fmt.Sprintf("Invalid number, expected digit but got: %v.", printCharCode(code))
214	return position, gqlerrors.NewSyntaxError(s, position, description)
215}
216
217func readString(s *source.Source, start int) (Token, error) {
218	body := s.Body
219	position := start + 1
220	runePosition := start + 1
221	chunkStart := position
222	var code rune
223	var n int
224	var valueBuffer bytes.Buffer
225	for {
226		code, n = runeAt(body, position)
227		if position < len(body) &&
228			// not LineTerminator
229			code != 0x000A && code != 0x000D &&
230			// not Quote (")
231			code != '"' {
232
233			// SourceCharacter
234			if code < 0x0020 && code != 0x0009 {
235				return Token{}, gqlerrors.NewSyntaxError(s, runePosition, fmt.Sprintf(`Invalid character within String: %v.`, printCharCode(code)))
236			}
237			position += n
238			runePosition++
239			if code == '\\' { // \
240				valueBuffer.Write(body[chunkStart : position-1])
241				code, n = runeAt(body, position)
242				switch code {
243				case '"':
244					valueBuffer.WriteRune('"')
245					break
246				case '/':
247					valueBuffer.WriteRune('/')
248					break
249				case '\\':
250					valueBuffer.WriteRune('\\')
251					break
252				case 'b':
253					valueBuffer.WriteRune('\b')
254					break
255				case 'f':
256					valueBuffer.WriteRune('\f')
257					break
258				case 'n':
259					valueBuffer.WriteRune('\n')
260					break
261				case 'r':
262					valueBuffer.WriteRune('\r')
263					break
264				case 't':
265					valueBuffer.WriteRune('\t')
266					break
267				case 'u':
268					// Check if there are at least 4 bytes available
269					if len(body) <= position+4 {
270						return Token{}, gqlerrors.NewSyntaxError(s, runePosition,
271							fmt.Sprintf("Invalid character escape sequence: "+
272								"\\u%v", string(body[position+1:])))
273					}
274					charCode := uniCharCode(
275						rune(body[position+1]),
276						rune(body[position+2]),
277						rune(body[position+3]),
278						rune(body[position+4]),
279					)
280					if charCode < 0 {
281						return Token{}, gqlerrors.NewSyntaxError(s, runePosition,
282							fmt.Sprintf("Invalid character escape sequence: "+
283								"\\u%v", string(body[position+1:position+5])))
284					}
285					valueBuffer.WriteRune(charCode)
286					position += 4
287					runePosition += 4
288					break
289				default:
290					return Token{}, gqlerrors.NewSyntaxError(s, runePosition,
291						fmt.Sprintf(`Invalid character escape sequence: \\%c.`, code))
292				}
293				position += n
294				runePosition++
295				chunkStart = position
296			}
297			continue
298		} else {
299			break
300		}
301	}
302	if code != '"' { // quote (")
303		return Token{}, gqlerrors.NewSyntaxError(s, runePosition, "Unterminated string.")
304	}
305	stringContent := body[chunkStart:position]
306	valueBuffer.Write(stringContent)
307	value := valueBuffer.String()
308	return makeToken(TokenKind[STRING], start, position+1, value), nil
309}
310
311// readBlockString reads a block string token from the source file.
312//
313// """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
314func readBlockString(s *source.Source, start int) (Token, error) {
315	body := s.Body
316	position := start + 3
317	runePosition := start + 3
318	chunkStart := position
319	var valueBuffer bytes.Buffer
320
321	for {
322		// Stop if we've reached the end of the buffer
323		if position >= len(body) {
324			break
325		}
326
327		code, n := runeAt(body, position)
328
329		// Closing Triple-Quote (""")
330		if code == '"' {
331			x, _ := runeAt(body, position+1)
332			y, _ := runeAt(body, position+2)
333			if x == '"' && y == '"' {
334				stringContent := body[chunkStart:position]
335				valueBuffer.Write(stringContent)
336				value := blockStringValue(valueBuffer.String())
337				return makeToken(TokenKind[BLOCK_STRING], start, position+3, value), nil
338			}
339		}
340
341		// SourceCharacter
342		if code < 0x0020 &&
343			code != 0x0009 &&
344			code != 0x000a &&
345			code != 0x000d {
346			return Token{}, gqlerrors.NewSyntaxError(s, runePosition, fmt.Sprintf(`Invalid character within String: %v.`, printCharCode(code)))
347		}
348
349		// Escape Triple-Quote (\""")
350		if code == '\\' { // \
351			x, _ := runeAt(body, position+1)
352			y, _ := runeAt(body, position+2)
353			z, _ := runeAt(body, position+3)
354			if x == '"' && y == '"' && z == '"' {
355				stringContent := append(body[chunkStart:position], []byte(`"""`)...)
356				valueBuffer.Write(stringContent)
357				position += 4     // account for `"""` characters
358				runePosition += 4 // "       "   "     "
359				chunkStart = position
360				continue
361			}
362		}
363
364		position += n
365		runePosition++
366	}
367
368	return Token{}, gqlerrors.NewSyntaxError(s, runePosition, "Unterminated string.")
369}
370
371var splitLinesRegex = regexp.MustCompile("\r\n|[\n\r]")
372
373// This implements the GraphQL spec's BlockStringValue() static algorithm.
374//
375// Produces the value of a block string from its parsed raw value, similar to
376// Coffeescript's block string, Python's docstring trim or Ruby's strip_heredoc.
377//
378// Spec: http://facebook.github.io/graphql/draft/#BlockStringValue()
379// Heavily borrows from: https://github.com/graphql/graphql-js/blob/8e0c599ceccfa8c40d6edf3b72ee2a71490b10e0/src/language/blockStringValue.js
380func blockStringValue(in string) string {
381	// Expand a block string's raw value into independent lines.
382	lines := splitLinesRegex.Split(in, -1)
383
384	// Remove common indentation from all lines but first
385	commonIndent := -1
386	for i := 1; i < len(lines); i++ {
387		line := lines[i]
388		indent := leadingWhitespaceLen(line)
389		if indent < len(line) && (commonIndent == -1 || indent < commonIndent) {
390			commonIndent = indent
391			if commonIndent == 0 {
392				break
393			}
394		}
395	}
396	if commonIndent > 0 {
397		for i, line := range lines {
398			if commonIndent > len(line) {
399				continue
400			}
401			lines[i] = line[commonIndent:]
402		}
403	}
404
405	// Remove leading blank lines.
406	for {
407		if isBlank := lineIsBlank(lines[0]); !isBlank {
408			break
409		}
410		lines = lines[1:]
411	}
412
413	// Remove trailing blank lines.
414	for {
415		i := len(lines) - 1
416		if isBlank := lineIsBlank(lines[i]); !isBlank {
417			break
418		}
419		lines = append(lines[:i], lines[i+1:]...)
420	}
421
422	// Return a string of the lines joined with U+000A.
423	return strings.Join(lines, "\n")
424}
425
426// leadingWhitespaceLen returns count of whitespace characters on given line.
427func leadingWhitespaceLen(in string) (n int) {
428	for _, ch := range in {
429		if ch == ' ' || ch == '\t' {
430			n++
431		} else {
432			break
433		}
434	}
435	return
436}
437
438// lineIsBlank returns true when given line has no content.
439func lineIsBlank(in string) bool {
440	return leadingWhitespaceLen(in) == len(in)
441}
442
443// Converts four hexidecimal chars to the integer that the
444// string represents. For example, uniCharCode('0','0','0','f')
445// will return 15, and uniCharCode('0','0','f','f') returns 255.
446// Returns a negative number on error, if a char was invalid.
447// This is implemented by noting that char2hex() returns -1 on error,
448// which means the result of ORing the char2hex() will also be negative.
449func uniCharCode(a, b, c, d rune) rune {
450	return rune(char2hex(a)<<12 | char2hex(b)<<8 | char2hex(c)<<4 | char2hex(d))
451}
452
453// Converts a hex character to its integer value.
454// '0' becomes 0, '9' becomes 9
455// 'A' becomes 10, 'F' becomes 15
456// 'a' becomes 10, 'f' becomes 15
457// Returns -1 on error.
458func char2hex(a rune) int {
459	if a >= 48 && a <= 57 { // 0-9
460		return int(a) - 48
461	} else if a >= 65 && a <= 70 { // A-F
462		return int(a) - 55
463	} else if a >= 97 && a <= 102 {
464		// a-f
465		return int(a) - 87
466	}
467	return -1
468}
469
470func makeToken(kind int, start int, end int, value string) Token {
471	return Token{Kind: kind, Start: start, End: end, Value: value}
472}
473
474func printCharCode(code rune) string {
475	// NaN/undefined represents access beyond the end of the file.
476	if code < 0 {
477		return "<EOF>"
478	}
479	// print as ASCII for printable range
480	if code >= 0x0020 && code < 0x007F {
481		return fmt.Sprintf(`"%c"`, code)
482	}
483	// Otherwise print the escaped form. e.g. `"\\u0007"`
484	return fmt.Sprintf(`"\\u%04X"`, code)
485}
486
487func readToken(s *source.Source, fromPosition int) (Token, error) {
488	body := s.Body
489	bodyLength := len(body)
490	position, runePosition := positionAfterWhitespace(body, fromPosition)
491	if position >= bodyLength {
492		return makeToken(TokenKind[EOF], position, position, ""), nil
493	}
494	code, codeLength := runeAt(body, position)
495
496	// SourceCharacter
497	if code < 0x0020 && code != 0x0009 && code != 0x000A && code != 0x000D {
498		return Token{}, gqlerrors.NewSyntaxError(s, runePosition, fmt.Sprintf(`Invalid character %v`, printCharCode(code)))
499	}
500
501	switch code {
502	// !
503	case '!':
504		return makeToken(TokenKind[BANG], position, position+1, ""), nil
505	// $
506	case '$':
507		return makeToken(TokenKind[DOLLAR], position, position+1, ""), nil
508	// (
509	case '(':
510		return makeToken(TokenKind[PAREN_L], position, position+1, ""), nil
511	// )
512	case ')':
513		return makeToken(TokenKind[PAREN_R], position, position+1, ""), nil
514	// .
515	case '.':
516		next1, _ := runeAt(body, position+1)
517		next2, _ := runeAt(body, position+2)
518		if next1 == '.' && next2 == '.' {
519			return makeToken(TokenKind[SPREAD], position, position+3, ""), nil
520		}
521		break
522	// :
523	case ':':
524		return makeToken(TokenKind[COLON], position, position+1, ""), nil
525	// =
526	case '=':
527		return makeToken(TokenKind[EQUALS], position, position+1, ""), nil
528	// @
529	case '@':
530		return makeToken(TokenKind[AT], position, position+1, ""), nil
531	// [
532	case '[':
533		return makeToken(TokenKind[BRACKET_L], position, position+1, ""), nil
534	// ]
535	case ']':
536		return makeToken(TokenKind[BRACKET_R], position, position+1, ""), nil
537	// {
538	case '{':
539		return makeToken(TokenKind[BRACE_L], position, position+1, ""), nil
540	// |
541	case '|':
542		return makeToken(TokenKind[PIPE], position, position+1, ""), nil
543	// }
544	case '}':
545		return makeToken(TokenKind[BRACE_R], position, position+1, ""), nil
546	// A-Z
547	case 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
548		'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z':
549		return readName(s, position, runePosition), nil
550	// _
551	// a-z
552	case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
553		'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z':
554		return readName(s, position, runePosition), nil
555	// -
556	// 0-9
557	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
558		token, err := readNumber(s, position, code, codeLength)
559		if err != nil {
560			return token, err
561		}
562		return token, nil
563	// "
564	case '"':
565		var token Token
566		var err error
567		x, _ := runeAt(body, position+1)
568		y, _ := runeAt(body, position+2)
569		if x == '"' && y == '"' {
570			token, err = readBlockString(s, position)
571		} else {
572			token, err = readString(s, position)
573		}
574		return token, err
575	}
576	description := fmt.Sprintf("Unexpected character %v.", printCharCode(code))
577	return Token{}, gqlerrors.NewSyntaxError(s, runePosition, description)
578}
579
580// Gets the rune from the byte array at given byte position and it's width in bytes
581func runeAt(body []byte, position int) (code rune, charWidth int) {
582	if len(body) <= position {
583		// <EOF>
584		return -1, utf8.RuneError
585	}
586
587	c := body[position]
588	if c < utf8.RuneSelf {
589		return rune(c), 1
590	}
591
592	r, n := utf8.DecodeRune(body[position:])
593	return r, n
594}
595
596// Reads from body starting at startPosition until it finds a non-whitespace
597// or commented character, then returns the position of that character for lexing.
598// lexing.
599// Returns both byte positions and rune position
600func positionAfterWhitespace(body []byte, startPosition int) (position int, runePosition int) {
601	bodyLength := len(body)
602	position = startPosition
603	runePosition = startPosition
604	for {
605		if position < bodyLength {
606			code, n := runeAt(body, position)
607
608			// Skip Ignored
609			if code == 0xFEFF || // BOM
610				// White Space
611				code == 0x0009 || // tab
612				code == 0x0020 || // space
613				// Line Terminator
614				code == 0x000A || // new line
615				code == 0x000D || // carriage return
616				// Comma
617				code == 0x002C {
618				position += n
619				runePosition++
620			} else if code == 35 { // #
621				position += n
622				runePosition++
623				for {
624					code, n := runeAt(body, position)
625					if position < bodyLength &&
626						code != 0 &&
627						// SourceCharacter but not LineTerminator
628						(code > 0x001F || code == 0x0009) && code != 0x000A && code != 0x000D {
629						position += n
630						runePosition++
631						continue
632					} else {
633						break
634					}
635				}
636			} else {
637				break
638			}
639			continue
640		} else {
641			break
642		}
643	}
644	return position, runePosition
645}
646
647func GetTokenDesc(token Token) string {
648	if token.Value == "" {
649		return GetTokenKindDesc(token.Kind)
650	}
651	return fmt.Sprintf("%s \"%s\"", GetTokenKindDesc(token.Kind), token.Value)
652}
653
654func GetTokenKindDesc(kind int) string {
655	return tokenDescription[kind]
656}