lexer.go

  1package lexer
  2
  3import (
  4	"bytes"
  5	"unicode/utf8"
  6
  7	"github.com/vektah/gqlparser/ast"
  8	"github.com/vektah/gqlparser/gqlerror"
  9)
 10
 11// Lexer turns graphql request and schema strings into tokens
 12type Lexer struct {
 13	*ast.Source
 14	// An offset into the string in bytes
 15	start int
 16	// An offset into the string in runes
 17	startRunes int
 18	// An offset into the string in bytes
 19	end int
 20	// An offset into the string in runes
 21	endRunes int
 22	// the current line number
 23	line int
 24	// An offset into the string in rune
 25	lineStartRunes int
 26}
 27
 28func New(src *ast.Source) Lexer {
 29	return Lexer{
 30		Source: src,
 31		line:   1,
 32	}
 33}
 34
 35// take one rune from input and advance end
 36func (s *Lexer) peek() (rune, int) {
 37	return utf8.DecodeRuneInString(s.Input[s.end:])
 38}
 39
 40func (s *Lexer) makeToken(kind Type) (Token, *gqlerror.Error) {
 41	return s.makeValueToken(kind, s.Input[s.start:s.end])
 42}
 43
 44func (s *Lexer) makeValueToken(kind Type, value string) (Token, *gqlerror.Error) {
 45	return Token{
 46		Kind:  kind,
 47		Value: value,
 48		Pos: ast.Position{
 49			Start:  s.startRunes,
 50			End:    s.endRunes,
 51			Line:   s.line,
 52			Column: s.startRunes - s.lineStartRunes + 1,
 53			Src:    s.Source,
 54		},
 55	}, nil
 56}
 57
 58func (s *Lexer) makeError(format string, args ...interface{}) (Token, *gqlerror.Error) {
 59	column := s.endRunes - s.lineStartRunes + 1
 60	return Token{
 61		Kind: Invalid,
 62		Pos: ast.Position{
 63			Start:  s.startRunes,
 64			End:    s.endRunes,
 65			Line:   s.line,
 66			Column: column,
 67			Src:    s.Source,
 68		},
 69	}, gqlerror.ErrorLocf(s.Source.Name, s.line, column, format, args...)
 70}
 71
 72// ReadToken gets the next token from the source starting at the given position.
 73//
 74// This skips over whitespace and comments until it finds the next lexable
 75// token, then lexes punctuators immediately or calls the appropriate helper
 76// function for more complicated tokens.
 77func (s *Lexer) ReadToken() (token Token, err *gqlerror.Error) {
 78
 79	s.ws()
 80	s.start = s.end
 81	s.startRunes = s.endRunes
 82
 83	if s.end >= len(s.Input) {
 84		return s.makeToken(EOF)
 85	}
 86	r := s.Input[s.start]
 87	s.end++
 88	s.endRunes++
 89	switch r {
 90	case '!':
 91		return s.makeValueToken(Bang, "")
 92
 93	case '$':
 94		return s.makeValueToken(Dollar, "")
 95	case '&':
 96		return s.makeValueToken(Amp, "")
 97	case '(':
 98		return s.makeValueToken(ParenL, "")
 99	case ')':
100		return s.makeValueToken(ParenR, "")
101	case '.':
102		if len(s.Input) > s.start+2 && s.Input[s.start:s.start+3] == "..." {
103			s.end += 2
104			s.endRunes += 2
105			return s.makeValueToken(Spread, "")
106		}
107	case ':':
108		return s.makeValueToken(Colon, "")
109	case '=':
110		return s.makeValueToken(Equals, "")
111	case '@':
112		return s.makeValueToken(At, "")
113	case '[':
114		return s.makeValueToken(BracketL, "")
115	case ']':
116		return s.makeValueToken(BracketR, "")
117	case '{':
118		return s.makeValueToken(BraceL, "")
119	case '}':
120		return s.makeValueToken(BraceR, "")
121	case '|':
122		return s.makeValueToken(Pipe, "")
123	case '#':
124		s.readComment()
125		return s.ReadToken()
126
127	case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z':
128		return s.readName()
129
130	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
131		return s.readNumber()
132
133	case '"':
134		if len(s.Input) > s.start+2 && s.Input[s.start:s.start+3] == `"""` {
135			return s.readBlockString()
136		}
137
138		return s.readString()
139	}
140
141	s.end--
142	s.endRunes--
143
144	if r < 0x0020 && r != 0x0009 && r != 0x000a && r != 0x000d {
145		return s.makeError(`Cannot contain the invalid character "\u%04d"`, r)
146	}
147
148	if r == '\'' {
149		return s.makeError(`Unexpected single quote character ('), did you mean to use a double quote (")?`)
150	}
151
152	return s.makeError(`Cannot parse the unexpected character "%s".`, string(r))
153}
154
155// ws reads from body starting at startPosition until it finds a non-whitespace
156// or commented character, and updates the token end to include all whitespace
157func (s *Lexer) ws() {
158	for s.end < len(s.Input) {
159		switch s.Input[s.end] {
160		case '\t', ' ', ',':
161			s.end++
162			s.endRunes++
163		case '\n':
164			s.end++
165			s.endRunes++
166			s.line++
167			s.lineStartRunes = s.endRunes
168		case '\r':
169			s.end++
170			s.endRunes++
171			s.line++
172			s.lineStartRunes = s.endRunes
173			// skip the following newline if its there
174			if s.end < len(s.Input) && s.Input[s.end] == '\n' {
175				s.end++
176				s.endRunes++
177			}
178			// byte order mark, given ws is hot path we aren't relying on the unicode package here.
179		case 0xef:
180			if s.end+2 < len(s.Input) && s.Input[s.end+1] == 0xBB && s.Input[s.end+2] == 0xBF {
181				s.end += 3
182				s.endRunes++
183			} else {
184				return
185			}
186		default:
187			return
188		}
189	}
190}
191
192// readComment from the input
193//
194// #[\u0009\u0020-\uFFFF]*
195func (s *Lexer) readComment() (Token, *gqlerror.Error) {
196	for s.end < len(s.Input) {
197		r, w := s.peek()
198
199		// SourceCharacter but not LineTerminator
200		if r > 0x001f || r == '\t' {
201			s.end += w
202			s.endRunes++
203		} else {
204			break
205		}
206	}
207
208	return s.makeToken(Comment)
209}
210
211// readNumber from the input, either a float
212// or an int depending on whether a decimal point appears.
213//
214// Int:   -?(0|[1-9][0-9]*)
215// Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
216func (s *Lexer) readNumber() (Token, *gqlerror.Error) {
217	float := false
218
219	// backup to the first digit
220	s.end--
221	s.endRunes--
222
223	s.acceptByte('-')
224
225	if s.acceptByte('0') {
226		if consumed := s.acceptDigits(); consumed != 0 {
227			s.end -= consumed
228			s.endRunes -= consumed
229			return s.makeError("Invalid number, unexpected digit after 0: %s.", s.describeNext())
230		}
231	} else {
232		if consumed := s.acceptDigits(); consumed == 0 {
233			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
234		}
235	}
236
237	if s.acceptByte('.') {
238		float = true
239
240		if consumed := s.acceptDigits(); consumed == 0 {
241			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
242		}
243	}
244
245	if s.acceptByte('e', 'E') {
246		float = true
247
248		s.acceptByte('-', '+')
249
250		if consumed := s.acceptDigits(); consumed == 0 {
251			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
252		}
253	}
254
255	if float {
256		return s.makeToken(Float)
257	} else {
258		return s.makeToken(Int)
259	}
260}
261
262// acceptByte if it matches any of given bytes, returning true if it found anything
263func (s *Lexer) acceptByte(bytes ...uint8) bool {
264	if s.end >= len(s.Input) {
265		return false
266	}
267
268	for _, accepted := range bytes {
269		if s.Input[s.end] == accepted {
270			s.end++
271			s.endRunes++
272			return true
273		}
274	}
275	return false
276}
277
278// acceptDigits from the input, returning the number of digits it found
279func (s *Lexer) acceptDigits() int {
280	consumed := 0
281	for s.end < len(s.Input) && s.Input[s.end] >= '0' && s.Input[s.end] <= '9' {
282		s.end++
283		s.endRunes++
284		consumed++
285	}
286
287	return consumed
288}
289
290// describeNext peeks at the input and returns a human readable string. This should will alloc
291// and should only be used in errors
292func (s *Lexer) describeNext() string {
293	if s.end < len(s.Input) {
294		return `"` + string(s.Input[s.end]) + `"`
295	}
296	return "<EOF>"
297}
298
299// readString from the input
300//
301// "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
302func (s *Lexer) readString() (Token, *gqlerror.Error) {
303	inputLen := len(s.Input)
304
305	// this buffer is lazily created only if there are escape characters.
306	var buf *bytes.Buffer
307
308	// skip the opening quote
309	s.start++
310	s.startRunes++
311
312	for s.end < inputLen {
313		r := s.Input[s.end]
314		if r == '\n' || r == '\r' {
315			break
316		}
317		if r < 0x0020 && r != '\t' {
318			return s.makeError(`Invalid character within String: "\u%04d".`, r)
319		}
320		switch r {
321		default:
322			var char = rune(r)
323			var w = 1
324
325			// skip unicode overhead if we are in the ascii range
326			if r >= 127 {
327				char, w = utf8.DecodeRuneInString(s.Input[s.end:])
328			}
329			s.end += w
330			s.endRunes++
331
332			if buf != nil {
333				buf.WriteRune(char)
334			}
335
336		case '"':
337			t, err := s.makeToken(String)
338			// the token should not include the quotes in its value, but should cover them in its position
339			t.Pos.Start--
340			t.Pos.End++
341
342			if buf != nil {
343				t.Value = buf.String()
344			}
345
346			// skip the close quote
347			s.end++
348			s.endRunes++
349
350			return t, err
351
352		case '\\':
353			if s.end+1 >= inputLen {
354				s.end++
355				s.endRunes++
356				return s.makeError(`Invalid character escape sequence.`)
357			}
358
359			if buf == nil {
360				buf = bytes.NewBufferString(s.Input[s.start:s.end])
361			}
362
363			escape := s.Input[s.end+1]
364
365			if escape == 'u' {
366				if s.end+6 >= inputLen {
367					s.end++
368					s.endRunes++
369					return s.makeError("Invalid character escape sequence: \\%s.", s.Input[s.end:])
370				}
371
372				r, ok := unhex(s.Input[s.end+2 : s.end+6])
373				if !ok {
374					s.end++
375					s.endRunes++
376					return s.makeError("Invalid character escape sequence: \\%s.", s.Input[s.end:s.end+5])
377				}
378				buf.WriteRune(r)
379				s.end += 6
380				s.endRunes += 6
381			} else {
382				switch escape {
383				case '"', '/', '\\':
384					buf.WriteByte(escape)
385				case 'b':
386					buf.WriteByte('\b')
387				case 'f':
388					buf.WriteByte('\f')
389				case 'n':
390					buf.WriteByte('\n')
391				case 'r':
392					buf.WriteByte('\r')
393				case 't':
394					buf.WriteByte('\t')
395				default:
396					s.end += 1
397					s.endRunes += 1
398					return s.makeError("Invalid character escape sequence: \\%s.", string(escape))
399				}
400				s.end += 2
401				s.endRunes += 2
402			}
403		}
404	}
405
406	return s.makeError("Unterminated string.")
407}
408
409// readBlockString from the input
410//
411// """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
412func (s *Lexer) readBlockString() (Token, *gqlerror.Error) {
413	inputLen := len(s.Input)
414
415	var buf bytes.Buffer
416
417	// skip the opening quote
418	s.start += 3
419	s.startRunes += 3
420	s.end += 2
421	s.endRunes += 2
422
423	for s.end < inputLen {
424		r := s.Input[s.end]
425
426		// Closing triple quote (""")
427		if r == '"' && s.end+3 <= inputLen && s.Input[s.end:s.end+3] == `"""` {
428			t, err := s.makeValueToken(BlockString, blockStringValue(buf.String()))
429
430			// the token should not include the quotes in its value, but should cover them in its position
431			t.Pos.Start -= 3
432			t.Pos.End += 3
433
434			// skip the close quote
435			s.end += 3
436			s.endRunes += 3
437
438			return t, err
439		}
440
441		// SourceCharacter
442		if r < 0x0020 && r != '\t' && r != '\n' && r != '\r' {
443			return s.makeError(`Invalid character within String: "\u%04d".`, r)
444		}
445
446		if r == '\\' && s.end+4 <= inputLen && s.Input[s.end:s.end+4] == `\"""` {
447			buf.WriteString(`"""`)
448			s.end += 4
449			s.endRunes += 4
450		} else if r == '\r' {
451			if s.end+1 <= inputLen && s.Input[s.end+1] == '\n' {
452				s.end++
453				s.endRunes++
454			}
455
456			buf.WriteByte('\n')
457			s.end++
458			s.endRunes++
459		} else {
460			var char = rune(r)
461			var w = 1
462
463			// skip unicode overhead if we are in the ascii range
464			if r >= 127 {
465				char, w = utf8.DecodeRuneInString(s.Input[s.end:])
466			}
467			s.end += w
468			s.endRunes++
469			buf.WriteRune(char)
470		}
471	}
472
473	return s.makeError("Unterminated string.")
474}
475
476func unhex(b string) (v rune, ok bool) {
477	for _, c := range b {
478		v <<= 4
479		switch {
480		case '0' <= c && c <= '9':
481			v |= c - '0'
482		case 'a' <= c && c <= 'f':
483			v |= c - 'a' + 10
484		case 'A' <= c && c <= 'F':
485			v |= c - 'A' + 10
486		default:
487			return 0, false
488		}
489	}
490
491	return v, true
492}
493
494// readName from the input
495//
496// [_A-Za-z][_0-9A-Za-z]*
497func (s *Lexer) readName() (Token, *gqlerror.Error) {
498	for s.end < len(s.Input) {
499		r, w := s.peek()
500
501		if (r >= '0' && r <= '9') || (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || r == '_' {
502			s.end += w
503			s.endRunes++
504		} else {
505			break
506		}
507	}
508
509	return s.makeToken(Name)
510}