scanner.go

  1// Copyright 2012 The Gorilla Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package scanner
  6
  7import (
  8	"fmt"
  9	"regexp"
 10	"strings"
 11	"unicode"
 12	"unicode/utf8"
 13)
 14
 15// tokenType identifies the type of lexical tokens.
 16type tokenType int
 17
 18// String returns a string representation of the token type.
 19func (t tokenType) String() string {
 20	return tokenNames[t]
 21}
 22
 23// Token represents a token and the corresponding string.
 24type Token struct {
 25	Type   tokenType
 26	Value  string
 27	Line   int
 28	Column int
 29}
 30
 31// String returns a string representation of the token.
 32func (t *Token) String() string {
 33	if len(t.Value) > 10 {
 34		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
 35			t.Type, t.Line, t.Column, t.Value)
 36	}
 37	return fmt.Sprintf("%s (line: %d, column: %d): %q",
 38		t.Type, t.Line, t.Column, t.Value)
 39}
 40
 41// All tokens -----------------------------------------------------------------
 42
 43// The complete list of tokens in CSS3.
 44const (
 45	// Scanner flags.
 46	TokenError tokenType = iota
 47	TokenEOF
 48	// From now on, only tokens from the CSS specification.
 49	TokenIdent
 50	TokenAtKeyword
 51	TokenString
 52	TokenHash
 53	TokenNumber
 54	TokenPercentage
 55	TokenDimension
 56	TokenURI
 57	TokenUnicodeRange
 58	TokenCDO
 59	TokenCDC
 60	TokenS
 61	TokenComment
 62	TokenFunction
 63	TokenIncludes
 64	TokenDashMatch
 65	TokenPrefixMatch
 66	TokenSuffixMatch
 67	TokenSubstringMatch
 68	TokenChar
 69	TokenBOM
 70)
 71
 72// tokenNames maps tokenType's to their names. Used for conversion to string.
 73var tokenNames = map[tokenType]string{
 74	TokenError:          "error",
 75	TokenEOF:            "EOF",
 76	TokenIdent:          "IDENT",
 77	TokenAtKeyword:      "ATKEYWORD",
 78	TokenString:         "STRING",
 79	TokenHash:           "HASH",
 80	TokenNumber:         "NUMBER",
 81	TokenPercentage:     "PERCENTAGE",
 82	TokenDimension:      "DIMENSION",
 83	TokenURI:            "URI",
 84	TokenUnicodeRange:   "UNICODE-RANGE",
 85	TokenCDO:            "CDO",
 86	TokenCDC:            "CDC",
 87	TokenS:              "S",
 88	TokenComment:        "COMMENT",
 89	TokenFunction:       "FUNCTION",
 90	TokenIncludes:       "INCLUDES",
 91	TokenDashMatch:      "DASHMATCH",
 92	TokenPrefixMatch:    "PREFIXMATCH",
 93	TokenSuffixMatch:    "SUFFIXMATCH",
 94	TokenSubstringMatch: "SUBSTRINGMATCH",
 95	TokenChar:           "CHAR",
 96	TokenBOM:            "BOM",
 97}
 98
 99// Macros and productions -----------------------------------------------------
100// http://www.w3.org/TR/css3-syntax/#tokenization
101
102var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
103
104// macros maps macro names to patterns to be expanded.
105var macros = map[string]string{
106	// must be escaped: `\.+*?()|[]{}^$`
107	"ident":      `-?{nmstart}{nmchar}*`,
108	"name":       `{nmchar}+`,
109	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
110	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
111	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
112	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
113	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
114	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
115	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
116	"stringchar": `{urlchar}|[ ]|\\{nl}`,
117	"nl":         `[\n\r\f]|\r\n`,
118	"w":          `{wc}*`,
119	"wc":         `[\t\n\f\r ]`,
120
121	// urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
122	// ASCII characters range = `[\u0020-\u007e]`
123	// Skip space \u0020 = `[\u0021-\u007e]`
124	// Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
125	// Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
126	// Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
127	// Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
128	"urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
129}
130
131// productions maps the list of tokens to patterns to be expanded.
132var productions = map[tokenType]string{
133	// Unused regexps (matched using other methods) are commented out.
134	TokenIdent:        `{ident}`,
135	TokenAtKeyword:    `@{ident}`,
136	TokenString:       `{string}`,
137	TokenHash:         `#{name}`,
138	TokenNumber:       `{num}`,
139	TokenPercentage:   `{num}%`,
140	TokenDimension:    `{num}{ident}`,
141	TokenURI:          `url\({w}(?:{string}|{urlchar}*?){w}\)`,
142	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
143	//TokenCDO:            `<!--`,
144	TokenCDC:      `-->`,
145	TokenS:        `{wc}+`,
146	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
147	TokenFunction: `{ident}\(`,
148	//TokenIncludes:       `~=`,
149	//TokenDashMatch:      `\|=`,
150	//TokenPrefixMatch:    `\^=`,
151	//TokenSuffixMatch:    `\$=`,
152	//TokenSubstringMatch: `\*=`,
153	//TokenChar:           `[^"']`,
154	//TokenBOM:            "\uFEFF",
155}
156
157// matchers maps the list of tokens to compiled regular expressions.
158//
159// The map is filled on init() using the macros and productions defined in
160// the CSS specification.
161var matchers = map[tokenType]*regexp.Regexp{}
162
163// matchOrder is the order to test regexps when first-char shortcuts
164// can't be used.
165var matchOrder = []tokenType{
166	TokenURI,
167	TokenFunction,
168	TokenUnicodeRange,
169	TokenIdent,
170	TokenDimension,
171	TokenPercentage,
172	TokenNumber,
173	TokenCDC,
174}
175
176func init() {
177	// replace macros and compile regexps for productions.
178	replaceMacro := func(s string) string {
179		return "(?:" + macros[s[1:len(s)-1]] + ")"
180	}
181	for t, s := range productions {
182		for macroRegexp.MatchString(s) {
183			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
184		}
185		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
186	}
187}
188
189// Scanner --------------------------------------------------------------------
190
191// New returns a new CSS scanner for the given input.
192func New(input string) *Scanner {
193	// Normalize newlines.
194	// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
195	input = strings.Replace(input, "\r\n", "\n", -1)
196	input = strings.Replace(input, "\r", "\n", -1)
197	input = strings.Replace(input, "\f", "\n", -1)
198	input = strings.Replace(input, "\u0000", "\ufffd", -1)
199	return &Scanner{
200		input: input,
201		row:   1,
202		col:   1,
203	}
204}
205
206// Scanner scans an input and emits tokens following the CSS3 specification.
207type Scanner struct {
208	input string
209	pos   int
210	row   int
211	col   int
212	err   *Token
213}
214
215// Next returns the next token from the input.
216//
217// At the end of the input the token type is TokenEOF.
218//
219// If the input can't be tokenized the token type is TokenError. This occurs
220// in case of unclosed quotation marks or comments.
221func (s *Scanner) Next() *Token {
222	if s.err != nil {
223		return s.err
224	}
225	if s.pos >= len(s.input) {
226		s.err = &Token{TokenEOF, "", s.row, s.col}
227		return s.err
228	}
229	if s.pos == 0 {
230		// Test BOM only once, at the beginning of the file.
231		if strings.HasPrefix(s.input, "\uFEFF") {
232			return s.emitSimple(TokenBOM, "\uFEFF")
233		}
234	}
235	// There's a lot we can guess based on the first byte so we'll take a
236	// shortcut before testing multiple regexps.
237	input := s.input[s.pos:]
238	switch input[0] {
239	case '\t', '\n', ' ':
240		// Whitespace.
241		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
242	case '.':
243		// Dot is too common to not have a quick check.
244		// We'll test if this is a Char; if it is followed by a number it is a
245		// dimension/percentage/number, and this will be matched later.
246		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
247			return s.emitSimple(TokenChar, ".")
248		}
249	case '#':
250		// Another common one: Hash or Char.
251		if match := matchers[TokenHash].FindString(input); match != "" {
252			return s.emitToken(TokenHash, match)
253		}
254		return s.emitSimple(TokenChar, "#")
255	case '@':
256		// Another common one: AtKeyword or Char.
257		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
258			return s.emitSimple(TokenAtKeyword, match)
259		}
260		return s.emitSimple(TokenChar, "@")
261	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
262		// More common chars.
263		return s.emitSimple(TokenChar, string(input[0]))
264	case '"', '\'':
265		// String or error.
266		match := matchers[TokenString].FindString(input)
267		if match != "" {
268			return s.emitToken(TokenString, match)
269		}
270
271		s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
272		return s.err
273	case '/':
274		// Comment, error or Char.
275		if len(input) > 1 && input[1] == '*' {
276			match := matchers[TokenComment].FindString(input)
277			if match != "" {
278				return s.emitToken(TokenComment, match)
279			} else {
280				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
281				return s.err
282			}
283		}
284		return s.emitSimple(TokenChar, "/")
285	case '~':
286		// Includes or Char.
287		return s.emitPrefixOrChar(TokenIncludes, "~=")
288	case '|':
289		// DashMatch or Char.
290		return s.emitPrefixOrChar(TokenDashMatch, "|=")
291	case '^':
292		// PrefixMatch or Char.
293		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
294	case '$':
295		// SuffixMatch or Char.
296		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
297	case '*':
298		// SubstringMatch or Char.
299		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
300	case '<':
301		// CDO or Char.
302		return s.emitPrefixOrChar(TokenCDO, "<!--")
303	}
304	// Test all regexps, in order.
305	for _, token := range matchOrder {
306		if match := matchers[token].FindString(input); match != "" {
307			return s.emitToken(token, match)
308		}
309	}
310	// We already handled unclosed quotation marks and comments,
311	// so this can only be a Char.
312	r, width := utf8.DecodeRuneInString(input)
313	token := &Token{TokenChar, string(r), s.row, s.col}
314	s.col += width
315	s.pos += width
316	return token
317}
318
319// updatePosition updates input coordinates based on the consumed text.
320func (s *Scanner) updatePosition(text string) {
321	width := utf8.RuneCountInString(text)
322	lines := strings.Count(text, "\n")
323	s.row += lines
324	if lines == 0 {
325		s.col += width
326	} else {
327		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
328	}
329	s.pos += len(text) // while col is a rune index, pos is a byte index
330}
331
332// emitToken returns a Token for the string v and updates the scanner position.
333func (s *Scanner) emitToken(t tokenType, v string) *Token {
334	token := &Token{t, v, s.row, s.col}
335	s.updatePosition(v)
336	return token
337}
338
339// emitSimple returns a Token for the string v and updates the scanner
340// position in a simplified manner.
341//
342// The string is known to have only ASCII characters and to not have a newline.
343func (s *Scanner) emitSimple(t tokenType, v string) *Token {
344	token := &Token{t, v, s.row, s.col}
345	s.col += len(v)
346	s.pos += len(v)
347	return token
348}
349
350// emitPrefixOrChar returns a Token for type t if the current position
351// matches the given prefix. Otherwise it returns a Char token using the
352// first character from the prefix.
353//
354// The prefix is known to have only ASCII characters and to not have a newline.
355func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
356	if strings.HasPrefix(s.input[s.pos:], prefix) {
357		return s.emitSimple(t, prefix)
358	}
359	return s.emitSimple(TokenChar, string(prefix[0]))
360}