1// Copyright 2012 The Gorilla Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package scanner
6
7import (
8 "fmt"
9 "regexp"
10 "strings"
11 "unicode"
12 "unicode/utf8"
13)
14
15// tokenType identifies the type of lexical tokens.
16type tokenType int
17
18// String returns a string representation of the token type.
19func (t tokenType) String() string {
20 return tokenNames[t]
21}
22
23// Token represents a token and the corresponding string.
24type Token struct {
25 Type tokenType
26 Value string
27 Line int
28 Column int
29}
30
31// String returns a string representation of the token.
32func (t *Token) String() string {
33 if len(t.Value) > 10 {
34 return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
35 t.Type, t.Line, t.Column, t.Value)
36 }
37 return fmt.Sprintf("%s (line: %d, column: %d): %q",
38 t.Type, t.Line, t.Column, t.Value)
39}
40
41// All tokens -----------------------------------------------------------------
42
43// The complete list of tokens in CSS3.
44const (
45 // Scanner flags.
46 TokenError tokenType = iota
47 TokenEOF
48 // From now on, only tokens from the CSS specification.
49 TokenIdent
50 TokenAtKeyword
51 TokenString
52 TokenHash
53 TokenNumber
54 TokenPercentage
55 TokenDimension
56 TokenURI
57 TokenUnicodeRange
58 TokenCDO
59 TokenCDC
60 TokenS
61 TokenComment
62 TokenFunction
63 TokenIncludes
64 TokenDashMatch
65 TokenPrefixMatch
66 TokenSuffixMatch
67 TokenSubstringMatch
68 TokenChar
69 TokenBOM
70)
71
72// tokenNames maps tokenType's to their names. Used for conversion to string.
73var tokenNames = map[tokenType]string{
74 TokenError: "error",
75 TokenEOF: "EOF",
76 TokenIdent: "IDENT",
77 TokenAtKeyword: "ATKEYWORD",
78 TokenString: "STRING",
79 TokenHash: "HASH",
80 TokenNumber: "NUMBER",
81 TokenPercentage: "PERCENTAGE",
82 TokenDimension: "DIMENSION",
83 TokenURI: "URI",
84 TokenUnicodeRange: "UNICODE-RANGE",
85 TokenCDO: "CDO",
86 TokenCDC: "CDC",
87 TokenS: "S",
88 TokenComment: "COMMENT",
89 TokenFunction: "FUNCTION",
90 TokenIncludes: "INCLUDES",
91 TokenDashMatch: "DASHMATCH",
92 TokenPrefixMatch: "PREFIXMATCH",
93 TokenSuffixMatch: "SUFFIXMATCH",
94 TokenSubstringMatch: "SUBSTRINGMATCH",
95 TokenChar: "CHAR",
96 TokenBOM: "BOM",
97}
98
99// Macros and productions -----------------------------------------------------
100// http://www.w3.org/TR/css3-syntax/#tokenization
101
102var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
103
104// macros maps macro names to patterns to be expanded.
105var macros = map[string]string{
106 // must be escaped: `\.+*?()|[]{}^$`
107 "ident": `-?{nmstart}{nmchar}*`,
108 "name": `{nmchar}+`,
109 "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`,
110 "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
111 "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`,
112 "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
113 "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
114 "num": `[0-9]*\.[0-9]+|[0-9]+`,
115 "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
116 "stringchar": `{urlchar}|[ ]|\\{nl}`,
117 "nl": `[\n\r\f]|\r\n`,
118 "w": `{wc}*`,
119 "wc": `[\t\n\f\r ]`,
120
121 // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
122 // ASCII characters range = `[\u0020-\u007e]`
123 // Skip space \u0020 = `[\u0021-\u007e]`
124 // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
125 // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
126 // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
127 // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
128 "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
129}
130
131// productions maps the list of tokens to patterns to be expanded.
132var productions = map[tokenType]string{
133 // Unused regexps (matched using other methods) are commented out.
134 TokenIdent: `{ident}`,
135 TokenAtKeyword: `@{ident}`,
136 TokenString: `{string}`,
137 TokenHash: `#{name}`,
138 TokenNumber: `{num}`,
139 TokenPercentage: `{num}%`,
140 TokenDimension: `{num}{ident}`,
141 TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`,
142 TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
143 //TokenCDO: `<!--`,
144 TokenCDC: `-->`,
145 TokenS: `{wc}+`,
146 TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
147 TokenFunction: `{ident}\(`,
148 //TokenIncludes: `~=`,
149 //TokenDashMatch: `\|=`,
150 //TokenPrefixMatch: `\^=`,
151 //TokenSuffixMatch: `\$=`,
152 //TokenSubstringMatch: `\*=`,
153 //TokenChar: `[^"']`,
154 //TokenBOM: "\uFEFF",
155}
156
157// matchers maps the list of tokens to compiled regular expressions.
158//
159// The map is filled on init() using the macros and productions defined in
160// the CSS specification.
161var matchers = map[tokenType]*regexp.Regexp{}
162
163// matchOrder is the order to test regexps when first-char shortcuts
164// can't be used.
165var matchOrder = []tokenType{
166 TokenURI,
167 TokenFunction,
168 TokenUnicodeRange,
169 TokenIdent,
170 TokenDimension,
171 TokenPercentage,
172 TokenNumber,
173 TokenCDC,
174}
175
176func init() {
177 // replace macros and compile regexps for productions.
178 replaceMacro := func(s string) string {
179 return "(?:" + macros[s[1:len(s)-1]] + ")"
180 }
181 for t, s := range productions {
182 for macroRegexp.MatchString(s) {
183 s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
184 }
185 matchers[t] = regexp.MustCompile("^(?:" + s + ")")
186 }
187}
188
189// Scanner --------------------------------------------------------------------
190
191// New returns a new CSS scanner for the given input.
192func New(input string) *Scanner {
193 // Normalize newlines.
194 // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
195 input = strings.Replace(input, "\r\n", "\n", -1)
196 input = strings.Replace(input, "\r", "\n", -1)
197 input = strings.Replace(input, "\f", "\n", -1)
198 input = strings.Replace(input, "\u0000", "\ufffd", -1)
199 return &Scanner{
200 input: input,
201 row: 1,
202 col: 1,
203 }
204}
205
206// Scanner scans an input and emits tokens following the CSS3 specification.
207type Scanner struct {
208 input string
209 pos int
210 row int
211 col int
212 err *Token
213}
214
215// Next returns the next token from the input.
216//
217// At the end of the input the token type is TokenEOF.
218//
219// If the input can't be tokenized the token type is TokenError. This occurs
220// in case of unclosed quotation marks or comments.
221func (s *Scanner) Next() *Token {
222 if s.err != nil {
223 return s.err
224 }
225 if s.pos >= len(s.input) {
226 s.err = &Token{TokenEOF, "", s.row, s.col}
227 return s.err
228 }
229 if s.pos == 0 {
230 // Test BOM only once, at the beginning of the file.
231 if strings.HasPrefix(s.input, "\uFEFF") {
232 return s.emitSimple(TokenBOM, "\uFEFF")
233 }
234 }
235 // There's a lot we can guess based on the first byte so we'll take a
236 // shortcut before testing multiple regexps.
237 input := s.input[s.pos:]
238 switch input[0] {
239 case '\t', '\n', ' ':
240 // Whitespace.
241 return s.emitToken(TokenS, matchers[TokenS].FindString(input))
242 case '.':
243 // Dot is too common to not have a quick check.
244 // We'll test if this is a Char; if it is followed by a number it is a
245 // dimension/percentage/number, and this will be matched later.
246 if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
247 return s.emitSimple(TokenChar, ".")
248 }
249 case '#':
250 // Another common one: Hash or Char.
251 if match := matchers[TokenHash].FindString(input); match != "" {
252 return s.emitToken(TokenHash, match)
253 }
254 return s.emitSimple(TokenChar, "#")
255 case '@':
256 // Another common one: AtKeyword or Char.
257 if match := matchers[TokenAtKeyword].FindString(input); match != "" {
258 return s.emitSimple(TokenAtKeyword, match)
259 }
260 return s.emitSimple(TokenChar, "@")
261 case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
262 // More common chars.
263 return s.emitSimple(TokenChar, string(input[0]))
264 case '"', '\'':
265 // String or error.
266 match := matchers[TokenString].FindString(input)
267 if match != "" {
268 return s.emitToken(TokenString, match)
269 }
270
271 s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
272 return s.err
273 case '/':
274 // Comment, error or Char.
275 if len(input) > 1 && input[1] == '*' {
276 match := matchers[TokenComment].FindString(input)
277 if match != "" {
278 return s.emitToken(TokenComment, match)
279 } else {
280 s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
281 return s.err
282 }
283 }
284 return s.emitSimple(TokenChar, "/")
285 case '~':
286 // Includes or Char.
287 return s.emitPrefixOrChar(TokenIncludes, "~=")
288 case '|':
289 // DashMatch or Char.
290 return s.emitPrefixOrChar(TokenDashMatch, "|=")
291 case '^':
292 // PrefixMatch or Char.
293 return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
294 case '$':
295 // SuffixMatch or Char.
296 return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
297 case '*':
298 // SubstringMatch or Char.
299 return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
300 case '<':
301 // CDO or Char.
302 return s.emitPrefixOrChar(TokenCDO, "<!--")
303 }
304 // Test all regexps, in order.
305 for _, token := range matchOrder {
306 if match := matchers[token].FindString(input); match != "" {
307 return s.emitToken(token, match)
308 }
309 }
310 // We already handled unclosed quotation marks and comments,
311 // so this can only be a Char.
312 r, width := utf8.DecodeRuneInString(input)
313 token := &Token{TokenChar, string(r), s.row, s.col}
314 s.col += width
315 s.pos += width
316 return token
317}
318
319// updatePosition updates input coordinates based on the consumed text.
320func (s *Scanner) updatePosition(text string) {
321 width := utf8.RuneCountInString(text)
322 lines := strings.Count(text, "\n")
323 s.row += lines
324 if lines == 0 {
325 s.col += width
326 } else {
327 s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
328 }
329 s.pos += len(text) // while col is a rune index, pos is a byte index
330}
331
332// emitToken returns a Token for the string v and updates the scanner position.
333func (s *Scanner) emitToken(t tokenType, v string) *Token {
334 token := &Token{t, v, s.row, s.col}
335 s.updatePosition(v)
336 return token
337}
338
339// emitSimple returns a Token for the string v and updates the scanner
340// position in a simplified manner.
341//
342// The string is known to have only ASCII characters and to not have a newline.
343func (s *Scanner) emitSimple(t tokenType, v string) *Token {
344 token := &Token{t, v, s.row, s.col}
345 s.col += len(v)
346 s.pos += len(v)
347 return token
348}
349
350// emitPrefixOrChar returns a Token for type t if the current position
351// matches the given prefix. Otherwise it returns a Char token using the
352// first character from the prefix.
353//
354// The prefix is known to have only ASCII characters and to not have a newline.
355func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
356 if strings.HasPrefix(s.input[s.pos:], prefix) {
357 return s.emitSimple(t, prefix)
358 }
359 return s.emitSimple(TokenChar, string(prefix[0]))
360}