1<!--
2Lexer for RFC-4180 compliant CSV subject to the following additions:
3- UTF-8 encoding is accepted (the RFC requires 7-bit ASCII)
4- The line terminator character can be LF or CRLF (the RFC allows CRLF only)
5
6Link to the RFC-4180 specification: https://tools.ietf.org/html/rfc4180
7
8Additions inspired by:
9https://github.com/frictionlessdata/datapackage/issues/204#issuecomment-193242077
10
11Future improvements:
12- Identify non-quoted numbers as LiteralNumber
13- Identify y as an error in "x"y. Currently it's identified as another string
14 literal.
15-->
16
17<lexer>
18 <config>
19 <name>CSV</name>
20 <alias>csv</alias>
21 <filename>*.csv</filename>
22 <mime_type>text/csv</mime_type>
23 </config>
24 <rules>
25 <state name="root">
26 <rule pattern="\r?\n">
27 <token type="Punctuation" />
28 </rule>
29 <rule pattern=",">
30 <token type="Punctuation" />
31 </rule>
32 <rule pattern=""">
33 <token type="LiteralStringDouble" />
34 <push state="escaped" />
35 </rule>
36 <rule pattern="[^\r\n,]+">
37 <token type="LiteralString" />
38 </rule>
39 </state>
40 <state name="escaped">
41 <rule pattern="""">
42 <token type="LiteralStringEscape"/>
43 </rule>
44 <rule pattern=""">
45 <token type="LiteralStringDouble" />
46 <pop depth="1"/>
47 </rule>
48 <rule pattern="[^"]+">
49 <token type="LiteralStringDouble" />
50 </rule>
51 </state>
52 </rules>
53</lexer>