1<!--
 2Lexer for RFC-4180 compliant CSV subject to the following additions:
 3- UTF-8 encoding is accepted (the RFC requires 7-bit ASCII)
 4- The line terminator character can be LF or CRLF (the RFC allows CRLF only)
 5
 6Link to the RFC-4180 specification: https://tools.ietf.org/html/rfc4180
 7
 8Additions inspired by:
 9https://github.com/frictionlessdata/datapackage/issues/204#issuecomment-193242077
10
11Future improvements:
12- Identify non-quoted numbers as LiteralNumber
13- Identify y as an error in "x"y. Currently it's identified as another string
14  literal.
15-->
16
17<lexer>
18    <config>
19        <name>CSV</name>
20        <alias>csv</alias>
21        <filename>*.csv</filename>
22        <mime_type>text/csv</mime_type>
23    </config>
24    <rules>
25        <state name="root">
26            <rule pattern="\r?\n">
27                <token type="Punctuation" />
28            </rule>
29            <rule pattern=",">
30                <token type="Punctuation" />
31            </rule>
32            <rule pattern=""">
33                <token type="LiteralStringDouble" />
34                <push state="escaped" />
35            </rule>
36            <rule pattern="[^\r\n,]+">
37                <token type="LiteralString" />
38            </rule>
39        </state>
40        <state name="escaped">
41            <rule pattern="""">
42                <token type="LiteralStringEscape"/>
43            </rule>
44            <rule pattern=""">
45                <token type="LiteralStringDouble" />
46                <pop depth="1"/>
47            </rule>
48            <rule pattern="[^"]+">
49                <token type="LiteralStringDouble" />
50            </rule>
51        </state>
52    </rules>
53</lexer>