1<lexer>
  2  <config>
  3    <name>WebVTT</name>
  4    <alias>vtt</alias>
  5    <filename>*.vtt</filename>
  6    <mime_type>text/vtt</mime_type>
  7  </config>
  8  <!--
  9    The WebVTT spec refers to a WebVTT line terminator as either CRLF, CR or LF.
 10    (https://www.w3.org/TR/webvtt1/#webvtt-line-terminator) However, with this 
 11    definition it is unclear whether CRLF is one line terminator (CRLF) or two
 12    line terminators (CR and LF).
 13
 14    To work around this ambiguity, only CRLF and LF are considered as line terminators. 
 15    To my knowledge only classic Mac OS uses CR as line terminators, so the lexer should
 16    still work for most files.
 17  -->
 18  <rules>
 19    <!-- https://www.w3.org/TR/webvtt1/#webvtt-file-body -->
 20    <state name="root">
 21      <rule pattern="(\AWEBVTT)((?:[ \t][^\r\n]*)?(?:\r?\n){2,})">
 22        <bygroups>
 23          <token type="Keyword" />
 24          <token type="Text" />
 25        </bygroups>
 26      </rule>
 27      <rule pattern="(^REGION)([ \t]*$)">
 28        <bygroups>
 29          <token type="Keyword" />
 30          <token type="Text" />
 31        </bygroups>
 32        <push state="region-settings-list" />
 33      </rule>
 34      <rule
 35        pattern="(^STYLE)([ \t]*$)((?:(?!-->)[\s\S])*?)((?:\r?\n){2})">
 36        <bygroups>
 37          <token type="Keyword" />
 38          <token type="Text" />
 39          <using lexer="CSS" />
 40          <token type="Text" />
 41        </bygroups>
 42      </rule>
 43      <rule>
 44        <include state="comment" />
 45      </rule>
 46      <rule
 47        pattern="(?=((?![^\r\n]*-->)[^\r\n]*\r?\n)?(\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3}[ \t]+-->[ \t]+(\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})"
 48      >
 49        <push state="cues" />
 50      </rule>
 51    </state>
 52
 53    <!-- https://www.w3.org/TR/webvtt1/#webvtt-region-settings-list -->
 54    <state name="region-settings-list">
 55      <rule pattern="(?: |\t|\r?\n(?!\r?\n))+">
 56        <token type="Text" />
 57      </rule>
 58      <rule pattern="(?:\r?\n){2}">
 59        <token type="Text" />
 60        <pop depth="1" />
 61      </rule>
 62      <rule pattern="(id)(:)(?!-->)(\S+)">
 63        <bygroups>
 64          <token type="Keyword" />
 65          <token type="Punctuation" />
 66          <token type="Literal" />
 67        </bygroups>
 68      </rule>
 69      <rule pattern="(width)(:)((?:[1-9]?\d|100)(?:\.\d+)?)(%)">
 70        <bygroups>
 71          <token type="Keyword" />
 72          <token type="Punctuation" />
 73          <token type="Literal" />
 74          <token type="KeywordType" />
 75        </bygroups>
 76      </rule>
 77      <rule pattern="(lines)(:)(\d+)">
 78        <bygroups>
 79          <token type="Keyword" />
 80          <token type="Punctuation" />
 81          <token type="Literal" />
 82        </bygroups>
 83      </rule>
 84      <rule
 85        pattern="(regionanchor|viewportanchor)(:)((?:[1-9]?\d|100)(?:\.\d+)?)(%)(,)((?:[1-9]?\d|100)(?:\.\d+)?)(%)">
 86        <bygroups>
 87          <token type="Keyword" />
 88          <token type="Punctuation" />
 89          <token type="Literal" />
 90          <token type="KeywordType" />
 91          <token type="Punctuation" />
 92          <token type="Literal" />
 93          <token type="KeywordType" />
 94        </bygroups>
 95      </rule>
 96      <rule pattern="(scroll)(:)(up)">
 97        <bygroups>
 98          <token type="Keyword" />
 99          <token type="Punctuation" />
100          <token type="KeywordConstant" />
101        </bygroups>
102      </rule>
103    </state>
104
105    <!-- https://www.w3.org/TR/webvtt1/#webvtt-comment-block -->
106    <state name="comment">
107      <rule
108        pattern="^NOTE( |\t|\r?\n)((?!-->)[\s\S])*?(?:(\r?\n){2}|\Z)">
109        <token type="Comment" />
110      </rule>
111    </state>
112
113    <!-- 
114      "Zero or more WebVTT cue blocks and WebVTT comment blocks separated from each other by one or more
115      WebVTT line terminators." (https://www.w3.org/TR/webvtt1/#file-structure)
116    -->
117    <state name="cues">
118      <rule
119        pattern="(?:((?!-->)[^\r\n]+)?(\r?\n))?((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})([ \t]+)(-->)([ \t]+)((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})([ \t]*)">
120        <bygroups>
121          <token type="Name" />
122          <token type="Text" />
123          <token type="LiteralDate" />
124          <token type="Text" />
125          <token type="Operator" />
126          <token type="Text" />
127          <token type="LiteralDate" />
128          <token type="Text" />
129        </bygroups>
130        <push state="cue-settings-list" />
131      </rule>
132      <rule>
133        <include state="comment" />
134      </rule>
135    </state>
136
137    <!-- https://www.w3.org/TR/webvtt1/#webvtt-cue-settings-list -->
138    <state name="cue-settings-list">
139      <rule pattern="[ \t]+">
140        <token type="Text" />
141      </rule>
142      <rule pattern="(vertical)(:)?(rl|lr)?">
143        <bygroups>
144          <token type="Keyword" />
145          <token type="Punctuation" />
146          <token type="KeywordConstant" />
147        </bygroups>
148      </rule>
149      <rule
150        pattern="(line)(:)?(?:(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%)|(-?\d+))(?:(,)(start|center|end))?)?">
151        <bygroups>
152          <token type="Keyword" />
153          <token type="Punctuation" />
154          <token type="Literal" />
155          <token type="KeywordType" />
156          <token type="Literal" />
157          <token type="Punctuation" />
158          <token type="KeywordConstant" />
159        </bygroups>
160      </rule>
161      <rule
162        pattern="(position)(:)?(?:(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%)|(-?\d+))(?:(,)(line-left|center|line-right))?)?">
163        <bygroups>
164          <token type="Keyword" />
165          <token type="Punctuation" />
166          <token type="Literal" />
167          <token type="KeywordType" />
168          <token type="Literal" />
169          <token type="Punctuation" />
170          <token type="KeywordConstant" />
171        </bygroups>
172      </rule>
173      <rule pattern="(size)(:)?(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%))?">
174        <bygroups>
175          <token type="Keyword" />
176          <token type="Punctuation" />
177          <token type="Literal" />
178          <token type="KeywordType" />
179        </bygroups>
180      </rule>
181      <rule pattern="(align)(:)?(start|center|end|left|right)?">
182        <bygroups>
183          <token type="Keyword" />
184          <token type="Punctuation" />
185          <token type="KeywordConstant" />
186        </bygroups>
187      </rule>
188      <rule pattern="(region)(:)?((?![^\r\n]*-->(?=[ \t]+?))[^ \t\r\n]+)?">
189        <bygroups>
190          <token type="Keyword" />
191          <token type="Punctuation" />
192          <token type="Literal" />
193        </bygroups>
194      </rule>
195      <rule
196        pattern="(?=\r?\n)">
197        <push state="cue-payload" />
198      </rule>
199    </state>
200
201    <!-- https://www.w3.org/TR/webvtt1/#cue-payload -->
202    <state name="cue-payload">
203      <rule pattern="(\r?\n){2,}">
204        <token type="Text" />
205        <pop depth="2" />
206      </rule>
207      <rule pattern="[^<&]+?">
208        <token type="Text" />
209      </rule>
210      <rule pattern="&(#\d+|#x[0-9A-Fa-f]+|[a-zA-Z0-9]+);">
211        <token type="Text" />
212      </rule>
213      <rule pattern="(?=<)">
214        <token type="Text" />
215        <push state="cue-span-tag" />
216      </rule>
217    </state>
218    <state name="cue-span-tag">
219      <rule
220        pattern="<(?=c|i|b|u|ruby|rt|v|lang|(?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})">
221        <token type="Punctuation" />
222        <push state="cue-span-start-tag-name" />
223      </rule>
224      <rule pattern="(</)(c|i|b|u|ruby|rt|v|lang)">
225        <bygroups>
226          <token type="Punctuation" />
227          <token type="NameTag" />
228        </bygroups>
229      </rule>
230      <rule pattern=">">
231        <token type="Punctuation" />
232        <pop depth="1" />
233      </rule>
234    </state>
235    <state name="cue-span-start-tag-name">
236      <rule pattern="(c|i|b|u|ruby|rt)|((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})">
237        <bygroups>
238          <token type="NameTag" />
239          <token type="LiteralDate" />
240        </bygroups>
241        <push state="cue-span-classes-without-annotations" />
242      </rule>
243      <rule pattern="v|lang">
244        <token type="NameTag" />
245        <push state="cue-span-classes-with-annotations" />
246      </rule>
247    </state>
248    <state name="cue-span-classes-without-annotations">
249      <rule>
250        <include state="cue-span-classes" />
251      </rule>
252      <rule pattern="(?=>)">
253        <pop depth="2" />
254      </rule>
255    </state>
256    <state name="cue-span-classes-with-annotations">
257      <rule>
258        <include state="cue-span-classes" />
259      </rule>
260      <rule pattern="(?=[ \t])">
261        <push state="cue-span-start-tag-annotations" />
262      </rule>
263    </state>
264    <state name="cue-span-classes">
265      <rule pattern="(\.)([^ \t\n\r&<>\.]+)">
266        <bygroups>
267          <token type="Punctuation" />
268          <token type="NameTag" />
269        </bygroups>
270      </rule>
271    </state>
272    <state name="cue-span-start-tag-annotations">
273      <rule
274        pattern="[ \t](?:[^\n\r&>]|&(?:#\d+|#x[0-9A-Fa-f]+|[a-zA-Z0-9]+);)+">
275        <token type="Text" />
276      </rule>
277      <rule pattern="(?=>)">
278        <token type="Text" />
279        <pop depth="3" />
280      </rule>
281    </state>
282  </rules>
283</lexer>