1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package html
6
7import (
8 "errors"
9 "fmt"
10 "io"
11 "strings"
12
13 a "golang.org/x/net/html/atom"
14)
15
16// A parser implements the HTML5 parsing algorithm:
17// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
18type parser struct {
19 // tokenizer provides the tokens for the parser.
20 tokenizer *Tokenizer
21 // tok is the most recently read token.
22 tok Token
23 // Self-closing tags like <hr/> are treated as start tags, except that
24 // hasSelfClosingToken is set while they are being processed.
25 hasSelfClosingToken bool
26 // doc is the document root element.
27 doc *Node
28 // The stack of open elements (section 12.2.4.2) and active formatting
29 // elements (section 12.2.4.3).
30 oe, afe nodeStack
31 // Element pointers (section 12.2.4.4).
32 head, form *Node
33 // Other parsing state flags (section 12.2.4.5).
34 scripting, framesetOK bool
35 // The stack of template insertion modes
36 templateStack insertionModeStack
37 // im is the current insertion mode.
38 im insertionMode
39 // originalIM is the insertion mode to go back to after completing a text
40 // or inTableText insertion mode.
41 originalIM insertionMode
42 // fosterParenting is whether new elements should be inserted according to
43 // the foster parenting rules (section 12.2.6.1).
44 fosterParenting bool
45 // quirks is whether the parser is operating in "quirks mode."
46 quirks bool
47 // fragment is whether the parser is parsing an HTML fragment.
48 fragment bool
49 // context is the context element when parsing an HTML fragment
50 // (section 12.4).
51 context *Node
52}
53
54func (p *parser) top() *Node {
55 if n := p.oe.top(); n != nil {
56 return n
57 }
58 return p.doc
59}
60
61// Stop tags for use in popUntil. These come from section 12.2.4.2.
62var (
63 defaultScopeStopTags = map[string][]a.Atom{
64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
66 "svg": {a.Desc, a.ForeignObject, a.Title},
67 }
68)
69
70type scope int
71
72const (
73 defaultScope scope = iota
74 listItemScope
75 buttonScope
76 tableScope
77 tableRowScope
78 tableBodyScope
79 selectScope
80)
81
82// popUntil pops the stack of open elements at the highest element whose tag
83// is in matchTags, provided there is no higher element in the scope's stop
84// tags (as defined in section 12.2.4.2). It returns whether or not there was
85// such an element. If there was not, popUntil leaves the stack unchanged.
86//
87// For example, the set of stop tags for table scope is: "html", "table". If
88// the stack was:
89// ["html", "body", "font", "table", "b", "i", "u"]
90// then popUntil(tableScope, "font") would return false, but
91// popUntil(tableScope, "i") would return true and the stack would become:
92// ["html", "body", "font", "table", "b"]
93//
94// If an element's tag is in both the stop tags and matchTags, then the stack
95// will be popped and the function returns true (provided, of course, there was
96// no higher element in the stack that was also in the stop tags). For example,
97// popUntil(tableScope, "table") returns true and leaves:
98// ["html", "body", "font"]
99func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
101 p.oe = p.oe[:i]
102 return true
103 }
104 return false
105}
106
107// indexOfElementInScope returns the index in p.oe of the highest element whose
108// tag is in matchTags that is in scope. If no matching element is in scope, it
109// returns -1.
110func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
111 for i := len(p.oe) - 1; i >= 0; i-- {
112 tagAtom := p.oe[i].DataAtom
113 if p.oe[i].Namespace == "" {
114 for _, t := range matchTags {
115 if t == tagAtom {
116 return i
117 }
118 }
119 switch s {
120 case defaultScope:
121 // No-op.
122 case listItemScope:
123 if tagAtom == a.Ol || tagAtom == a.Ul {
124 return -1
125 }
126 case buttonScope:
127 if tagAtom == a.Button {
128 return -1
129 }
130 case tableScope:
131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
132 return -1
133 }
134 case selectScope:
135 if tagAtom != a.Optgroup && tagAtom != a.Option {
136 return -1
137 }
138 default:
139 panic("unreachable")
140 }
141 }
142 switch s {
143 case defaultScope, listItemScope, buttonScope:
144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
145 if t == tagAtom {
146 return -1
147 }
148 }
149 }
150 }
151 return -1
152}
153
154// elementInScope is like popUntil, except that it doesn't modify the stack of
155// open elements.
156func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
157 return p.indexOfElementInScope(s, matchTags...) != -1
158}
159
160// clearStackToContext pops elements off the stack of open elements until a
161// scope-defined element is found.
162func (p *parser) clearStackToContext(s scope) {
163 for i := len(p.oe) - 1; i >= 0; i-- {
164 tagAtom := p.oe[i].DataAtom
165 switch s {
166 case tableScope:
167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
168 p.oe = p.oe[:i+1]
169 return
170 }
171 case tableRowScope:
172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
173 p.oe = p.oe[:i+1]
174 return
175 }
176 case tableBodyScope:
177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
178 p.oe = p.oe[:i+1]
179 return
180 }
181 default:
182 panic("unreachable")
183 }
184 }
185}
186
187// parseGenericRawTextElement implements the generic raw text element parsing
188// algorithm defined in 12.2.6.2.
189// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
190// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
191// officially, need to make tokenizer consider both states.
192func (p *parser) parseGenericRawTextElement() {
193 p.addElement()
194 p.originalIM = p.im
195 p.im = textIM
196}
197
198// generateImpliedEndTags pops nodes off the stack of open elements as long as
199// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
200// If exceptions are specified, nodes with that name will not be popped off.
201func (p *parser) generateImpliedEndTags(exceptions ...string) {
202 var i int
203loop:
204 for i = len(p.oe) - 1; i >= 0; i-- {
205 n := p.oe[i]
206 if n.Type != ElementNode {
207 break
208 }
209 switch n.DataAtom {
210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
211 for _, except := range exceptions {
212 if n.Data == except {
213 break loop
214 }
215 }
216 continue
217 }
218 break
219 }
220
221 p.oe = p.oe[:i+1]
222}
223
224// addChild adds a child node n to the top element, and pushes n onto the stack
225// of open elements if it is an element node.
226func (p *parser) addChild(n *Node) {
227 if p.shouldFosterParent() {
228 p.fosterParent(n)
229 } else {
230 p.top().AppendChild(n)
231 }
232
233 if n.Type == ElementNode {
234 p.oe = append(p.oe, n)
235 }
236}
237
238// shouldFosterParent returns whether the next node to be added should be
239// foster parented.
240func (p *parser) shouldFosterParent() bool {
241 if p.fosterParenting {
242 switch p.top().DataAtom {
243 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
244 return true
245 }
246 }
247 return false
248}
249
250// fosterParent adds a child node according to the foster parenting rules.
251// Section 12.2.6.1, "foster parenting".
252func (p *parser) fosterParent(n *Node) {
253 var table, parent, prev, template *Node
254 var i int
255 for i = len(p.oe) - 1; i >= 0; i-- {
256 if p.oe[i].DataAtom == a.Table {
257 table = p.oe[i]
258 break
259 }
260 }
261
262 var j int
263 for j = len(p.oe) - 1; j >= 0; j-- {
264 if p.oe[j].DataAtom == a.Template {
265 template = p.oe[j]
266 break
267 }
268 }
269
270 if template != nil && (table == nil || j > i) {
271 template.AppendChild(n)
272 return
273 }
274
275 if table == nil {
276 // The foster parent is the html element.
277 parent = p.oe[0]
278 } else {
279 parent = table.Parent
280 }
281 if parent == nil {
282 parent = p.oe[i-1]
283 }
284
285 if table != nil {
286 prev = table.PrevSibling
287 } else {
288 prev = parent.LastChild
289 }
290 if prev != nil && prev.Type == TextNode && n.Type == TextNode {
291 prev.Data += n.Data
292 return
293 }
294
295 parent.InsertBefore(n, table)
296}
297
298// addText adds text to the preceding node if it is a text node, or else it
299// calls addChild with a new text node.
300func (p *parser) addText(text string) {
301 if text == "" {
302 return
303 }
304
305 if p.shouldFosterParent() {
306 p.fosterParent(&Node{
307 Type: TextNode,
308 Data: text,
309 })
310 return
311 }
312
313 t := p.top()
314 if n := t.LastChild; n != nil && n.Type == TextNode {
315 n.Data += text
316 return
317 }
318 p.addChild(&Node{
319 Type: TextNode,
320 Data: text,
321 })
322}
323
324// addElement adds a child element based on the current token.
325func (p *parser) addElement() {
326 p.addChild(&Node{
327 Type: ElementNode,
328 DataAtom: p.tok.DataAtom,
329 Data: p.tok.Data,
330 Attr: p.tok.Attr,
331 })
332}
333
334// Section 12.2.4.3.
335func (p *parser) addFormattingElement() {
336 tagAtom, attr := p.tok.DataAtom, p.tok.Attr
337 p.addElement()
338
339 // Implement the Noah's Ark clause, but with three per family instead of two.
340 identicalElements := 0
341findIdenticalElements:
342 for i := len(p.afe) - 1; i >= 0; i-- {
343 n := p.afe[i]
344 if n.Type == scopeMarkerNode {
345 break
346 }
347 if n.Type != ElementNode {
348 continue
349 }
350 if n.Namespace != "" {
351 continue
352 }
353 if n.DataAtom != tagAtom {
354 continue
355 }
356 if len(n.Attr) != len(attr) {
357 continue
358 }
359 compareAttributes:
360 for _, t0 := range n.Attr {
361 for _, t1 := range attr {
362 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
363 // Found a match for this attribute, continue with the next attribute.
364 continue compareAttributes
365 }
366 }
367 // If we get here, there is no attribute that matches a.
368 // Therefore the element is not identical to the new one.
369 continue findIdenticalElements
370 }
371
372 identicalElements++
373 if identicalElements >= 3 {
374 p.afe.remove(n)
375 }
376 }
377
378 p.afe = append(p.afe, p.top())
379}
380
381// Section 12.2.4.3.
382func (p *parser) clearActiveFormattingElements() {
383 for {
384 if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {
385 return
386 }
387 }
388}
389
390// Section 12.2.4.3.
391func (p *parser) reconstructActiveFormattingElements() {
392 n := p.afe.top()
393 if n == nil {
394 return
395 }
396 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
397 return
398 }
399 i := len(p.afe) - 1
400 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
401 if i == 0 {
402 i = -1
403 break
404 }
405 i--
406 n = p.afe[i]
407 }
408 for {
409 i++
410 clone := p.afe[i].clone()
411 p.addChild(clone)
412 p.afe[i] = clone
413 if i == len(p.afe)-1 {
414 break
415 }
416 }
417}
418
419// Section 12.2.5.
420func (p *parser) acknowledgeSelfClosingTag() {
421 p.hasSelfClosingToken = false
422}
423
424// An insertion mode (section 12.2.4.1) is the state transition function from
425// a particular state in the HTML5 parser's state machine. It updates the
426// parser's fields depending on parser.tok (where ErrorToken means EOF).
427// It returns whether the token was consumed.
428type insertionMode func(*parser) bool
429
430// setOriginalIM sets the insertion mode to return to after completing a text or
431// inTableText insertion mode.
432// Section 12.2.4.1, "using the rules for".
433func (p *parser) setOriginalIM() {
434 if p.originalIM != nil {
435 panic("html: bad parser state: originalIM was set twice")
436 }
437 p.originalIM = p.im
438}
439
440// Section 12.2.4.1, "reset the insertion mode".
441func (p *parser) resetInsertionMode() {
442 for i := len(p.oe) - 1; i >= 0; i-- {
443 n := p.oe[i]
444 last := i == 0
445 if last && p.context != nil {
446 n = p.context
447 }
448
449 switch n.DataAtom {
450 case a.Select:
451 if !last {
452 for ancestor, first := n, p.oe[0]; ancestor != first; {
453 ancestor = p.oe[p.oe.index(ancestor)-1]
454 switch ancestor.DataAtom {
455 case a.Template:
456 p.im = inSelectIM
457 return
458 case a.Table:
459 p.im = inSelectInTableIM
460 return
461 }
462 }
463 }
464 p.im = inSelectIM
465 case a.Td, a.Th:
466 // TODO: remove this divergence from the HTML5 spec.
467 //
468 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
469 p.im = inCellIM
470 case a.Tr:
471 p.im = inRowIM
472 case a.Tbody, a.Thead, a.Tfoot:
473 p.im = inTableBodyIM
474 case a.Caption:
475 p.im = inCaptionIM
476 case a.Colgroup:
477 p.im = inColumnGroupIM
478 case a.Table:
479 p.im = inTableIM
480 case a.Template:
481 // TODO: remove this divergence from the HTML5 spec.
482 if n.Namespace != "" {
483 continue
484 }
485 p.im = p.templateStack.top()
486 case a.Head:
487 // TODO: remove this divergence from the HTML5 spec.
488 //
489 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
490 p.im = inHeadIM
491 case a.Body:
492 p.im = inBodyIM
493 case a.Frameset:
494 p.im = inFramesetIM
495 case a.Html:
496 if p.head == nil {
497 p.im = beforeHeadIM
498 } else {
499 p.im = afterHeadIM
500 }
501 default:
502 if last {
503 p.im = inBodyIM
504 return
505 }
506 continue
507 }
508 return
509 }
510}
511
512const whitespace = " \t\r\n\f"
513
514// Section 12.2.6.4.1.
515func initialIM(p *parser) bool {
516 switch p.tok.Type {
517 case TextToken:
518 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
519 if len(p.tok.Data) == 0 {
520 // It was all whitespace, so ignore it.
521 return true
522 }
523 case CommentToken:
524 p.doc.AppendChild(&Node{
525 Type: CommentNode,
526 Data: p.tok.Data,
527 })
528 return true
529 case DoctypeToken:
530 n, quirks := parseDoctype(p.tok.Data)
531 p.doc.AppendChild(n)
532 p.quirks = quirks
533 p.im = beforeHTMLIM
534 return true
535 }
536 p.quirks = true
537 p.im = beforeHTMLIM
538 return false
539}
540
541// Section 12.2.6.4.2.
542func beforeHTMLIM(p *parser) bool {
543 switch p.tok.Type {
544 case DoctypeToken:
545 // Ignore the token.
546 return true
547 case TextToken:
548 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
549 if len(p.tok.Data) == 0 {
550 // It was all whitespace, so ignore it.
551 return true
552 }
553 case StartTagToken:
554 if p.tok.DataAtom == a.Html {
555 p.addElement()
556 p.im = beforeHeadIM
557 return true
558 }
559 case EndTagToken:
560 switch p.tok.DataAtom {
561 case a.Head, a.Body, a.Html, a.Br:
562 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
563 return false
564 default:
565 // Ignore the token.
566 return true
567 }
568 case CommentToken:
569 p.doc.AppendChild(&Node{
570 Type: CommentNode,
571 Data: p.tok.Data,
572 })
573 return true
574 }
575 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
576 return false
577}
578
579// Section 12.2.6.4.3.
580func beforeHeadIM(p *parser) bool {
581 switch p.tok.Type {
582 case TextToken:
583 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
584 if len(p.tok.Data) == 0 {
585 // It was all whitespace, so ignore it.
586 return true
587 }
588 case StartTagToken:
589 switch p.tok.DataAtom {
590 case a.Head:
591 p.addElement()
592 p.head = p.top()
593 p.im = inHeadIM
594 return true
595 case a.Html:
596 return inBodyIM(p)
597 }
598 case EndTagToken:
599 switch p.tok.DataAtom {
600 case a.Head, a.Body, a.Html, a.Br:
601 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
602 return false
603 default:
604 // Ignore the token.
605 return true
606 }
607 case CommentToken:
608 p.addChild(&Node{
609 Type: CommentNode,
610 Data: p.tok.Data,
611 })
612 return true
613 case DoctypeToken:
614 // Ignore the token.
615 return true
616 }
617
618 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
619 return false
620}
621
622// Section 12.2.6.4.4.
623func inHeadIM(p *parser) bool {
624 switch p.tok.Type {
625 case TextToken:
626 s := strings.TrimLeft(p.tok.Data, whitespace)
627 if len(s) < len(p.tok.Data) {
628 // Add the initial whitespace to the current node.
629 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
630 if s == "" {
631 return true
632 }
633 p.tok.Data = s
634 }
635 case StartTagToken:
636 switch p.tok.DataAtom {
637 case a.Html:
638 return inBodyIM(p)
639 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:
640 p.addElement()
641 p.oe.pop()
642 p.acknowledgeSelfClosingTag()
643 return true
644 case a.Noscript:
645 if p.scripting {
646 p.parseGenericRawTextElement()
647 return true
648 }
649 p.addElement()
650 p.im = inHeadNoscriptIM
651 // Don't let the tokenizer go into raw text mode when scripting is disabled.
652 p.tokenizer.NextIsNotRawText()
653 return true
654 case a.Script, a.Title:
655 p.addElement()
656 p.setOriginalIM()
657 p.im = textIM
658 return true
659 case a.Noframes, a.Style:
660 p.parseGenericRawTextElement()
661 return true
662 case a.Head:
663 // Ignore the token.
664 return true
665 case a.Template:
666 // TODO: remove this divergence from the HTML5 spec.
667 //
668 // We don't handle all of the corner cases when mixing foreign
669 // content (i.e. <math> or <svg>) with <template>. Without this
670 // early return, we can get into an infinite loop, possibly because
671 // of the "TODO... further divergence" a little below.
672 //
673 // As a workaround, if we are mixing foreign content and templates,
674 // just ignore the rest of the HTML. Foreign content is rare and a
675 // relatively old HTML feature. Templates are also rare and a
676 // relatively new HTML feature. Their combination is very rare.
677 for _, e := range p.oe {
678 if e.Namespace != "" {
679 p.im = ignoreTheRemainingTokens
680 return true
681 }
682 }
683
684 p.addElement()
685 p.afe = append(p.afe, &scopeMarker)
686 p.framesetOK = false
687 p.im = inTemplateIM
688 p.templateStack = append(p.templateStack, inTemplateIM)
689 return true
690 }
691 case EndTagToken:
692 switch p.tok.DataAtom {
693 case a.Head:
694 p.oe.pop()
695 p.im = afterHeadIM
696 return true
697 case a.Body, a.Html, a.Br:
698 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
699 return false
700 case a.Template:
701 if !p.oe.contains(a.Template) {
702 return true
703 }
704 // TODO: remove this further divergence from the HTML5 spec.
705 //
706 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
707 p.generateImpliedEndTags()
708 for i := len(p.oe) - 1; i >= 0; i-- {
709 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
710 p.oe = p.oe[:i]
711 break
712 }
713 }
714 p.clearActiveFormattingElements()
715 p.templateStack.pop()
716 p.resetInsertionMode()
717 return true
718 default:
719 // Ignore the token.
720 return true
721 }
722 case CommentToken:
723 p.addChild(&Node{
724 Type: CommentNode,
725 Data: p.tok.Data,
726 })
727 return true
728 case DoctypeToken:
729 // Ignore the token.
730 return true
731 }
732
733 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
734 return false
735}
736
737// Section 12.2.6.4.5.
738func inHeadNoscriptIM(p *parser) bool {
739 switch p.tok.Type {
740 case DoctypeToken:
741 // Ignore the token.
742 return true
743 case StartTagToken:
744 switch p.tok.DataAtom {
745 case a.Html:
746 return inBodyIM(p)
747 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
748 return inHeadIM(p)
749 case a.Head:
750 // Ignore the token.
751 return true
752 case a.Noscript:
753 // Don't let the tokenizer go into raw text mode even when a <noscript>
754 // tag is in "in head noscript" insertion mode.
755 p.tokenizer.NextIsNotRawText()
756 // Ignore the token.
757 return true
758 }
759 case EndTagToken:
760 switch p.tok.DataAtom {
761 case a.Noscript, a.Br:
762 default:
763 // Ignore the token.
764 return true
765 }
766 case TextToken:
767 s := strings.TrimLeft(p.tok.Data, whitespace)
768 if len(s) == 0 {
769 // It was all whitespace.
770 return inHeadIM(p)
771 }
772 case CommentToken:
773 return inHeadIM(p)
774 }
775 p.oe.pop()
776 if p.top().DataAtom != a.Head {
777 panic("html: the new current node will be a head element.")
778 }
779 p.im = inHeadIM
780 if p.tok.DataAtom == a.Noscript {
781 return true
782 }
783 return false
784}
785
786// Section 12.2.6.4.6.
787func afterHeadIM(p *parser) bool {
788 switch p.tok.Type {
789 case TextToken:
790 s := strings.TrimLeft(p.tok.Data, whitespace)
791 if len(s) < len(p.tok.Data) {
792 // Add the initial whitespace to the current node.
793 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
794 if s == "" {
795 return true
796 }
797 p.tok.Data = s
798 }
799 case StartTagToken:
800 switch p.tok.DataAtom {
801 case a.Html:
802 return inBodyIM(p)
803 case a.Body:
804 p.addElement()
805 p.framesetOK = false
806 p.im = inBodyIM
807 return true
808 case a.Frameset:
809 p.addElement()
810 p.im = inFramesetIM
811 return true
812 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
813 p.oe = append(p.oe, p.head)
814 defer p.oe.remove(p.head)
815 return inHeadIM(p)
816 case a.Head:
817 // Ignore the token.
818 return true
819 }
820 case EndTagToken:
821 switch p.tok.DataAtom {
822 case a.Body, a.Html, a.Br:
823 // Drop down to creating an implied <body> tag.
824 case a.Template:
825 return inHeadIM(p)
826 default:
827 // Ignore the token.
828 return true
829 }
830 case CommentToken:
831 p.addChild(&Node{
832 Type: CommentNode,
833 Data: p.tok.Data,
834 })
835 return true
836 case DoctypeToken:
837 // Ignore the token.
838 return true
839 }
840
841 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
842 p.framesetOK = true
843 if p.tok.Type == ErrorToken {
844 // Stop parsing.
845 return true
846 }
847 return false
848}
849
850// copyAttributes copies attributes of src not found on dst to dst.
851func copyAttributes(dst *Node, src Token) {
852 if len(src.Attr) == 0 {
853 return
854 }
855 attr := map[string]string{}
856 for _, t := range dst.Attr {
857 attr[t.Key] = t.Val
858 }
859 for _, t := range src.Attr {
860 if _, ok := attr[t.Key]; !ok {
861 dst.Attr = append(dst.Attr, t)
862 attr[t.Key] = t.Val
863 }
864 }
865}
866
867// Section 12.2.6.4.7.
868func inBodyIM(p *parser) bool {
869 switch p.tok.Type {
870 case TextToken:
871 d := p.tok.Data
872 switch n := p.oe.top(); n.DataAtom {
873 case a.Pre, a.Listing:
874 if n.FirstChild == nil {
875 // Ignore a newline at the start of a <pre> block.
876 if d != "" && d[0] == '\r' {
877 d = d[1:]
878 }
879 if d != "" && d[0] == '\n' {
880 d = d[1:]
881 }
882 }
883 }
884 d = strings.Replace(d, "\x00", "", -1)
885 if d == "" {
886 return true
887 }
888 p.reconstructActiveFormattingElements()
889 p.addText(d)
890 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
891 // There were non-whitespace characters inserted.
892 p.framesetOK = false
893 }
894 case StartTagToken:
895 switch p.tok.DataAtom {
896 case a.Html:
897 if p.oe.contains(a.Template) {
898 return true
899 }
900 copyAttributes(p.oe[0], p.tok)
901 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
902 return inHeadIM(p)
903 case a.Body:
904 if p.oe.contains(a.Template) {
905 return true
906 }
907 if len(p.oe) >= 2 {
908 body := p.oe[1]
909 if body.Type == ElementNode && body.DataAtom == a.Body {
910 p.framesetOK = false
911 copyAttributes(body, p.tok)
912 }
913 }
914 case a.Frameset:
915 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
916 // Ignore the token.
917 return true
918 }
919 body := p.oe[1]
920 if body.Parent != nil {
921 body.Parent.RemoveChild(body)
922 }
923 p.oe = p.oe[:1]
924 p.addElement()
925 p.im = inFramesetIM
926 return true
927 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:
928 p.popUntil(buttonScope, a.P)
929 p.addElement()
930 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
931 p.popUntil(buttonScope, a.P)
932 switch n := p.top(); n.DataAtom {
933 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
934 p.oe.pop()
935 }
936 p.addElement()
937 case a.Pre, a.Listing:
938 p.popUntil(buttonScope, a.P)
939 p.addElement()
940 // The newline, if any, will be dealt with by the TextToken case.
941 p.framesetOK = false
942 case a.Form:
943 if p.form != nil && !p.oe.contains(a.Template) {
944 // Ignore the token
945 return true
946 }
947 p.popUntil(buttonScope, a.P)
948 p.addElement()
949 if !p.oe.contains(a.Template) {
950 p.form = p.top()
951 }
952 case a.Li:
953 p.framesetOK = false
954 for i := len(p.oe) - 1; i >= 0; i-- {
955 node := p.oe[i]
956 switch node.DataAtom {
957 case a.Li:
958 p.oe = p.oe[:i]
959 case a.Address, a.Div, a.P:
960 continue
961 default:
962 if !isSpecialElement(node) {
963 continue
964 }
965 }
966 break
967 }
968 p.popUntil(buttonScope, a.P)
969 p.addElement()
970 case a.Dd, a.Dt:
971 p.framesetOK = false
972 for i := len(p.oe) - 1; i >= 0; i-- {
973 node := p.oe[i]
974 switch node.DataAtom {
975 case a.Dd, a.Dt:
976 p.oe = p.oe[:i]
977 case a.Address, a.Div, a.P:
978 continue
979 default:
980 if !isSpecialElement(node) {
981 continue
982 }
983 }
984 break
985 }
986 p.popUntil(buttonScope, a.P)
987 p.addElement()
988 case a.Plaintext:
989 p.popUntil(buttonScope, a.P)
990 p.addElement()
991 case a.Button:
992 p.popUntil(defaultScope, a.Button)
993 p.reconstructActiveFormattingElements()
994 p.addElement()
995 p.framesetOK = false
996 case a.A:
997 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
998 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
999 p.inBodyEndTagFormatting(a.A, "a")
1000 p.oe.remove(n)
1001 p.afe.remove(n)
1002 break
1003 }
1004 }
1005 p.reconstructActiveFormattingElements()
1006 p.addFormattingElement()
1007 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1008 p.reconstructActiveFormattingElements()
1009 p.addFormattingElement()
1010 case a.Nobr:
1011 p.reconstructActiveFormattingElements()
1012 if p.elementInScope(defaultScope, a.Nobr) {
1013 p.inBodyEndTagFormatting(a.Nobr, "nobr")
1014 p.reconstructActiveFormattingElements()
1015 }
1016 p.addFormattingElement()
1017 case a.Applet, a.Marquee, a.Object:
1018 p.reconstructActiveFormattingElements()
1019 p.addElement()
1020 p.afe = append(p.afe, &scopeMarker)
1021 p.framesetOK = false
1022 case a.Table:
1023 if !p.quirks {
1024 p.popUntil(buttonScope, a.P)
1025 }
1026 p.addElement()
1027 p.framesetOK = false
1028 p.im = inTableIM
1029 return true
1030 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
1031 p.reconstructActiveFormattingElements()
1032 p.addElement()
1033 p.oe.pop()
1034 p.acknowledgeSelfClosingTag()
1035 if p.tok.DataAtom == a.Input {
1036 for _, t := range p.tok.Attr {
1037 if t.Key == "type" {
1038 if strings.EqualFold(t.Val, "hidden") {
1039 // Skip setting framesetOK = false
1040 return true
1041 }
1042 }
1043 }
1044 }
1045 p.framesetOK = false
1046 case a.Param, a.Source, a.Track:
1047 p.addElement()
1048 p.oe.pop()
1049 p.acknowledgeSelfClosingTag()
1050 case a.Hr:
1051 p.popUntil(buttonScope, a.P)
1052 p.addElement()
1053 p.oe.pop()
1054 p.acknowledgeSelfClosingTag()
1055 p.framesetOK = false
1056 case a.Image:
1057 p.tok.DataAtom = a.Img
1058 p.tok.Data = a.Img.String()
1059 return false
1060 case a.Textarea:
1061 p.addElement()
1062 p.setOriginalIM()
1063 p.framesetOK = false
1064 p.im = textIM
1065 case a.Xmp:
1066 p.popUntil(buttonScope, a.P)
1067 p.reconstructActiveFormattingElements()
1068 p.framesetOK = false
1069 p.parseGenericRawTextElement()
1070 case a.Iframe:
1071 p.framesetOK = false
1072 p.parseGenericRawTextElement()
1073 case a.Noembed:
1074 p.parseGenericRawTextElement()
1075 case a.Noscript:
1076 if p.scripting {
1077 p.parseGenericRawTextElement()
1078 return true
1079 }
1080 p.reconstructActiveFormattingElements()
1081 p.addElement()
1082 // Don't let the tokenizer go into raw text mode when scripting is disabled.
1083 p.tokenizer.NextIsNotRawText()
1084 case a.Select:
1085 p.reconstructActiveFormattingElements()
1086 p.addElement()
1087 p.framesetOK = false
1088 p.im = inSelectIM
1089 return true
1090 case a.Optgroup, a.Option:
1091 if p.top().DataAtom == a.Option {
1092 p.oe.pop()
1093 }
1094 p.reconstructActiveFormattingElements()
1095 p.addElement()
1096 case a.Rb, a.Rtc:
1097 if p.elementInScope(defaultScope, a.Ruby) {
1098 p.generateImpliedEndTags()
1099 }
1100 p.addElement()
1101 case a.Rp, a.Rt:
1102 if p.elementInScope(defaultScope, a.Ruby) {
1103 p.generateImpliedEndTags("rtc")
1104 }
1105 p.addElement()
1106 case a.Math, a.Svg:
1107 p.reconstructActiveFormattingElements()
1108 if p.tok.DataAtom == a.Math {
1109 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1110 } else {
1111 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1112 }
1113 adjustForeignAttributes(p.tok.Attr)
1114 p.addElement()
1115 p.top().Namespace = p.tok.Data
1116 if p.hasSelfClosingToken {
1117 p.oe.pop()
1118 p.acknowledgeSelfClosingTag()
1119 }
1120 return true
1121 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1122 // Ignore the token.
1123 default:
1124 p.reconstructActiveFormattingElements()
1125 p.addElement()
1126 }
1127 case EndTagToken:
1128 switch p.tok.DataAtom {
1129 case a.Body:
1130 if p.elementInScope(defaultScope, a.Body) {
1131 p.im = afterBodyIM
1132 }
1133 case a.Html:
1134 if p.elementInScope(defaultScope, a.Body) {
1135 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
1136 return false
1137 }
1138 return true
1139 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:
1140 p.popUntil(defaultScope, p.tok.DataAtom)
1141 case a.Form:
1142 if p.oe.contains(a.Template) {
1143 i := p.indexOfElementInScope(defaultScope, a.Form)
1144 if i == -1 {
1145 // Ignore the token.
1146 return true
1147 }
1148 p.generateImpliedEndTags()
1149 if p.oe[i].DataAtom != a.Form {
1150 // Ignore the token.
1151 return true
1152 }
1153 p.popUntil(defaultScope, a.Form)
1154 } else {
1155 node := p.form
1156 p.form = nil
1157 i := p.indexOfElementInScope(defaultScope, a.Form)
1158 if node == nil || i == -1 || p.oe[i] != node {
1159 // Ignore the token.
1160 return true
1161 }
1162 p.generateImpliedEndTags()
1163 p.oe.remove(node)
1164 }
1165 case a.P:
1166 if !p.elementInScope(buttonScope, a.P) {
1167 p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1168 }
1169 p.popUntil(buttonScope, a.P)
1170 case a.Li:
1171 p.popUntil(listItemScope, a.Li)
1172 case a.Dd, a.Dt:
1173 p.popUntil(defaultScope, p.tok.DataAtom)
1174 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1175 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1176 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1177 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
1178 case a.Applet, a.Marquee, a.Object:
1179 if p.popUntil(defaultScope, p.tok.DataAtom) {
1180 p.clearActiveFormattingElements()
1181 }
1182 case a.Br:
1183 p.tok.Type = StartTagToken
1184 return false
1185 case a.Template:
1186 return inHeadIM(p)
1187 default:
1188 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
1189 }
1190 case CommentToken:
1191 p.addChild(&Node{
1192 Type: CommentNode,
1193 Data: p.tok.Data,
1194 })
1195 case ErrorToken:
1196 // TODO: remove this divergence from the HTML5 spec.
1197 if len(p.templateStack) > 0 {
1198 p.im = inTemplateIM
1199 return false
1200 }
1201 for _, e := range p.oe {
1202 switch e.DataAtom {
1203 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
1204 a.Thead, a.Tr, a.Body, a.Html:
1205 default:
1206 return true
1207 }
1208 }
1209 }
1210
1211 return true
1212}
1213
1214func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
1215 // This is the "adoption agency" algorithm, described at
1216 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1217
1218 // TODO: this is a fairly literal line-by-line translation of that algorithm.
1219 // Once the code successfully parses the comprehensive test suite, we should
1220 // refactor this code to be more idiomatic.
1221
1222 // Steps 1-2
1223 if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {
1224 p.oe.pop()
1225 return
1226 }
1227
1228 // Steps 3-5. The outer loop.
1229 for i := 0; i < 8; i++ {
1230 // Step 6. Find the formatting element.
1231 var formattingElement *Node
1232 for j := len(p.afe) - 1; j >= 0; j-- {
1233 if p.afe[j].Type == scopeMarkerNode {
1234 break
1235 }
1236 if p.afe[j].DataAtom == tagAtom {
1237 formattingElement = p.afe[j]
1238 break
1239 }
1240 }
1241 if formattingElement == nil {
1242 p.inBodyEndTagOther(tagAtom, tagName)
1243 return
1244 }
1245
1246 // Step 7. Ignore the tag if formatting element is not in the stack of open elements.
1247 feIndex := p.oe.index(formattingElement)
1248 if feIndex == -1 {
1249 p.afe.remove(formattingElement)
1250 return
1251 }
1252 // Step 8. Ignore the tag if formatting element is not in the scope.
1253 if !p.elementInScope(defaultScope, tagAtom) {
1254 // Ignore the tag.
1255 return
1256 }
1257
1258 // Step 9. This step is omitted because it's just a parse error but no need to return.
1259
1260 // Steps 10-11. Find the furthest block.
1261 var furthestBlock *Node
1262 for _, e := range p.oe[feIndex:] {
1263 if isSpecialElement(e) {
1264 furthestBlock = e
1265 break
1266 }
1267 }
1268 if furthestBlock == nil {
1269 e := p.oe.pop()
1270 for e != formattingElement {
1271 e = p.oe.pop()
1272 }
1273 p.afe.remove(e)
1274 return
1275 }
1276
1277 // Steps 12-13. Find the common ancestor and bookmark node.
1278 commonAncestor := p.oe[feIndex-1]
1279 bookmark := p.afe.index(formattingElement)
1280
1281 // Step 14. The inner loop. Find the lastNode to reparent.
1282 lastNode := furthestBlock
1283 node := furthestBlock
1284 x := p.oe.index(node)
1285 // Step 14.1.
1286 j := 0
1287 for {
1288 // Step 14.2.
1289 j++
1290 // Step. 14.3.
1291 x--
1292 node = p.oe[x]
1293 // Step 14.4. Go to the next step if node is formatting element.
1294 if node == formattingElement {
1295 break
1296 }
1297 // Step 14.5. Remove node from the list of active formatting elements if
1298 // inner loop counter is greater than three and node is in the list of
1299 // active formatting elements.
1300 if ni := p.afe.index(node); j > 3 && ni > -1 {
1301 p.afe.remove(node)
1302 // If any element of the list of active formatting elements is removed,
1303 // we need to take care whether bookmark should be decremented or not.
1304 // This is because the value of bookmark may exceed the size of the
1305 // list by removing elements from the list.
1306 if ni <= bookmark {
1307 bookmark--
1308 }
1309 continue
1310 }
1311 // Step 14.6. Continue the next inner loop if node is not in the list of
1312 // active formatting elements.
1313 if p.afe.index(node) == -1 {
1314 p.oe.remove(node)
1315 continue
1316 }
1317 // Step 14.7.
1318 clone := node.clone()
1319 p.afe[p.afe.index(node)] = clone
1320 p.oe[p.oe.index(node)] = clone
1321 node = clone
1322 // Step 14.8.
1323 if lastNode == furthestBlock {
1324 bookmark = p.afe.index(node) + 1
1325 }
1326 // Step 14.9.
1327 if lastNode.Parent != nil {
1328 lastNode.Parent.RemoveChild(lastNode)
1329 }
1330 node.AppendChild(lastNode)
1331 // Step 14.10.
1332 lastNode = node
1333 }
1334
1335 // Step 15. Reparent lastNode to the common ancestor,
1336 // or for misnested table nodes, to the foster parent.
1337 if lastNode.Parent != nil {
1338 lastNode.Parent.RemoveChild(lastNode)
1339 }
1340 switch commonAncestor.DataAtom {
1341 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1342 p.fosterParent(lastNode)
1343 default:
1344 commonAncestor.AppendChild(lastNode)
1345 }
1346
1347 // Steps 16-18. Reparent nodes from the furthest block's children
1348 // to a clone of the formatting element.
1349 clone := formattingElement.clone()
1350 reparentChildren(clone, furthestBlock)
1351 furthestBlock.AppendChild(clone)
1352
1353 // Step 19. Fix up the list of active formatting elements.
1354 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1355 // Move the bookmark with the rest of the list.
1356 bookmark--
1357 }
1358 p.afe.remove(formattingElement)
1359 p.afe.insert(bookmark, clone)
1360
1361 // Step 20. Fix up the stack of open elements.
1362 p.oe.remove(formattingElement)
1363 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1364 }
1365}
1366
1367// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1368// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
1369// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1370func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
1371 for i := len(p.oe) - 1; i >= 0; i-- {
1372 // Two element nodes have the same tag if they have the same Data (a
1373 // string-typed field). As an optimization, for common HTML tags, each
1374 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed
1375 // field), since integer comparison is faster than string comparison.
1376 // Uncommon (custom) tags get a zero DataAtom.
1377 //
1378 // The if condition here is equivalent to (p.oe[i].Data == tagName).
1379 if (p.oe[i].DataAtom == tagAtom) &&
1380 ((tagAtom != 0) || (p.oe[i].Data == tagName)) {
1381 p.oe = p.oe[:i]
1382 break
1383 }
1384 if isSpecialElement(p.oe[i]) {
1385 break
1386 }
1387 }
1388}
1389
1390// Section 12.2.6.4.8.
1391func textIM(p *parser) bool {
1392 switch p.tok.Type {
1393 case ErrorToken:
1394 p.oe.pop()
1395 case TextToken:
1396 d := p.tok.Data
1397 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1398 // Ignore a newline at the start of a <textarea> block.
1399 if d != "" && d[0] == '\r' {
1400 d = d[1:]
1401 }
1402 if d != "" && d[0] == '\n' {
1403 d = d[1:]
1404 }
1405 }
1406 if d == "" {
1407 return true
1408 }
1409 p.addText(d)
1410 return true
1411 case EndTagToken:
1412 p.oe.pop()
1413 }
1414 p.im = p.originalIM
1415 p.originalIM = nil
1416 return p.tok.Type == EndTagToken
1417}
1418
1419// Section 12.2.6.4.9.
1420func inTableIM(p *parser) bool {
1421 switch p.tok.Type {
1422 case TextToken:
1423 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1424 switch p.oe.top().DataAtom {
1425 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1426 if strings.Trim(p.tok.Data, whitespace) == "" {
1427 p.addText(p.tok.Data)
1428 return true
1429 }
1430 }
1431 case StartTagToken:
1432 switch p.tok.DataAtom {
1433 case a.Caption:
1434 p.clearStackToContext(tableScope)
1435 p.afe = append(p.afe, &scopeMarker)
1436 p.addElement()
1437 p.im = inCaptionIM
1438 return true
1439 case a.Colgroup:
1440 p.clearStackToContext(tableScope)
1441 p.addElement()
1442 p.im = inColumnGroupIM
1443 return true
1444 case a.Col:
1445 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1446 return false
1447 case a.Tbody, a.Tfoot, a.Thead:
1448 p.clearStackToContext(tableScope)
1449 p.addElement()
1450 p.im = inTableBodyIM
1451 return true
1452 case a.Td, a.Th, a.Tr:
1453 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1454 return false
1455 case a.Table:
1456 if p.popUntil(tableScope, a.Table) {
1457 p.resetInsertionMode()
1458 return false
1459 }
1460 // Ignore the token.
1461 return true
1462 case a.Style, a.Script, a.Template:
1463 return inHeadIM(p)
1464 case a.Input:
1465 for _, t := range p.tok.Attr {
1466 if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {
1467 p.addElement()
1468 p.oe.pop()
1469 return true
1470 }
1471 }
1472 // Otherwise drop down to the default action.
1473 case a.Form:
1474 if p.oe.contains(a.Template) || p.form != nil {
1475 // Ignore the token.
1476 return true
1477 }
1478 p.addElement()
1479 p.form = p.oe.pop()
1480 case a.Select:
1481 p.reconstructActiveFormattingElements()
1482 switch p.top().DataAtom {
1483 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1484 p.fosterParenting = true
1485 }
1486 p.addElement()
1487 p.fosterParenting = false
1488 p.framesetOK = false
1489 p.im = inSelectInTableIM
1490 return true
1491 }
1492 case EndTagToken:
1493 switch p.tok.DataAtom {
1494 case a.Table:
1495 if p.popUntil(tableScope, a.Table) {
1496 p.resetInsertionMode()
1497 return true
1498 }
1499 // Ignore the token.
1500 return true
1501 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1502 // Ignore the token.
1503 return true
1504 case a.Template:
1505 return inHeadIM(p)
1506 }
1507 case CommentToken:
1508 p.addChild(&Node{
1509 Type: CommentNode,
1510 Data: p.tok.Data,
1511 })
1512 return true
1513 case DoctypeToken:
1514 // Ignore the token.
1515 return true
1516 case ErrorToken:
1517 return inBodyIM(p)
1518 }
1519
1520 p.fosterParenting = true
1521 defer func() { p.fosterParenting = false }()
1522
1523 return inBodyIM(p)
1524}
1525
1526// Section 12.2.6.4.11.
1527func inCaptionIM(p *parser) bool {
1528 switch p.tok.Type {
1529 case StartTagToken:
1530 switch p.tok.DataAtom {
1531 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1532 if !p.popUntil(tableScope, a.Caption) {
1533 // Ignore the token.
1534 return true
1535 }
1536 p.clearActiveFormattingElements()
1537 p.im = inTableIM
1538 return false
1539 case a.Select:
1540 p.reconstructActiveFormattingElements()
1541 p.addElement()
1542 p.framesetOK = false
1543 p.im = inSelectInTableIM
1544 return true
1545 }
1546 case EndTagToken:
1547 switch p.tok.DataAtom {
1548 case a.Caption:
1549 if p.popUntil(tableScope, a.Caption) {
1550 p.clearActiveFormattingElements()
1551 p.im = inTableIM
1552 }
1553 return true
1554 case a.Table:
1555 if !p.popUntil(tableScope, a.Caption) {
1556 // Ignore the token.
1557 return true
1558 }
1559 p.clearActiveFormattingElements()
1560 p.im = inTableIM
1561 return false
1562 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1563 // Ignore the token.
1564 return true
1565 }
1566 }
1567 return inBodyIM(p)
1568}
1569
1570// Section 12.2.6.4.12.
1571func inColumnGroupIM(p *parser) bool {
1572 switch p.tok.Type {
1573 case TextToken:
1574 s := strings.TrimLeft(p.tok.Data, whitespace)
1575 if len(s) < len(p.tok.Data) {
1576 // Add the initial whitespace to the current node.
1577 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1578 if s == "" {
1579 return true
1580 }
1581 p.tok.Data = s
1582 }
1583 case CommentToken:
1584 p.addChild(&Node{
1585 Type: CommentNode,
1586 Data: p.tok.Data,
1587 })
1588 return true
1589 case DoctypeToken:
1590 // Ignore the token.
1591 return true
1592 case StartTagToken:
1593 switch p.tok.DataAtom {
1594 case a.Html:
1595 return inBodyIM(p)
1596 case a.Col:
1597 p.addElement()
1598 p.oe.pop()
1599 p.acknowledgeSelfClosingTag()
1600 return true
1601 case a.Template:
1602 return inHeadIM(p)
1603 }
1604 case EndTagToken:
1605 switch p.tok.DataAtom {
1606 case a.Colgroup:
1607 if p.oe.top().DataAtom == a.Colgroup {
1608 p.oe.pop()
1609 p.im = inTableIM
1610 }
1611 return true
1612 case a.Col:
1613 // Ignore the token.
1614 return true
1615 case a.Template:
1616 return inHeadIM(p)
1617 }
1618 case ErrorToken:
1619 return inBodyIM(p)
1620 }
1621 if p.oe.top().DataAtom != a.Colgroup {
1622 return true
1623 }
1624 p.oe.pop()
1625 p.im = inTableIM
1626 return false
1627}
1628
1629// Section 12.2.6.4.13.
1630func inTableBodyIM(p *parser) bool {
1631 switch p.tok.Type {
1632 case StartTagToken:
1633 switch p.tok.DataAtom {
1634 case a.Tr:
1635 p.clearStackToContext(tableBodyScope)
1636 p.addElement()
1637 p.im = inRowIM
1638 return true
1639 case a.Td, a.Th:
1640 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1641 return false
1642 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1643 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1644 p.im = inTableIM
1645 return false
1646 }
1647 // Ignore the token.
1648 return true
1649 }
1650 case EndTagToken:
1651 switch p.tok.DataAtom {
1652 case a.Tbody, a.Tfoot, a.Thead:
1653 if p.elementInScope(tableScope, p.tok.DataAtom) {
1654 p.clearStackToContext(tableBodyScope)
1655 p.oe.pop()
1656 p.im = inTableIM
1657 }
1658 return true
1659 case a.Table:
1660 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1661 p.im = inTableIM
1662 return false
1663 }
1664 // Ignore the token.
1665 return true
1666 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1667 // Ignore the token.
1668 return true
1669 }
1670 case CommentToken:
1671 p.addChild(&Node{
1672 Type: CommentNode,
1673 Data: p.tok.Data,
1674 })
1675 return true
1676 }
1677
1678 return inTableIM(p)
1679}
1680
1681// Section 12.2.6.4.14.
1682func inRowIM(p *parser) bool {
1683 switch p.tok.Type {
1684 case StartTagToken:
1685 switch p.tok.DataAtom {
1686 case a.Td, a.Th:
1687 p.clearStackToContext(tableRowScope)
1688 p.addElement()
1689 p.afe = append(p.afe, &scopeMarker)
1690 p.im = inCellIM
1691 return true
1692 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1693 if p.popUntil(tableScope, a.Tr) {
1694 p.im = inTableBodyIM
1695 return false
1696 }
1697 // Ignore the token.
1698 return true
1699 }
1700 case EndTagToken:
1701 switch p.tok.DataAtom {
1702 case a.Tr:
1703 if p.popUntil(tableScope, a.Tr) {
1704 p.im = inTableBodyIM
1705 return true
1706 }
1707 // Ignore the token.
1708 return true
1709 case a.Table:
1710 if p.popUntil(tableScope, a.Tr) {
1711 p.im = inTableBodyIM
1712 return false
1713 }
1714 // Ignore the token.
1715 return true
1716 case a.Tbody, a.Tfoot, a.Thead:
1717 if p.elementInScope(tableScope, p.tok.DataAtom) {
1718 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String())
1719 return false
1720 }
1721 // Ignore the token.
1722 return true
1723 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1724 // Ignore the token.
1725 return true
1726 }
1727 }
1728
1729 return inTableIM(p)
1730}
1731
1732// Section 12.2.6.4.15.
1733func inCellIM(p *parser) bool {
1734 switch p.tok.Type {
1735 case StartTagToken:
1736 switch p.tok.DataAtom {
1737 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1738 if p.popUntil(tableScope, a.Td, a.Th) {
1739 // Close the cell and reprocess.
1740 p.clearActiveFormattingElements()
1741 p.im = inRowIM
1742 return false
1743 }
1744 // Ignore the token.
1745 return true
1746 case a.Select:
1747 p.reconstructActiveFormattingElements()
1748 p.addElement()
1749 p.framesetOK = false
1750 p.im = inSelectInTableIM
1751 return true
1752 }
1753 case EndTagToken:
1754 switch p.tok.DataAtom {
1755 case a.Td, a.Th:
1756 if !p.popUntil(tableScope, p.tok.DataAtom) {
1757 // Ignore the token.
1758 return true
1759 }
1760 p.clearActiveFormattingElements()
1761 p.im = inRowIM
1762 return true
1763 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1764 // Ignore the token.
1765 return true
1766 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1767 if !p.elementInScope(tableScope, p.tok.DataAtom) {
1768 // Ignore the token.
1769 return true
1770 }
1771 // Close the cell and reprocess.
1772 if p.popUntil(tableScope, a.Td, a.Th) {
1773 p.clearActiveFormattingElements()
1774 }
1775 p.im = inRowIM
1776 return false
1777 }
1778 }
1779 return inBodyIM(p)
1780}
1781
1782// Section 12.2.6.4.16.
1783func inSelectIM(p *parser) bool {
1784 switch p.tok.Type {
1785 case TextToken:
1786 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1787 case StartTagToken:
1788 switch p.tok.DataAtom {
1789 case a.Html:
1790 return inBodyIM(p)
1791 case a.Option:
1792 if p.top().DataAtom == a.Option {
1793 p.oe.pop()
1794 }
1795 p.addElement()
1796 case a.Optgroup:
1797 if p.top().DataAtom == a.Option {
1798 p.oe.pop()
1799 }
1800 if p.top().DataAtom == a.Optgroup {
1801 p.oe.pop()
1802 }
1803 p.addElement()
1804 case a.Select:
1805 if !p.popUntil(selectScope, a.Select) {
1806 // Ignore the token.
1807 return true
1808 }
1809 p.resetInsertionMode()
1810 case a.Input, a.Keygen, a.Textarea:
1811 if p.elementInScope(selectScope, a.Select) {
1812 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1813 return false
1814 }
1815 // In order to properly ignore <textarea>, we need to change the tokenizer mode.
1816 p.tokenizer.NextIsNotRawText()
1817 // Ignore the token.
1818 return true
1819 case a.Script, a.Template:
1820 return inHeadIM(p)
1821 case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
1822 // Don't let the tokenizer go into raw text mode when there are raw tags
1823 // to be ignored. These tags should be ignored from the tokenizer
1824 // properly.
1825 p.tokenizer.NextIsNotRawText()
1826 // Ignore the token.
1827 return true
1828 }
1829 case EndTagToken:
1830 switch p.tok.DataAtom {
1831 case a.Option:
1832 if p.top().DataAtom == a.Option {
1833 p.oe.pop()
1834 }
1835 case a.Optgroup:
1836 i := len(p.oe) - 1
1837 if p.oe[i].DataAtom == a.Option {
1838 i--
1839 }
1840 if p.oe[i].DataAtom == a.Optgroup {
1841 p.oe = p.oe[:i]
1842 }
1843 case a.Select:
1844 if !p.popUntil(selectScope, a.Select) {
1845 // Ignore the token.
1846 return true
1847 }
1848 p.resetInsertionMode()
1849 case a.Template:
1850 return inHeadIM(p)
1851 }
1852 case CommentToken:
1853 p.addChild(&Node{
1854 Type: CommentNode,
1855 Data: p.tok.Data,
1856 })
1857 case DoctypeToken:
1858 // Ignore the token.
1859 return true
1860 case ErrorToken:
1861 return inBodyIM(p)
1862 }
1863
1864 return true
1865}
1866
1867// Section 12.2.6.4.17.
1868func inSelectInTableIM(p *parser) bool {
1869 switch p.tok.Type {
1870 case StartTagToken, EndTagToken:
1871 switch p.tok.DataAtom {
1872 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1873 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
1874 // Ignore the token.
1875 return true
1876 }
1877 // This is like p.popUntil(selectScope, a.Select), but it also
1878 // matches <math select>, not just <select>. Matching the MathML
1879 // tag is arguably incorrect (conceptually), but it mimics what
1880 // Chromium does.
1881 for i := len(p.oe) - 1; i >= 0; i-- {
1882 if n := p.oe[i]; n.DataAtom == a.Select {
1883 p.oe = p.oe[:i]
1884 break
1885 }
1886 }
1887 p.resetInsertionMode()
1888 return false
1889 }
1890 }
1891 return inSelectIM(p)
1892}
1893
1894// Section 12.2.6.4.18.
1895func inTemplateIM(p *parser) bool {
1896 switch p.tok.Type {
1897 case TextToken, CommentToken, DoctypeToken:
1898 return inBodyIM(p)
1899 case StartTagToken:
1900 switch p.tok.DataAtom {
1901 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
1902 return inHeadIM(p)
1903 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1904 p.templateStack.pop()
1905 p.templateStack = append(p.templateStack, inTableIM)
1906 p.im = inTableIM
1907 return false
1908 case a.Col:
1909 p.templateStack.pop()
1910 p.templateStack = append(p.templateStack, inColumnGroupIM)
1911 p.im = inColumnGroupIM
1912 return false
1913 case a.Tr:
1914 p.templateStack.pop()
1915 p.templateStack = append(p.templateStack, inTableBodyIM)
1916 p.im = inTableBodyIM
1917 return false
1918 case a.Td, a.Th:
1919 p.templateStack.pop()
1920 p.templateStack = append(p.templateStack, inRowIM)
1921 p.im = inRowIM
1922 return false
1923 default:
1924 p.templateStack.pop()
1925 p.templateStack = append(p.templateStack, inBodyIM)
1926 p.im = inBodyIM
1927 return false
1928 }
1929 case EndTagToken:
1930 switch p.tok.DataAtom {
1931 case a.Template:
1932 return inHeadIM(p)
1933 default:
1934 // Ignore the token.
1935 return true
1936 }
1937 case ErrorToken:
1938 if !p.oe.contains(a.Template) {
1939 // Ignore the token.
1940 return true
1941 }
1942 // TODO: remove this divergence from the HTML5 spec.
1943 //
1944 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
1945 p.generateImpliedEndTags()
1946 for i := len(p.oe) - 1; i >= 0; i-- {
1947 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
1948 p.oe = p.oe[:i]
1949 break
1950 }
1951 }
1952 p.clearActiveFormattingElements()
1953 p.templateStack.pop()
1954 p.resetInsertionMode()
1955 return false
1956 }
1957 return false
1958}
1959
1960// Section 12.2.6.4.19.
1961func afterBodyIM(p *parser) bool {
1962 switch p.tok.Type {
1963 case ErrorToken:
1964 // Stop parsing.
1965 return true
1966 case TextToken:
1967 s := strings.TrimLeft(p.tok.Data, whitespace)
1968 if len(s) == 0 {
1969 // It was all whitespace.
1970 return inBodyIM(p)
1971 }
1972 case StartTagToken:
1973 if p.tok.DataAtom == a.Html {
1974 return inBodyIM(p)
1975 }
1976 case EndTagToken:
1977 if p.tok.DataAtom == a.Html {
1978 if !p.fragment {
1979 p.im = afterAfterBodyIM
1980 }
1981 return true
1982 }
1983 case CommentToken:
1984 // The comment is attached to the <html> element.
1985 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
1986 panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1987 }
1988 p.oe[0].AppendChild(&Node{
1989 Type: CommentNode,
1990 Data: p.tok.Data,
1991 })
1992 return true
1993 }
1994 p.im = inBodyIM
1995 return false
1996}
1997
1998// Section 12.2.6.4.20.
1999func inFramesetIM(p *parser) bool {
2000 switch p.tok.Type {
2001 case CommentToken:
2002 p.addChild(&Node{
2003 Type: CommentNode,
2004 Data: p.tok.Data,
2005 })
2006 case TextToken:
2007 // Ignore all text but whitespace.
2008 s := strings.Map(func(c rune) rune {
2009 switch c {
2010 case ' ', '\t', '\n', '\f', '\r':
2011 return c
2012 }
2013 return -1
2014 }, p.tok.Data)
2015 if s != "" {
2016 p.addText(s)
2017 }
2018 case StartTagToken:
2019 switch p.tok.DataAtom {
2020 case a.Html:
2021 return inBodyIM(p)
2022 case a.Frameset:
2023 p.addElement()
2024 case a.Frame:
2025 p.addElement()
2026 p.oe.pop()
2027 p.acknowledgeSelfClosingTag()
2028 case a.Noframes:
2029 return inHeadIM(p)
2030 }
2031 case EndTagToken:
2032 switch p.tok.DataAtom {
2033 case a.Frameset:
2034 if p.oe.top().DataAtom != a.Html {
2035 p.oe.pop()
2036 if p.oe.top().DataAtom != a.Frameset {
2037 p.im = afterFramesetIM
2038 return true
2039 }
2040 }
2041 }
2042 default:
2043 // Ignore the token.
2044 }
2045 return true
2046}
2047
2048// Section 12.2.6.4.21.
2049func afterFramesetIM(p *parser) bool {
2050 switch p.tok.Type {
2051 case CommentToken:
2052 p.addChild(&Node{
2053 Type: CommentNode,
2054 Data: p.tok.Data,
2055 })
2056 case TextToken:
2057 // Ignore all text but whitespace.
2058 s := strings.Map(func(c rune) rune {
2059 switch c {
2060 case ' ', '\t', '\n', '\f', '\r':
2061 return c
2062 }
2063 return -1
2064 }, p.tok.Data)
2065 if s != "" {
2066 p.addText(s)
2067 }
2068 case StartTagToken:
2069 switch p.tok.DataAtom {
2070 case a.Html:
2071 return inBodyIM(p)
2072 case a.Noframes:
2073 return inHeadIM(p)
2074 }
2075 case EndTagToken:
2076 switch p.tok.DataAtom {
2077 case a.Html:
2078 p.im = afterAfterFramesetIM
2079 return true
2080 }
2081 default:
2082 // Ignore the token.
2083 }
2084 return true
2085}
2086
2087// Section 12.2.6.4.22.
2088func afterAfterBodyIM(p *parser) bool {
2089 switch p.tok.Type {
2090 case ErrorToken:
2091 // Stop parsing.
2092 return true
2093 case TextToken:
2094 s := strings.TrimLeft(p.tok.Data, whitespace)
2095 if len(s) == 0 {
2096 // It was all whitespace.
2097 return inBodyIM(p)
2098 }
2099 case StartTagToken:
2100 if p.tok.DataAtom == a.Html {
2101 return inBodyIM(p)
2102 }
2103 case CommentToken:
2104 p.doc.AppendChild(&Node{
2105 Type: CommentNode,
2106 Data: p.tok.Data,
2107 })
2108 return true
2109 case DoctypeToken:
2110 return inBodyIM(p)
2111 }
2112 p.im = inBodyIM
2113 return false
2114}
2115
2116// Section 12.2.6.4.23.
2117func afterAfterFramesetIM(p *parser) bool {
2118 switch p.tok.Type {
2119 case CommentToken:
2120 p.doc.AppendChild(&Node{
2121 Type: CommentNode,
2122 Data: p.tok.Data,
2123 })
2124 case TextToken:
2125 // Ignore all text but whitespace.
2126 s := strings.Map(func(c rune) rune {
2127 switch c {
2128 case ' ', '\t', '\n', '\f', '\r':
2129 return c
2130 }
2131 return -1
2132 }, p.tok.Data)
2133 if s != "" {
2134 p.tok.Data = s
2135 return inBodyIM(p)
2136 }
2137 case StartTagToken:
2138 switch p.tok.DataAtom {
2139 case a.Html:
2140 return inBodyIM(p)
2141 case a.Noframes:
2142 return inHeadIM(p)
2143 }
2144 case DoctypeToken:
2145 return inBodyIM(p)
2146 default:
2147 // Ignore the token.
2148 }
2149 return true
2150}
2151
2152func ignoreTheRemainingTokens(p *parser) bool {
2153 return true
2154}
2155
2156const whitespaceOrNUL = whitespace + "\x00"
2157
2158// Section 12.2.6.5
2159func parseForeignContent(p *parser) bool {
2160 switch p.tok.Type {
2161 case TextToken:
2162 if p.framesetOK {
2163 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
2164 }
2165 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
2166 p.addText(p.tok.Data)
2167 case CommentToken:
2168 p.addChild(&Node{
2169 Type: CommentNode,
2170 Data: p.tok.Data,
2171 })
2172 case StartTagToken:
2173 if !p.fragment {
2174 b := breakout[p.tok.Data]
2175 if p.tok.DataAtom == a.Font {
2176 loop:
2177 for _, attr := range p.tok.Attr {
2178 switch attr.Key {
2179 case "color", "face", "size":
2180 b = true
2181 break loop
2182 }
2183 }
2184 }
2185 if b {
2186 for i := len(p.oe) - 1; i >= 0; i-- {
2187 n := p.oe[i]
2188 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
2189 p.oe = p.oe[:i+1]
2190 break
2191 }
2192 }
2193 return false
2194 }
2195 }
2196 current := p.adjustedCurrentNode()
2197 switch current.Namespace {
2198 case "math":
2199 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
2200 case "svg":
2201 // Adjust SVG tag names. The tokenizer lower-cases tag names, but
2202 // SVG wants e.g. "foreignObject" with a capital second "O".
2203 if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
2204 p.tok.DataAtom = a.Lookup([]byte(x))
2205 p.tok.Data = x
2206 }
2207 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
2208 default:
2209 panic("html: bad parser state: unexpected namespace")
2210 }
2211 adjustForeignAttributes(p.tok.Attr)
2212 namespace := current.Namespace
2213 p.addElement()
2214 p.top().Namespace = namespace
2215 if namespace != "" {
2216 // Don't let the tokenizer go into raw text mode in foreign content
2217 // (e.g. in an SVG <title> tag).
2218 p.tokenizer.NextIsNotRawText()
2219 }
2220 if p.hasSelfClosingToken {
2221 p.oe.pop()
2222 p.acknowledgeSelfClosingTag()
2223 }
2224 case EndTagToken:
2225 for i := len(p.oe) - 1; i >= 0; i-- {
2226 if p.oe[i].Namespace == "" {
2227 return p.im(p)
2228 }
2229 if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
2230 p.oe = p.oe[:i]
2231 break
2232 }
2233 }
2234 return true
2235 default:
2236 // Ignore the token.
2237 }
2238 return true
2239}
2240
2241// Section 12.2.4.2.
2242func (p *parser) adjustedCurrentNode() *Node {
2243 if len(p.oe) == 1 && p.fragment && p.context != nil {
2244 return p.context
2245 }
2246 return p.oe.top()
2247}
2248
2249// Section 12.2.6.
2250func (p *parser) inForeignContent() bool {
2251 if len(p.oe) == 0 {
2252 return false
2253 }
2254 n := p.adjustedCurrentNode()
2255 if n.Namespace == "" {
2256 return false
2257 }
2258 if mathMLTextIntegrationPoint(n) {
2259 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
2260 return false
2261 }
2262 if p.tok.Type == TextToken {
2263 return false
2264 }
2265 }
2266 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
2267 return false
2268 }
2269 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
2270 return false
2271 }
2272 if p.tok.Type == ErrorToken {
2273 return false
2274 }
2275 return true
2276}
2277
2278// parseImpliedToken parses a token as though it had appeared in the parser's
2279// input.
2280func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
2281 realToken, selfClosing := p.tok, p.hasSelfClosingToken
2282 p.tok = Token{
2283 Type: t,
2284 DataAtom: dataAtom,
2285 Data: data,
2286 }
2287 p.hasSelfClosingToken = false
2288 p.parseCurrentToken()
2289 p.tok, p.hasSelfClosingToken = realToken, selfClosing
2290}
2291
2292// parseCurrentToken runs the current token through the parsing routines
2293// until it is consumed.
2294func (p *parser) parseCurrentToken() {
2295 if p.tok.Type == SelfClosingTagToken {
2296 p.hasSelfClosingToken = true
2297 p.tok.Type = StartTagToken
2298 }
2299
2300 consumed := false
2301 for !consumed {
2302 if p.inForeignContent() {
2303 consumed = parseForeignContent(p)
2304 } else {
2305 consumed = p.im(p)
2306 }
2307 }
2308
2309 if p.hasSelfClosingToken {
2310 // This is a parse error, but ignore it.
2311 p.hasSelfClosingToken = false
2312 }
2313}
2314
2315func (p *parser) parse() error {
2316 // Iterate until EOF. Any other error will cause an early return.
2317 var err error
2318 for err != io.EOF {
2319 // CDATA sections are allowed only in foreign content.
2320 n := p.oe.top()
2321 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2322 // Read and parse the next token.
2323 p.tokenizer.Next()
2324 p.tok = p.tokenizer.Token()
2325 if p.tok.Type == ErrorToken {
2326 err = p.tokenizer.Err()
2327 if err != nil && err != io.EOF {
2328 return err
2329 }
2330 }
2331 p.parseCurrentToken()
2332 }
2333 return nil
2334}
2335
2336// Parse returns the parse tree for the HTML from the given Reader.
2337//
2338// It implements the HTML5 parsing algorithm
2339// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
2340// which is very complicated. The resultant tree can contain implicitly created
2341// nodes that have no explicit <tag> listed in r's data, and nodes' parents can
2342// differ from the nesting implied by a naive processing of start and end
2343// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
2344// with no corresponding node in the resulting tree.
2345//
2346// The input is assumed to be UTF-8 encoded.
2347func Parse(r io.Reader) (*Node, error) {
2348 return ParseWithOptions(r)
2349}
2350
2351// ParseFragment parses a fragment of HTML and returns the nodes that were
2352// found. If the fragment is the InnerHTML for an existing element, pass that
2353// element in context.
2354//
2355// It has the same intricacies as Parse.
2356func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2357 return ParseFragmentWithOptions(r, context)
2358}
2359
2360// ParseOption configures a parser.
2361type ParseOption func(p *parser)
2362
2363// ParseOptionEnableScripting configures the scripting flag.
2364// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
2365//
2366// By default, scripting is enabled.
2367func ParseOptionEnableScripting(enable bool) ParseOption {
2368 return func(p *parser) {
2369 p.scripting = enable
2370 }
2371}
2372
2373// ParseWithOptions is like Parse, with options.
2374func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
2375 p := &parser{
2376 tokenizer: NewTokenizer(r),
2377 doc: &Node{
2378 Type: DocumentNode,
2379 },
2380 scripting: true,
2381 framesetOK: true,
2382 im: initialIM,
2383 }
2384
2385 for _, f := range opts {
2386 f(p)
2387 }
2388
2389 if err := p.parse(); err != nil {
2390 return nil, err
2391 }
2392 return p.doc, nil
2393}
2394
2395// ParseFragmentWithOptions is like ParseFragment, with options.
2396func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
2397 contextTag := ""
2398 if context != nil {
2399 if context.Type != ElementNode {
2400 return nil, errors.New("html: ParseFragment of non-element Node")
2401 }
2402 // The next check isn't just context.DataAtom.String() == context.Data because
2403 // it is valid to pass an element whose tag isn't a known atom. For example,
2404 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2405 if context.DataAtom != a.Lookup([]byte(context.Data)) {
2406 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2407 }
2408 contextTag = context.DataAtom.String()
2409 }
2410 p := &parser{
2411 doc: &Node{
2412 Type: DocumentNode,
2413 },
2414 scripting: true,
2415 fragment: true,
2416 context: context,
2417 }
2418 if context != nil && context.Namespace != "" {
2419 p.tokenizer = NewTokenizer(r)
2420 } else {
2421 p.tokenizer = NewTokenizerFragment(r, contextTag)
2422 }
2423
2424 for _, f := range opts {
2425 f(p)
2426 }
2427
2428 root := &Node{
2429 Type: ElementNode,
2430 DataAtom: a.Html,
2431 Data: a.Html.String(),
2432 }
2433 p.doc.AppendChild(root)
2434 p.oe = nodeStack{root}
2435 if context != nil && context.DataAtom == a.Template {
2436 p.templateStack = append(p.templateStack, inTemplateIM)
2437 }
2438 p.resetInsertionMode()
2439
2440 for n := context; n != nil; n = n.Parent {
2441 if n.Type == ElementNode && n.DataAtom == a.Form {
2442 p.form = n
2443 break
2444 }
2445 }
2446
2447 if err := p.parse(); err != nil {
2448 return nil, err
2449 }
2450
2451 parent := p.doc
2452 if context != nil {
2453 parent = root
2454 }
2455
2456 var result []*Node
2457 for c := parent.FirstChild; c != nil; {
2458 next := c.NextSibling
2459 parent.RemoveChild(c)
2460 result = append(result, c)
2461 c = next
2462 }
2463 return result, nil
2464}