html_block.go

  1package parser
  2
  3import (
  4	"bytes"
  5	"regexp"
  6	"strings"
  7
  8	"github.com/yuin/goldmark/ast"
  9	"github.com/yuin/goldmark/text"
 10	"github.com/yuin/goldmark/util"
 11)
 12
 13var allowedBlockTags = map[string]bool{
 14	"address":    true,
 15	"article":    true,
 16	"aside":      true,
 17	"base":       true,
 18	"basefont":   true,
 19	"blockquote": true,
 20	"body":       true,
 21	"caption":    true,
 22	"center":     true,
 23	"col":        true,
 24	"colgroup":   true,
 25	"dd":         true,
 26	"details":    true,
 27	"dialog":     true,
 28	"dir":        true,
 29	"div":        true,
 30	"dl":         true,
 31	"dt":         true,
 32	"fieldset":   true,
 33	"figcaption": true,
 34	"figure":     true,
 35	"footer":     true,
 36	"form":       true,
 37	"frame":      true,
 38	"frameset":   true,
 39	"h1":         true,
 40	"h2":         true,
 41	"h3":         true,
 42	"h4":         true,
 43	"h5":         true,
 44	"h6":         true,
 45	"head":       true,
 46	"header":     true,
 47	"hr":         true,
 48	"html":       true,
 49	"iframe":     true,
 50	"legend":     true,
 51	"li":         true,
 52	"link":       true,
 53	"main":       true,
 54	"menu":       true,
 55	"menuitem":   true,
 56	"meta":       true,
 57	"nav":        true,
 58	"noframes":   true,
 59	"ol":         true,
 60	"optgroup":   true,
 61	"option":     true,
 62	"p":          true,
 63	"param":      true,
 64	"search":     true,
 65	"section":    true,
 66	"summary":    true,
 67	"table":      true,
 68	"tbody":      true,
 69	"td":         true,
 70	"tfoot":      true,
 71	"th":         true,
 72	"thead":      true,
 73	"title":      true,
 74	"tr":         true,
 75	"track":      true,
 76	"ul":         true,
 77}
 78
 79var htmlBlockType1OpenRegexp = regexp.MustCompile(`(?i)^[ ]{0,3}<(script|pre|style|textarea)(?:\s.*|>.*|/>.*|)(?:\r\n|\n)?$`) //nolint:golint,lll
 80var htmlBlockType1CloseRegexp = regexp.MustCompile(`(?i)^.*</(?:script|pre|style|textarea)>.*`)
 81
 82var htmlBlockType2OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<!\-\-`)
 83var htmlBlockType2Close = []byte{'-', '-', '>'}
 84
 85var htmlBlockType3OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\?`)
 86var htmlBlockType3Close = []byte{'?', '>'}
 87
 88var htmlBlockType4OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<![A-Z]+.*(?:\r\n|\n)?$`)
 89var htmlBlockType4Close = []byte{'>'}
 90
 91var htmlBlockType5OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\!\[CDATA\[`)
 92var htmlBlockType5Close = []byte{']', ']', '>'}
 93
 94var htmlBlockType6Regexp = regexp.MustCompile(`^[ ]{0,3}<(?:/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(?:[ ].*|>.*|/>.*|)(?:\r\n|\n)?$`) //nolint:golint,lll
 95
 96var htmlBlockType7Regexp = regexp.MustCompile(`^[ ]{0,3}<(/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(` + attributePattern + `*)[ ]*(?:>|/>)[ ]*(?:\r\n|\n)?$`) //nolint:golint,lll
 97
 98type htmlBlockParser struct {
 99}
100
101var defaultHTMLBlockParser = &htmlBlockParser{}
102
103// NewHTMLBlockParser return a new BlockParser that can parse html
104// blocks.
105func NewHTMLBlockParser() BlockParser {
106	return defaultHTMLBlockParser
107}
108
109func (b *htmlBlockParser) Trigger() []byte {
110	return []byte{'<'}
111}
112
113func (b *htmlBlockParser) Open(parent ast.Node, reader text.Reader, pc Context) (ast.Node, State) {
114	var node *ast.HTMLBlock
115	line, segment := reader.PeekLine()
116	last := pc.LastOpenedBlock().Node
117	if pos := pc.BlockOffset(); pos < 0 || line[pos] != '<' {
118		return nil, NoChildren
119	}
120
121	if m := htmlBlockType1OpenRegexp.FindSubmatchIndex(line); m != nil {
122		node = ast.NewHTMLBlock(ast.HTMLBlockType1)
123	} else if htmlBlockType2OpenRegexp.Match(line) {
124		node = ast.NewHTMLBlock(ast.HTMLBlockType2)
125	} else if htmlBlockType3OpenRegexp.Match(line) {
126		node = ast.NewHTMLBlock(ast.HTMLBlockType3)
127	} else if htmlBlockType4OpenRegexp.Match(line) {
128		node = ast.NewHTMLBlock(ast.HTMLBlockType4)
129	} else if htmlBlockType5OpenRegexp.Match(line) {
130		node = ast.NewHTMLBlock(ast.HTMLBlockType5)
131	} else if match := htmlBlockType7Regexp.FindSubmatchIndex(line); match != nil {
132		isCloseTag := match[2] > -1 && bytes.Equal(line[match[2]:match[3]], []byte("/"))
133		hasAttr := match[6] != match[7]
134		tagName := strings.ToLower(string(line[match[4]:match[5]]))
135		_, ok := allowedBlockTags[tagName]
136		if ok {
137			node = ast.NewHTMLBlock(ast.HTMLBlockType6)
138		} else if tagName != "script" && tagName != "style" &&
139			tagName != "pre" && !ast.IsParagraph(last) && !(isCloseTag && hasAttr) { // type 7 can not interrupt paragraph
140			node = ast.NewHTMLBlock(ast.HTMLBlockType7)
141		}
142	}
143	if node == nil {
144		if match := htmlBlockType6Regexp.FindSubmatchIndex(line); match != nil {
145			tagName := string(line[match[2]:match[3]])
146			_, ok := allowedBlockTags[strings.ToLower(tagName)]
147			if ok {
148				node = ast.NewHTMLBlock(ast.HTMLBlockType6)
149			}
150		}
151	}
152	if node != nil {
153		reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
154		node.Lines().Append(segment)
155		return node, NoChildren
156	}
157	return nil, NoChildren
158}
159
160func (b *htmlBlockParser) Continue(node ast.Node, reader text.Reader, pc Context) State {
161	htmlBlock := node.(*ast.HTMLBlock)
162	lines := htmlBlock.Lines()
163	line, segment := reader.PeekLine()
164	var closurePattern []byte
165
166	switch htmlBlock.HTMLBlockType {
167	case ast.HTMLBlockType1:
168		if lines.Len() == 1 {
169			firstLine := lines.At(0)
170			if htmlBlockType1CloseRegexp.Match(firstLine.Value(reader.Source())) {
171				return Close
172			}
173		}
174		if htmlBlockType1CloseRegexp.Match(line) {
175			htmlBlock.ClosureLine = segment
176			reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
177			return Close
178		}
179	case ast.HTMLBlockType2:
180		closurePattern = htmlBlockType2Close
181		fallthrough
182	case ast.HTMLBlockType3:
183		if closurePattern == nil {
184			closurePattern = htmlBlockType3Close
185		}
186		fallthrough
187	case ast.HTMLBlockType4:
188		if closurePattern == nil {
189			closurePattern = htmlBlockType4Close
190		}
191		fallthrough
192	case ast.HTMLBlockType5:
193		if closurePattern == nil {
194			closurePattern = htmlBlockType5Close
195		}
196
197		if lines.Len() == 1 {
198			firstLine := lines.At(0)
199			if bytes.Contains(firstLine.Value(reader.Source()), closurePattern) {
200				return Close
201			}
202		}
203		if bytes.Contains(line, closurePattern) {
204			htmlBlock.ClosureLine = segment
205			reader.Advance(segment.Len())
206			return Close
207		}
208
209	case ast.HTMLBlockType6, ast.HTMLBlockType7:
210		if util.IsBlank(line) {
211			return Close
212		}
213	}
214	node.Lines().Append(segment)
215	reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
216	return Continue | NoChildren
217}
218
219func (b *htmlBlockParser) Close(node ast.Node, reader text.Reader, pc Context) {
220	// nothing to do
221}
222
223func (b *htmlBlockParser) CanInterruptParagraph() bool {
224	return true
225}
226
227func (b *htmlBlockParser) CanAcceptIndentedLine() bool {
228	return false
229}