1package md
2
3import (
4 "fmt"
5 "unicode"
6
7 "regexp"
8 "strconv"
9 "strings"
10 "unicode/utf8"
11
12 "github.com/JohannesKaufmann/html-to-markdown/escape"
13 "github.com/PuerkitoBio/goquery"
14)
15
16var multipleSpacesR = regexp.MustCompile(` +`)
17
18var commonmark = []Rule{
19 {
20 Filter: []string{"ul", "ol"},
21 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
22 parent := selec.Parent()
23
24 // we have a nested list, were the ul/ol is inside a list item
25 // -> based on work done by @requilence from @anytypeio
26 if (parent.Is("li") || parent.Is("ul") || parent.Is("ol")) && parent.Children().Last().IsSelection(selec) {
27 // add a line break prefix if the parent's text node doesn't have it.
28 // that makes sure that every list item is on its on line
29 lastContentTextNode := strings.TrimRight(parent.Nodes[0].FirstChild.Data, " \t")
30 if !strings.HasSuffix(lastContentTextNode, "\n") {
31 content = "\n" + content
32 }
33
34 // remove empty lines between lists
35 trimmedSpaceContent := strings.TrimRight(content, " \t")
36 if strings.HasSuffix(trimmedSpaceContent, "\n") {
37 content = strings.TrimRightFunc(content, unicode.IsSpace)
38 }
39 } else {
40 content = "\n\n" + content + "\n\n"
41 }
42 return &content
43 },
44 },
45 {
46 Filter: []string{"li"},
47 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
48 if strings.TrimSpace(content) == "" {
49 return nil
50 }
51
52 // remove leading newlines
53 content = leadingNewlinesR.ReplaceAllString(content, "")
54 // replace trailing newlines with just a single one
55 content = trailingNewlinesR.ReplaceAllString(content, "\n")
56 // remove leading spaces
57 content = strings.TrimLeft(content, " ")
58
59 prefix := selec.AttrOr(attrListPrefix, "")
60
61 // `prefixCount` is not nessesarily the length of the empty string `prefix`
62 // but how much space is reserved for the prefixes of the siblings.
63 prefixCount, previousPrefixCounts := countListParents(opt, selec)
64
65 // if the prefix is not needed, balance it by adding the usual prefix spaces
66 if prefix == "" {
67 prefix = strings.Repeat(" ", prefixCount)
68 }
69 // indent the prefix so that the nested links are represented
70 indent := strings.Repeat(" ", previousPrefixCounts)
71 prefix = indent + prefix
72
73 content = IndentMultiLineListItem(opt, content, prefixCount+previousPrefixCounts)
74
75 return String(prefix + content + "\n")
76 },
77 },
78 {
79 Filter: []string{"#text"},
80 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
81 text := selec.Text()
82 if trimmed := strings.TrimSpace(text); trimmed == "" {
83 return String("")
84 }
85 text = tabR.ReplaceAllString(text, " ")
86
87 // replace multiple spaces by one space: dont accidentally make
88 // normal text be indented and thus be a code block.
89 text = multipleSpacesR.ReplaceAllString(text, " ")
90
91 if opt.EscapeMode == "basic" {
92 text = escape.MarkdownCharacters(text)
93 }
94
95 // if its inside a list, trim the spaces to not mess up the indentation
96 parent := selec.Parent()
97 next := selec.Next()
98 if IndexWithText(selec) == 0 &&
99 (parent.Is("li") || parent.Is("ol") || parent.Is("ul")) &&
100 (next.Is("ul") || next.Is("ol")) {
101 // trim only spaces and not new lines
102 text = strings.Trim(text, ` `)
103 }
104
105 return &text
106 },
107 },
108 {
109 Filter: []string{"p", "div"},
110 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
111 parent := goquery.NodeName(selec.Parent())
112 if IsInlineElement(parent) || parent == "li" {
113 content = "\n" + content + "\n"
114 return &content
115 }
116
117 // remove unnecessary spaces to have clean markdown
118 content = TrimpLeadingSpaces(content)
119
120 content = "\n\n" + content + "\n\n"
121 return &content
122 },
123 },
124 {
125 Filter: []string{"h1", "h2", "h3", "h4", "h5", "h6"},
126 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
127 if strings.TrimSpace(content) == "" {
128 return nil
129 }
130
131 content = strings.Replace(content, "\n", " ", -1)
132 content = strings.Replace(content, "\r", " ", -1)
133 content = strings.Replace(content, `#`, `\#`, -1)
134 content = strings.TrimSpace(content)
135
136 insideLink := selec.ParentsFiltered("a").Length() > 0
137 if insideLink {
138 text := opt.StrongDelimiter + content + opt.StrongDelimiter
139 text = AddSpaceIfNessesary(selec, text)
140 return &text
141 }
142
143 node := goquery.NodeName(selec)
144 level, err := strconv.Atoi(node[1:])
145 if err != nil {
146 return nil
147 }
148
149 if opt.HeadingStyle == "setext" && level < 3 {
150 line := "-"
151 if level == 1 {
152 line = "="
153 }
154
155 underline := strings.Repeat(line, len(content))
156 return String("\n\n" + content + "\n" + underline + "\n\n")
157 }
158
159 prefix := strings.Repeat("#", level)
160 text := "\n\n" + prefix + " " + content + "\n\n"
161 return &text
162 },
163 },
164 {
165 Filter: []string{"strong", "b"},
166 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
167 // only use one bold tag if they are nested
168 parent := selec.Parent()
169 if parent.Is("strong") || parent.Is("b") {
170 return &content
171 }
172
173 trimmed := strings.TrimSpace(content)
174 if trimmed == "" {
175 return &trimmed
176 }
177
178 // If there is a newline character between the start and end delimiter
179 // the delimiters won't be recognized. Either we remove all newline characters
180 // OR on _every_ line we put start & end delimiters.
181 trimmed = delimiterForEveryLine(trimmed, opt.StrongDelimiter)
182
183 // Always have a space to the side to recognize the delimiter
184 trimmed = AddSpaceIfNessesary(selec, trimmed)
185
186 return &trimmed
187 },
188 },
189 {
190 Filter: []string{"i", "em"},
191 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
192 // only use one italic tag if they are nested
193 parent := selec.Parent()
194 if parent.Is("i") || parent.Is("em") {
195 return &content
196 }
197
198 trimmed := strings.TrimSpace(content)
199 if trimmed == "" {
200 return &trimmed
201 }
202
203 // If there is a newline character between the start and end delimiter
204 // the delimiters won't be recognized. Either we remove all newline characters
205 // OR on _every_ line we put start & end delimiters.
206 trimmed = delimiterForEveryLine(trimmed, opt.EmDelimiter)
207
208 // Always have a space to the side to recognize the delimiter
209 trimmed = AddSpaceIfNessesary(selec, trimmed)
210
211 return &trimmed
212 },
213 },
214 {
215 Filter: []string{"img"},
216 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
217 src := selec.AttrOr("src", "")
218 src = strings.TrimSpace(src)
219 if src == "" {
220 return String("")
221 }
222
223 src = opt.GetAbsoluteURL(selec, src, opt.domain)
224
225 alt := selec.AttrOr("alt", "")
226 alt = strings.Replace(alt, "\n", " ", -1)
227
228 text := fmt.Sprintf("", alt, src)
229 return &text
230 },
231 },
232 {
233 Filter: []string{"a"},
234 AdvancedReplacement: func(content string, selec *goquery.Selection, opt *Options) (AdvancedResult, bool) {
235 // if there is no href, no link is used. So just return the content inside the link
236 href, ok := selec.Attr("href")
237 if !ok || strings.TrimSpace(href) == "" || strings.TrimSpace(href) == "#" {
238 return AdvancedResult{
239 Markdown: content,
240 }, false
241 }
242
243 href = opt.GetAbsoluteURL(selec, href, opt.domain)
244
245 // having multiline content inside a link is a bit tricky
246 content = EscapeMultiLine(content)
247
248 var title string
249 if t, ok := selec.Attr("title"); ok {
250 t = strings.Replace(t, "\n", " ", -1)
251 // escape all quotes
252 t = strings.Replace(t, `"`, `\"`, -1)
253 title = fmt.Sprintf(` "%s"`, t)
254 }
255
256 // if there is no link content (for example because it contains an svg)
257 // the 'title' or 'aria-label' attribute is used instead.
258 if strings.TrimSpace(content) == "" {
259 content = selec.AttrOr("title", selec.AttrOr("aria-label", ""))
260 }
261
262 // a link without text won't de displayed anyway
263 if content == "" {
264 return AdvancedResult{}, true
265 }
266
267 if opt.LinkStyle == "inlined" {
268 md := fmt.Sprintf("[%s](%s%s)", content, href, title)
269 md = AddSpaceIfNessesary(selec, md)
270
271 return AdvancedResult{
272 Markdown: md,
273 }, false
274 }
275
276 var replacement string
277 var reference string
278
279 switch opt.LinkReferenceStyle {
280 case "collapsed":
281
282 replacement = "[" + content + "][]"
283 reference = "[" + content + "]: " + href + title
284 case "shortcut":
285 replacement = "[" + content + "]"
286 reference = "[" + content + "]: " + href + title
287
288 default:
289 id := selec.AttrOr("data-index", "")
290 replacement = "[" + content + "][" + id + "]"
291 reference = "[" + id + "]: " + href + title
292 }
293
294 replacement = AddSpaceIfNessesary(selec, replacement)
295 return AdvancedResult{Markdown: replacement, Footer: reference}, false
296 },
297 },
298 {
299 Filter: []string{"code", "kbd", "samp", "tt"},
300 Replacement: func(_ string, selec *goquery.Selection, opt *Options) *string {
301 code := getCodeContent(selec)
302
303 // Newlines in the text aren't great, since this is inline code and not a code block.
304 // Newlines will be stripped anyway in the browser, but it won't be recognized as code
305 // from the markdown parser when there is more than one newline.
306 // So limit to
307 code = multipleNewLinesRegex.ReplaceAllString(code, "\n")
308
309 fenceChar := '`'
310 maxCount := calculateCodeFenceOccurrences(fenceChar, code)
311 maxCount++
312
313 fence := strings.Repeat(string(fenceChar), maxCount)
314
315 // code block contains a backtick as first character
316 if strings.HasPrefix(code, "`") {
317 code = " " + code
318 }
319 // code block contains a backtick as last character
320 if strings.HasSuffix(code, "`") {
321 code = code + " "
322 }
323
324 // TODO: configure delimeter in options?
325 text := fence + code + fence
326 text = AddSpaceIfNessesary(selec, text)
327 return &text
328 },
329 },
330 {
331 Filter: []string{"pre"},
332 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
333 codeElement := selec.Find("code")
334 language := codeElement.AttrOr("class", "")
335 language = strings.Replace(language, "language-", "", 1)
336
337 code := getCodeContent(selec)
338
339 fenceChar, _ := utf8.DecodeRuneInString(opt.Fence)
340 fence := CalculateCodeFence(fenceChar, code)
341
342 text := "\n\n" + fence + language + "\n" +
343 code +
344 "\n" + fence + "\n\n"
345 return &text
346 },
347 },
348 {
349 Filter: []string{"hr"},
350 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
351 // e.g. `## --- Heading` would look weird, so don't render a divider if inside a heading
352 insideHeading := selec.ParentsFiltered("h1,h2,h3,h4,h5,h6").Length() > 0
353 if insideHeading {
354 return String("")
355 }
356
357 text := "\n\n" + opt.HorizontalRule + "\n\n"
358 return &text
359 },
360 },
361 {
362 Filter: []string{"br"},
363 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
364 return String("\n\n")
365 },
366 },
367 {
368 Filter: []string{"blockquote"},
369 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
370 content = strings.TrimSpace(content)
371 if content == "" {
372 return nil
373 }
374
375 content = multipleNewLinesRegex.ReplaceAllString(content, "\n\n")
376
377 var beginningR = regexp.MustCompile(`(?m)^`)
378 content = beginningR.ReplaceAllString(content, "> ")
379
380 text := "\n\n" + content + "\n\n"
381 return &text
382 },
383 },
384 {
385 Filter: []string{"noscript"},
386 Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
387 // for now remove the contents of noscript. But in the future we could
388 // tell goquery to parse the contents of the tag.
389 // -> https://github.com/PuerkitoBio/goquery/issues/139#issuecomment-517526070
390 return nil
391 },
392 },
393}