1package md
  2
  3import (
  4	"fmt"
  5	"unicode"
  6
  7	"regexp"
  8	"strconv"
  9	"strings"
 10	"unicode/utf8"
 11
 12	"github.com/JohannesKaufmann/html-to-markdown/escape"
 13	"github.com/PuerkitoBio/goquery"
 14)
 15
 16var multipleSpacesR = regexp.MustCompile(`  +`)
 17
 18var commonmark = []Rule{
 19	{
 20		Filter: []string{"ul", "ol"},
 21		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
 22			parent := selec.Parent()
 23
 24			// we have a nested list, were the ul/ol is inside a list item
 25			// -> based on work done by @requilence from @anytypeio
 26			if (parent.Is("li") || parent.Is("ul") || parent.Is("ol")) && parent.Children().Last().IsSelection(selec) {
 27				// add a line break prefix if the parent's text node doesn't have it.
 28				// that makes sure that every list item is on its on line
 29				lastContentTextNode := strings.TrimRight(parent.Nodes[0].FirstChild.Data, " \t")
 30				if !strings.HasSuffix(lastContentTextNode, "\n") {
 31					content = "\n" + content
 32				}
 33
 34				// remove empty lines between lists
 35				trimmedSpaceContent := strings.TrimRight(content, " \t")
 36				if strings.HasSuffix(trimmedSpaceContent, "\n") {
 37					content = strings.TrimRightFunc(content, unicode.IsSpace)
 38				}
 39			} else {
 40				content = "\n\n" + content + "\n\n"
 41			}
 42			return &content
 43		},
 44	},
 45	{
 46		Filter: []string{"li"},
 47		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
 48			if strings.TrimSpace(content) == "" {
 49				return nil
 50			}
 51
 52			// remove leading newlines
 53			content = leadingNewlinesR.ReplaceAllString(content, "")
 54			// replace trailing newlines with just a single one
 55			content = trailingNewlinesR.ReplaceAllString(content, "\n")
 56			// remove leading spaces
 57			content = strings.TrimLeft(content, " ")
 58
 59			prefix := selec.AttrOr(attrListPrefix, "")
 60
 61			// `prefixCount` is not nessesarily the length of the empty string `prefix`
 62			// but how much space is reserved for the prefixes of the siblings.
 63			prefixCount, previousPrefixCounts := countListParents(opt, selec)
 64
 65			// if the prefix is not needed, balance it by adding the usual prefix spaces
 66			if prefix == "" {
 67				prefix = strings.Repeat(" ", prefixCount)
 68			}
 69			// indent the prefix so that the nested links are represented
 70			indent := strings.Repeat(" ", previousPrefixCounts)
 71			prefix = indent + prefix
 72
 73			content = IndentMultiLineListItem(opt, content, prefixCount+previousPrefixCounts)
 74
 75			return String(prefix + content + "\n")
 76		},
 77	},
 78	{
 79		Filter: []string{"#text"},
 80		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
 81			text := selec.Text()
 82			if trimmed := strings.TrimSpace(text); trimmed == "" {
 83				return String("")
 84			}
 85			text = tabR.ReplaceAllString(text, " ")
 86
 87			// replace multiple spaces by one space: dont accidentally make
 88			// normal text be indented and thus be a code block.
 89			text = multipleSpacesR.ReplaceAllString(text, " ")
 90
 91			if opt.EscapeMode == "basic" {
 92				text = escape.MarkdownCharacters(text)
 93			}
 94
 95			// if its inside a list, trim the spaces to not mess up the indentation
 96			parent := selec.Parent()
 97			next := selec.Next()
 98			if IndexWithText(selec) == 0 &&
 99				(parent.Is("li") || parent.Is("ol") || parent.Is("ul")) &&
100				(next.Is("ul") || next.Is("ol")) {
101				// trim only spaces and not new lines
102				text = strings.Trim(text, ` `)
103			}
104
105			return &text
106		},
107	},
108	{
109		Filter: []string{"p", "div"},
110		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
111			parent := goquery.NodeName(selec.Parent())
112			if IsInlineElement(parent) || parent == "li" {
113				content = "\n" + content + "\n"
114				return &content
115			}
116
117			// remove unnecessary spaces to have clean markdown
118			content = TrimpLeadingSpaces(content)
119
120			content = "\n\n" + content + "\n\n"
121			return &content
122		},
123	},
124	{
125		Filter: []string{"h1", "h2", "h3", "h4", "h5", "h6"},
126		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
127			if strings.TrimSpace(content) == "" {
128				return nil
129			}
130
131			content = strings.Replace(content, "\n", " ", -1)
132			content = strings.Replace(content, "\r", " ", -1)
133			content = strings.Replace(content, `#`, `\#`, -1)
134			content = strings.TrimSpace(content)
135
136			insideLink := selec.ParentsFiltered("a").Length() > 0
137			if insideLink {
138				text := opt.StrongDelimiter + content + opt.StrongDelimiter
139				text = AddSpaceIfNessesary(selec, text)
140				return &text
141			}
142
143			node := goquery.NodeName(selec)
144			level, err := strconv.Atoi(node[1:])
145			if err != nil {
146				return nil
147			}
148
149			if opt.HeadingStyle == "setext" && level < 3 {
150				line := "-"
151				if level == 1 {
152					line = "="
153				}
154
155				underline := strings.Repeat(line, len(content))
156				return String("\n\n" + content + "\n" + underline + "\n\n")
157			}
158
159			prefix := strings.Repeat("#", level)
160			text := "\n\n" + prefix + " " + content + "\n\n"
161			return &text
162		},
163	},
164	{
165		Filter: []string{"strong", "b"},
166		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
167			// only use one bold tag if they are nested
168			parent := selec.Parent()
169			if parent.Is("strong") || parent.Is("b") {
170				return &content
171			}
172
173			trimmed := strings.TrimSpace(content)
174			if trimmed == "" {
175				return &trimmed
176			}
177
178			// If there is a newline character between the start and end delimiter
179			// the delimiters won't be recognized. Either we remove all newline characters
180			// OR on _every_ line we put start & end delimiters.
181			trimmed = delimiterForEveryLine(trimmed, opt.StrongDelimiter)
182
183			// Always have a space to the side to recognize the delimiter
184			trimmed = AddSpaceIfNessesary(selec, trimmed)
185
186			return &trimmed
187		},
188	},
189	{
190		Filter: []string{"i", "em"},
191		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
192			// only use one italic tag if they are nested
193			parent := selec.Parent()
194			if parent.Is("i") || parent.Is("em") {
195				return &content
196			}
197
198			trimmed := strings.TrimSpace(content)
199			if trimmed == "" {
200				return &trimmed
201			}
202
203			// If there is a newline character between the start and end delimiter
204			// the delimiters won't be recognized. Either we remove all newline characters
205			// OR on _every_ line we put start & end delimiters.
206			trimmed = delimiterForEveryLine(trimmed, opt.EmDelimiter)
207
208			// Always have a space to the side to recognize the delimiter
209			trimmed = AddSpaceIfNessesary(selec, trimmed)
210
211			return &trimmed
212		},
213	},
214	{
215		Filter: []string{"img"},
216		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
217			src := selec.AttrOr("src", "")
218			src = strings.TrimSpace(src)
219			if src == "" {
220				return String("")
221			}
222
223			src = opt.GetAbsoluteURL(selec, src, opt.domain)
224
225			alt := selec.AttrOr("alt", "")
226			alt = strings.Replace(alt, "\n", " ", -1)
227
228			text := fmt.Sprintf("", alt, src)
229			return &text
230		},
231	},
232	{
233		Filter: []string{"a"},
234		AdvancedReplacement: func(content string, selec *goquery.Selection, opt *Options) (AdvancedResult, bool) {
235			// if there is no href, no link is used. So just return the content inside the link
236			href, ok := selec.Attr("href")
237			if !ok || strings.TrimSpace(href) == "" || strings.TrimSpace(href) == "#" {
238				return AdvancedResult{
239					Markdown: content,
240				}, false
241			}
242
243			href = opt.GetAbsoluteURL(selec, href, opt.domain)
244
245			// having multiline content inside a link is a bit tricky
246			content = EscapeMultiLine(content)
247
248			var title string
249			if t, ok := selec.Attr("title"); ok {
250				t = strings.Replace(t, "\n", " ", -1)
251				// escape all quotes
252				t = strings.Replace(t, `"`, `\"`, -1)
253				title = fmt.Sprintf(` "%s"`, t)
254			}
255
256			// if there is no link content (for example because it contains an svg)
257			// the 'title' or 'aria-label' attribute is used instead.
258			if strings.TrimSpace(content) == "" {
259				content = selec.AttrOr("title", selec.AttrOr("aria-label", ""))
260			}
261
262			// a link without text won't de displayed anyway
263			if content == "" {
264				return AdvancedResult{}, true
265			}
266
267			if opt.LinkStyle == "inlined" {
268				md := fmt.Sprintf("[%s](%s%s)", content, href, title)
269				md = AddSpaceIfNessesary(selec, md)
270
271				return AdvancedResult{
272					Markdown: md,
273				}, false
274			}
275
276			var replacement string
277			var reference string
278
279			switch opt.LinkReferenceStyle {
280			case "collapsed":
281
282				replacement = "[" + content + "][]"
283				reference = "[" + content + "]: " + href + title
284			case "shortcut":
285				replacement = "[" + content + "]"
286				reference = "[" + content + "]: " + href + title
287
288			default:
289				id := selec.AttrOr("data-index", "")
290				replacement = "[" + content + "][" + id + "]"
291				reference = "[" + id + "]: " + href + title
292			}
293
294			replacement = AddSpaceIfNessesary(selec, replacement)
295			return AdvancedResult{Markdown: replacement, Footer: reference}, false
296		},
297	},
298	{
299		Filter: []string{"code", "kbd", "samp", "tt"},
300		Replacement: func(_ string, selec *goquery.Selection, opt *Options) *string {
301			code := getCodeContent(selec)
302
303			// Newlines in the text aren't great, since this is inline code and not a code block.
304			// Newlines will be stripped anyway in the browser, but it won't be recognized as code
305			// from the markdown parser when there is more than one newline.
306			// So limit to
307			code = multipleNewLinesRegex.ReplaceAllString(code, "\n")
308
309			fenceChar := '`'
310			maxCount := calculateCodeFenceOccurrences(fenceChar, code)
311			maxCount++
312
313			fence := strings.Repeat(string(fenceChar), maxCount)
314
315			// code block contains a backtick as first character
316			if strings.HasPrefix(code, "`") {
317				code = " " + code
318			}
319			// code block contains a backtick as last character
320			if strings.HasSuffix(code, "`") {
321				code = code + " "
322			}
323
324			// TODO: configure delimeter in options?
325			text := fence + code + fence
326			text = AddSpaceIfNessesary(selec, text)
327			return &text
328		},
329	},
330	{
331		Filter: []string{"pre"},
332		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
333			codeElement := selec.Find("code")
334			language := codeElement.AttrOr("class", "")
335			language = strings.Replace(language, "language-", "", 1)
336
337			code := getCodeContent(selec)
338
339			fenceChar, _ := utf8.DecodeRuneInString(opt.Fence)
340			fence := CalculateCodeFence(fenceChar, code)
341
342			text := "\n\n" + fence + language + "\n" +
343				code +
344				"\n" + fence + "\n\n"
345			return &text
346		},
347	},
348	{
349		Filter: []string{"hr"},
350		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
351			// e.g. `## --- Heading` would look weird, so don't render a divider if inside a heading
352			insideHeading := selec.ParentsFiltered("h1,h2,h3,h4,h5,h6").Length() > 0
353			if insideHeading {
354				return String("")
355			}
356
357			text := "\n\n" + opt.HorizontalRule + "\n\n"
358			return &text
359		},
360	},
361	{
362		Filter: []string{"br"},
363		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
364			return String("\n\n")
365		},
366	},
367	{
368		Filter: []string{"blockquote"},
369		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
370			content = strings.TrimSpace(content)
371			if content == "" {
372				return nil
373			}
374
375			content = multipleNewLinesRegex.ReplaceAllString(content, "\n\n")
376
377			var beginningR = regexp.MustCompile(`(?m)^`)
378			content = beginningR.ReplaceAllString(content, "> ")
379
380			text := "\n\n" + content + "\n\n"
381			return &text
382		},
383	},
384	{
385		Filter: []string{"noscript"},
386		Replacement: func(content string, selec *goquery.Selection, opt *Options) *string {
387			// for now remove the contents of noscript. But in the future we could
388			// tell goquery to parse the contents of the tag.
389			// -> https://github.com/PuerkitoBio/goquery/issues/139#issuecomment-517526070
390			return nil
391		},
392	},
393}