1package md
2
3import (
4 "bytes"
5 "fmt"
6 "regexp"
7 "strconv"
8 "strings"
9 "unicode"
10 "unicode/utf8"
11
12 "github.com/PuerkitoBio/goquery"
13 "golang.org/x/net/html"
14)
15
16/*
17WARNING: The functions from this file can be used externally
18but there is no garanty that they will stay exported.
19*/
20
21// CollectText returns the text of the node and all its children
22func CollectText(n *html.Node) string {
23 text := &bytes.Buffer{}
24 collectText(n, text)
25 return text.String()
26}
27func collectText(n *html.Node, buf *bytes.Buffer) {
28 if n.Type == html.TextNode {
29 buf.WriteString(n.Data)
30 }
31 for c := n.FirstChild; c != nil; c = c.NextSibling {
32 collectText(c, buf)
33 }
34}
35
36func getName(node *html.Node) string {
37 selec := &goquery.Selection{Nodes: []*html.Node{node}}
38 return goquery.NodeName(selec)
39}
40
41// What elements automatically trim their content?
42// Don't add another space if the other element is going to add a
43// space already.
44func isTrimmedElement(name string) bool {
45 nodes := []string{
46 "a",
47 "strong", "b",
48 "i", "em",
49 "del", "s", "strike",
50 "code",
51 }
52
53 for _, node := range nodes {
54 if name == node {
55 return true
56 }
57 }
58 return false
59}
60
61func getPrevNodeText(node *html.Node) (string, bool) {
62 if node == nil {
63 return "", false
64 }
65
66 for ; node != nil; node = node.PrevSibling {
67 text := CollectText(node)
68
69 name := getName(node)
70 if name == "br" {
71 return "\n", true
72 }
73
74 // if the content is empty, try our luck with the next node
75 if strings.TrimSpace(text) == "" {
76 continue
77 }
78
79 if isTrimmedElement(name) {
80 text = strings.TrimSpace(text)
81 }
82
83 return text, true
84 }
85 return "", false
86}
87func getNextNodeText(node *html.Node) (string, bool) {
88 if node == nil {
89 return "", false
90 }
91
92 for ; node != nil; node = node.NextSibling {
93 text := CollectText(node)
94
95 name := getName(node)
96 if name == "br" {
97 return "\n", true
98 }
99
100 // if the content is empty, try our luck with the next node
101 if strings.TrimSpace(text) == "" {
102 continue
103 }
104
105 // if you have "a a a", three elements that are trimmed, then only add
106 // a space to one side, since the other's are also adding a space.
107 if isTrimmedElement(name) {
108 text = " "
109 }
110
111 return text, true
112 }
113 return "", false
114}
115
116// AddSpaceIfNessesary adds spaces to the text based on the neighbors.
117// That makes sure that there is always a space to the side, to recognize the delimiter.
118func AddSpaceIfNessesary(selec *goquery.Selection, markdown string) string {
119 if len(selec.Nodes) == 0 {
120 return markdown
121 }
122 rootNode := selec.Nodes[0]
123
124 prev, hasPrev := getPrevNodeText(rootNode.PrevSibling)
125 if hasPrev {
126 lastChar, size := utf8.DecodeLastRuneInString(prev)
127 if size > 0 && !unicode.IsSpace(lastChar) {
128 markdown = " " + markdown
129 }
130 }
131
132 next, hasNext := getNextNodeText(rootNode.NextSibling)
133 if hasNext {
134 firstChar, size := utf8.DecodeRuneInString(next)
135 if size > 0 && !unicode.IsSpace(firstChar) && !unicode.IsPunct(firstChar) {
136 markdown = markdown + " "
137 }
138 }
139
140 return markdown
141}
142
143func isLineCodeDelimiter(chars []rune) bool {
144 if len(chars) < 3 {
145 return false
146 }
147
148 // TODO: If it starts with 4 (instead of 3) fence characters, we should only end it
149 // if we see the same amount of ending fence characters.
150 return chars[0] == '`' && chars[1] == '`' && chars[2] == '`'
151}
152
153// TrimpLeadingSpaces removes spaces from the beginning of a line
154// but makes sure that list items and code blocks are not affected.
155func TrimpLeadingSpaces(text string) string {
156 var insideCodeBlock bool
157
158 lines := strings.Split(text, "\n")
159 for index := range lines {
160 chars := []rune(lines[index])
161
162 if isLineCodeDelimiter(chars) {
163 if !insideCodeBlock {
164 // start the code block
165 insideCodeBlock = true
166 } else {
167 // end the code block
168 insideCodeBlock = false
169 }
170 }
171 if insideCodeBlock {
172 // We are inside a code block and don't want to
173 // disturb that formatting (e.g. python indentation)
174 continue
175 }
176
177 var spaces int
178 for i := 0; i < len(chars); i++ {
179 if unicode.IsSpace(chars[i]) {
180 if chars[i] == ' ' {
181 spaces = spaces + 4
182 } else {
183 spaces++
184 }
185 continue
186 }
187
188 // this seems to be a list item
189 if chars[i] == '-' {
190 break
191 }
192
193 // this seems to be a code block
194 if spaces >= 4 {
195 break
196 }
197
198 // remove the space characters from the string
199 chars = chars[i:]
200 break
201 }
202 lines[index] = string(chars)
203 }
204
205 return strings.Join(lines, "\n")
206}
207
208// TrimTrailingSpaces removes unnecessary spaces from the end of lines.
209func TrimTrailingSpaces(text string) string {
210 parts := strings.Split(text, "\n")
211 for i := range parts {
212 parts[i] = strings.TrimRightFunc(parts[i], func(r rune) bool {
213 return unicode.IsSpace(r)
214 })
215
216 }
217
218 return strings.Join(parts, "\n")
219}
220
221// The same as `multipleNewLinesRegex`, but applies to escaped new lines inside a link `\n\`
222var multipleNewLinesInLinkRegex = regexp.MustCompile(`(\n\\){1,}`) // `([\n\r\s]\\)`
223
224// EscapeMultiLine deals with multiline content inside a link
225func EscapeMultiLine(content string) string {
226 content = strings.TrimSpace(content)
227 content = strings.Replace(content, "\n", `\`+"\n", -1)
228
229 content = multipleNewLinesInLinkRegex.ReplaceAllString(content, "\n\\")
230
231 return content
232}
233
234func calculateCodeFenceOccurrences(fenceChar rune, content string) int {
235 var occurrences []int
236
237 var charsTogether int
238 for _, char := range content {
239 // we encountered a fence character, now count how many
240 // are directly afterwards
241 if char == fenceChar {
242 charsTogether++
243 } else if charsTogether != 0 {
244 occurrences = append(occurrences, charsTogether)
245 charsTogether = 0
246 }
247 }
248
249 // if the last element in the content was a fenceChar
250 if charsTogether != 0 {
251 occurrences = append(occurrences, charsTogether)
252 }
253
254 return findMax(occurrences)
255}
256
257// CalculateCodeFence can be passed the content of a code block and it returns
258// how many fence characters (` or ~) should be used.
259//
260// This is useful if the html content includes the same fence characters
261// for example ```
262// -> https://stackoverflow.com/a/49268657
263func CalculateCodeFence(fenceChar rune, content string) string {
264 repeat := calculateCodeFenceOccurrences(fenceChar, content)
265
266 // the outer fence block always has to have
267 // at least one character more than any content inside
268 repeat++
269
270 // you have to have at least three fence characters
271 // to be recognized as a code block
272 if repeat < 3 {
273 repeat = 3
274 }
275
276 return strings.Repeat(string(fenceChar), repeat)
277}
278
279func findMax(a []int) (max int) {
280 for i, value := range a {
281 if i == 0 {
282 max = a[i]
283 }
284
285 if value > max {
286 max = value
287 }
288 }
289 return max
290}
291
292func getCodeWithoutTags(startNode *html.Node) []byte {
293 var buf bytes.Buffer
294
295 var f func(*html.Node)
296 f = func(n *html.Node) {
297 if n.Type == html.ElementNode && (n.Data == "style" || n.Data == "script" || n.Data == "textarea") {
298 return
299 }
300 if n.Type == html.ElementNode && (n.Data == "br" || n.Data == "div") {
301 buf.WriteString("\n")
302 }
303
304 if n.Type == html.TextNode {
305 buf.WriteString(n.Data)
306 return
307 }
308
309 for c := n.FirstChild; c != nil; c = c.NextSibling {
310 f(c)
311 }
312 }
313
314 f(startNode)
315
316 return buf.Bytes()
317}
318
319// getCodeContent gets the content of pre/code and unescapes the encoded characters.
320// Returns "" if there is an error.
321func getCodeContent(selec *goquery.Selection) string {
322 if len(selec.Nodes) == 0 {
323 return ""
324 }
325
326 code := getCodeWithoutTags(selec.Nodes[0])
327
328 return string(code)
329}
330
331// delimiterForEveryLine puts the delimiter not just at the start and end of the string
332// but if the text is divided on multiple lines, puts the delimiters on every line with content.
333//
334// Otherwise the bold/italic delimiters won't be recognized if it contains new line characters.
335func delimiterForEveryLine(text string, delimiter string) string {
336 lines := strings.Split(text, "\n")
337
338 for i, line := range lines {
339 line = strings.TrimSpace(line)
340 if line == "" {
341 // Skip empty lines
342 continue
343 }
344
345 lines[i] = delimiter + line + delimiter
346 }
347 return strings.Join(lines, "\n")
348}
349
350// isWrapperListItem returns wether the list item has own
351// content or is just a wrapper for another list.
352// e.g. "<li><ul>..."
353func isWrapperListItem(s *goquery.Selection) bool {
354 directText := s.Contents().Not("ul").Not("ol").Text()
355
356 noOwnText := strings.TrimSpace(directText) == ""
357 childIsList := s.ChildrenFiltered("ul").Length() > 0 || s.ChildrenFiltered("ol").Length() > 0
358
359 return noOwnText && childIsList
360}
361
362// getListStart returns the integer from which the counting
363// for for the list items should start from.
364// -> https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ol#start
365func getListStart(parent *goquery.Selection) int {
366 val := parent.AttrOr("start", "")
367 if val == "" {
368 return 1
369 }
370
371 num, err := strconv.Atoi(val)
372 if err != nil {
373 return 1
374 }
375
376 if num < 0 {
377 return 1
378 }
379 return num
380}
381
382// getListPrefix returns the appropriate prefix for the list item.
383// For example "- ", "* ", "1. ", "01. ", ...
384func getListPrefix(opt *Options, s *goquery.Selection) string {
385 if isWrapperListItem(s) {
386 return ""
387 }
388
389 parent := s.Parent()
390 if parent.Is("ul") {
391 return opt.BulletListMarker + " "
392 } else if parent.Is("ol") {
393 start := getListStart(parent)
394 currentIndex := start + s.Index()
395
396 lastIndex := parent.Children().Last().Index() + 1
397 maxLength := len(strconv.Itoa(lastIndex))
398
399 // pad the numbers so that all prefix numbers in the list take up the same space
400 // `%02d.` -> "01. "
401 format := `%0` + strconv.Itoa(maxLength) + `d. `
402 return fmt.Sprintf(format, currentIndex)
403 }
404 // If the HTML is malformed and the list element isn't in a ul or ol, return no prefix
405 return ""
406}
407
408// countListParents counts how much space is reserved for the prefixes at all the parent lists.
409// This is useful to calculate the correct level of indentation for nested lists.
410func countListParents(opt *Options, selec *goquery.Selection) (int, int) {
411 var values []int
412 for n := selec.Parent(); n != nil; n = n.Parent() {
413 if n.Is("li") {
414 continue
415 }
416 if !n.Is("ul") && !n.Is("ol") {
417 break
418 }
419
420 prefix := n.Children().First().AttrOr(attrListPrefix, "")
421
422 values = append(values, len(prefix))
423 }
424
425 // how many spaces are reserved for the prefixes of my siblings
426 var prefixCount int
427
428 // how many spaces are reserved in total for all of the other
429 // list parents up the tree
430 var previousPrefixCounts int
431
432 for i, val := range values {
433 if i == 0 {
434 prefixCount = val
435 continue
436 }
437
438 previousPrefixCounts += val
439 }
440
441 return prefixCount, previousPrefixCounts
442}
443
444// IndentMultiLineListItem makes sure that multiline list items
445// are properly indented.
446func IndentMultiLineListItem(opt *Options, text string, spaces int) string {
447 parts := strings.Split(text, "\n")
448 for i := range parts {
449 // dont touch the first line since its indented through the prefix
450 if i == 0 {
451 continue
452 }
453
454 if isListItem(opt, parts[i]) {
455 return strings.Join(parts, "\n")
456 }
457
458 indent := strings.Repeat(" ", spaces)
459 parts[i] = indent + parts[i]
460 }
461
462 return strings.Join(parts, "\n")
463}
464
465// isListItem checks wether the line is a markdown list item
466func isListItem(opt *Options, line string) bool {
467 b := []rune(line)
468
469 bulletMarker := []rune(opt.BulletListMarker)[0]
470
471 var hasNumber bool
472 var hasMarker bool
473 var hasSpace bool
474
475 for i := 0; i < len(b); i++ {
476 // A marker followed by a space qualifies as a list item
477 if hasMarker && hasSpace {
478 if b[i] == bulletMarker {
479 // But if another BulletListMarker is found, it
480 // might be a HorizontalRule
481 return false
482 }
483
484 if !unicode.IsSpace(b[i]) {
485 // Now we have some text
486 return true
487 }
488 }
489
490 if hasMarker {
491 if unicode.IsSpace(b[i]) {
492 hasSpace = true
493 continue
494 }
495 // A marker like "1." that is not immediately followed by a space
496 // is probably a false positive
497 return false
498 }
499
500 if b[i] == bulletMarker {
501 hasMarker = true
502 continue
503 }
504
505 if hasNumber && b[i] == '.' {
506 hasMarker = true
507 continue
508 }
509 if unicode.IsDigit(b[i]) {
510 hasNumber = true
511 continue
512 }
513
514 if unicode.IsSpace(b[i]) {
515 continue
516 }
517
518 // If we encouter any other character
519 // before finding an indicator, its
520 // not a list item
521 return false
522 }
523 return false
524}
525
526// IndexWithText is similar to goquery's Index function but
527// returns the index of the current element while
528// NOT counting the empty elements beforehand.
529func IndexWithText(s *goquery.Selection) int {
530 return s.PrevAll().FilterFunction(func(i int, s *goquery.Selection) bool {
531 return strings.TrimSpace(s.Text()) != ""
532 }).Length()
533}