utils.go

  1package md
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"regexp"
  7	"strconv"
  8	"strings"
  9	"unicode"
 10	"unicode/utf8"
 11
 12	"github.com/PuerkitoBio/goquery"
 13	"golang.org/x/net/html"
 14)
 15
 16/*
 17WARNING: The functions from this file can be used externally
 18but there is no garanty that they will stay exported.
 19*/
 20
 21// CollectText returns the text of the node and all its children
 22func CollectText(n *html.Node) string {
 23	text := &bytes.Buffer{}
 24	collectText(n, text)
 25	return text.String()
 26}
 27func collectText(n *html.Node, buf *bytes.Buffer) {
 28	if n.Type == html.TextNode {
 29		buf.WriteString(n.Data)
 30	}
 31	for c := n.FirstChild; c != nil; c = c.NextSibling {
 32		collectText(c, buf)
 33	}
 34}
 35
 36func getName(node *html.Node) string {
 37	selec := &goquery.Selection{Nodes: []*html.Node{node}}
 38	return goquery.NodeName(selec)
 39}
 40
 41// What elements automatically trim their content?
 42// Don't add another space if the other element is going to add a
 43// space already.
 44func isTrimmedElement(name string) bool {
 45	nodes := []string{
 46		"a",
 47		"strong", "b",
 48		"i", "em",
 49		"del", "s", "strike",
 50		"code",
 51	}
 52
 53	for _, node := range nodes {
 54		if name == node {
 55			return true
 56		}
 57	}
 58	return false
 59}
 60
 61func getPrevNodeText(node *html.Node) (string, bool) {
 62	if node == nil {
 63		return "", false
 64	}
 65
 66	for ; node != nil; node = node.PrevSibling {
 67		text := CollectText(node)
 68
 69		name := getName(node)
 70		if name == "br" {
 71			return "\n", true
 72		}
 73
 74		// if the content is empty, try our luck with the next node
 75		if strings.TrimSpace(text) == "" {
 76			continue
 77		}
 78
 79		if isTrimmedElement(name) {
 80			text = strings.TrimSpace(text)
 81		}
 82
 83		return text, true
 84	}
 85	return "", false
 86}
 87func getNextNodeText(node *html.Node) (string, bool) {
 88	if node == nil {
 89		return "", false
 90	}
 91
 92	for ; node != nil; node = node.NextSibling {
 93		text := CollectText(node)
 94
 95		name := getName(node)
 96		if name == "br" {
 97			return "\n", true
 98		}
 99
100		// if the content is empty, try our luck with the next node
101		if strings.TrimSpace(text) == "" {
102			continue
103		}
104
105		// if you have "a a a", three elements that are trimmed, then only add
106		// a space to one side, since the other's are also adding a space.
107		if isTrimmedElement(name) {
108			text = " "
109		}
110
111		return text, true
112	}
113	return "", false
114}
115
116// AddSpaceIfNessesary adds spaces to the text based on the neighbors.
117// That makes sure that there is always a space to the side, to recognize the delimiter.
118func AddSpaceIfNessesary(selec *goquery.Selection, markdown string) string {
119	if len(selec.Nodes) == 0 {
120		return markdown
121	}
122	rootNode := selec.Nodes[0]
123
124	prev, hasPrev := getPrevNodeText(rootNode.PrevSibling)
125	if hasPrev {
126		lastChar, size := utf8.DecodeLastRuneInString(prev)
127		if size > 0 && !unicode.IsSpace(lastChar) {
128			markdown = " " + markdown
129		}
130	}
131
132	next, hasNext := getNextNodeText(rootNode.NextSibling)
133	if hasNext {
134		firstChar, size := utf8.DecodeRuneInString(next)
135		if size > 0 && !unicode.IsSpace(firstChar) && !unicode.IsPunct(firstChar) {
136			markdown = markdown + " "
137		}
138	}
139
140	return markdown
141}
142
143func isLineCodeDelimiter(chars []rune) bool {
144	if len(chars) < 3 {
145		return false
146	}
147
148	// TODO: If it starts with 4 (instead of 3) fence characters, we should only end it
149	// if we see the same amount of ending fence characters.
150	return chars[0] == '`' && chars[1] == '`' && chars[2] == '`'
151}
152
153// TrimpLeadingSpaces removes spaces from the beginning of a line
154// but makes sure that list items and code blocks are not affected.
155func TrimpLeadingSpaces(text string) string {
156	var insideCodeBlock bool
157
158	lines := strings.Split(text, "\n")
159	for index := range lines {
160		chars := []rune(lines[index])
161
162		if isLineCodeDelimiter(chars) {
163			if !insideCodeBlock {
164				// start the code block
165				insideCodeBlock = true
166			} else {
167				// end the code block
168				insideCodeBlock = false
169			}
170		}
171		if insideCodeBlock {
172			// We are inside a code block and don't want to
173			// disturb that formatting (e.g. python indentation)
174			continue
175		}
176
177		var spaces int
178		for i := 0; i < len(chars); i++ {
179			if unicode.IsSpace(chars[i]) {
180				if chars[i] == '	' {
181					spaces = spaces + 4
182				} else {
183					spaces++
184				}
185				continue
186			}
187
188			// this seems to be a list item
189			if chars[i] == '-' {
190				break
191			}
192
193			// this seems to be a code block
194			if spaces >= 4 {
195				break
196			}
197
198			// remove the space characters from the string
199			chars = chars[i:]
200			break
201		}
202		lines[index] = string(chars)
203	}
204
205	return strings.Join(lines, "\n")
206}
207
208// TrimTrailingSpaces removes unnecessary spaces from the end of lines.
209func TrimTrailingSpaces(text string) string {
210	parts := strings.Split(text, "\n")
211	for i := range parts {
212		parts[i] = strings.TrimRightFunc(parts[i], func(r rune) bool {
213			return unicode.IsSpace(r)
214		})
215
216	}
217
218	return strings.Join(parts, "\n")
219}
220
221// The same as `multipleNewLinesRegex`, but applies to escaped new lines inside a link `\n\`
222var multipleNewLinesInLinkRegex = regexp.MustCompile(`(\n\\){1,}`) // `([\n\r\s]\\)`
223
224// EscapeMultiLine deals with multiline content inside a link
225func EscapeMultiLine(content string) string {
226	content = strings.TrimSpace(content)
227	content = strings.Replace(content, "\n", `\`+"\n", -1)
228
229	content = multipleNewLinesInLinkRegex.ReplaceAllString(content, "\n\\")
230
231	return content
232}
233
234func calculateCodeFenceOccurrences(fenceChar rune, content string) int {
235	var occurrences []int
236
237	var charsTogether int
238	for _, char := range content {
239		// we encountered a fence character, now count how many
240		// are directly afterwards
241		if char == fenceChar {
242			charsTogether++
243		} else if charsTogether != 0 {
244			occurrences = append(occurrences, charsTogether)
245			charsTogether = 0
246		}
247	}
248
249	// if the last element in the content was a fenceChar
250	if charsTogether != 0 {
251		occurrences = append(occurrences, charsTogether)
252	}
253
254	return findMax(occurrences)
255}
256
257// CalculateCodeFence can be passed the content of a code block and it returns
258// how many fence characters (` or ~) should be used.
259//
260// This is useful if the html content includes the same fence characters
261// for example ```
262// -> https://stackoverflow.com/a/49268657
263func CalculateCodeFence(fenceChar rune, content string) string {
264	repeat := calculateCodeFenceOccurrences(fenceChar, content)
265
266	// the outer fence block always has to have
267	// at least one character more than any content inside
268	repeat++
269
270	// you have to have at least three fence characters
271	// to be recognized as a code block
272	if repeat < 3 {
273		repeat = 3
274	}
275
276	return strings.Repeat(string(fenceChar), repeat)
277}
278
279func findMax(a []int) (max int) {
280	for i, value := range a {
281		if i == 0 {
282			max = a[i]
283		}
284
285		if value > max {
286			max = value
287		}
288	}
289	return max
290}
291
292func getCodeWithoutTags(startNode *html.Node) []byte {
293	var buf bytes.Buffer
294
295	var f func(*html.Node)
296	f = func(n *html.Node) {
297		if n.Type == html.ElementNode && (n.Data == "style" || n.Data == "script" || n.Data == "textarea") {
298			return
299		}
300		if n.Type == html.ElementNode && (n.Data == "br" || n.Data == "div") {
301			buf.WriteString("\n")
302		}
303
304		if n.Type == html.TextNode {
305			buf.WriteString(n.Data)
306			return
307		}
308
309		for c := n.FirstChild; c != nil; c = c.NextSibling {
310			f(c)
311		}
312	}
313
314	f(startNode)
315
316	return buf.Bytes()
317}
318
319// getCodeContent gets the content of pre/code and unescapes the encoded characters.
320// Returns "" if there is an error.
321func getCodeContent(selec *goquery.Selection) string {
322	if len(selec.Nodes) == 0 {
323		return ""
324	}
325
326	code := getCodeWithoutTags(selec.Nodes[0])
327
328	return string(code)
329}
330
331// delimiterForEveryLine puts the delimiter not just at the start and end of the string
332// but if the text is divided on multiple lines, puts the delimiters on every line with content.
333//
334// Otherwise the bold/italic delimiters won't be recognized if it contains new line characters.
335func delimiterForEveryLine(text string, delimiter string) string {
336	lines := strings.Split(text, "\n")
337
338	for i, line := range lines {
339		line = strings.TrimSpace(line)
340		if line == "" {
341			// Skip empty lines
342			continue
343		}
344
345		lines[i] = delimiter + line + delimiter
346	}
347	return strings.Join(lines, "\n")
348}
349
350// isWrapperListItem returns wether the list item has own
351// content or is just a wrapper for another list.
352// e.g. "<li><ul>..."
353func isWrapperListItem(s *goquery.Selection) bool {
354	directText := s.Contents().Not("ul").Not("ol").Text()
355
356	noOwnText := strings.TrimSpace(directText) == ""
357	childIsList := s.ChildrenFiltered("ul").Length() > 0 || s.ChildrenFiltered("ol").Length() > 0
358
359	return noOwnText && childIsList
360}
361
362// getListStart returns the integer from which the counting
363// for for the list items should start from.
364// -> https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ol#start
365func getListStart(parent *goquery.Selection) int {
366	val := parent.AttrOr("start", "")
367	if val == "" {
368		return 1
369	}
370
371	num, err := strconv.Atoi(val)
372	if err != nil {
373		return 1
374	}
375
376	if num < 0 {
377		return 1
378	}
379	return num
380}
381
382// getListPrefix returns the appropriate prefix for the list item.
383// For example "- ", "* ", "1. ", "01. ", ...
384func getListPrefix(opt *Options, s *goquery.Selection) string {
385	if isWrapperListItem(s) {
386		return ""
387	}
388
389	parent := s.Parent()
390	if parent.Is("ul") {
391		return opt.BulletListMarker + " "
392	} else if parent.Is("ol") {
393		start := getListStart(parent)
394		currentIndex := start + s.Index()
395
396		lastIndex := parent.Children().Last().Index() + 1
397		maxLength := len(strconv.Itoa(lastIndex))
398
399		// pad the numbers so that all prefix numbers in the list take up the same space
400		// `%02d.` -> "01. "
401		format := `%0` + strconv.Itoa(maxLength) + `d. `
402		return fmt.Sprintf(format, currentIndex)
403	}
404	// If the HTML is malformed and the list element isn't in a ul or ol, return no prefix
405	return ""
406}
407
408// countListParents counts how much space is reserved for the prefixes at all the parent lists.
409// This is useful to calculate the correct level of indentation for nested lists.
410func countListParents(opt *Options, selec *goquery.Selection) (int, int) {
411	var values []int
412	for n := selec.Parent(); n != nil; n = n.Parent() {
413		if n.Is("li") {
414			continue
415		}
416		if !n.Is("ul") && !n.Is("ol") {
417			break
418		}
419
420		prefix := n.Children().First().AttrOr(attrListPrefix, "")
421
422		values = append(values, len(prefix))
423	}
424
425	// how many spaces are reserved for the prefixes of my siblings
426	var prefixCount int
427
428	// how many spaces are reserved in total for all of the other
429	// list parents up the tree
430	var previousPrefixCounts int
431
432	for i, val := range values {
433		if i == 0 {
434			prefixCount = val
435			continue
436		}
437
438		previousPrefixCounts += val
439	}
440
441	return prefixCount, previousPrefixCounts
442}
443
444// IndentMultiLineListItem makes sure that multiline list items
445// are properly indented.
446func IndentMultiLineListItem(opt *Options, text string, spaces int) string {
447	parts := strings.Split(text, "\n")
448	for i := range parts {
449		// dont touch the first line since its indented through the prefix
450		if i == 0 {
451			continue
452		}
453
454		if isListItem(opt, parts[i]) {
455			return strings.Join(parts, "\n")
456		}
457
458		indent := strings.Repeat(" ", spaces)
459		parts[i] = indent + parts[i]
460	}
461
462	return strings.Join(parts, "\n")
463}
464
465// isListItem checks wether the line is a markdown list item
466func isListItem(opt *Options, line string) bool {
467	b := []rune(line)
468
469	bulletMarker := []rune(opt.BulletListMarker)[0]
470
471	var hasNumber bool
472	var hasMarker bool
473	var hasSpace bool
474
475	for i := 0; i < len(b); i++ {
476		// A marker followed by a space qualifies as a list item
477		if hasMarker && hasSpace {
478			if b[i] == bulletMarker {
479				// But if another BulletListMarker is found, it
480				// might be a HorizontalRule
481				return false
482			}
483
484			if !unicode.IsSpace(b[i]) {
485				// Now we have some text
486				return true
487			}
488		}
489
490		if hasMarker {
491			if unicode.IsSpace(b[i]) {
492				hasSpace = true
493				continue
494			}
495			// A marker like "1." that is not immediately followed by a space
496			// is probably a false positive
497			return false
498		}
499
500		if b[i] == bulletMarker {
501			hasMarker = true
502			continue
503		}
504
505		if hasNumber && b[i] == '.' {
506			hasMarker = true
507			continue
508		}
509		if unicode.IsDigit(b[i]) {
510			hasNumber = true
511			continue
512		}
513
514		if unicode.IsSpace(b[i]) {
515			continue
516		}
517
518		// If we encouter any other character
519		// before finding an indicator, its
520		// not a list item
521		return false
522	}
523	return false
524}
525
526// IndexWithText is similar to goquery's Index function but
527// returns the index of the current element while
528// NOT counting the empty elements beforehand.
529func IndexWithText(s *goquery.Selection) int {
530	return s.PrevAll().FilterFunction(func(i int, s *goquery.Selection) bool {
531		return strings.TrimSpace(s.Text()) != ""
532	}).Length()
533}