1package md
2
3import (
4 "bytes"
5 "log"
6 "net/url"
7 "regexp"
8 "strings"
9
10 "github.com/PuerkitoBio/goquery"
11 "golang.org/x/net/html"
12)
13
14var (
15 ruleDefault = func(content string, selec *goquery.Selection, opt *Options) *string {
16 return &content
17 }
18 ruleKeep = func(content string, selec *goquery.Selection, opt *Options) *string {
19 element := selec.Get(0)
20
21 var buf bytes.Buffer
22 err := html.Render(&buf, element)
23 if err != nil {
24 log.Println("[JohannesKaufmann/html-to-markdown] ruleKeep: error while rendering the element to html:", err)
25 return String("")
26 }
27
28 return String(buf.String())
29 }
30)
31
32var inlineElements = []string{ // -> https://developer.mozilla.org/de/docs/Web/HTML/Inline_elemente
33 "b", "big", "i", "small", "tt",
34 "abbr", "acronym", "cite", "code", "dfn", "em", "kbd", "strong", "samp", "var",
35 "a", "bdo", "br", "img", "map", "object", "q", "script", "span", "sub", "sup",
36 "button", "input", "label", "select", "textarea",
37}
38
39// IsInlineElement can be used to check wether a node name (goquery.Nodename) is
40// an html inline element and not a block element. Used in the rule for the
41// p tag to check wether the text is inside a block element.
42func IsInlineElement(e string) bool {
43 for _, element := range inlineElements {
44 if element == e {
45 return true
46 }
47 }
48 return false
49}
50
51// String is a helper function to return a pointer.
52func String(text string) *string {
53 return &text
54}
55
56// Options to customize the output. You can change stuff like
57// the character that is used for strong text.
58type Options struct {
59 // "setext" or "atx"
60 // default: "atx"
61 HeadingStyle string
62
63 // Any Thematic break
64 // default: "* * *"
65 HorizontalRule string
66
67 // "-", "+", or "*"
68 // default: "-"
69 BulletListMarker string
70
71 // "indented" or "fenced"
72 // default: "indented"
73 CodeBlockStyle string
74
75 // ``` or ~~~
76 // default: ```
77 Fence string
78
79 // _ or *
80 // default: _
81 EmDelimiter string
82
83 // ** or __
84 // default: **
85 StrongDelimiter string
86
87 // inlined or referenced
88 // default: inlined
89 LinkStyle string
90
91 // full, collapsed, or shortcut
92 // default: full
93 LinkReferenceStyle string
94
95 // basic, disabled
96 // default: basic
97 EscapeMode string
98
99 domain string
100
101 // GetAbsoluteURL parses the `rawURL` and adds the `domain` to convert relative (/page.html)
102 // urls to absolute urls (http://domain.com/page.html).
103 //
104 // The default is `DefaultGetAbsoluteURL`, unless you override it. That can also
105 // be useful if you want to proxy the images.
106 GetAbsoluteURL func(selec *goquery.Selection, rawURL string, domain string) string
107
108 // GetCodeBlockLanguage identifies the language for syntax highlighting
109 // of a code block. The default is `DefaultGetCodeBlockLanguage`, which
110 // only gets the attribute x from the selection.
111 //
112 // You can override it if you want more results, for example by using
113 // lexers.Analyse(content) from github.com/alecthomas/chroma
114 // TODO: implement
115 // GetCodeBlockLanguage func(s *goquery.Selection, content string) string
116}
117
118// DefaultGetAbsoluteURL is the default function and can be overridden through `GetAbsoluteURL` in the options.
119func DefaultGetAbsoluteURL(selec *goquery.Selection, rawURL string, domain string) string {
120 if domain == "" {
121 return rawURL
122 }
123
124 u, err := url.Parse(rawURL)
125 if err != nil {
126 // we can't do anything with this url because it is invalid
127 return rawURL
128 }
129
130 if u.Scheme == "data" {
131 // this is a data uri (for example an inline base64 image)
132 return rawURL
133 }
134
135 if u.Scheme == "" {
136 u.Scheme = "http"
137 }
138 if u.Host == "" {
139 u.Host = domain
140 }
141
142 return u.String()
143}
144
145// AdvancedResult is used for example for links. If you use LinkStyle:referenced
146// the link href is placed at the bottom of the generated markdown (Footer).
147type AdvancedResult struct {
148 Header string
149 Markdown string
150 Footer string
151}
152
153// Rule to convert certain html tags to markdown.
154// md.Rule{
155// Filter: []string{"del", "s", "strike"},
156// Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string {
157// // You need to return a pointer to a string (md.String is just a helper function).
158// // If you return nil the next function for that html element
159// // will be picked. For example you could only convert an element
160// // if it has a certain class name and fallback if not.
161// return md.String("~" + content + "~")
162// },
163// }
164type Rule struct {
165 Filter []string
166 Replacement func(content string, selec *goquery.Selection, options *Options) *string
167 AdvancedReplacement func(content string, selec *goquery.Selection, options *Options) (res AdvancedResult, skip bool)
168}
169
170var leadingNewlinesR = regexp.MustCompile(`^\n+`)
171var trailingNewlinesR = regexp.MustCompile(`\n+$`)
172
173var newlinesR = regexp.MustCompile(`\n+`)
174var tabR = regexp.MustCompile(`\t+`)
175var indentR = regexp.MustCompile(`(?m)\n`)
176
177func (conv *Converter) selecToMD(domain string, selec *goquery.Selection, opt *Options) AdvancedResult {
178 var result AdvancedResult
179
180 var builder strings.Builder
181 selec.Contents().Each(func(i int, s *goquery.Selection) {
182 name := goquery.NodeName(s)
183 rules := conv.getRuleFuncs(name)
184
185 for i := len(rules) - 1; i >= 0; i-- {
186 rule := rules[i]
187
188 content := conv.selecToMD(domain, s, opt)
189 if content.Header != "" {
190 result.Header += content.Header
191 }
192 if content.Footer != "" {
193 result.Footer += content.Footer
194 }
195
196 res, skip := rule(content.Markdown, s, opt)
197 if res.Header != "" {
198 result.Header += res.Header + "\n"
199 }
200 if res.Footer != "" {
201 result.Footer += res.Footer + "\n"
202 }
203
204 if !skip {
205 builder.WriteString(res.Markdown)
206 return
207 }
208 }
209 })
210 result.Markdown = builder.String()
211 return result
212}