1// Package md converts html to markdown.
2//
3// converter := md.NewConverter("", true, nil)
4//
5// html = `<strong>Important</strong>`
6//
7// markdown, err := converter.ConvertString(html)
8// if err != nil {
9// log.Fatal(err)
10// }
11// fmt.Println("md ->", markdown)
12// Or if you are already using goquery:
13// markdown, err := converter.Convert(selec)
14package md
15
16import (
17 "bytes"
18 "errors"
19 "fmt"
20 "io"
21 "log"
22 "net/http"
23 "net/url"
24 "regexp"
25 "strconv"
26 "strings"
27 "sync"
28 "time"
29
30 "github.com/PuerkitoBio/goquery"
31)
32
33type simpleRuleFunc func(content string, selec *goquery.Selection, options *Options) *string
34type ruleFunc func(content string, selec *goquery.Selection, options *Options) (res AdvancedResult, skip bool)
35
36// BeforeHook runs before the converter and can be used to transform the original html
37type BeforeHook func(selec *goquery.Selection)
38
39// Afterhook runs after the converter and can be used to transform the resulting markdown
40type Afterhook func(markdown string) string
41
42// Converter is initialized by NewConverter.
43type Converter struct {
44 mutex sync.RWMutex
45 rules map[string][]ruleFunc
46 keep map[string]struct{}
47 remove map[string]struct{}
48
49 before []BeforeHook
50 after []Afterhook
51
52 domain string
53 options Options
54}
55
56func validate(val string, possible ...string) error {
57 for _, e := range possible {
58 if e == val {
59 return nil
60 }
61 }
62 return fmt.Errorf("field must be one of %v but got %s", possible, val)
63}
64func validateOptions(opt Options) error {
65 if err := validate(opt.HeadingStyle, "setext", "atx"); err != nil {
66 return err
67 }
68 if strings.Count(opt.HorizontalRule, "*") < 3 &&
69 strings.Count(opt.HorizontalRule, "_") < 3 &&
70 strings.Count(opt.HorizontalRule, "-") < 3 {
71 return errors.New("HorizontalRule must be at least 3 characters of '*', '_' or '-' but got " + opt.HorizontalRule)
72 }
73
74 if err := validate(opt.BulletListMarker, "-", "+", "*"); err != nil {
75 return err
76 }
77 if err := validate(opt.CodeBlockStyle, "indented", "fenced"); err != nil {
78 return err
79 }
80 if err := validate(opt.Fence, "```", "~~~"); err != nil {
81 return err
82 }
83 if err := validate(opt.EmDelimiter, "_", "*"); err != nil {
84 return err
85 }
86 if err := validate(opt.StrongDelimiter, "**", "__"); err != nil {
87 return err
88 }
89 if err := validate(opt.LinkStyle, "inlined", "referenced"); err != nil {
90 return err
91 }
92 if err := validate(opt.LinkReferenceStyle, "full", "collapsed", "shortcut"); err != nil {
93 return err
94 }
95
96 return nil
97}
98
99var (
100 attrListPrefix = "data-converter-list-prefix"
101)
102
103// NewConverter initializes a new converter and holds all the rules.
104// - `domain` is used for links and images to convert relative urls ("/image.png") to absolute urls.
105// - CommonMark is the default set of rules. Set enableCommonmark to false if you want
106// to customize everything using AddRules and DONT want to fallback to default rules.
107func NewConverter(domain string, enableCommonmark bool, options *Options) *Converter {
108 conv := &Converter{
109 domain: domain,
110 rules: make(map[string][]ruleFunc),
111 keep: make(map[string]struct{}),
112 remove: make(map[string]struct{}),
113 }
114
115 conv.before = append(conv.before, func(selec *goquery.Selection) {
116 selec.Find("a[href]").Each(func(i int, s *goquery.Selection) {
117 // TODO: don't hardcode "data-index" and rename it to avoid accidental conflicts
118 s.SetAttr("data-index", strconv.Itoa(i+1))
119 })
120 })
121 conv.before = append(conv.before, func(selec *goquery.Selection) {
122 selec.Find("li").Each(func(i int, s *goquery.Selection) {
123 prefix := getListPrefix(options, s)
124
125 s.SetAttr(attrListPrefix, prefix)
126 })
127 })
128 conv.after = append(conv.after, func(markdown string) string {
129 markdown = strings.TrimSpace(markdown)
130 markdown = multipleNewLinesRegex.ReplaceAllString(markdown, "\n\n")
131
132 // remove unnecessary trailing spaces to have clean markdown
133 markdown = TrimTrailingSpaces(markdown)
134
135 return markdown
136 })
137
138 if enableCommonmark {
139 conv.AddRules(commonmark...)
140 conv.remove["script"] = struct{}{}
141 conv.remove["style"] = struct{}{}
142 conv.remove["textarea"] = struct{}{}
143 }
144
145 // TODO: put domain in options?
146 if options == nil {
147 options = &Options{}
148 }
149 if options.HeadingStyle == "" {
150 options.HeadingStyle = "atx"
151 }
152 if options.HorizontalRule == "" {
153 options.HorizontalRule = "* * *"
154 }
155 if options.BulletListMarker == "" {
156 options.BulletListMarker = "-"
157 }
158 if options.CodeBlockStyle == "" {
159 options.CodeBlockStyle = "indented"
160 }
161 if options.Fence == "" {
162 options.Fence = "```"
163 }
164 if options.EmDelimiter == "" {
165 options.EmDelimiter = "_"
166 }
167 if options.StrongDelimiter == "" {
168 options.StrongDelimiter = "**"
169 }
170 if options.LinkStyle == "" {
171 options.LinkStyle = "inlined"
172 }
173 if options.LinkReferenceStyle == "" {
174 options.LinkReferenceStyle = "full"
175 }
176 if options.EscapeMode == "" {
177 options.EscapeMode = "basic"
178 }
179
180 // for now, store it in the options
181 options.domain = domain
182
183 if options.GetAbsoluteURL == nil {
184 options.GetAbsoluteURL = DefaultGetAbsoluteURL
185 }
186
187 conv.options = *options
188 err := validateOptions(conv.options)
189 if err != nil {
190 log.Println("markdown options is not valid:", err)
191 }
192
193 return conv
194}
195func (conv *Converter) getRuleFuncs(tag string) []ruleFunc {
196 conv.mutex.RLock()
197 defer conv.mutex.RUnlock()
198
199 r, ok := conv.rules[tag]
200 if !ok || len(r) == 0 {
201 if _, keep := conv.keep[tag]; keep {
202 return []ruleFunc{wrap(ruleKeep)}
203 }
204 if _, remove := conv.remove[tag]; remove {
205 return nil // TODO:
206 }
207
208 return []ruleFunc{wrap(ruleDefault)}
209 }
210
211 return r
212}
213
214func wrap(simple simpleRuleFunc) ruleFunc {
215 return func(content string, selec *goquery.Selection, opt *Options) (AdvancedResult, bool) {
216 res := simple(content, selec, opt)
217 if res == nil {
218 return AdvancedResult{}, true
219 }
220 return AdvancedResult{Markdown: *res}, false
221 }
222}
223
224// Before registers a hook that is run before the conversion. It
225// can be used to transform the original goquery html document.
226//
227// For example, the default before hook adds an index to every link,
228// so that the `a` tag rule (for "reference" "full") can have an incremental number.
229func (conv *Converter) Before(hooks ...BeforeHook) *Converter {
230 conv.mutex.Lock()
231 defer conv.mutex.Unlock()
232
233 for _, hook := range hooks {
234 conv.before = append(conv.before, hook)
235 }
236
237 return conv
238}
239
240// After registers a hook that is run after the conversion. It
241// can be used to transform the markdown document that is about to be returned.
242//
243// For example, the default after hook trims the returned markdown.
244func (conv *Converter) After(hooks ...Afterhook) *Converter {
245 conv.mutex.Lock()
246 defer conv.mutex.Unlock()
247
248 for _, hook := range hooks {
249 conv.after = append(conv.after, hook)
250 }
251
252 return conv
253}
254
255// ClearBefore clears the current before hooks (including the default before hooks).
256func (conv *Converter) ClearBefore() *Converter {
257 conv.mutex.Lock()
258 defer conv.mutex.Unlock()
259
260 conv.before = nil
261
262 return conv
263}
264
265// ClearAfter clears the current after hooks (including the default after hooks).
266func (conv *Converter) ClearAfter() *Converter {
267 conv.mutex.Lock()
268 defer conv.mutex.Unlock()
269
270 conv.after = nil
271
272 return conv
273}
274
275// AddRules adds the rules that are passed in to the converter.
276//
277// By default it overrides the rule for that html tag. You can
278// fall back to the default rule by returning nil.
279func (conv *Converter) AddRules(rules ...Rule) *Converter {
280 conv.mutex.Lock()
281 defer conv.mutex.Unlock()
282
283 for _, rule := range rules {
284 if len(rule.Filter) == 0 {
285 log.Println("you need to specify at least one filter for your rule")
286 }
287 for _, filter := range rule.Filter {
288 r, _ := conv.rules[filter]
289
290 if rule.AdvancedReplacement != nil {
291 r = append(r, rule.AdvancedReplacement)
292 } else {
293 r = append(r, wrap(rule.Replacement))
294 }
295 conv.rules[filter] = r
296 }
297 }
298
299 return conv
300}
301
302// Keep certain html tags in the generated output.
303func (conv *Converter) Keep(tags ...string) *Converter {
304 conv.mutex.Lock()
305 defer conv.mutex.Unlock()
306
307 for _, tag := range tags {
308 conv.keep[tag] = struct{}{}
309 }
310 return conv
311}
312
313// Remove certain html tags from the source.
314func (conv *Converter) Remove(tags ...string) *Converter {
315 conv.mutex.Lock()
316 defer conv.mutex.Unlock()
317 for _, tag := range tags {
318 conv.remove[tag] = struct{}{}
319 }
320 return conv
321}
322
323// Plugin can be used to extends functionality beyond what
324// is offered by commonmark.
325type Plugin func(conv *Converter) []Rule
326
327// Use can be used to add additional functionality to the converter. It is
328// used when its not sufficient to use only rules for example in Plugins.
329func (conv *Converter) Use(plugins ...Plugin) *Converter {
330 for _, plugin := range plugins {
331 rules := plugin(conv)
332 conv.AddRules(rules...) // TODO: for better performance only use one lock for all plugins
333 }
334 return conv
335}
336
337// Timeout for the http client
338var Timeout = time.Second * 10
339var netClient = &http.Client{
340 Timeout: Timeout,
341}
342
343// DomainFromURL returns `u.Host` from the parsed url.
344func DomainFromURL(rawURL string) string {
345 rawURL = strings.TrimSpace(rawURL)
346
347 u, _ := url.Parse(rawURL)
348 if u != nil && u.Host != "" {
349 return u.Host
350 }
351
352 // lets try it again by adding a scheme
353 u, _ = url.Parse("http://" + rawURL)
354 if u != nil {
355 return u.Host
356 }
357
358 return ""
359}
360
361// Reduce many newline characters `\n` to at most 2 new line characters.
362var multipleNewLinesRegex = regexp.MustCompile(`[\n]{2,}`)
363
364// Convert returns the content from a goquery selection.
365// If you have a goquery document just pass in doc.Selection.
366func (conv *Converter) Convert(selec *goquery.Selection) string {
367 conv.mutex.RLock()
368 domain := conv.domain
369 options := conv.options
370 l := len(conv.rules)
371 if l == 0 {
372 log.Println("you have added no rules. either enable commonmark or add you own.")
373 }
374 before := conv.before
375 after := conv.after
376 conv.mutex.RUnlock()
377
378 // before hook
379 for _, hook := range before {
380 hook(selec)
381 }
382
383 res := conv.selecToMD(domain, selec, &options)
384 markdown := res.Markdown
385
386 if res.Header != "" {
387 markdown = res.Header + "\n\n" + markdown
388 }
389 if res.Footer != "" {
390 markdown += "\n\n" + res.Footer
391 }
392
393 // after hook
394 for _, hook := range after {
395 markdown = hook(markdown)
396 }
397
398 return markdown
399}
400
401// ConvertReader returns the content from a reader and returns a buffer.
402func (conv *Converter) ConvertReader(reader io.Reader) (bytes.Buffer, error) {
403 var buffer bytes.Buffer
404 doc, err := goquery.NewDocumentFromReader(reader)
405 if err != nil {
406 return buffer, err
407 }
408 buffer.WriteString(
409 conv.Convert(doc.Selection),
410 )
411
412 return buffer, nil
413}
414
415// ConvertResponse returns the content from a html response.
416func (conv *Converter) ConvertResponse(res *http.Response) (string, error) {
417 doc, err := goquery.NewDocumentFromResponse(res)
418 if err != nil {
419 return "", err
420 }
421 return conv.Convert(doc.Selection), nil
422}
423
424// ConvertString returns the content from a html string. If you
425// already have a goquery selection use `Convert`.
426func (conv *Converter) ConvertString(html string) (string, error) {
427 doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
428 if err != nil {
429 return "", err
430 }
431 return conv.Convert(doc.Selection), nil
432}
433
434// ConvertBytes returns the content from a html byte array.
435func (conv *Converter) ConvertBytes(bytes []byte) ([]byte, error) {
436 res, err := conv.ConvertString(string(bytes))
437 if err != nil {
438 return nil, err
439 }
440 return []byte(res), nil
441}
442
443// ConvertURL returns the content from the page with that url.
444func (conv *Converter) ConvertURL(url string) (string, error) {
445 // not using goquery.NewDocument directly because of the timeout
446 resp, err := netClient.Get(url)
447 if err != nil {
448 return "", err
449 }
450
451 if resp.StatusCode < 200 || resp.StatusCode > 299 {
452 return "", fmt.Errorf("expected a status code in the 2xx range but got %d", resp.StatusCode)
453 }
454
455 doc, err := goquery.NewDocumentFromResponse(resp)
456 if err != nil {
457 return "", err
458 }
459 domain := DomainFromURL(url)
460 if conv.domain != domain {
461 log.Printf("expected '%s' as the domain but got '%s' \n", conv.domain, domain)
462 }
463 return conv.Convert(doc.Selection), nil
464}