from.go

  1// Package md converts html to markdown.
  2//
  3//  converter := md.NewConverter("", true, nil)
  4//
  5//  html = `<strong>Important</strong>`
  6//
  7//  markdown, err := converter.ConvertString(html)
  8//  if err != nil {
  9//    log.Fatal(err)
 10//  }
 11//  fmt.Println("md ->", markdown)
 12// Or if you are already using goquery:
 13//  markdown, err := converter.Convert(selec)
 14package md
 15
 16import (
 17	"bytes"
 18	"errors"
 19	"fmt"
 20	"io"
 21	"log"
 22	"net/http"
 23	"net/url"
 24	"regexp"
 25	"strconv"
 26	"strings"
 27	"sync"
 28	"time"
 29
 30	"github.com/PuerkitoBio/goquery"
 31)
 32
 33type simpleRuleFunc func(content string, selec *goquery.Selection, options *Options) *string
 34type ruleFunc func(content string, selec *goquery.Selection, options *Options) (res AdvancedResult, skip bool)
 35
 36// BeforeHook runs before the converter and can be used to transform the original html
 37type BeforeHook func(selec *goquery.Selection)
 38
 39// Afterhook runs after the converter and can be used to transform the resulting markdown
 40type Afterhook func(markdown string) string
 41
 42// Converter is initialized by NewConverter.
 43type Converter struct {
 44	mutex  sync.RWMutex
 45	rules  map[string][]ruleFunc
 46	keep   map[string]struct{}
 47	remove map[string]struct{}
 48
 49	before []BeforeHook
 50	after  []Afterhook
 51
 52	domain  string
 53	options Options
 54}
 55
 56func validate(val string, possible ...string) error {
 57	for _, e := range possible {
 58		if e == val {
 59			return nil
 60		}
 61	}
 62	return fmt.Errorf("field must be one of %v but got %s", possible, val)
 63}
 64func validateOptions(opt Options) error {
 65	if err := validate(opt.HeadingStyle, "setext", "atx"); err != nil {
 66		return err
 67	}
 68	if strings.Count(opt.HorizontalRule, "*") < 3 &&
 69		strings.Count(opt.HorizontalRule, "_") < 3 &&
 70		strings.Count(opt.HorizontalRule, "-") < 3 {
 71		return errors.New("HorizontalRule must be at least 3 characters of '*', '_' or '-' but got " + opt.HorizontalRule)
 72	}
 73
 74	if err := validate(opt.BulletListMarker, "-", "+", "*"); err != nil {
 75		return err
 76	}
 77	if err := validate(opt.CodeBlockStyle, "indented", "fenced"); err != nil {
 78		return err
 79	}
 80	if err := validate(opt.Fence, "```", "~~~"); err != nil {
 81		return err
 82	}
 83	if err := validate(opt.EmDelimiter, "_", "*"); err != nil {
 84		return err
 85	}
 86	if err := validate(opt.StrongDelimiter, "**", "__"); err != nil {
 87		return err
 88	}
 89	if err := validate(opt.LinkStyle, "inlined", "referenced"); err != nil {
 90		return err
 91	}
 92	if err := validate(opt.LinkReferenceStyle, "full", "collapsed", "shortcut"); err != nil {
 93		return err
 94	}
 95
 96	return nil
 97}
 98
 99var (
100	attrListPrefix = "data-converter-list-prefix"
101)
102
103// NewConverter initializes a new converter and holds all the rules.
104// - `domain` is used for links and images to convert relative urls ("/image.png") to absolute urls.
105// - CommonMark is the default set of rules. Set enableCommonmark to false if you want
106//   to customize everything using AddRules and DONT want to fallback to default rules.
107func NewConverter(domain string, enableCommonmark bool, options *Options) *Converter {
108	conv := &Converter{
109		domain: domain,
110		rules:  make(map[string][]ruleFunc),
111		keep:   make(map[string]struct{}),
112		remove: make(map[string]struct{}),
113	}
114
115	conv.before = append(conv.before, func(selec *goquery.Selection) {
116		selec.Find("a[href]").Each(func(i int, s *goquery.Selection) {
117			// TODO: don't hardcode "data-index" and rename it to avoid accidental conflicts
118			s.SetAttr("data-index", strconv.Itoa(i+1))
119		})
120	})
121	conv.before = append(conv.before, func(selec *goquery.Selection) {
122		selec.Find("li").Each(func(i int, s *goquery.Selection) {
123			prefix := getListPrefix(options, s)
124
125			s.SetAttr(attrListPrefix, prefix)
126		})
127	})
128	conv.after = append(conv.after, func(markdown string) string {
129		markdown = strings.TrimSpace(markdown)
130		markdown = multipleNewLinesRegex.ReplaceAllString(markdown, "\n\n")
131
132		// remove unnecessary trailing spaces to have clean markdown
133		markdown = TrimTrailingSpaces(markdown)
134
135		return markdown
136	})
137
138	if enableCommonmark {
139		conv.AddRules(commonmark...)
140		conv.remove["script"] = struct{}{}
141		conv.remove["style"] = struct{}{}
142		conv.remove["textarea"] = struct{}{}
143	}
144
145	// TODO: put domain in options?
146	if options == nil {
147		options = &Options{}
148	}
149	if options.HeadingStyle == "" {
150		options.HeadingStyle = "atx"
151	}
152	if options.HorizontalRule == "" {
153		options.HorizontalRule = "* * *"
154	}
155	if options.BulletListMarker == "" {
156		options.BulletListMarker = "-"
157	}
158	if options.CodeBlockStyle == "" {
159		options.CodeBlockStyle = "indented"
160	}
161	if options.Fence == "" {
162		options.Fence = "```"
163	}
164	if options.EmDelimiter == "" {
165		options.EmDelimiter = "_"
166	}
167	if options.StrongDelimiter == "" {
168		options.StrongDelimiter = "**"
169	}
170	if options.LinkStyle == "" {
171		options.LinkStyle = "inlined"
172	}
173	if options.LinkReferenceStyle == "" {
174		options.LinkReferenceStyle = "full"
175	}
176	if options.EscapeMode == "" {
177		options.EscapeMode = "basic"
178	}
179
180	// for now, store it in the options
181	options.domain = domain
182
183	if options.GetAbsoluteURL == nil {
184		options.GetAbsoluteURL = DefaultGetAbsoluteURL
185	}
186
187	conv.options = *options
188	err := validateOptions(conv.options)
189	if err != nil {
190		log.Println("markdown options is not valid:", err)
191	}
192
193	return conv
194}
195func (conv *Converter) getRuleFuncs(tag string) []ruleFunc {
196	conv.mutex.RLock()
197	defer conv.mutex.RUnlock()
198
199	r, ok := conv.rules[tag]
200	if !ok || len(r) == 0 {
201		if _, keep := conv.keep[tag]; keep {
202			return []ruleFunc{wrap(ruleKeep)}
203		}
204		if _, remove := conv.remove[tag]; remove {
205			return nil // TODO:
206		}
207
208		return []ruleFunc{wrap(ruleDefault)}
209	}
210
211	return r
212}
213
214func wrap(simple simpleRuleFunc) ruleFunc {
215	return func(content string, selec *goquery.Selection, opt *Options) (AdvancedResult, bool) {
216		res := simple(content, selec, opt)
217		if res == nil {
218			return AdvancedResult{}, true
219		}
220		return AdvancedResult{Markdown: *res}, false
221	}
222}
223
224// Before registers a hook that is run before the conversion. It
225// can be used to transform the original goquery html document.
226//
227// For example, the default before hook adds an index to every link,
228// so that the `a` tag rule (for "reference" "full") can have an incremental number.
229func (conv *Converter) Before(hooks ...BeforeHook) *Converter {
230	conv.mutex.Lock()
231	defer conv.mutex.Unlock()
232
233	for _, hook := range hooks {
234		conv.before = append(conv.before, hook)
235	}
236
237	return conv
238}
239
240// After registers a hook that is run after the conversion. It
241// can be used to transform the markdown document that is about to be returned.
242//
243// For example, the default after hook trims the returned markdown.
244func (conv *Converter) After(hooks ...Afterhook) *Converter {
245	conv.mutex.Lock()
246	defer conv.mutex.Unlock()
247
248	for _, hook := range hooks {
249		conv.after = append(conv.after, hook)
250	}
251
252	return conv
253}
254
255// ClearBefore clears the current before hooks (including the default before hooks).
256func (conv *Converter) ClearBefore() *Converter {
257	conv.mutex.Lock()
258	defer conv.mutex.Unlock()
259
260	conv.before = nil
261
262	return conv
263}
264
265// ClearAfter clears the current after hooks (including the default after hooks).
266func (conv *Converter) ClearAfter() *Converter {
267	conv.mutex.Lock()
268	defer conv.mutex.Unlock()
269
270	conv.after = nil
271
272	return conv
273}
274
275// AddRules adds the rules that are passed in to the converter.
276//
277// By default it overrides the rule for that html tag. You can
278// fall back to the default rule by returning nil.
279func (conv *Converter) AddRules(rules ...Rule) *Converter {
280	conv.mutex.Lock()
281	defer conv.mutex.Unlock()
282
283	for _, rule := range rules {
284		if len(rule.Filter) == 0 {
285			log.Println("you need to specify at least one filter for your rule")
286		}
287		for _, filter := range rule.Filter {
288			r, _ := conv.rules[filter]
289
290			if rule.AdvancedReplacement != nil {
291				r = append(r, rule.AdvancedReplacement)
292			} else {
293				r = append(r, wrap(rule.Replacement))
294			}
295			conv.rules[filter] = r
296		}
297	}
298
299	return conv
300}
301
302// Keep certain html tags in the generated output.
303func (conv *Converter) Keep(tags ...string) *Converter {
304	conv.mutex.Lock()
305	defer conv.mutex.Unlock()
306
307	for _, tag := range tags {
308		conv.keep[tag] = struct{}{}
309	}
310	return conv
311}
312
313// Remove certain html tags from the source.
314func (conv *Converter) Remove(tags ...string) *Converter {
315	conv.mutex.Lock()
316	defer conv.mutex.Unlock()
317	for _, tag := range tags {
318		conv.remove[tag] = struct{}{}
319	}
320	return conv
321}
322
323// Plugin can be used to extends functionality beyond what
324// is offered by commonmark.
325type Plugin func(conv *Converter) []Rule
326
327// Use can be used to add additional functionality to the converter. It is
328// used when its not sufficient to use only rules for example in Plugins.
329func (conv *Converter) Use(plugins ...Plugin) *Converter {
330	for _, plugin := range plugins {
331		rules := plugin(conv)
332		conv.AddRules(rules...) // TODO: for better performance only use one lock for all plugins
333	}
334	return conv
335}
336
337// Timeout for the http client
338var Timeout = time.Second * 10
339var netClient = &http.Client{
340	Timeout: Timeout,
341}
342
343// DomainFromURL returns `u.Host` from the parsed url.
344func DomainFromURL(rawURL string) string {
345	rawURL = strings.TrimSpace(rawURL)
346
347	u, _ := url.Parse(rawURL)
348	if u != nil && u.Host != "" {
349		return u.Host
350	}
351
352	// lets try it again by adding a scheme
353	u, _ = url.Parse("http://" + rawURL)
354	if u != nil {
355		return u.Host
356	}
357
358	return ""
359}
360
361// Reduce many newline characters `\n` to at most 2 new line characters.
362var multipleNewLinesRegex = regexp.MustCompile(`[\n]{2,}`)
363
364// Convert returns the content from a goquery selection.
365// If you have a goquery document just pass in doc.Selection.
366func (conv *Converter) Convert(selec *goquery.Selection) string {
367	conv.mutex.RLock()
368	domain := conv.domain
369	options := conv.options
370	l := len(conv.rules)
371	if l == 0 {
372		log.Println("you have added no rules. either enable commonmark or add you own.")
373	}
374	before := conv.before
375	after := conv.after
376	conv.mutex.RUnlock()
377
378	// before hook
379	for _, hook := range before {
380		hook(selec)
381	}
382
383	res := conv.selecToMD(domain, selec, &options)
384	markdown := res.Markdown
385
386	if res.Header != "" {
387		markdown = res.Header + "\n\n" + markdown
388	}
389	if res.Footer != "" {
390		markdown += "\n\n" + res.Footer
391	}
392
393	// after hook
394	for _, hook := range after {
395		markdown = hook(markdown)
396	}
397
398	return markdown
399}
400
401// ConvertReader returns the content from a reader and returns a buffer.
402func (conv *Converter) ConvertReader(reader io.Reader) (bytes.Buffer, error) {
403	var buffer bytes.Buffer
404	doc, err := goquery.NewDocumentFromReader(reader)
405	if err != nil {
406		return buffer, err
407	}
408	buffer.WriteString(
409		conv.Convert(doc.Selection),
410	)
411
412	return buffer, nil
413}
414
415// ConvertResponse returns the content from a html response.
416func (conv *Converter) ConvertResponse(res *http.Response) (string, error) {
417	doc, err := goquery.NewDocumentFromResponse(res)
418	if err != nil {
419		return "", err
420	}
421	return conv.Convert(doc.Selection), nil
422}
423
424// ConvertString returns the content from a html string. If you
425// already have a goquery selection use `Convert`.
426func (conv *Converter) ConvertString(html string) (string, error) {
427	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
428	if err != nil {
429		return "", err
430	}
431	return conv.Convert(doc.Selection), nil
432}
433
434// ConvertBytes returns the content from a html byte array.
435func (conv *Converter) ConvertBytes(bytes []byte) ([]byte, error) {
436	res, err := conv.ConvertString(string(bytes))
437	if err != nil {
438		return nil, err
439	}
440	return []byte(res), nil
441}
442
443// ConvertURL returns the content from the page with that url.
444func (conv *Converter) ConvertURL(url string) (string, error) {
445	// not using goquery.NewDocument directly because of the timeout
446	resp, err := netClient.Get(url)
447	if err != nil {
448		return "", err
449	}
450
451	if resp.StatusCode < 200 || resp.StatusCode > 299 {
452		return "", fmt.Errorf("expected a status code in the 2xx range but got %d", resp.StatusCode)
453	}
454
455	doc, err := goquery.NewDocumentFromResponse(resp)
456	if err != nil {
457		return "", err
458	}
459	domain := DomainFromURL(url)
460	if conv.domain != domain {
461		log.Printf("expected '%s' as the domain but got '%s' \n", conv.domain, domain)
462	}
463	return conv.Convert(doc.Selection), nil
464}