markdown.go

  1package md
  2
  3import (
  4	"bytes"
  5	"log"
  6	"net/url"
  7	"regexp"
  8	"strings"
  9
 10	"github.com/PuerkitoBio/goquery"
 11	"golang.org/x/net/html"
 12)
 13
 14var (
 15	ruleDefault = func(content string, selec *goquery.Selection, opt *Options) *string {
 16		return &content
 17	}
 18	ruleKeep = func(content string, selec *goquery.Selection, opt *Options) *string {
 19		element := selec.Get(0)
 20
 21		var buf bytes.Buffer
 22		err := html.Render(&buf, element)
 23		if err != nil {
 24			log.Println("[JohannesKaufmann/html-to-markdown] ruleKeep: error while rendering the element to html:", err)
 25			return String("")
 26		}
 27
 28		return String(buf.String())
 29	}
 30)
 31
 32var inlineElements = []string{ // -> https://developer.mozilla.org/de/docs/Web/HTML/Inline_elemente
 33	"b", "big", "i", "small", "tt",
 34	"abbr", "acronym", "cite", "code", "dfn", "em", "kbd", "strong", "samp", "var",
 35	"a", "bdo", "br", "img", "map", "object", "q", "script", "span", "sub", "sup",
 36	"button", "input", "label", "select", "textarea",
 37}
 38
 39// IsInlineElement can be used to check wether a node name (goquery.Nodename) is
 40// an html inline element and not a block element. Used in the rule for the
 41// p tag to check wether the text is inside a block element.
 42func IsInlineElement(e string) bool {
 43	for _, element := range inlineElements {
 44		if element == e {
 45			return true
 46		}
 47	}
 48	return false
 49}
 50
 51// String is a helper function to return a pointer.
 52func String(text string) *string {
 53	return &text
 54}
 55
 56// Options to customize the output. You can change stuff like
 57// the character that is used for strong text.
 58type Options struct {
 59	// "setext" or "atx"
 60	// default: "atx"
 61	HeadingStyle string
 62
 63	// Any Thematic break
 64	// default: "* * *"
 65	HorizontalRule string
 66
 67	// "-", "+", or "*"
 68	// default: "-"
 69	BulletListMarker string
 70
 71	// "indented" or "fenced"
 72	// default: "indented"
 73	CodeBlockStyle string
 74
 75	// ``` or ~~~
 76	// default: ```
 77	Fence string
 78
 79	// _ or *
 80	// default: _
 81	EmDelimiter string
 82
 83	// ** or __
 84	// default: **
 85	StrongDelimiter string
 86
 87	// inlined or referenced
 88	// default: inlined
 89	LinkStyle string
 90
 91	// full, collapsed, or shortcut
 92	// default: full
 93	LinkReferenceStyle string
 94
 95	// basic, disabled
 96	// default: basic
 97	EscapeMode string
 98
 99	domain string
100
101	// GetAbsoluteURL parses the `rawURL` and adds the `domain` to convert relative (/page.html)
102	// urls to absolute urls (http://domain.com/page.html).
103	//
104	// The default is `DefaultGetAbsoluteURL`, unless you override it. That can also
105	// be useful if you want to proxy the images.
106	GetAbsoluteURL func(selec *goquery.Selection, rawURL string, domain string) string
107
108	// GetCodeBlockLanguage identifies the language for syntax highlighting
109	// of a code block. The default is `DefaultGetCodeBlockLanguage`, which
110	// only gets the attribute x from the selection.
111	//
112	// You can override it if you want more results, for example by using
113	// lexers.Analyse(content) from github.com/alecthomas/chroma
114	// TODO: implement
115	// GetCodeBlockLanguage func(s *goquery.Selection, content string) string
116}
117
118// DefaultGetAbsoluteURL is the default function and can be overridden through `GetAbsoluteURL` in the options.
119func DefaultGetAbsoluteURL(selec *goquery.Selection, rawURL string, domain string) string {
120	if domain == "" {
121		return rawURL
122	}
123
124	u, err := url.Parse(rawURL)
125	if err != nil {
126		// we can't do anything with this url because it is invalid
127		return rawURL
128	}
129
130	if u.Scheme == "data" {
131		// this is a data uri (for example an inline base64 image)
132		return rawURL
133	}
134
135	if u.Scheme == "" {
136		u.Scheme = "http"
137	}
138	if u.Host == "" {
139		u.Host = domain
140	}
141
142	return u.String()
143}
144
145// AdvancedResult is used for example for links. If you use LinkStyle:referenced
146// the link href is placed at the bottom of the generated markdown (Footer).
147type AdvancedResult struct {
148	Header   string
149	Markdown string
150	Footer   string
151}
152
153// Rule to convert certain html tags to markdown.
154//  md.Rule{
155//    Filter: []string{"del", "s", "strike"},
156//    Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string {
157//      // You need to return a pointer to a string (md.String is just a helper function).
158//      // If you return nil the next function for that html element
159//      // will be picked. For example you could only convert an element
160//      // if it has a certain class name and fallback if not.
161//      return md.String("~" + content + "~")
162//    },
163//  }
164type Rule struct {
165	Filter              []string
166	Replacement         func(content string, selec *goquery.Selection, options *Options) *string
167	AdvancedReplacement func(content string, selec *goquery.Selection, options *Options) (res AdvancedResult, skip bool)
168}
169
170var leadingNewlinesR = regexp.MustCompile(`^\n+`)
171var trailingNewlinesR = regexp.MustCompile(`\n+$`)
172
173var newlinesR = regexp.MustCompile(`\n+`)
174var tabR = regexp.MustCompile(`\t+`)
175var indentR = regexp.MustCompile(`(?m)\n`)
176
177func (conv *Converter) selecToMD(domain string, selec *goquery.Selection, opt *Options) AdvancedResult {
178	var result AdvancedResult
179
180	var builder strings.Builder
181	selec.Contents().Each(func(i int, s *goquery.Selection) {
182		name := goquery.NodeName(s)
183		rules := conv.getRuleFuncs(name)
184
185		for i := len(rules) - 1; i >= 0; i-- {
186			rule := rules[i]
187
188			content := conv.selecToMD(domain, s, opt)
189			if content.Header != "" {
190				result.Header += content.Header
191			}
192			if content.Footer != "" {
193				result.Footer += content.Footer
194			}
195
196			res, skip := rule(content.Markdown, s, opt)
197			if res.Header != "" {
198				result.Header += res.Header + "\n"
199			}
200			if res.Footer != "" {
201				result.Footer += res.Footer + "\n"
202			}
203
204			if !skip {
205				builder.WriteString(res.Markdown)
206				return
207			}
208		}
209	})
210	result.Markdown = builder.String()
211	return result
212}