lib_sanitizer.go

 1package htmlsanitizer
 2
 3import (
 4	"encoding/base64"
 5	"net/url"
 6	"regexp"
 7
 8	"github.com/microcosm-cc/bluemonday"
 9)
10
11type LibSanitizer struct {
12	policy *bluemonday.Policy
13}
14
15func NewLibSanitizer() LibSanitizer {
16	return LibSanitizer{policy: newPolicy()}
17}
18
19func (s LibSanitizer) SanitizeBytes(html []byte) []byte {
20	return s.policy.SanitizeBytes(html)
21}
22
23func newPolicy() *bluemonday.Policy {
24	p := bluemonday.NewPolicy()
25	linkURLPattern := regexp.MustCompile(`(?i)^(https?://|mailto:|tel:)`)
26	imageURLPattern := regexp.MustCompile(`(?i)^(https?://|cid:|data:image/)`)
27	dataImagePrefixPattern := regexp.MustCompile(`(?i)^image/(gif|jpe?g|png|webp);base64,`)
28	p.AllowElements(
29		"a", "b", "blockquote", "br", "code", "div", "em", "h1", "h2",
30		"i", "img", "li", "ol", "p", "pre", "span", "strong", "table",
31		"tbody", "td", "th", "thead", "tr", "u", "ul",
32	)
33	p.AllowAttrs("href").Matching(linkURLPattern).OnElements("a")
34	p.AllowAttrs("src").Matching(imageURLPattern).OnElements("img")
35	p.AllowAttrs("alt").OnElements("img")
36	p.AllowAttrs("cite").OnElements("blockquote")
37	p.RequireParseableURLs(true)
38	p.AllowURLSchemes("http", "https", "mailto", "tel")
39	p.AllowURLSchemeWithCustomPolicy("cid", func(u *url.URL) bool {
40		return u.Opaque != "" && u.RawQuery == "" && u.Fragment == ""
41	})
42	p.AllowURLSchemeWithCustomPolicy("data", func(u *url.URL) bool {
43		if u.RawQuery != "" || u.Fragment != "" {
44			return false
45		}
46		prefix := dataImagePrefixPattern.FindString(u.Opaque)
47		if prefix == "" {
48			return false
49		}
50		payload := u.Opaque[len(prefix):]
51		if _, err := base64.StdEncoding.DecodeString(payload); err == nil {
52			return true
53		}
54		_, err := base64.RawStdEncoding.DecodeString(payload)
55		return err == nil
56	})
57	return p
58}