1package htmlsanitizer
2
3import (
4 "encoding/base64"
5 "net/url"
6 "regexp"
7
8 "github.com/microcosm-cc/bluemonday"
9)
10
11type LibSanitizer struct {
12 policy *bluemonday.Policy
13}
14
15func NewLibSanitizer() LibSanitizer {
16 return LibSanitizer{policy: newPolicy()}
17}
18
19func (s LibSanitizer) SanitizeBytes(html []byte) []byte {
20 return s.policy.SanitizeBytes(html)
21}
22
23func newPolicy() *bluemonday.Policy {
24 p := bluemonday.NewPolicy()
25 linkURLPattern := regexp.MustCompile(`(?i)^(https?://|mailto:|tel:)`)
26 imageURLPattern := regexp.MustCompile(`(?i)^(https?://|cid:|data:image/)`)
27 dataImagePrefixPattern := regexp.MustCompile(`(?i)^image/(gif|jpe?g|png|webp);base64,`)
28 p.AllowElements(
29 "a", "b", "blockquote", "br", "code", "div", "em", "h1", "h2",
30 "i", "img", "li", "ol", "p", "pre", "span", "strong", "table",
31 "tbody", "td", "th", "thead", "tr", "u", "ul",
32 )
33 p.AllowAttrs("href").Matching(linkURLPattern).OnElements("a")
34 p.AllowAttrs("src").Matching(imageURLPattern).OnElements("img")
35 p.AllowAttrs("alt").OnElements("img")
36 p.AllowAttrs("cite").OnElements("blockquote")
37 p.RequireParseableURLs(true)
38 p.AllowURLSchemes("http", "https", "mailto", "tel")
39 p.AllowURLSchemeWithCustomPolicy("cid", func(u *url.URL) bool {
40 return u.Opaque != "" && u.RawQuery == "" && u.Fragment == ""
41 })
42 p.AllowURLSchemeWithCustomPolicy("data", func(u *url.URL) bool {
43 if u.RawQuery != "" || u.Fragment != "" {
44 return false
45 }
46 prefix := dataImagePrefixPattern.FindString(u.Opaque)
47 if prefix == "" {
48 return false
49 }
50 payload := u.Opaque[len(prefix):]
51 if _, err := base64.StdEncoding.DecodeString(payload); err == nil {
52 return true
53 }
54 _, err := base64.RawStdEncoding.DecodeString(payload)
55 return err == nil
56 })
57 return p
58}