1// Copyright (c) 2014, David Kitchen <david@buro9.com>
2//
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice, this
9// list of conditions and the following disclaimer.
10//
11// * Redistributions in binary form must reproduce the above copyright notice,
12// this list of conditions and the following disclaimer in the documentation
13// and/or other materials provided with the distribution.
14//
15// * Neither the name of the organisation (Microcosm) nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30package bluemonday
31
32import (
33 "bytes"
34 "fmt"
35 "io"
36 "net/url"
37 "regexp"
38 "strconv"
39 "strings"
40
41 "golang.org/x/net/html"
42
43 "github.com/aymerick/douceur/parser"
44)
45
46var (
47 dataAttribute = regexp.MustCompile("^data-.+")
48 dataAttributeXMLPrefix = regexp.MustCompile("^xml.+")
49 dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
50 cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
51 dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`)
52)
53
54// Sanitize takes a string that contains a HTML fragment or document and applies
55// the given policy allowlist.
56//
57// It returns a HTML string that has been sanitized by the policy or an empty
58// string if an error has occurred (most likely as a consequence of extremely
59// malformed input)
60func (p *Policy) Sanitize(s string) string {
61 if strings.TrimSpace(s) == "" {
62 return s
63 }
64
65 return p.sanitizeWithBuff(strings.NewReader(s)).String()
66}
67
68// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
69// the given policy allowlist.
70//
71// It returns a []byte containing the HTML that has been sanitized by the policy
72// or an empty []byte if an error has occurred (most likely as a consequence of
73// extremely malformed input)
74func (p *Policy) SanitizeBytes(b []byte) []byte {
75 if len(bytes.TrimSpace(b)) == 0 {
76 return b
77 }
78
79 return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
80}
81
82// SanitizeReader takes an io.Reader that contains a HTML fragment or document
83// and applies the given policy allowlist.
84//
85// It returns a bytes.Buffer containing the HTML that has been sanitized by the
86// policy. Errors during sanitization will merely return an empty result.
87func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
88 return p.sanitizeWithBuff(r)
89}
90
91// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
92// and applies the given policy allowlist and writes to the provided writer returning
93// an error if there is one.
94func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
95 return p.sanitize(r, w)
96}
97
98// Query represents a single part of the query string, a query param
99type Query struct {
100 Key string
101 Value string
102 HasValue bool
103}
104
105func parseQuery(query string) (values []Query, err error) {
106 // This is essentially a copy of parseQuery from
107 // https://golang.org/src/net/url/url.go but adjusted to build our values
108 // based on our type, which we need to preserve the ordering of the query
109 // string
110 for query != "" {
111 key := query
112 if i := strings.IndexAny(key, "&;"); i >= 0 {
113 key, query = key[:i], key[i+1:]
114 } else {
115 query = ""
116 }
117 if key == "" {
118 continue
119 }
120 value := ""
121 hasValue := false
122 if i := strings.Index(key, "="); i >= 0 {
123 key, value = key[:i], key[i+1:]
124 hasValue = true
125 }
126 key, err1 := url.QueryUnescape(key)
127 if err1 != nil {
128 if err == nil {
129 err = err1
130 }
131 continue
132 }
133 value, err1 = url.QueryUnescape(value)
134 if err1 != nil {
135 if err == nil {
136 err = err1
137 }
138 continue
139 }
140 values = append(values, Query{
141 Key: key,
142 Value: value,
143 HasValue: hasValue,
144 })
145 }
146 return values, err
147}
148
149func encodeQueries(queries []Query) string {
150 var buff bytes.Buffer
151 for i, query := range queries {
152 buff.WriteString(url.QueryEscape(query.Key))
153 if query.HasValue {
154 buff.WriteString("=")
155 buff.WriteString(url.QueryEscape(query.Value))
156 }
157 if i < len(queries)-1 {
158 buff.WriteString("&")
159 }
160 }
161 return buff.String()
162}
163
164func sanitizedURL(val string) (string, error) {
165 u, err := url.Parse(val)
166 if err != nil {
167 return "", err
168 }
169
170 // we use parseQuery but not u.Query to keep the order not change because
171 // url.Values is a map which has a random order.
172 queryValues, err := parseQuery(u.RawQuery)
173 if err != nil {
174 return "", err
175 }
176 // sanitize the url query params
177 for i, query := range queryValues {
178 queryValues[i].Key = html.EscapeString(query.Key)
179 }
180 u.RawQuery = encodeQueries(queryValues)
181 // u.String() will also sanitize host/scheme/user/pass
182 return u.String(), nil
183}
184
185// Performs the actual sanitization process.
186func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
187 var buff bytes.Buffer
188 if err := p.sanitize(r, &buff); err != nil {
189 return &bytes.Buffer{}
190 }
191 return &buff
192}
193
194type asStringWriter struct {
195 io.Writer
196}
197
198func (a *asStringWriter) WriteString(s string) (int, error) {
199 return a.Write([]byte(s))
200}
201
202func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
203 // It is possible that the developer has created the policy via:
204 // p := bluemonday.Policy{}
205 // rather than:
206 // p := bluemonday.NewPolicy()
207 // If this is the case, and if they haven't yet triggered an action that
208 // would initialize the maps, then we need to do that.
209 p.init()
210
211 buff, ok := w.(stringWriterWriter)
212 if !ok {
213 buff = &asStringWriter{w}
214 }
215
216 var (
217 skipElementContent bool
218 skippingElementsCount int64
219 skipClosingTag bool
220 closingTagToSkipStack []string
221 mostRecentlyStartedToken string
222 )
223
224 tokenizer := html.NewTokenizer(r)
225 for {
226 if tokenizer.Next() == html.ErrorToken {
227 err := tokenizer.Err()
228 if err == io.EOF {
229 // End of input means end of processing
230 return nil
231 }
232
233 // Raw tokenizer error
234 return err
235 }
236
237 token := tokenizer.Token()
238 switch token.Type {
239 case html.DoctypeToken:
240
241 // DocType is not handled as there is no safe parsing mechanism
242 // provided by golang.org/x/net/html for the content, and this can
243 // be misused to insert HTML tags that are not then sanitized
244 //
245 // One might wish to recursively sanitize here using the same policy
246 // but I will need to do some further testing before considering
247 // this.
248
249 case html.CommentToken:
250
251 // Comments are ignored by default
252 if p.allowComments {
253 // But if allowed then write the comment out as-is
254 buff.WriteString(token.String())
255 }
256
257 case html.StartTagToken:
258
259 mostRecentlyStartedToken = normaliseElementName(token.Data)
260
261 switch normaliseElementName(token.Data) {
262 case `script`:
263 if !p.allowUnsafe {
264 continue
265 }
266 case `style`:
267 if !p.allowUnsafe {
268 continue
269 }
270 }
271
272 aps, ok := p.elsAndAttrs[token.Data]
273 if !ok {
274 aa, matched := p.matchRegex(token.Data)
275 if !matched {
276 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
277 skipElementContent = true
278 skippingElementsCount++
279 }
280 if p.addSpaces {
281 if _, err := buff.WriteString(" "); err != nil {
282 return err
283 }
284 }
285 break
286 }
287 aps = aa
288 }
289 if len(token.Attr) != 0 {
290 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
291 }
292
293 if len(token.Attr) == 0 {
294 if !p.allowNoAttrs(token.Data) {
295 skipClosingTag = true
296 closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
297 if p.addSpaces {
298 if _, err := buff.WriteString(" "); err != nil {
299 return err
300 }
301 }
302 break
303 }
304 }
305
306 if !skipElementContent {
307 if _, err := buff.WriteString(token.String()); err != nil {
308 return err
309 }
310 }
311
312 case html.EndTagToken:
313
314 if mostRecentlyStartedToken == normaliseElementName(token.Data) {
315 mostRecentlyStartedToken = ""
316 }
317
318 switch normaliseElementName(token.Data) {
319 case `script`:
320 if !p.allowUnsafe {
321 continue
322 }
323 case `style`:
324 if !p.allowUnsafe {
325 continue
326 }
327 }
328
329 if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
330 closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
331 if len(closingTagToSkipStack) == 0 {
332 skipClosingTag = false
333 }
334 if p.addSpaces {
335 if _, err := buff.WriteString(" "); err != nil {
336 return err
337 }
338 }
339 break
340 }
341 if _, ok := p.elsAndAttrs[token.Data]; !ok {
342 match := false
343 for regex := range p.elsMatchingAndAttrs {
344 if regex.MatchString(token.Data) {
345 skipElementContent = false
346 match = true
347 break
348 }
349 }
350 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
351 skippingElementsCount--
352 if skippingElementsCount == 0 {
353 skipElementContent = false
354 }
355 }
356 if !match {
357 if p.addSpaces {
358 if _, err := buff.WriteString(" "); err != nil {
359 return err
360 }
361 }
362 break
363 }
364 }
365
366 if !skipElementContent {
367 if _, err := buff.WriteString(token.String()); err != nil {
368 return err
369 }
370 }
371
372 case html.SelfClosingTagToken:
373
374 switch normaliseElementName(token.Data) {
375 case `script`:
376 if !p.allowUnsafe {
377 continue
378 }
379 case `style`:
380 if !p.allowUnsafe {
381 continue
382 }
383 }
384
385 aps, ok := p.elsAndAttrs[token.Data]
386 if !ok {
387 aa, matched := p.matchRegex(token.Data)
388 if !matched {
389 if p.addSpaces && !matched {
390 if _, err := buff.WriteString(" "); err != nil {
391 return err
392 }
393 }
394 break
395 }
396 aps = aa
397 }
398
399 if len(token.Attr) != 0 {
400 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
401 }
402
403 if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
404 if p.addSpaces {
405 if _, err := buff.WriteString(" "); err != nil {
406 return err
407 }
408 }
409 break
410 }
411 if !skipElementContent {
412 if _, err := buff.WriteString(token.String()); err != nil {
413 return err
414 }
415 }
416
417 case html.TextToken:
418
419 if !skipElementContent {
420 switch mostRecentlyStartedToken {
421 case `script`:
422 // not encouraged, but if a policy allows JavaScript we
423 // should not HTML escape it as that would break the output
424 //
425 // requires p.AllowUnsafe()
426 if p.allowUnsafe {
427 if _, err := buff.WriteString(token.Data); err != nil {
428 return err
429 }
430 }
431 case "style":
432 // not encouraged, but if a policy allows CSS styles we
433 // should not HTML escape it as that would break the output
434 //
435 // requires p.AllowUnsafe()
436 if p.allowUnsafe {
437 if _, err := buff.WriteString(token.Data); err != nil {
438 return err
439 }
440 }
441 default:
442 // HTML escape the text
443 if _, err := buff.WriteString(token.String()); err != nil {
444 return err
445 }
446 }
447 }
448
449 default:
450 // A token that didn't exist in the html package when we wrote this
451 return fmt.Errorf("unknown token: %v", token)
452 }
453 }
454}
455
456// sanitizeAttrs takes a set of element attribute policies and the global
457// attribute policies and applies them to the []html.Attribute returning a set
458// of html.Attributes that match the policies
459func (p *Policy) sanitizeAttrs(
460 elementName string,
461 attrs []html.Attribute,
462 aps map[string][]attrPolicy,
463) []html.Attribute {
464
465 if len(attrs) == 0 {
466 return attrs
467 }
468
469 hasStylePolicies := false
470 sps, elementHasStylePolicies := p.elsAndStyles[elementName]
471 if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
472 hasStylePolicies = true
473 }
474 // no specific element policy found, look for a pattern match
475 if !hasStylePolicies {
476 for k, v := range p.elsMatchingAndStyles {
477 if k.MatchString(elementName) {
478 if len(v) > 0 {
479 hasStylePolicies = true
480 break
481 }
482 }
483 }
484 }
485
486 // Builds a new attribute slice based on the whether the attribute has been
487 // allowed explicitly or globally.
488 cleanAttrs := []html.Attribute{}
489attrsLoop:
490 for _, htmlAttr := range attrs {
491 if p.allowDataAttributes {
492 // If we see a data attribute, let it through.
493 if isDataAttribute(htmlAttr.Key) {
494 cleanAttrs = append(cleanAttrs, htmlAttr)
495 continue
496 }
497 }
498 // Is this a "style" attribute, and if so, do we need to sanitize it?
499 if htmlAttr.Key == "style" && hasStylePolicies {
500 htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
501 if htmlAttr.Val == "" {
502 // We've sanitized away any and all styles; don't bother to
503 // output the style attribute (even if it's allowed)
504 continue
505 } else {
506 cleanAttrs = append(cleanAttrs, htmlAttr)
507 continue
508 }
509 }
510
511 // Is there an element specific attribute policy that applies?
512 if apl, ok := aps[htmlAttr.Key]; ok {
513 for _, ap := range apl {
514 if ap.regexp != nil {
515 if ap.regexp.MatchString(htmlAttr.Val) {
516 cleanAttrs = append(cleanAttrs, htmlAttr)
517 continue attrsLoop
518 }
519 } else {
520 cleanAttrs = append(cleanAttrs, htmlAttr)
521 continue attrsLoop
522 }
523 }
524 }
525
526 // Is there a global attribute policy that applies?
527 if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
528 for _, ap := range apl {
529 if ap.regexp != nil {
530 if ap.regexp.MatchString(htmlAttr.Val) {
531 cleanAttrs = append(cleanAttrs, htmlAttr)
532 continue attrsLoop
533 }
534 } else {
535 cleanAttrs = append(cleanAttrs, htmlAttr)
536 continue attrsLoop
537 }
538 }
539 }
540 }
541
542 if len(cleanAttrs) == 0 {
543 // If nothing was allowed, let's get out of here
544 return cleanAttrs
545 }
546 // cleanAttrs now contains the attributes that are permitted
547
548 if linkable(elementName) {
549 if p.requireParseableURLs {
550 // Ensure URLs are parseable:
551 // - a.href
552 // - area.href
553 // - link.href
554 // - blockquote.cite
555 // - q.cite
556 // - img.src
557 // - script.src
558 tmpAttrs := []html.Attribute{}
559 for _, htmlAttr := range cleanAttrs {
560 switch elementName {
561 case "a", "area", "base", "link":
562 if htmlAttr.Key == "href" {
563 if u, ok := p.validURL(htmlAttr.Val); ok {
564 htmlAttr.Val = u
565 tmpAttrs = append(tmpAttrs, htmlAttr)
566 }
567 break
568 }
569 tmpAttrs = append(tmpAttrs, htmlAttr)
570 case "blockquote", "del", "ins", "q":
571 if htmlAttr.Key == "cite" {
572 if u, ok := p.validURL(htmlAttr.Val); ok {
573 htmlAttr.Val = u
574 tmpAttrs = append(tmpAttrs, htmlAttr)
575 }
576 break
577 }
578 tmpAttrs = append(tmpAttrs, htmlAttr)
579 case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
580 if htmlAttr.Key == "src" {
581 if u, ok := p.validURL(htmlAttr.Val); ok {
582 if p.srcRewriter != nil {
583 parsedURL, err := url.Parse(u)
584 if err != nil {
585 fmt.Println(err)
586 }
587 p.srcRewriter(parsedURL)
588 u = parsedURL.String()
589 }
590 htmlAttr.Val = u
591 tmpAttrs = append(tmpAttrs, htmlAttr)
592 }
593 break
594 }
595 tmpAttrs = append(tmpAttrs, htmlAttr)
596 default:
597 tmpAttrs = append(tmpAttrs, htmlAttr)
598 }
599 }
600 cleanAttrs = tmpAttrs
601 }
602
603 if (p.requireNoFollow ||
604 p.requireNoFollowFullyQualifiedLinks ||
605 p.requireNoReferrer ||
606 p.requireNoReferrerFullyQualifiedLinks ||
607 p.addTargetBlankToFullyQualifiedLinks) &&
608 len(cleanAttrs) > 0 {
609
610 // Add rel="nofollow" if a "href" exists
611 switch elementName {
612 case "a", "area", "base", "link":
613 var hrefFound bool
614 var externalLink bool
615 for _, htmlAttr := range cleanAttrs {
616 if htmlAttr.Key == "href" {
617 hrefFound = true
618
619 u, err := url.Parse(htmlAttr.Val)
620 if err != nil {
621 continue
622 }
623 if u.Host != "" {
624 externalLink = true
625 }
626
627 continue
628 }
629 }
630
631 if hrefFound {
632 var (
633 noFollowFound bool
634 noReferrerFound bool
635 targetBlankFound bool
636 )
637
638 addNoFollow := (p.requireNoFollow ||
639 externalLink && p.requireNoFollowFullyQualifiedLinks)
640
641 addNoReferrer := (p.requireNoReferrer ||
642 externalLink && p.requireNoReferrerFullyQualifiedLinks)
643
644 addTargetBlank := (externalLink &&
645 p.addTargetBlankToFullyQualifiedLinks)
646
647 tmpAttrs := []html.Attribute{}
648 for _, htmlAttr := range cleanAttrs {
649
650 var appended bool
651 if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
652
653 if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
654 htmlAttr.Val += " nofollow"
655 }
656 if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
657 htmlAttr.Val += " noreferrer"
658 }
659 noFollowFound = addNoFollow
660 noReferrerFound = addNoReferrer
661 tmpAttrs = append(tmpAttrs, htmlAttr)
662 appended = true
663 }
664
665 if elementName == "a" && htmlAttr.Key == "target" {
666 if htmlAttr.Val == "_blank" {
667 targetBlankFound = true
668 }
669 if addTargetBlank && !targetBlankFound {
670 htmlAttr.Val = "_blank"
671 targetBlankFound = true
672 tmpAttrs = append(tmpAttrs, htmlAttr)
673 appended = true
674 }
675 }
676
677 if !appended {
678 tmpAttrs = append(tmpAttrs, htmlAttr)
679 }
680 }
681 if noFollowFound || noReferrerFound || targetBlankFound {
682 cleanAttrs = tmpAttrs
683 }
684
685 if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
686 rel := html.Attribute{}
687 rel.Key = "rel"
688 if addNoFollow {
689 rel.Val = "nofollow"
690 }
691 if addNoReferrer {
692 if rel.Val != "" {
693 rel.Val += " "
694 }
695 rel.Val += "noreferrer"
696 }
697 cleanAttrs = append(cleanAttrs, rel)
698 }
699
700 if elementName == "a" && addTargetBlank && !targetBlankFound {
701 rel := html.Attribute{}
702 rel.Key = "target"
703 rel.Val = "_blank"
704 targetBlankFound = true
705 cleanAttrs = append(cleanAttrs, rel)
706 }
707
708 if targetBlankFound {
709 // target="_blank" has a security risk that allows the
710 // opened window/tab to issue JavaScript calls against
711 // window.opener, which in effect allow the destination
712 // of the link to control the source:
713 // https://dev.to/ben/the-targetblank-vulnerability-by-example
714 //
715 // To mitigate this risk, we need to add a specific rel
716 // attribute if it is not already present.
717 // rel="noopener"
718 //
719 // Unfortunately this is processing the rel twice (we
720 // already looked at it earlier ^^) as we cannot be sure
721 // of the ordering of the href and rel, and whether we
722 // have fully satisfied that we need to do this. This
723 // double processing only happens *if* target="_blank"
724 // is true.
725 var noOpenerAdded bool
726 tmpAttrs := []html.Attribute{}
727 for _, htmlAttr := range cleanAttrs {
728 var appended bool
729 if htmlAttr.Key == "rel" {
730 if strings.Contains(htmlAttr.Val, "noopener") {
731 noOpenerAdded = true
732 tmpAttrs = append(tmpAttrs, htmlAttr)
733 } else {
734 htmlAttr.Val += " noopener"
735 noOpenerAdded = true
736 tmpAttrs = append(tmpAttrs, htmlAttr)
737 }
738
739 appended = true
740 }
741 if !appended {
742 tmpAttrs = append(tmpAttrs, htmlAttr)
743 }
744 }
745 if noOpenerAdded {
746 cleanAttrs = tmpAttrs
747 } else {
748 // rel attr was not found, or else noopener would
749 // have been added already
750 rel := html.Attribute{}
751 rel.Key = "rel"
752 rel.Val = "noopener"
753 cleanAttrs = append(cleanAttrs, rel)
754 }
755
756 }
757 }
758 default:
759 }
760 }
761 }
762
763 if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 {
764 switch elementName {
765 case "audio", "img", "link", "script", "video":
766 var crossOriginFound bool
767 for i, htmlAttr := range cleanAttrs {
768 if htmlAttr.Key == "crossorigin" {
769 crossOriginFound = true
770 cleanAttrs[i].Val = "anonymous"
771 }
772 }
773
774 if !crossOriginFound {
775 crossOrigin := html.Attribute{}
776 crossOrigin.Key = "crossorigin"
777 crossOrigin.Val = "anonymous"
778 cleanAttrs = append(cleanAttrs, crossOrigin)
779 }
780 }
781 }
782
783 if p.requireSandboxOnIFrame != nil && elementName == "iframe" {
784 var sandboxFound bool
785 for i, htmlAttr := range cleanAttrs {
786 if htmlAttr.Key == "sandbox" {
787 sandboxFound = true
788 var cleanVals []string
789 cleanValsSet := make(map[string]bool)
790 for _, val := range strings.Fields(htmlAttr.Val) {
791 if p.requireSandboxOnIFrame[val] {
792 if !cleanValsSet[val] {
793 cleanVals = append(cleanVals, val)
794 cleanValsSet[val] = true
795 }
796 }
797 }
798 cleanAttrs[i].Val = strings.Join(cleanVals, " ")
799 }
800 }
801
802 if !sandboxFound {
803 sandbox := html.Attribute{}
804 sandbox.Key = "sandbox"
805 sandbox.Val = ""
806 cleanAttrs = append(cleanAttrs, sandbox)
807 }
808 }
809
810 return cleanAttrs
811}
812
813func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
814 sps := p.elsAndStyles[elementName]
815 if len(sps) == 0 {
816 sps = map[string][]stylePolicy{}
817 // check for any matching elements, if we don't already have a policy found
818 // if multiple matches are found they will be overwritten, it's best
819 // to not have overlapping matchers
820 for regex, policies := range p.elsMatchingAndStyles {
821 if regex.MatchString(elementName) {
822 for k, v := range policies {
823 sps[k] = append(sps[k], v...)
824 }
825 }
826 }
827 }
828
829 //Add semi-colon to end to fix parsing issue
830 attr.Val = strings.TrimRight(attr.Val, " ")
831 if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
832 attr.Val = attr.Val + ";"
833 }
834 decs, err := parser.ParseDeclarations(attr.Val)
835 if err != nil {
836 attr.Val = ""
837 return attr
838 }
839 clean := []string{}
840 prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
841
842decLoop:
843 for _, dec := range decs {
844 tempProperty := strings.ToLower(dec.Property)
845 tempValue := removeUnicode(strings.ToLower(dec.Value))
846 for _, i := range prefixes {
847 tempProperty = strings.TrimPrefix(tempProperty, i)
848 }
849 if spl, ok := sps[tempProperty]; ok {
850 for _, sp := range spl {
851 if sp.handler != nil {
852 if sp.handler(tempValue) {
853 clean = append(clean, dec.Property+": "+dec.Value)
854 continue decLoop
855 }
856 } else if len(sp.enum) > 0 {
857 if stringInSlice(tempValue, sp.enum) {
858 clean = append(clean, dec.Property+": "+dec.Value)
859 continue decLoop
860 }
861 } else if sp.regexp != nil {
862 if sp.regexp.MatchString(tempValue) {
863 clean = append(clean, dec.Property+": "+dec.Value)
864 continue decLoop
865 }
866 }
867 }
868 }
869 if spl, ok := p.globalStyles[tempProperty]; ok {
870 for _, sp := range spl {
871 if sp.handler != nil {
872 if sp.handler(tempValue) {
873 clean = append(clean, dec.Property+": "+dec.Value)
874 continue decLoop
875 }
876 } else if len(sp.enum) > 0 {
877 if stringInSlice(tempValue, sp.enum) {
878 clean = append(clean, dec.Property+": "+dec.Value)
879 continue decLoop
880 }
881 } else if sp.regexp != nil {
882 if sp.regexp.MatchString(tempValue) {
883 clean = append(clean, dec.Property+": "+dec.Value)
884 continue decLoop
885 }
886 }
887 }
888 }
889 }
890 if len(clean) > 0 {
891 attr.Val = strings.Join(clean, "; ")
892 } else {
893 attr.Val = ""
894 }
895 return attr
896}
897
898func (p *Policy) allowNoAttrs(elementName string) bool {
899 _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
900 if !ok {
901 for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
902 if r.MatchString(elementName) {
903 ok = true
904 break
905 }
906 }
907 }
908 return ok
909}
910
911func (p *Policy) validURL(rawurl string) (string, bool) {
912 if p.requireParseableURLs {
913 // URLs are valid if when space is trimmed the URL is valid
914 rawurl = strings.TrimSpace(rawurl)
915
916 // URLs cannot contain whitespace, unless it is a data-uri
917 if strings.Contains(rawurl, " ") ||
918 strings.Contains(rawurl, "\t") ||
919 strings.Contains(rawurl, "\n") {
920 if !strings.HasPrefix(rawurl, `data:`) {
921 return "", false
922 }
923
924 // Remove \r and \n from base64 encoded data to pass url.Parse.
925 matched := dataURIbase64Prefix.FindString(rawurl)
926 if matched != "" {
927 rawurl = matched + strings.Replace(
928 strings.Replace(
929 rawurl[len(matched):],
930 "\r",
931 "",
932 -1,
933 ),
934 "\n",
935 "",
936 -1,
937 )
938 }
939 }
940
941 // URLs are valid if they parse
942 u, err := url.Parse(rawurl)
943 if err != nil {
944 return "", false
945 }
946
947 if u.Scheme != "" {
948 urlPolicies, ok := p.allowURLSchemes[u.Scheme]
949 if !ok {
950 for _, r := range p.allowURLSchemeRegexps {
951 if r.MatchString(u.Scheme) {
952 return u.String(), true
953 }
954 }
955
956 return "", false
957 }
958
959 if len(urlPolicies) == 0 {
960 return u.String(), true
961 }
962
963 for _, urlPolicy := range urlPolicies {
964 if urlPolicy(u) {
965 return u.String(), true
966 }
967 }
968
969 return "", false
970 }
971
972 if p.allowRelativeURLs {
973 if u.String() != "" {
974 return u.String(), true
975 }
976 }
977
978 return "", false
979 }
980
981 return rawurl, true
982}
983
984func linkable(elementName string) bool {
985 switch elementName {
986 case "a", "area", "base", "link":
987 // elements that allow .href
988 return true
989 case "blockquote", "del", "ins", "q":
990 // elements that allow .cite
991 return true
992 case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
993 // elements that allow .src
994 return true
995 default:
996 return false
997 }
998}
999
1000// stringInSlice returns true if needle exists in haystack
1001func stringInSlice(needle string, haystack []string) bool {
1002 for _, straw := range haystack {
1003 if strings.EqualFold(straw, needle) {
1004 return true
1005 }
1006 }
1007 return false
1008}
1009
1010func isDataAttribute(val string) bool {
1011 if !dataAttribute.MatchString(val) {
1012 return false
1013 }
1014 rest := strings.Split(val, "data-")
1015 if len(rest) == 1 {
1016 return false
1017 }
1018 // data-xml* is invalid.
1019 if dataAttributeXMLPrefix.MatchString(rest[1]) {
1020 return false
1021 }
1022 // no uppercase or semi-colons allowed.
1023 if dataAttributeInvalidChars.MatchString(rest[1]) {
1024 return false
1025 }
1026 return true
1027}
1028
1029func removeUnicode(value string) string {
1030 substitutedValue := value
1031 currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
1032 for currentLoc != nil {
1033
1034 character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
1035 character = strings.TrimSpace(character)
1036 if len(character) < 4 {
1037 character = strings.Repeat("0", 4-len(character)) + character
1038 } else {
1039 for len(character) > 4 {
1040 if character[0] != '0' {
1041 character = ""
1042 break
1043 } else {
1044 character = character[1:]
1045 }
1046 }
1047 }
1048 character = "\\u" + character
1049 translatedChar, err := strconv.Unquote(`"` + character + `"`)
1050 translatedChar = strings.TrimSpace(translatedChar)
1051 if err != nil {
1052 return ""
1053 }
1054 substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
1055 currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
1056 }
1057 return substitutedValue
1058}
1059
1060func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
1061 aps := make(map[string][]attrPolicy, 0)
1062 matched := false
1063 for regex, attrs := range p.elsMatchingAndAttrs {
1064 if regex.MatchString(elementName) {
1065 matched = true
1066 for k, v := range attrs {
1067 aps[k] = append(aps[k], v...)
1068 }
1069 }
1070 }
1071 return aps, matched
1072}
1073
1074// normaliseElementName takes a HTML element like <script> which is user input
1075// and returns a lower case version of it that is immune to UTF-8 to ASCII
1076// conversion tricks (like the use of upper case cyrillic i scrİpt which a
1077// strings.ToLower would convert to script). Instead this func will preserve
1078// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
1079// characters when lower cased
1080func normaliseElementName(str string) string {
1081 // that useful QuoteToASCII put quote marks at the start and end
1082 // so those are trimmed off
1083 return strings.TrimSuffix(
1084 strings.TrimPrefix(
1085 strings.ToLower(
1086 strconv.QuoteToASCII(str),
1087 ),
1088 `"`),
1089 `"`,
1090 )
1091}
1092
1093type stringWriterWriter interface {
1094 io.Writer
1095 io.StringWriter
1096}