sanitize.go

   1// Copyright (c) 2014, David Kitchen <david@buro9.com>
   2//
   3// All rights reserved.
   4//
   5// Redistribution and use in source and binary forms, with or without
   6// modification, are permitted provided that the following conditions are met:
   7//
   8// * Redistributions of source code must retain the above copyright notice, this
   9//   list of conditions and the following disclaimer.
  10//
  11// * Redistributions in binary form must reproduce the above copyright notice,
  12//   this list of conditions and the following disclaimer in the documentation
  13//   and/or other materials provided with the distribution.
  14//
  15// * Neither the name of the organisation (Microcosm) nor the names of its
  16//   contributors may be used to endorse or promote products derived from
  17//   this software without specific prior written permission.
  18//
  19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30package bluemonday
  31
  32import (
  33	"bytes"
  34	"fmt"
  35	"io"
  36	"net/url"
  37	"regexp"
  38	"strconv"
  39	"strings"
  40
  41	"golang.org/x/net/html"
  42
  43	"github.com/aymerick/douceur/parser"
  44)
  45
  46var (
  47	dataAttribute             = regexp.MustCompile("^data-.+")
  48	dataAttributeXMLPrefix    = regexp.MustCompile("^xml.+")
  49	dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
  50	cssUnicodeChar            = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
  51	dataURIbase64Prefix       = regexp.MustCompile(`^data:[^,]*;base64,`)
  52)
  53
  54// Sanitize takes a string that contains a HTML fragment or document and applies
  55// the given policy allowlist.
  56//
  57// It returns a HTML string that has been sanitized by the policy or an empty
  58// string if an error has occurred (most likely as a consequence of extremely
  59// malformed input)
  60func (p *Policy) Sanitize(s string) string {
  61	if strings.TrimSpace(s) == "" {
  62		return s
  63	}
  64
  65	return p.sanitizeWithBuff(strings.NewReader(s)).String()
  66}
  67
  68// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  69// the given policy allowlist.
  70//
  71// It returns a []byte containing the HTML that has been sanitized by the policy
  72// or an empty []byte if an error has occurred (most likely as a consequence of
  73// extremely malformed input)
  74func (p *Policy) SanitizeBytes(b []byte) []byte {
  75	if len(bytes.TrimSpace(b)) == 0 {
  76		return b
  77	}
  78
  79	return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
  80}
  81
  82// SanitizeReader takes an io.Reader that contains a HTML fragment or document
  83// and applies the given policy allowlist.
  84//
  85// It returns a bytes.Buffer containing the HTML that has been sanitized by the
  86// policy. Errors during sanitization will merely return an empty result.
  87func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  88	return p.sanitizeWithBuff(r)
  89}
  90
  91// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
  92// and applies the given policy allowlist and writes to the provided writer returning
  93// an error if there is one.
  94func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
  95	return p.sanitize(r, w)
  96}
  97
  98// Query represents a single part of the query string, a query param
  99type Query struct {
 100	Key      string
 101	Value    string
 102	HasValue bool
 103}
 104
 105func parseQuery(query string) (values []Query, err error) {
 106	// This is essentially a copy of parseQuery from
 107	// https://golang.org/src/net/url/url.go but adjusted to build our values
 108	// based on our type, which we need to preserve the ordering of the query
 109	// string
 110	for query != "" {
 111		key := query
 112		if i := strings.IndexAny(key, "&;"); i >= 0 {
 113			key, query = key[:i], key[i+1:]
 114		} else {
 115			query = ""
 116		}
 117		if key == "" {
 118			continue
 119		}
 120		value := ""
 121		hasValue := false
 122		if i := strings.Index(key, "="); i >= 0 {
 123			key, value = key[:i], key[i+1:]
 124			hasValue = true
 125		}
 126		key, err1 := url.QueryUnescape(key)
 127		if err1 != nil {
 128			if err == nil {
 129				err = err1
 130			}
 131			continue
 132		}
 133		value, err1 = url.QueryUnescape(value)
 134		if err1 != nil {
 135			if err == nil {
 136				err = err1
 137			}
 138			continue
 139		}
 140		values = append(values, Query{
 141			Key:      key,
 142			Value:    value,
 143			HasValue: hasValue,
 144		})
 145	}
 146	return values, err
 147}
 148
 149func encodeQueries(queries []Query) string {
 150	var buff bytes.Buffer
 151	for i, query := range queries {
 152		buff.WriteString(url.QueryEscape(query.Key))
 153		if query.HasValue {
 154			buff.WriteString("=")
 155			buff.WriteString(url.QueryEscape(query.Value))
 156		}
 157		if i < len(queries)-1 {
 158			buff.WriteString("&")
 159		}
 160	}
 161	return buff.String()
 162}
 163
 164func sanitizedURL(val string) (string, error) {
 165	u, err := url.Parse(val)
 166	if err != nil {
 167		return "", err
 168	}
 169
 170	// we use parseQuery but not u.Query to keep the order not change because
 171	// url.Values is a map which has a random order.
 172	queryValues, err := parseQuery(u.RawQuery)
 173	if err != nil {
 174		return "", err
 175	}
 176	// sanitize the url query params
 177	for i, query := range queryValues {
 178		queryValues[i].Key = html.EscapeString(query.Key)
 179	}
 180	u.RawQuery = encodeQueries(queryValues)
 181	// u.String() will also sanitize host/scheme/user/pass
 182	return u.String(), nil
 183}
 184
 185// Performs the actual sanitization process.
 186func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
 187	var buff bytes.Buffer
 188	if err := p.sanitize(r, &buff); err != nil {
 189		return &bytes.Buffer{}
 190	}
 191	return &buff
 192}
 193
 194type asStringWriter struct {
 195	io.Writer
 196}
 197
 198func (a *asStringWriter) WriteString(s string) (int, error) {
 199	return a.Write([]byte(s))
 200}
 201
 202func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
 203	// It is possible that the developer has created the policy via:
 204	//   p := bluemonday.Policy{}
 205	// rather than:
 206	//   p := bluemonday.NewPolicy()
 207	// If this is the case, and if they haven't yet triggered an action that
 208	// would initialize the maps, then we need to do that.
 209	p.init()
 210
 211	buff, ok := w.(stringWriterWriter)
 212	if !ok {
 213		buff = &asStringWriter{w}
 214	}
 215
 216	var (
 217		skipElementContent       bool
 218		skippingElementsCount    int64
 219		skipClosingTag           bool
 220		closingTagToSkipStack    []string
 221		mostRecentlyStartedToken string
 222	)
 223
 224	tokenizer := html.NewTokenizer(r)
 225	for {
 226		if tokenizer.Next() == html.ErrorToken {
 227			err := tokenizer.Err()
 228			if err == io.EOF {
 229				// End of input means end of processing
 230				return nil
 231			}
 232
 233			// Raw tokenizer error
 234			return err
 235		}
 236
 237		token := tokenizer.Token()
 238		switch token.Type {
 239		case html.DoctypeToken:
 240
 241			// DocType is not handled as there is no safe parsing mechanism
 242			// provided by golang.org/x/net/html for the content, and this can
 243			// be misused to insert HTML tags that are not then sanitized
 244			//
 245			// One might wish to recursively sanitize here using the same policy
 246			// but I will need to do some further testing before considering
 247			// this.
 248
 249		case html.CommentToken:
 250
 251			// Comments are ignored by default
 252			if p.allowComments {
 253				// But if allowed then write the comment out as-is
 254				buff.WriteString(token.String())
 255			}
 256
 257		case html.StartTagToken:
 258
 259			mostRecentlyStartedToken = normaliseElementName(token.Data)
 260
 261			switch normaliseElementName(token.Data) {
 262			case `script`:
 263				if !p.allowUnsafe {
 264					continue
 265				}
 266			case `style`:
 267				if !p.allowUnsafe {
 268					continue
 269				}
 270			}
 271
 272			aps, ok := p.elsAndAttrs[token.Data]
 273			if !ok {
 274				aa, matched := p.matchRegex(token.Data)
 275				if !matched {
 276					if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
 277						skipElementContent = true
 278						skippingElementsCount++
 279					}
 280					if p.addSpaces {
 281						if _, err := buff.WriteString(" "); err != nil {
 282							return err
 283						}
 284					}
 285					break
 286				}
 287				aps = aa
 288			}
 289			if len(token.Attr) != 0 {
 290				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 291			}
 292
 293			if len(token.Attr) == 0 {
 294				if !p.allowNoAttrs(token.Data) {
 295					skipClosingTag = true
 296					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
 297					if p.addSpaces {
 298						if _, err := buff.WriteString(" "); err != nil {
 299							return err
 300						}
 301					}
 302					break
 303				}
 304			}
 305
 306			if !skipElementContent {
 307				if _, err := buff.WriteString(token.String()); err != nil {
 308					return err
 309				}
 310			}
 311
 312		case html.EndTagToken:
 313
 314			if mostRecentlyStartedToken == normaliseElementName(token.Data) {
 315				mostRecentlyStartedToken = ""
 316			}
 317
 318			switch normaliseElementName(token.Data) {
 319			case `script`:
 320				if !p.allowUnsafe {
 321					continue
 322				}
 323			case `style`:
 324				if !p.allowUnsafe {
 325					continue
 326				}
 327			}
 328
 329			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
 330				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
 331				if len(closingTagToSkipStack) == 0 {
 332					skipClosingTag = false
 333				}
 334				if p.addSpaces {
 335					if _, err := buff.WriteString(" "); err != nil {
 336						return err
 337					}
 338				}
 339				break
 340			}
 341			if _, ok := p.elsAndAttrs[token.Data]; !ok {
 342				match := false
 343				for regex := range p.elsMatchingAndAttrs {
 344					if regex.MatchString(token.Data) {
 345						skipElementContent = false
 346						match = true
 347						break
 348					}
 349				}
 350				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
 351					skippingElementsCount--
 352					if skippingElementsCount == 0 {
 353						skipElementContent = false
 354					}
 355				}
 356				if !match {
 357					if p.addSpaces {
 358						if _, err := buff.WriteString(" "); err != nil {
 359							return err
 360						}
 361					}
 362					break
 363				}
 364			}
 365
 366			if !skipElementContent {
 367				if _, err := buff.WriteString(token.String()); err != nil {
 368					return err
 369				}
 370			}
 371
 372		case html.SelfClosingTagToken:
 373
 374			switch normaliseElementName(token.Data) {
 375			case `script`:
 376				if !p.allowUnsafe {
 377					continue
 378				}
 379			case `style`:
 380				if !p.allowUnsafe {
 381					continue
 382				}
 383			}
 384
 385			aps, ok := p.elsAndAttrs[token.Data]
 386			if !ok {
 387				aa, matched := p.matchRegex(token.Data)
 388				if !matched {
 389					if p.addSpaces && !matched {
 390						if _, err := buff.WriteString(" "); err != nil {
 391							return err
 392						}
 393					}
 394					break
 395				}
 396				aps = aa
 397			}
 398
 399			if len(token.Attr) != 0 {
 400				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 401			}
 402
 403			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
 404				if p.addSpaces {
 405					if _, err := buff.WriteString(" "); err != nil {
 406						return err
 407					}
 408				}
 409				break
 410			}
 411			if !skipElementContent {
 412				if _, err := buff.WriteString(token.String()); err != nil {
 413					return err
 414				}
 415			}
 416
 417		case html.TextToken:
 418
 419			if !skipElementContent {
 420				switch mostRecentlyStartedToken {
 421				case `script`:
 422					// not encouraged, but if a policy allows JavaScript we
 423					// should not HTML escape it as that would break the output
 424					//
 425					// requires p.AllowUnsafe()
 426					if p.allowUnsafe {
 427						if _, err := buff.WriteString(token.Data); err != nil {
 428							return err
 429						}
 430					}
 431				case "style":
 432					// not encouraged, but if a policy allows CSS styles we
 433					// should not HTML escape it as that would break the output
 434					//
 435					// requires p.AllowUnsafe()
 436					if p.allowUnsafe {
 437						if _, err := buff.WriteString(token.Data); err != nil {
 438							return err
 439						}
 440					}
 441				default:
 442					// HTML escape the text
 443					if _, err := buff.WriteString(token.String()); err != nil {
 444						return err
 445					}
 446				}
 447			}
 448
 449		default:
 450			// A token that didn't exist in the html package when we wrote this
 451			return fmt.Errorf("unknown token: %v", token)
 452		}
 453	}
 454}
 455
 456// sanitizeAttrs takes a set of element attribute policies and the global
 457// attribute policies and applies them to the []html.Attribute returning a set
 458// of html.Attributes that match the policies
 459func (p *Policy) sanitizeAttrs(
 460	elementName string,
 461	attrs []html.Attribute,
 462	aps map[string][]attrPolicy,
 463) []html.Attribute {
 464
 465	if len(attrs) == 0 {
 466		return attrs
 467	}
 468
 469	hasStylePolicies := false
 470	sps, elementHasStylePolicies := p.elsAndStyles[elementName]
 471	if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
 472		hasStylePolicies = true
 473	}
 474	// no specific element policy found, look for a pattern match
 475	if !hasStylePolicies {
 476		for k, v := range p.elsMatchingAndStyles {
 477			if k.MatchString(elementName) {
 478				if len(v) > 0 {
 479					hasStylePolicies = true
 480					break
 481				}
 482			}
 483		}
 484	}
 485
 486	// Builds a new attribute slice based on the whether the attribute has been
 487	// allowed explicitly or globally.
 488	cleanAttrs := []html.Attribute{}
 489attrsLoop:
 490	for _, htmlAttr := range attrs {
 491		if p.allowDataAttributes {
 492			// If we see a data attribute, let it through.
 493			if isDataAttribute(htmlAttr.Key) {
 494				cleanAttrs = append(cleanAttrs, htmlAttr)
 495				continue
 496			}
 497		}
 498		// Is this a "style" attribute, and if so, do we need to sanitize it?
 499		if htmlAttr.Key == "style" && hasStylePolicies {
 500			htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
 501			if htmlAttr.Val == "" {
 502				// We've sanitized away any and all styles; don't bother to
 503				// output the style attribute (even if it's allowed)
 504				continue
 505			} else {
 506				cleanAttrs = append(cleanAttrs, htmlAttr)
 507				continue
 508			}
 509		}
 510
 511		// Is there an element specific attribute policy that applies?
 512		if apl, ok := aps[htmlAttr.Key]; ok {
 513			for _, ap := range apl {
 514				if ap.regexp != nil {
 515					if ap.regexp.MatchString(htmlAttr.Val) {
 516						cleanAttrs = append(cleanAttrs, htmlAttr)
 517						continue attrsLoop
 518					}
 519				} else {
 520					cleanAttrs = append(cleanAttrs, htmlAttr)
 521					continue attrsLoop
 522				}
 523			}
 524		}
 525
 526		// Is there a global attribute policy that applies?
 527		if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
 528			for _, ap := range apl {
 529				if ap.regexp != nil {
 530					if ap.regexp.MatchString(htmlAttr.Val) {
 531						cleanAttrs = append(cleanAttrs, htmlAttr)
 532						continue attrsLoop
 533					}
 534				} else {
 535					cleanAttrs = append(cleanAttrs, htmlAttr)
 536					continue attrsLoop
 537				}
 538			}
 539		}
 540	}
 541
 542	if len(cleanAttrs) == 0 {
 543		// If nothing was allowed, let's get out of here
 544		return cleanAttrs
 545	}
 546	// cleanAttrs now contains the attributes that are permitted
 547
 548	if linkable(elementName) {
 549		if p.requireParseableURLs {
 550			// Ensure URLs are parseable:
 551			// - a.href
 552			// - area.href
 553			// - link.href
 554			// - blockquote.cite
 555			// - q.cite
 556			// - img.src
 557			// - script.src
 558			tmpAttrs := []html.Attribute{}
 559			for _, htmlAttr := range cleanAttrs {
 560				switch elementName {
 561				case "a", "area", "base", "link":
 562					if htmlAttr.Key == "href" {
 563						if u, ok := p.validURL(htmlAttr.Val); ok {
 564							htmlAttr.Val = u
 565							tmpAttrs = append(tmpAttrs, htmlAttr)
 566						}
 567						break
 568					}
 569					tmpAttrs = append(tmpAttrs, htmlAttr)
 570				case "blockquote", "del", "ins", "q":
 571					if htmlAttr.Key == "cite" {
 572						if u, ok := p.validURL(htmlAttr.Val); ok {
 573							htmlAttr.Val = u
 574							tmpAttrs = append(tmpAttrs, htmlAttr)
 575						}
 576						break
 577					}
 578					tmpAttrs = append(tmpAttrs, htmlAttr)
 579				case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
 580					if htmlAttr.Key == "src" {
 581						if u, ok := p.validURL(htmlAttr.Val); ok {
 582							if p.srcRewriter != nil {
 583								parsedURL, err := url.Parse(u)
 584								if err != nil {
 585									fmt.Println(err)
 586								}
 587								p.srcRewriter(parsedURL)
 588								u = parsedURL.String()
 589							}
 590							htmlAttr.Val = u
 591							tmpAttrs = append(tmpAttrs, htmlAttr)
 592						}
 593						break
 594					}
 595					tmpAttrs = append(tmpAttrs, htmlAttr)
 596				default:
 597					tmpAttrs = append(tmpAttrs, htmlAttr)
 598				}
 599			}
 600			cleanAttrs = tmpAttrs
 601		}
 602
 603		if (p.requireNoFollow ||
 604			p.requireNoFollowFullyQualifiedLinks ||
 605			p.requireNoReferrer ||
 606			p.requireNoReferrerFullyQualifiedLinks ||
 607			p.addTargetBlankToFullyQualifiedLinks) &&
 608			len(cleanAttrs) > 0 {
 609
 610			// Add rel="nofollow" if a "href" exists
 611			switch elementName {
 612			case "a", "area", "base", "link":
 613				var hrefFound bool
 614				var externalLink bool
 615				for _, htmlAttr := range cleanAttrs {
 616					if htmlAttr.Key == "href" {
 617						hrefFound = true
 618
 619						u, err := url.Parse(htmlAttr.Val)
 620						if err != nil {
 621							continue
 622						}
 623						if u.Host != "" {
 624							externalLink = true
 625						}
 626
 627						continue
 628					}
 629				}
 630
 631				if hrefFound {
 632					var (
 633						noFollowFound    bool
 634						noReferrerFound  bool
 635						targetBlankFound bool
 636					)
 637
 638					addNoFollow := (p.requireNoFollow ||
 639						externalLink && p.requireNoFollowFullyQualifiedLinks)
 640
 641					addNoReferrer := (p.requireNoReferrer ||
 642						externalLink && p.requireNoReferrerFullyQualifiedLinks)
 643
 644					addTargetBlank := (externalLink &&
 645						p.addTargetBlankToFullyQualifiedLinks)
 646
 647					tmpAttrs := []html.Attribute{}
 648					for _, htmlAttr := range cleanAttrs {
 649
 650						var appended bool
 651						if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
 652
 653							if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
 654								htmlAttr.Val += " nofollow"
 655							}
 656							if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
 657								htmlAttr.Val += " noreferrer"
 658							}
 659							noFollowFound = addNoFollow
 660							noReferrerFound = addNoReferrer
 661							tmpAttrs = append(tmpAttrs, htmlAttr)
 662							appended = true
 663						}
 664
 665						if elementName == "a" && htmlAttr.Key == "target" {
 666							if htmlAttr.Val == "_blank" {
 667								targetBlankFound = true
 668							}
 669							if addTargetBlank && !targetBlankFound {
 670								htmlAttr.Val = "_blank"
 671								targetBlankFound = true
 672								tmpAttrs = append(tmpAttrs, htmlAttr)
 673								appended = true
 674							}
 675						}
 676
 677						if !appended {
 678							tmpAttrs = append(tmpAttrs, htmlAttr)
 679						}
 680					}
 681					if noFollowFound || noReferrerFound || targetBlankFound {
 682						cleanAttrs = tmpAttrs
 683					}
 684
 685					if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
 686						rel := html.Attribute{}
 687						rel.Key = "rel"
 688						if addNoFollow {
 689							rel.Val = "nofollow"
 690						}
 691						if addNoReferrer {
 692							if rel.Val != "" {
 693								rel.Val += " "
 694							}
 695							rel.Val += "noreferrer"
 696						}
 697						cleanAttrs = append(cleanAttrs, rel)
 698					}
 699
 700					if elementName == "a" && addTargetBlank && !targetBlankFound {
 701						rel := html.Attribute{}
 702						rel.Key = "target"
 703						rel.Val = "_blank"
 704						targetBlankFound = true
 705						cleanAttrs = append(cleanAttrs, rel)
 706					}
 707
 708					if targetBlankFound {
 709						// target="_blank" has a security risk that allows the
 710						// opened window/tab to issue JavaScript calls against
 711						// window.opener, which in effect allow the destination
 712						// of the link to control the source:
 713						// https://dev.to/ben/the-targetblank-vulnerability-by-example
 714						//
 715						// To mitigate this risk, we need to add a specific rel
 716						// attribute if it is not already present.
 717						// rel="noopener"
 718						//
 719						// Unfortunately this is processing the rel twice (we
 720						// already looked at it earlier ^^) as we cannot be sure
 721						// of the ordering of the href and rel, and whether we
 722						// have fully satisfied that we need to do this. This
 723						// double processing only happens *if* target="_blank"
 724						// is true.
 725						var noOpenerAdded bool
 726						tmpAttrs := []html.Attribute{}
 727						for _, htmlAttr := range cleanAttrs {
 728							var appended bool
 729							if htmlAttr.Key == "rel" {
 730								if strings.Contains(htmlAttr.Val, "noopener") {
 731									noOpenerAdded = true
 732									tmpAttrs = append(tmpAttrs, htmlAttr)
 733								} else {
 734									htmlAttr.Val += " noopener"
 735									noOpenerAdded = true
 736									tmpAttrs = append(tmpAttrs, htmlAttr)
 737								}
 738
 739								appended = true
 740							}
 741							if !appended {
 742								tmpAttrs = append(tmpAttrs, htmlAttr)
 743							}
 744						}
 745						if noOpenerAdded {
 746							cleanAttrs = tmpAttrs
 747						} else {
 748							// rel attr was not found, or else noopener would
 749							// have been added already
 750							rel := html.Attribute{}
 751							rel.Key = "rel"
 752							rel.Val = "noopener"
 753							cleanAttrs = append(cleanAttrs, rel)
 754						}
 755
 756					}
 757				}
 758			default:
 759			}
 760		}
 761	}
 762
 763	if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 {
 764		switch elementName {
 765		case "audio", "img", "link", "script", "video":
 766			var crossOriginFound bool
 767			for i, htmlAttr := range cleanAttrs {
 768				if htmlAttr.Key == "crossorigin" {
 769					crossOriginFound = true
 770					cleanAttrs[i].Val = "anonymous"
 771				}
 772			}
 773
 774			if !crossOriginFound {
 775				crossOrigin := html.Attribute{}
 776				crossOrigin.Key = "crossorigin"
 777				crossOrigin.Val = "anonymous"
 778				cleanAttrs = append(cleanAttrs, crossOrigin)
 779			}
 780		}
 781	}
 782
 783	if p.requireSandboxOnIFrame != nil && elementName == "iframe" {
 784		var sandboxFound bool
 785		for i, htmlAttr := range cleanAttrs {
 786			if htmlAttr.Key == "sandbox" {
 787				sandboxFound = true
 788				var cleanVals []string
 789				cleanValsSet := make(map[string]bool)
 790				for _, val := range strings.Fields(htmlAttr.Val) {
 791					if p.requireSandboxOnIFrame[val] {
 792						if !cleanValsSet[val] {
 793							cleanVals = append(cleanVals, val)
 794							cleanValsSet[val] = true
 795						}
 796					}
 797				}
 798				cleanAttrs[i].Val = strings.Join(cleanVals, " ")
 799			}
 800		}
 801
 802		if !sandboxFound {
 803			sandbox := html.Attribute{}
 804			sandbox.Key = "sandbox"
 805			sandbox.Val = ""
 806			cleanAttrs = append(cleanAttrs, sandbox)
 807		}
 808	}
 809
 810	return cleanAttrs
 811}
 812
 813func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
 814	sps := p.elsAndStyles[elementName]
 815	if len(sps) == 0 {
 816		sps = map[string][]stylePolicy{}
 817		// check for any matching elements, if we don't already have a policy found
 818		// if multiple matches are found they will be overwritten, it's best
 819		// to not have overlapping matchers
 820		for regex, policies := range p.elsMatchingAndStyles {
 821			if regex.MatchString(elementName) {
 822				for k, v := range policies {
 823					sps[k] = append(sps[k], v...)
 824				}
 825			}
 826		}
 827	}
 828
 829	//Add semi-colon to end to fix parsing issue
 830	attr.Val = strings.TrimRight(attr.Val, " ")
 831	if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
 832		attr.Val = attr.Val + ";"
 833	}
 834	decs, err := parser.ParseDeclarations(attr.Val)
 835	if err != nil {
 836		attr.Val = ""
 837		return attr
 838	}
 839	clean := []string{}
 840	prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
 841
 842decLoop:
 843	for _, dec := range decs {
 844		tempProperty := strings.ToLower(dec.Property)
 845		tempValue := removeUnicode(strings.ToLower(dec.Value))
 846		for _, i := range prefixes {
 847			tempProperty = strings.TrimPrefix(tempProperty, i)
 848		}
 849		if spl, ok := sps[tempProperty]; ok {
 850			for _, sp := range spl {
 851				if sp.handler != nil {
 852					if sp.handler(tempValue) {
 853						clean = append(clean, dec.Property+": "+dec.Value)
 854						continue decLoop
 855					}
 856				} else if len(sp.enum) > 0 {
 857					if stringInSlice(tempValue, sp.enum) {
 858						clean = append(clean, dec.Property+": "+dec.Value)
 859						continue decLoop
 860					}
 861				} else if sp.regexp != nil {
 862					if sp.regexp.MatchString(tempValue) {
 863						clean = append(clean, dec.Property+": "+dec.Value)
 864						continue decLoop
 865					}
 866				}
 867			}
 868		}
 869		if spl, ok := p.globalStyles[tempProperty]; ok {
 870			for _, sp := range spl {
 871				if sp.handler != nil {
 872					if sp.handler(tempValue) {
 873						clean = append(clean, dec.Property+": "+dec.Value)
 874						continue decLoop
 875					}
 876				} else if len(sp.enum) > 0 {
 877					if stringInSlice(tempValue, sp.enum) {
 878						clean = append(clean, dec.Property+": "+dec.Value)
 879						continue decLoop
 880					}
 881				} else if sp.regexp != nil {
 882					if sp.regexp.MatchString(tempValue) {
 883						clean = append(clean, dec.Property+": "+dec.Value)
 884						continue decLoop
 885					}
 886				}
 887			}
 888		}
 889	}
 890	if len(clean) > 0 {
 891		attr.Val = strings.Join(clean, "; ")
 892	} else {
 893		attr.Val = ""
 894	}
 895	return attr
 896}
 897
 898func (p *Policy) allowNoAttrs(elementName string) bool {
 899	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
 900	if !ok {
 901		for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
 902			if r.MatchString(elementName) {
 903				ok = true
 904				break
 905			}
 906		}
 907	}
 908	return ok
 909}
 910
 911func (p *Policy) validURL(rawurl string) (string, bool) {
 912	if p.requireParseableURLs {
 913		// URLs are valid if when space is trimmed the URL is valid
 914		rawurl = strings.TrimSpace(rawurl)
 915
 916		// URLs cannot contain whitespace, unless it is a data-uri
 917		if strings.Contains(rawurl, " ") ||
 918			strings.Contains(rawurl, "\t") ||
 919			strings.Contains(rawurl, "\n") {
 920			if !strings.HasPrefix(rawurl, `data:`) {
 921				return "", false
 922			}
 923
 924			// Remove \r and \n from base64 encoded data to pass url.Parse.
 925			matched := dataURIbase64Prefix.FindString(rawurl)
 926			if matched != "" {
 927				rawurl = matched + strings.Replace(
 928					strings.Replace(
 929						rawurl[len(matched):],
 930						"\r",
 931						"",
 932						-1,
 933					),
 934					"\n",
 935					"",
 936					-1,
 937				)
 938			}
 939		}
 940
 941		// URLs are valid if they parse
 942		u, err := url.Parse(rawurl)
 943		if err != nil {
 944			return "", false
 945		}
 946
 947		if u.Scheme != "" {
 948			urlPolicies, ok := p.allowURLSchemes[u.Scheme]
 949			if !ok {
 950				for _, r := range p.allowURLSchemeRegexps {
 951					if r.MatchString(u.Scheme) {
 952						return u.String(), true
 953					}
 954				}
 955
 956				return "", false
 957			}
 958
 959			if len(urlPolicies) == 0 {
 960				return u.String(), true
 961			}
 962
 963			for _, urlPolicy := range urlPolicies {
 964				if urlPolicy(u) {
 965					return u.String(), true
 966				}
 967			}
 968
 969			return "", false
 970		}
 971
 972		if p.allowRelativeURLs {
 973			if u.String() != "" {
 974				return u.String(), true
 975			}
 976		}
 977
 978		return "", false
 979	}
 980
 981	return rawurl, true
 982}
 983
 984func linkable(elementName string) bool {
 985	switch elementName {
 986	case "a", "area", "base", "link":
 987		// elements that allow .href
 988		return true
 989	case "blockquote", "del", "ins", "q":
 990		// elements that allow .cite
 991		return true
 992	case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
 993		// elements that allow .src
 994		return true
 995	default:
 996		return false
 997	}
 998}
 999
1000// stringInSlice returns true if needle exists in haystack
1001func stringInSlice(needle string, haystack []string) bool {
1002	for _, straw := range haystack {
1003		if strings.EqualFold(straw, needle) {
1004			return true
1005		}
1006	}
1007	return false
1008}
1009
1010func isDataAttribute(val string) bool {
1011	if !dataAttribute.MatchString(val) {
1012		return false
1013	}
1014	rest := strings.Split(val, "data-")
1015	if len(rest) == 1 {
1016		return false
1017	}
1018	// data-xml* is invalid.
1019	if dataAttributeXMLPrefix.MatchString(rest[1]) {
1020		return false
1021	}
1022	// no uppercase or semi-colons allowed.
1023	if dataAttributeInvalidChars.MatchString(rest[1]) {
1024		return false
1025	}
1026	return true
1027}
1028
1029func removeUnicode(value string) string {
1030	substitutedValue := value
1031	currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
1032	for currentLoc != nil {
1033
1034		character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
1035		character = strings.TrimSpace(character)
1036		if len(character) < 4 {
1037			character = strings.Repeat("0", 4-len(character)) + character
1038		} else {
1039			for len(character) > 4 {
1040				if character[0] != '0' {
1041					character = ""
1042					break
1043				} else {
1044					character = character[1:]
1045				}
1046			}
1047		}
1048		character = "\\u" + character
1049		translatedChar, err := strconv.Unquote(`"` + character + `"`)
1050		translatedChar = strings.TrimSpace(translatedChar)
1051		if err != nil {
1052			return ""
1053		}
1054		substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
1055		currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
1056	}
1057	return substitutedValue
1058}
1059
1060func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
1061	aps := make(map[string][]attrPolicy, 0)
1062	matched := false
1063	for regex, attrs := range p.elsMatchingAndAttrs {
1064		if regex.MatchString(elementName) {
1065			matched = true
1066			for k, v := range attrs {
1067				aps[k] = append(aps[k], v...)
1068			}
1069		}
1070	}
1071	return aps, matched
1072}
1073
1074// normaliseElementName takes a HTML element like <script> which is user input
1075// and returns a lower case version of it that is immune to UTF-8 to ASCII
1076// conversion tricks (like the use of upper case cyrillic i scrİpt which a
1077// strings.ToLower would convert to script). Instead this func will preserve
1078// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
1079// characters when lower cased
1080func normaliseElementName(str string) string {
1081	// that useful QuoteToASCII put quote marks at the start and end
1082	// so those are trimmed off
1083	return strings.TrimSuffix(
1084		strings.TrimPrefix(
1085			strings.ToLower(
1086				strconv.QuoteToASCII(str),
1087			),
1088			`"`),
1089		`"`,
1090	)
1091}
1092
1093type stringWriterWriter interface {
1094	io.Writer
1095	io.StringWriter
1096}