1// Copyright (c) 2014, David Kitchen <david@buro9.com>
2//
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice, this
9// list of conditions and the following disclaimer.
10//
11// * Redistributions in binary form must reproduce the above copyright notice,
12// this list of conditions and the following disclaimer in the documentation
13// and/or other materials provided with the distribution.
14//
15// * Neither the name of the organisation (Microcosm) nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30package bluemonday
31
32import (
33 "encoding/base64"
34 "net/url"
35 "regexp"
36)
37
38// A selection of regular expressions that can be used as .Matching() rules on
39// HTML attributes.
40var (
41 // CellAlign handles the `align` attribute
42 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td#attr-align
43 CellAlign = regexp.MustCompile(`(?i)^(center|justify|left|right|char)$`)
44
45 // CellVerticalAlign handles the `valign` attribute
46 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td#attr-valign
47 CellVerticalAlign = regexp.MustCompile(`(?i)^(baseline|bottom|middle|top)$`)
48
49 // Direction handles the `dir` attribute
50 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/bdo#attr-dir
51 Direction = regexp.MustCompile(`(?i)^(rtl|ltr)$`)
52
53 // ImageAlign handles the `align` attribute on the `image` tag
54 // http://www.w3.org/MarkUp/Test/Img/imgtest.html
55 ImageAlign = regexp.MustCompile(
56 `(?i)^(left|right|top|texttop|middle|absmiddle|baseline|bottom|absbottom)$`,
57 )
58
59 // Integer describes whole positive integers (including 0) used in places
60 // like td.colspan
61 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td#attr-colspan
62 Integer = regexp.MustCompile(`^[0-9]+$`)
63
64 // ISO8601 according to the W3 group is only a subset of the ISO8601
65 // standard: http://www.w3.org/TR/NOTE-datetime
66 //
67 // Used in places like time.datetime
68 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/time#attr-datetime
69 //
70 // Matches patterns:
71 // Year:
72 // YYYY (eg 1997)
73 // Year and month:
74 // YYYY-MM (eg 1997-07)
75 // Complete date:
76 // YYYY-MM-DD (eg 1997-07-16)
77 // Complete date plus hours and minutes:
78 // YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
79 // Complete date plus hours, minutes and seconds:
80 // YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
81 // Complete date plus hours, minutes, seconds and a decimal fraction of a
82 // second
83 // YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
84 ISO8601 = regexp.MustCompile(
85 `^[0-9]{4}(-[0-9]{2}(-[0-9]{2}([ T][0-9]{2}(:[0-9]{2}){1,2}(.[0-9]{1,6})` +
86 `?Z?([\+-][0-9]{2}:[0-9]{2})?)?)?)?$`,
87 )
88
89 // ListType encapsulates the common value as well as the latest spec
90 // values for lists
91 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ol#attr-type
92 ListType = regexp.MustCompile(`(?i)^(circle|disc|square|a|A|i|I|1)$`)
93
94 // SpaceSeparatedTokens is used in places like `a.rel` and the common attribute
95 // `class` which both contain space delimited lists of data tokens
96 // http://www.w3.org/TR/html-markup/datatypes.html#common.data.tokens-def
97 // Regexp: \p{L} matches unicode letters, \p{N} matches unicode numbers
98 SpaceSeparatedTokens = regexp.MustCompile(`^([\s\p{L}\p{N}_-]+)$`)
99
100 // Number is a double value used on HTML5 meter and progress elements
101 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-button-element.html#the-meter-element
102 Number = regexp.MustCompile(`^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$`)
103
104 // NumberOrPercent is used predominantly as units of measurement in width
105 // and height attributes
106 // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#attr-height
107 NumberOrPercent = regexp.MustCompile(`^[0-9]+[%]?$`)
108
109 // Paragraph of text in an attribute such as *.'title', img.alt, etc
110 // https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes#attr-title
111 // Note that we are not allowing chars that could close tags like '>'
112 Paragraph = regexp.MustCompile(`^[\p{L}\p{N}\s\-_',\[\]!\./\\\(\)]*$`)
113
114 // dataURIImagePrefix is used by AllowDataURIImages to define the acceptable
115 // prefix of data URIs that contain common web image formats.
116 //
117 // This is not exported as it's not useful by itself, and only has value
118 // within the AllowDataURIImages func
119 dataURIImagePrefix = regexp.MustCompile(
120 `^image/(gif|jpeg|png|svg\+xml|webp);base64,`,
121 )
122)
123
124// AllowStandardURLs is a convenience function that will enable rel="nofollow"
125// on "a", "area" and "link" (if you have allowed those elements) and will
126// ensure that the URL values are parseable and either relative or belong to the
127// "mailto", "http", or "https" schemes
128func (p *Policy) AllowStandardURLs() {
129 // URLs must be parseable by net/url.Parse()
130 p.RequireParseableURLs(true)
131
132 // !url.IsAbs() is permitted
133 p.AllowRelativeURLs(true)
134
135 // Most common URL schemes only
136 p.AllowURLSchemes("mailto", "http", "https")
137
138 // For linking elements we will add rel="nofollow" if it does not already exist
139 // This applies to "a" "area" "link"
140 p.RequireNoFollowOnLinks(true)
141}
142
143// AllowStandardAttributes will enable "id", "title" and the language specific
144// attributes "dir" and "lang" on all elements that are allowed
145func (p *Policy) AllowStandardAttributes() {
146 // "dir" "lang" are permitted as both language attributes affect charsets
147 // and direction of text.
148 p.AllowAttrs("dir").Matching(Direction).Globally()
149 p.AllowAttrs(
150 "lang",
151 ).Matching(regexp.MustCompile(`[a-zA-Z]{2,20}`)).Globally()
152
153 // "id" is permitted. This is pretty much as some HTML elements require this
154 // to work well ("dfn" is an example of a "id" being value)
155 // This does create a risk that JavaScript and CSS within your web page
156 // might identify the wrong elements. Ensure that you select things
157 // accurately
158 p.AllowAttrs("id").Matching(
159 regexp.MustCompile(`[a-zA-Z0-9\:\-_\.]+`),
160 ).Globally()
161
162 // "title" is permitted as it improves accessibility.
163 p.AllowAttrs("title").Matching(Paragraph).Globally()
164}
165
166// AllowStyling presently enables the class attribute globally.
167//
168// Note: When bluemonday ships a CSS parser and we can safely sanitise that,
169// this will also allow sanitized styling of elements via the style attribute.
170func (p *Policy) AllowStyling() {
171
172 // "class" is permitted globally
173 p.AllowAttrs("class").Matching(SpaceSeparatedTokens).Globally()
174}
175
176// AllowImages enables the img element and some popular attributes. It will also
177// ensure that URL values are parseable. This helper does not enable data URI
178// images, for that you should also use the AllowDataURIImages() helper.
179func (p *Policy) AllowImages() {
180
181 // "img" is permitted
182 p.AllowAttrs("align").Matching(ImageAlign).OnElements("img")
183 p.AllowAttrs("alt").Matching(Paragraph).OnElements("img")
184 p.AllowAttrs("height", "width").Matching(NumberOrPercent).OnElements("img")
185
186 // Standard URLs enabled
187 p.AllowStandardURLs()
188 p.AllowAttrs("src").OnElements("img")
189}
190
191// AllowDataURIImages permits the use of inline images defined in RFC2397
192// http://tools.ietf.org/html/rfc2397
193// http://en.wikipedia.org/wiki/Data_URI_scheme
194//
195// Images must have a mimetype matching:
196//
197// image/gif
198// image/jpeg
199// image/png
200// image/webp
201//
202// NOTE: There is a potential security risk to allowing data URIs and you should
203// only permit them on content you already trust.
204// http://palizine.plynt.com/issues/2010Oct/bypass-xss-filters/
205// https://capec.mitre.org/data/definitions/244.html
206func (p *Policy) AllowDataURIImages() {
207
208 // URLs must be parseable by net/url.Parse()
209 p.RequireParseableURLs(true)
210
211 // Supply a function to validate images contained within data URI
212 p.AllowURLSchemeWithCustomPolicy(
213 "data",
214 func(url *url.URL) (allowUrl bool) {
215 if url.RawQuery != "" || url.Fragment != "" {
216 return false
217 }
218
219 matched := dataURIImagePrefix.FindString(url.Opaque)
220 if matched == "" {
221 return false
222 }
223
224 _, err := base64.StdEncoding.DecodeString(url.Opaque[len(matched):])
225 return err == nil
226 },
227 )
228}
229
230// AllowLists will enabled ordered and unordered lists, as well as definition
231// lists
232func (p *Policy) AllowLists() {
233 // "ol" "ul" are permitted
234 p.AllowAttrs("type").Matching(ListType).OnElements("ol", "ul")
235
236 // "li" is permitted
237 p.AllowAttrs("type").Matching(ListType).OnElements("li")
238 p.AllowAttrs("value").Matching(Integer).OnElements("li")
239
240 // "dl" "dt" "dd" are permitted
241 p.AllowElements("dl", "dt", "dd")
242}
243
244// AllowTables will enable a rich set of elements and attributes to describe
245// HTML tables
246func (p *Policy) AllowTables() {
247
248 // "table" is permitted
249 p.AllowAttrs("height", "width").Matching(NumberOrPercent).OnElements("table")
250 p.AllowAttrs("summary").Matching(Paragraph).OnElements("table")
251
252 // "caption" is permitted
253 p.AllowElements("caption")
254
255 // "col" "colgroup" are permitted
256 p.AllowAttrs("align").Matching(CellAlign).OnElements("col", "colgroup")
257 p.AllowAttrs("height", "width").Matching(
258 NumberOrPercent,
259 ).OnElements("col", "colgroup")
260 p.AllowAttrs("span").Matching(Integer).OnElements("colgroup", "col")
261 p.AllowAttrs("valign").Matching(
262 CellVerticalAlign,
263 ).OnElements("col", "colgroup")
264
265 // "thead" "tr" are permitted
266 p.AllowAttrs("align").Matching(CellAlign).OnElements("thead", "tr")
267 p.AllowAttrs("valign").Matching(CellVerticalAlign).OnElements("thead", "tr")
268
269 // "td" "th" are permitted
270 p.AllowAttrs("abbr").Matching(Paragraph).OnElements("td", "th")
271 p.AllowAttrs("align").Matching(CellAlign).OnElements("td", "th")
272 p.AllowAttrs("colspan", "rowspan").Matching(Integer).OnElements("td", "th")
273 p.AllowAttrs("headers").Matching(
274 SpaceSeparatedTokens,
275 ).OnElements("td", "th")
276 p.AllowAttrs("height", "width").Matching(
277 NumberOrPercent,
278 ).OnElements("td", "th")
279 p.AllowAttrs(
280 "scope",
281 ).Matching(
282 regexp.MustCompile(`(?i)(?:row|col)(?:group)?`),
283 ).OnElements("td", "th")
284 p.AllowAttrs("valign").Matching(CellVerticalAlign).OnElements("td", "th")
285 p.AllowAttrs("nowrap").Matching(
286 regexp.MustCompile(`(?i)|nowrap`),
287 ).OnElements("td", "th")
288
289 // "tbody" "tfoot"
290 p.AllowAttrs("align").Matching(CellAlign).OnElements("tbody", "tfoot")
291 p.AllowAttrs("valign").Matching(
292 CellVerticalAlign,
293 ).OnElements("tbody", "tfoot")
294}
295
296func (p *Policy) AllowIFrames(vals ...SandboxValue) {
297 p.AllowAttrs("sandbox").OnElements("iframe")
298
299 p.RequireSandboxOnIFrame(vals...)
300}