sanitizer_test.go

  1package web
  2
  3import (
  4	"strings"
  5	"testing"
  6
  7	"github.com/matryer/is"
  8	"golang.org/x/net/html"
  9)
 10
 11func TestSanitizerXSSProtection(t *testing.T) {
 12	is := is.New(t)
 13
 14	ctx := &ReadmeContext{
 15		RepoName:   "test-repo",
 16		CommitHash: "abc123",
 17		ReadmePath: "README.md",
 18	}
 19
 20	// Test javascript: URL in link
 21	md := []byte(`[click me](javascript:alert('xss'))`)
 22	html, err := renderMarkdown(md, ctx)
 23	is.NoErr(err)
 24	is.True(!strings.Contains(string(html), "javascript:"))
 25
 26	// Test javascript: URL in image
 27	md = []byte(`![img](javascript:alert('xss'))`)
 28	html, err = renderMarkdown(md, ctx)
 29	is.NoErr(err)
 30	is.True(!strings.Contains(string(html), "javascript:"))
 31
 32	// Test data: URI in image
 33	md = []byte(`![img](data:image/png;base64,iVBORw0KG)`)
 34	html, err = renderMarkdown(md, ctx)
 35	is.NoErr(err)
 36	is.True(!strings.Contains(string(html), "data:image"))
 37
 38	// Test data: URI in link
 39	md = []byte(`[link](data:text/html,<script>alert('xss')</script>)`)
 40	html, err = renderMarkdown(md, ctx)
 41	is.NoErr(err)
 42	is.True(!strings.Contains(string(html), "data:text"))
 43
 44	// Test onerror handler
 45	md = []byte(`<img src="x" onerror="alert('xss')">`)
 46	html, err = renderMarkdown(md, ctx)
 47	is.NoErr(err)
 48	is.True(!strings.Contains(string(html), "onerror"))
 49
 50	// Test onclick handler
 51	md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
 52	html, err = renderMarkdown(md, ctx)
 53	is.NoErr(err)
 54	is.True(!strings.Contains(string(html), "onclick"))
 55
 56	// Test style attribute
 57	md = []byte(`<p style="background:url(javascript:alert('xss'))">test</p>`)
 58	html, err = renderMarkdown(md, ctx)
 59	is.NoErr(err)
 60	is.True(!strings.Contains(string(html), "style="))
 61	is.True(!strings.Contains(string(html), "javascript"))
 62
 63	// Test iframe injection
 64	md = []byte(`<iframe src="https://evil.com"></iframe>`)
 65	html, err = renderMarkdown(md, ctx)
 66	is.NoErr(err)
 67	is.True(!strings.Contains(string(html), "<iframe"))
 68
 69	// Test script tag
 70	md = []byte(`<script>alert('xss')</script>`)
 71	html, err = renderMarkdown(md, ctx)
 72	is.NoErr(err)
 73	is.True(!strings.Contains(string(html), "<script"))
 74
 75	// Test object/embed tags
 76	md = []byte(`<object data="https://evil.com"></object>`)
 77	html, err = renderMarkdown(md, ctx)
 78	is.NoErr(err)
 79	is.True(!strings.Contains(string(html), "<object"))
 80
 81	md = []byte(`<embed src="https://evil.com">`)
 82	html, err = renderMarkdown(md, ctx)
 83	is.NoErr(err)
 84	is.True(!strings.Contains(string(html), "<embed"))
 85}
 86
 87func TestSanitizerAllowedTags(t *testing.T) {
 88	is := is.New(t)
 89
 90	ctx := &ReadmeContext{
 91		RepoName:   "test-repo",
 92		CommitHash: "abc123",
 93		ReadmePath: "README.md",
 94	}
 95
 96	// Test allowed basic formatting
 97	md := []byte(`**bold** *italic* ~~strikethrough~~`)
 98	html, err := renderMarkdown(md, ctx)
 99	is.NoErr(err)
100	is.True(strings.Contains(string(html), "<strong>"))
101	is.True(strings.Contains(string(html), "<em>"))
102	is.True(strings.Contains(string(html), "<del>"))
103
104	// Test allowed headings
105	md = []byte(`# H1
106## H2
107### H3`)
108	html, err = renderMarkdown(md, ctx)
109	is.NoErr(err)
110	is.True(strings.Contains(string(html), "<h1"))
111	is.True(strings.Contains(string(html), "<h2"))
112	is.True(strings.Contains(string(html), "<h3"))
113
114	// Test allowed lists
115	md = []byte(`- item 1
116- item 2
117
1181. numbered
1192. list`)
120	html, err = renderMarkdown(md, ctx)
121	is.NoErr(err)
122	is.True(strings.Contains(string(html), "<ul"))
123	is.True(strings.Contains(string(html), "<ol"))
124	is.True(strings.Contains(string(html), "<li"))
125
126	// Test allowed code blocks
127	md = []byte("```go\nfunc main() {}\n```")
128	html, err = renderMarkdown(md, ctx)
129	is.NoErr(err)
130	is.True(strings.Contains(string(html), "<pre"))
131	is.True(strings.Contains(string(html), "<code"))
132
133	// Test allowed tables
134	md = []byte(`| Col1 | Col2 |
135|------|------|
136| A    | B    |`)
137	html, err = renderMarkdown(md, ctx)
138	is.NoErr(err)
139	is.True(strings.Contains(string(html), "<table"))
140	is.True(strings.Contains(string(html), "<thead"))
141	is.True(strings.Contains(string(html), "<tbody"))
142	is.True(strings.Contains(string(html), "<tr"))
143	is.True(strings.Contains(string(html), "<th"))
144	is.True(strings.Contains(string(html), "<td"))
145
146	// Test allowed blockquote
147	md = []byte(`> quote`)
148	html, err = renderMarkdown(md, ctx)
149	is.NoErr(err)
150	is.True(strings.Contains(string(html), "<blockquote"))
151
152	// Test allowed details/summary (collapsible sections)
153	md = []byte(`<details>
154<summary>Click to expand</summary>
155
156Hidden content here
157
158</details>`)
159	html, err = renderMarkdown(md, ctx)
160	is.NoErr(err)
161	is.True(strings.Contains(string(html), "<details"))
162	is.True(strings.Contains(string(html), "<summary"))
163}
164
165func TestSanitizerHTTPSOnly(t *testing.T) {
166	is := is.New(t)
167
168	ctx := &ReadmeContext{
169		RepoName:   "test-repo",
170		CommitHash: "abc123",
171		ReadmePath: "README.md",
172	}
173
174	// Test http:// link is stripped
175	md := []byte(`[insecure](http://example.com)`)
176	html, err := renderMarkdown(md, ctx)
177	is.NoErr(err)
178	htmlStr := string(html)
179	// Either no anchor tag or the anchor has no href attribute
180	is.True(!hasElementWithTag(htmlStr, "a") || elementAttrAbsent(htmlStr, "a", "href"))
181
182	// Test https:// link is allowed
183	md = []byte(`[secure](https://example.com)`)
184	html, err = renderMarkdown(md, ctx)
185	is.NoErr(err)
186	htmlStr = string(html)
187	is.True(elementHasAttr(htmlStr, "a", "href"))
188	is.True(elementAttrContains(htmlStr, "a", "href", "https://example.com"))
189
190	// Test http:// image is stripped
191	md = []byte(`![img](http://example.com/image.png)`)
192	html, err = renderMarkdown(md, ctx)
193	is.NoErr(err)
194	htmlStr = string(html)
195	is.True(!hasElementWithTag(htmlStr, "img") || elementAttrAbsent(htmlStr, "img", "src"))
196
197	// Test https:// image is allowed
198	md = []byte(`![img](https://example.com/image.png)`)
199	html, err = renderMarkdown(md, ctx)
200	is.NoErr(err)
201	htmlStr = string(html)
202	is.True(elementHasAttr(htmlStr, "img", "src"))
203	is.True(elementAttrContains(htmlStr, "img", "src", "https://example.com/image.png"))
204}
205
206func TestSanitizerNoFollowNoReferrer(t *testing.T) {
207	is := is.New(t)
208
209	ctx := &ReadmeContext{
210		RepoName:   "test-repo",
211		CommitHash: "abc123",
212		ReadmePath: "README.md",
213	}
214
215	// Test that external links get rel="nofollow noreferrer"
216	md := []byte(`[external](https://example.com)`)
217	html, err := renderMarkdown(md, ctx)
218	is.NoErr(err)
219	htmlStr := string(html)
220	is.True(elementHasAttr(htmlStr, "a", "rel"))
221	relAttr := getElementAttr(htmlStr, "a", "rel")
222	is.True(strings.Contains(relAttr, "nofollow"))
223	is.True(strings.Contains(relAttr, "noreferrer"))
224
225	// Test that relative/internal links also get rel="nofollow noreferrer"
226	md = []byte(`[internal](docs/README.md)`)
227	html, err = renderMarkdown(md, ctx)
228	is.NoErr(err)
229	htmlStr = string(html)
230	is.True(elementHasAttr(htmlStr, "a", "rel"))
231	relAttr = getElementAttr(htmlStr, "a", "rel")
232	is.True(strings.Contains(relAttr, "nofollow"))
233	is.True(strings.Contains(relAttr, "noreferrer"))
234}
235
236func TestSanitizerAdditionalSchemes(t *testing.T) {
237	is := is.New(t)
238
239	ctx := &ReadmeContext{
240		RepoName:   "test-repo",
241		CommitHash: "abc123",
242		ReadmePath: "README.md",
243	}
244
245	// Test protocol-relative URLs are stripped
246	md := []byte(`[protocol-relative](//evil.com/script.js)`)
247	html, err := renderMarkdown(md, ctx)
248	is.NoErr(err)
249	htmlStr := string(html)
250	// Rewriter blocks protocol-relative URLs with empty href, then sanitizer strips empty href links
251	is.True(!hasElementWithTag(htmlStr, "a"))
252
253	// Test mailto: scheme is stripped
254	md = []byte(`[email](mailto:user@example.com)`)
255	html, err = renderMarkdown(md, ctx)
256	is.NoErr(err)
257	htmlStr = string(html)
258	// mailto: is not in allowed schemes, so sanitizer strips the entire link
259	is.True(!hasElementWithTag(htmlStr, "a"))
260
261	// Test ftp: scheme is stripped
262	md = []byte(`[ftp](ftp://ftp.example.com/file.txt)`)
263	html, err = renderMarkdown(md, ctx)
264	is.NoErr(err)
265	htmlStr = string(html)
266	// ftp: is not in allowed schemes, so sanitizer strips the entire link
267	is.True(!hasElementWithTag(htmlStr, "a"))
268}
269
270func TestSanitizerDisallowedAttributes(t *testing.T) {
271	is := is.New(t)
272
273	ctx := &ReadmeContext{
274		RepoName:   "test-repo",
275		CommitHash: "abc123",
276		ReadmePath: "README.md",
277	}
278
279	// Test target attribute is stripped from links
280	md := []byte(`<a href="https://example.com" target="_blank">link</a>`)
281	html, err := renderMarkdown(md, ctx)
282	is.NoErr(err)
283	htmlStr := string(html)
284	is.True(elementAttrAbsent(htmlStr, "a", "target"))
285
286	// Test class attribute is stripped
287	md = []byte(`<a href="https://example.com" class="dangerous">link</a>`)
288	html, err = renderMarkdown(md, ctx)
289	is.NoErr(err)
290	htmlStr = string(html)
291	is.True(elementAttrAbsent(htmlStr, "a", "class"))
292
293	// Test style attribute is stripped (already tested in XSS, but explicit here)
294	md = []byte(`<a href="https://example.com" style="color: red;">link</a>`)
295	html, err = renderMarkdown(md, ctx)
296	is.NoErr(err)
297	htmlStr = string(html)
298	is.True(elementAttrAbsent(htmlStr, "a", "style"))
299
300	// Test onclick is stripped (already tested in XSS, but explicit here)
301	md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
302	html, err = renderMarkdown(md, ctx)
303	is.NoErr(err)
304	htmlStr = string(html)
305	is.True(elementAttrAbsent(htmlStr, "a", "onclick"))
306
307	// Test onerror is stripped (already tested in XSS, but explicit here)
308	md = []byte(`<img src="x" onerror="alert('xss')">`)
309	html, err = renderMarkdown(md, ctx)
310	is.NoErr(err)
311	htmlStr = string(html)
312	is.True(elementAttrAbsent(htmlStr, "img", "onerror"))
313}
314
315func TestSanitizerImageAttributes(t *testing.T) {
316	is := is.New(t)
317
318	ctx := &ReadmeContext{
319		RepoName:   "test-repo",
320		CommitHash: "abc123",
321		ReadmePath: "README.md",
322	}
323
324	// Test that width and height attributes are preserved on images
325	md := []byte(`<img src="https://example.com/image.png" width="100" height="50" alt="test">`)
326	html, err := renderMarkdown(md, ctx)
327	is.NoErr(err)
328	htmlStr := string(html)
329	is.True(elementAttrEquals(htmlStr, "img", "width", "100"))
330	is.True(elementAttrEquals(htmlStr, "img", "height", "50"))
331	is.True(elementAttrEquals(htmlStr, "img", "alt", "test"))
332}
333
334func TestSanitizerHeadingIDs(t *testing.T) {
335	is := is.New(t)
336
337	ctx := &ReadmeContext{
338		RepoName:   "test-repo",
339		CommitHash: "abc123",
340		ReadmePath: "README.md",
341	}
342
343	// Test that heading IDs are preserved (generated by AutoHeadingID)
344	md := []byte(`# Installation
345
346Jump to [installation](#installation).`)
347	html, err := renderMarkdown(md, ctx)
348	is.NoErr(err)
349	htmlStr := string(html)
350	// Check that the heading has an id attribute set to "installation"
351	is.True(elementAttrEquals(htmlStr, "h1", "id", "installation"))
352	// Check that the anchor link is preserved
353	is.True(elementAttrContains(htmlStr, "a", "href", "#installation"))
354}
355
356// HTML Testing Helpers
357// These utilities help test HTML output by parsing the DOM instead of string matching.
358
359// findElement searches the HTML tree for an element matching the given tag name.
360// Returns the first matching element node, or nil if not found.
361func findElement(n *html.Node, tag string) *html.Node {
362	if n.Type == html.ElementNode && n.Data == tag {
363		return n
364	}
365	for c := n.FirstChild; c != nil; c = c.NextSibling {
366		if found := findElement(c, tag); found != nil {
367			return found
368		}
369	}
370	return nil
371}
372
373// findElementWithAttr searches for an element with a specific attribute value.
374// Returns the first matching element, or nil if not found.
375func findElementWithAttr(n *html.Node, tag, attrKey, attrValue string) *html.Node {
376	if n.Type == html.ElementNode && n.Data == tag {
377		if getAttr(n, attrKey) == attrValue {
378			return n
379		}
380	}
381	for c := n.FirstChild; c != nil; c = c.NextSibling {
382		if found := findElementWithAttr(c, tag, attrKey, attrValue); found != nil {
383			return found
384		}
385	}
386	return nil
387}
388
389// getAttr returns the value of an attribute, or empty string if not present.
390func getAttr(n *html.Node, key string) string {
391	for _, attr := range n.Attr {
392		if attr.Key == key {
393			return attr.Val
394		}
395	}
396	return ""
397}
398
399// hasAttr returns true if the element has the specified attribute (regardless of value).
400func hasAttr(n *html.Node, key string) bool {
401	for _, attr := range n.Attr {
402		if attr.Key == key {
403			return true
404		}
405	}
406	return false
407}
408
409// attrContains checks if an attribute value contains a substring.
410func attrContains(n *html.Node, key, substr string) bool {
411	val := getAttr(n, key)
412	return strings.Contains(val, substr)
413}
414
415// parseHTML parses an HTML string and returns the root node.
416func parseHTML(htmlStr string) (*html.Node, error) {
417	return html.Parse(strings.NewReader(htmlStr))
418}
419
420// hasElementWithTag returns true if the HTML contains an element with the given tag.
421func hasElementWithTag(htmlStr, tag string) bool {
422	doc, err := parseHTML(htmlStr)
423	if err != nil {
424		return false
425	}
426	return findElement(doc, tag) != nil
427}
428
429// getElementAttr finds an element by tag and returns the value of the specified attribute.
430// Returns empty string if element or attribute not found.
431func getElementAttr(htmlStr, tag, attrKey string) string {
432	doc, err := parseHTML(htmlStr)
433	if err != nil {
434		return ""
435	}
436	elem := findElement(doc, tag)
437	if elem == nil {
438		return ""
439	}
440	return getAttr(elem, attrKey)
441}
442
443// elementHasAttr checks if an element has a specific attribute key (regardless of value).
444func elementHasAttr(htmlStr, tag, attrKey string) bool {
445	doc, err := parseHTML(htmlStr)
446	if err != nil {
447		return false
448	}
449	elem := findElement(doc, tag)
450	if elem == nil {
451		return false
452	}
453	return hasAttr(elem, attrKey)
454}
455
456// elementAttrEquals checks if an element's attribute equals a specific value.
457func elementAttrEquals(htmlStr, tag, attrKey, expected string) bool {
458	return getElementAttr(htmlStr, tag, attrKey) == expected
459}
460
461// elementAttrContains checks if an element's attribute contains a substring.
462func elementAttrContains(htmlStr, tag, attrKey, substr string) bool {
463	doc, err := parseHTML(htmlStr)
464	if err != nil {
465		return false
466	}
467	elem := findElement(doc, tag)
468	if elem == nil {
469		return false
470	}
471	return attrContains(elem, attrKey, substr)
472}
473
474// elementAttrAbsent checks that an element does NOT have a specific attribute.
475func elementAttrAbsent(htmlStr, tag, attrKey string) bool {
476	return !elementHasAttr(htmlStr, tag, attrKey)
477}
478
479// elementAttrEmpty checks if an element's attribute is present but empty.
480func elementAttrEmpty(htmlStr, tag, attrKey string) bool {
481	doc, err := parseHTML(htmlStr)
482	if err != nil {
483		return false
484	}
485	elem := findElement(doc, tag)
486	if elem == nil {
487		return false
488	}
489	if !hasAttr(elem, attrKey) {
490		return false
491	}
492	return getAttr(elem, attrKey) == ""
493}