sanitizer_test.go

  1package web
  2
  3import (
  4	"strings"
  5	"testing"
  6
  7	"github.com/matryer/is"
  8	"golang.org/x/net/html"
  9)
 10
 11func TestSanitizerXSSProtection(t *testing.T) {
 12	is := is.New(t)
 13
 14	ctx := &ReadmeContext{
 15		RepoName:   "test-repo",
 16		CommitHash: "abc123",
 17		ReadmePath: "README.md",
 18	}
 19
 20	// Test javascript: URL in link
 21	md := []byte(`[click me](javascript:alert('xss'))`)
 22	html, err := renderMarkdown(md, ctx)
 23	is.NoErr(err)
 24	is.True(!strings.Contains(string(html), "javascript:"))
 25
 26	// Test javascript: URL in image
 27	md = []byte(`![img](javascript:alert('xss'))`)
 28	html, err = renderMarkdown(md, ctx)
 29	is.NoErr(err)
 30	is.True(!strings.Contains(string(html), "javascript:"))
 31
 32	// Test data: URI in image
 33	md = []byte(`![img]()`)
 34	html, err = renderMarkdown(md, ctx)
 35	is.NoErr(err)
 36	is.True(!strings.Contains(string(html), "data:image"))
 37
 38	// Test data: URI in link
 39	md = []byte(`[link](data:text/html,<script>alert('xss')</script>)`)
 40	html, err = renderMarkdown(md, ctx)
 41	is.NoErr(err)
 42	is.True(!strings.Contains(string(html), "data:text"))
 43
 44	// Test onerror handler
 45	md = []byte(`<img src="x" onerror="alert('xss')">`)
 46	html, err = renderMarkdown(md, ctx)
 47	is.NoErr(err)
 48	is.True(!strings.Contains(string(html), "onerror"))
 49
 50	// Test onclick handler
 51	md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
 52	html, err = renderMarkdown(md, ctx)
 53	is.NoErr(err)
 54	is.True(!strings.Contains(string(html), "onclick"))
 55
 56	// Test style attribute
 57	md = []byte(`<p style="background:url(javascript:alert('xss'))">test</p>`)
 58	html, err = renderMarkdown(md, ctx)
 59	is.NoErr(err)
 60	is.True(!strings.Contains(string(html), "style="))
 61	is.True(!strings.Contains(string(html), "javascript"))
 62
 63	// Test iframe injection
 64	md = []byte(`<iframe src="https://evil.com"></iframe>`)
 65	html, err = renderMarkdown(md, ctx)
 66	is.NoErr(err)
 67	is.True(!strings.Contains(string(html), "<iframe"))
 68
 69	// Test script tag
 70	md = []byte(`<script>alert('xss')</script>`)
 71	html, err = renderMarkdown(md, ctx)
 72	is.NoErr(err)
 73	is.True(!strings.Contains(string(html), "<script"))
 74
 75	// Test object/embed tags
 76	md = []byte(`<object data="https://evil.com"></object>`)
 77	html, err = renderMarkdown(md, ctx)
 78	is.NoErr(err)
 79	is.True(!strings.Contains(string(html), "<object"))
 80
 81	md = []byte(`<embed src="https://evil.com">`)
 82	html, err = renderMarkdown(md, ctx)
 83	is.NoErr(err)
 84	is.True(!strings.Contains(string(html), "<embed"))
 85}
 86
 87func TestSanitizerAllowedTags(t *testing.T) {
 88	is := is.New(t)
 89
 90	ctx := &ReadmeContext{
 91		RepoName:   "test-repo",
 92		CommitHash: "abc123",
 93		ReadmePath: "README.md",
 94	}
 95
 96	// Test allowed basic formatting
 97	md := []byte(`**bold** *italic* ~~strikethrough~~`)
 98	html, err := renderMarkdown(md, ctx)
 99	is.NoErr(err)
100	is.True(strings.Contains(string(html), "<strong>"))
101	is.True(strings.Contains(string(html), "<em>"))
102	is.True(strings.Contains(string(html), "<del>"))
103
104	// Test allowed headings
105	md = []byte(`# H1
106## H2
107### H3`)
108	html, err = renderMarkdown(md, ctx)
109	is.NoErr(err)
110	is.True(strings.Contains(string(html), "<h1"))
111	is.True(strings.Contains(string(html), "<h2"))
112	is.True(strings.Contains(string(html), "<h3"))
113
114	// Test allowed lists
115	md = []byte(`- item 1
116- item 2
117
1181. numbered
1192. list`)
120	html, err = renderMarkdown(md, ctx)
121	is.NoErr(err)
122	is.True(strings.Contains(string(html), "<ul"))
123	is.True(strings.Contains(string(html), "<ol"))
124	is.True(strings.Contains(string(html), "<li"))
125
126	// Test allowed code blocks
127	md = []byte("```go\nfunc main() {}\n```")
128	html, err = renderMarkdown(md, ctx)
129	is.NoErr(err)
130	is.True(strings.Contains(string(html), "<pre"))
131	is.True(strings.Contains(string(html), "<code"))
132
133	// Test allowed tables
134	md = []byte(`| Col1 | Col2 |
135|------|------|
136| A    | B    |`)
137	html, err = renderMarkdown(md, ctx)
138	is.NoErr(err)
139	is.True(strings.Contains(string(html), "<table"))
140	is.True(strings.Contains(string(html), "<thead"))
141	is.True(strings.Contains(string(html), "<tbody"))
142	is.True(strings.Contains(string(html), "<tr"))
143	is.True(strings.Contains(string(html), "<th"))
144	is.True(strings.Contains(string(html), "<td"))
145
146	// Test allowed blockquote
147	md = []byte(`> quote`)
148	html, err = renderMarkdown(md, ctx)
149	is.NoErr(err)
150	is.True(strings.Contains(string(html), "<blockquote"))
151}
152
153func TestSanitizerHTTPSOnly(t *testing.T) {
154	is := is.New(t)
155
156	ctx := &ReadmeContext{
157		RepoName:   "test-repo",
158		CommitHash: "abc123",
159		ReadmePath: "README.md",
160	}
161
162	// Test http:// link is stripped
163	md := []byte(`[insecure](http://example.com)`)
164	html, err := renderMarkdown(md, ctx)
165	is.NoErr(err)
166	htmlStr := string(html)
167	// Either no anchor tag or the anchor has no href attribute
168	is.True(!hasElementWithTag(htmlStr, "a") || elementAttrAbsent(htmlStr, "a", "href"))
169
170	// Test https:// link is allowed
171	md = []byte(`[secure](https://example.com)`)
172	html, err = renderMarkdown(md, ctx)
173	is.NoErr(err)
174	htmlStr = string(html)
175	is.True(elementHasAttr(htmlStr, "a", "href"))
176	is.True(elementAttrContains(htmlStr, "a", "href", "https://example.com"))
177
178	// Test http:// image is stripped
179	md = []byte(`![img](http://example.com/image.png)`)
180	html, err = renderMarkdown(md, ctx)
181	is.NoErr(err)
182	htmlStr = string(html)
183	is.True(!hasElementWithTag(htmlStr, "img") || elementAttrAbsent(htmlStr, "img", "src"))
184
185	// Test https:// image is allowed
186	md = []byte(`![img](https://example.com/image.png)`)
187	html, err = renderMarkdown(md, ctx)
188	is.NoErr(err)
189	htmlStr = string(html)
190	is.True(elementHasAttr(htmlStr, "img", "src"))
191	is.True(elementAttrContains(htmlStr, "img", "src", "https://example.com/image.png"))
192}
193
194func TestSanitizerNoFollowNoReferrer(t *testing.T) {
195	is := is.New(t)
196
197	ctx := &ReadmeContext{
198		RepoName:   "test-repo",
199		CommitHash: "abc123",
200		ReadmePath: "README.md",
201	}
202
203	// Test that external links get rel="nofollow noreferrer"
204	md := []byte(`[external](https://example.com)`)
205	html, err := renderMarkdown(md, ctx)
206	is.NoErr(err)
207	htmlStr := string(html)
208	is.True(elementHasAttr(htmlStr, "a", "rel"))
209	relAttr := getElementAttr(htmlStr, "a", "rel")
210	is.True(strings.Contains(relAttr, "nofollow"))
211	is.True(strings.Contains(relAttr, "noreferrer"))
212
213	// Test that relative/internal links also get rel="nofollow noreferrer"
214	md = []byte(`[internal](docs/README.md)`)
215	html, err = renderMarkdown(md, ctx)
216	is.NoErr(err)
217	htmlStr = string(html)
218	is.True(elementHasAttr(htmlStr, "a", "rel"))
219	relAttr = getElementAttr(htmlStr, "a", "rel")
220	is.True(strings.Contains(relAttr, "nofollow"))
221	is.True(strings.Contains(relAttr, "noreferrer"))
222}
223
224func TestSanitizerAdditionalSchemes(t *testing.T) {
225	is := is.New(t)
226
227	ctx := &ReadmeContext{
228		RepoName:   "test-repo",
229		CommitHash: "abc123",
230		ReadmePath: "README.md",
231	}
232
233	// Test protocol-relative URLs are stripped
234	md := []byte(`[protocol-relative](//evil.com/script.js)`)
235	html, err := renderMarkdown(md, ctx)
236	is.NoErr(err)
237	htmlStr := string(html)
238	// Rewriter blocks protocol-relative URLs with empty href, then sanitizer strips empty href links
239	is.True(!hasElementWithTag(htmlStr, "a"))
240
241	// Test mailto: scheme is stripped
242	md = []byte(`[email](mailto:user@example.com)`)
243	html, err = renderMarkdown(md, ctx)
244	is.NoErr(err)
245	htmlStr = string(html)
246	// mailto: is not in allowed schemes, so sanitizer strips the entire link
247	is.True(!hasElementWithTag(htmlStr, "a"))
248
249	// Test ftp: scheme is stripped
250	md = []byte(`[ftp](ftp://ftp.example.com/file.txt)`)
251	html, err = renderMarkdown(md, ctx)
252	is.NoErr(err)
253	htmlStr = string(html)
254	// ftp: is not in allowed schemes, so sanitizer strips the entire link
255	is.True(!hasElementWithTag(htmlStr, "a"))
256}
257
258func TestSanitizerDisallowedAttributes(t *testing.T) {
259	is := is.New(t)
260
261	ctx := &ReadmeContext{
262		RepoName:   "test-repo",
263		CommitHash: "abc123",
264		ReadmePath: "README.md",
265	}
266
267	// Test target attribute is stripped from links
268	md := []byte(`<a href="https://example.com" target="_blank">link</a>`)
269	html, err := renderMarkdown(md, ctx)
270	is.NoErr(err)
271	htmlStr := string(html)
272	is.True(elementAttrAbsent(htmlStr, "a", "target"))
273
274	// Test class attribute is stripped
275	md = []byte(`<a href="https://example.com" class="dangerous">link</a>`)
276	html, err = renderMarkdown(md, ctx)
277	is.NoErr(err)
278	htmlStr = string(html)
279	is.True(elementAttrAbsent(htmlStr, "a", "class"))
280
281	// Test style attribute is stripped (already tested in XSS, but explicit here)
282	md = []byte(`<a href="https://example.com" style="color: red;">link</a>`)
283	html, err = renderMarkdown(md, ctx)
284	is.NoErr(err)
285	htmlStr = string(html)
286	is.True(elementAttrAbsent(htmlStr, "a", "style"))
287
288	// Test onclick is stripped (already tested in XSS, but explicit here)
289	md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
290	html, err = renderMarkdown(md, ctx)
291	is.NoErr(err)
292	htmlStr = string(html)
293	is.True(elementAttrAbsent(htmlStr, "a", "onclick"))
294
295	// Test onerror is stripped (already tested in XSS, but explicit here)
296	md = []byte(`<img src="x" onerror="alert('xss')">`)
297	html, err = renderMarkdown(md, ctx)
298	is.NoErr(err)
299	htmlStr = string(html)
300	is.True(elementAttrAbsent(htmlStr, "img", "onerror"))
301}
302
303func TestSanitizerImageAttributes(t *testing.T) {
304	is := is.New(t)
305
306	ctx := &ReadmeContext{
307		RepoName:   "test-repo",
308		CommitHash: "abc123",
309		ReadmePath: "README.md",
310	}
311
312	// Test that width and height attributes are preserved on images
313	md := []byte(`<img src="https://example.com/image.png" width="100" height="50" alt="test">`)
314	html, err := renderMarkdown(md, ctx)
315	is.NoErr(err)
316	htmlStr := string(html)
317	is.True(elementAttrEquals(htmlStr, "img", "width", "100"))
318	is.True(elementAttrEquals(htmlStr, "img", "height", "50"))
319	is.True(elementAttrEquals(htmlStr, "img", "alt", "test"))
320}
321
322func TestSanitizerHeadingIDs(t *testing.T) {
323	is := is.New(t)
324
325	ctx := &ReadmeContext{
326		RepoName:   "test-repo",
327		CommitHash: "abc123",
328		ReadmePath: "README.md",
329	}
330
331	// Test that heading IDs are preserved (generated by AutoHeadingID)
332	md := []byte(`# Installation
333
334Jump to [installation](#installation).`)
335	html, err := renderMarkdown(md, ctx)
336	is.NoErr(err)
337	htmlStr := string(html)
338	// Check that the heading has an id attribute set to "installation"
339	is.True(elementAttrEquals(htmlStr, "h1", "id", "installation"))
340	// Check that the anchor link is preserved
341	is.True(elementAttrContains(htmlStr, "a", "href", "#installation"))
342}
343
344// HTML Testing Helpers
345// These utilities help test HTML output by parsing the DOM instead of string matching.
346
347// findElement searches the HTML tree for an element matching the given tag name.
348// Returns the first matching element node, or nil if not found.
349func findElement(n *html.Node, tag string) *html.Node {
350	if n.Type == html.ElementNode && n.Data == tag {
351		return n
352	}
353	for c := n.FirstChild; c != nil; c = c.NextSibling {
354		if found := findElement(c, tag); found != nil {
355			return found
356		}
357	}
358	return nil
359}
360
361// findElementWithAttr searches for an element with a specific attribute value.
362// Returns the first matching element, or nil if not found.
363func findElementWithAttr(n *html.Node, tag, attrKey, attrValue string) *html.Node {
364	if n.Type == html.ElementNode && n.Data == tag {
365		if getAttr(n, attrKey) == attrValue {
366			return n
367		}
368	}
369	for c := n.FirstChild; c != nil; c = c.NextSibling {
370		if found := findElementWithAttr(c, tag, attrKey, attrValue); found != nil {
371			return found
372		}
373	}
374	return nil
375}
376
377// getAttr returns the value of an attribute, or empty string if not present.
378func getAttr(n *html.Node, key string) string {
379	for _, attr := range n.Attr {
380		if attr.Key == key {
381			return attr.Val
382		}
383	}
384	return ""
385}
386
387// hasAttr returns true if the element has the specified attribute (regardless of value).
388func hasAttr(n *html.Node, key string) bool {
389	for _, attr := range n.Attr {
390		if attr.Key == key {
391			return true
392		}
393	}
394	return false
395}
396
397// attrContains checks if an attribute value contains a substring.
398func attrContains(n *html.Node, key, substr string) bool {
399	val := getAttr(n, key)
400	return strings.Contains(val, substr)
401}
402
403// parseHTML parses an HTML string and returns the root node.
404func parseHTML(htmlStr string) (*html.Node, error) {
405	return html.Parse(strings.NewReader(htmlStr))
406}
407
408// hasElementWithTag returns true if the HTML contains an element with the given tag.
409func hasElementWithTag(htmlStr, tag string) bool {
410	doc, err := parseHTML(htmlStr)
411	if err != nil {
412		return false
413	}
414	return findElement(doc, tag) != nil
415}
416
417// getElementAttr finds an element by tag and returns the value of the specified attribute.
418// Returns empty string if element or attribute not found.
419func getElementAttr(htmlStr, tag, attrKey string) string {
420	doc, err := parseHTML(htmlStr)
421	if err != nil {
422		return ""
423	}
424	elem := findElement(doc, tag)
425	if elem == nil {
426		return ""
427	}
428	return getAttr(elem, attrKey)
429}
430
431// elementHasAttr checks if an element has a specific attribute key (regardless of value).
432func elementHasAttr(htmlStr, tag, attrKey string) bool {
433	doc, err := parseHTML(htmlStr)
434	if err != nil {
435		return false
436	}
437	elem := findElement(doc, tag)
438	if elem == nil {
439		return false
440	}
441	return hasAttr(elem, attrKey)
442}
443
444// elementAttrEquals checks if an element's attribute equals a specific value.
445func elementAttrEquals(htmlStr, tag, attrKey, expected string) bool {
446	return getElementAttr(htmlStr, tag, attrKey) == expected
447}
448
449// elementAttrContains checks if an element's attribute contains a substring.
450func elementAttrContains(htmlStr, tag, attrKey, substr string) bool {
451	doc, err := parseHTML(htmlStr)
452	if err != nil {
453		return false
454	}
455	elem := findElement(doc, tag)
456	if elem == nil {
457		return false
458	}
459	return attrContains(elem, attrKey, substr)
460}
461
462// elementAttrAbsent checks that an element does NOT have a specific attribute.
463func elementAttrAbsent(htmlStr, tag, attrKey string) bool {
464	return !elementHasAttr(htmlStr, tag, attrKey)
465}
466
467// elementAttrEmpty checks if an element's attribute is present but empty.
468func elementAttrEmpty(htmlStr, tag, attrKey string) bool {
469	doc, err := parseHTML(htmlStr)
470	if err != nil {
471		return false
472	}
473	elem := findElement(doc, tag)
474	if elem == nil {
475		return false
476	}
477	if !hasAttr(elem, attrKey) {
478		return false
479	}
480	return getAttr(elem, attrKey) == ""
481}