1package web
2
3import (
4 "strings"
5 "testing"
6
7 "github.com/matryer/is"
8 "golang.org/x/net/html"
9)
10
11func TestSanitizerXSSProtection(t *testing.T) {
12 is := is.New(t)
13
14 ctx := &ReadmeContext{
15 RepoName: "test-repo",
16 CommitHash: "abc123",
17 ReadmePath: "README.md",
18 }
19
20 // Test javascript: URL in link
21 md := []byte(`[click me](javascript:alert('xss'))`)
22 html, err := renderMarkdown(md, ctx)
23 is.NoErr(err)
24 is.True(!strings.Contains(string(html), "javascript:"))
25
26 // Test javascript: URL in image
27 md = []byte(`)`)
28 html, err = renderMarkdown(md, ctx)
29 is.NoErr(err)
30 is.True(!strings.Contains(string(html), "javascript:"))
31
32 // Test data: URI in image
33 md = []byte(``)
34 html, err = renderMarkdown(md, ctx)
35 is.NoErr(err)
36 is.True(!strings.Contains(string(html), "data:image"))
37
38 // Test data: URI in link
39 md = []byte(`[link](data:text/html,<script>alert('xss')</script>)`)
40 html, err = renderMarkdown(md, ctx)
41 is.NoErr(err)
42 is.True(!strings.Contains(string(html), "data:text"))
43
44 // Test onerror handler
45 md = []byte(`<img src="x" onerror="alert('xss')">`)
46 html, err = renderMarkdown(md, ctx)
47 is.NoErr(err)
48 is.True(!strings.Contains(string(html), "onerror"))
49
50 // Test onclick handler
51 md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
52 html, err = renderMarkdown(md, ctx)
53 is.NoErr(err)
54 is.True(!strings.Contains(string(html), "onclick"))
55
56 // Test style attribute
57 md = []byte(`<p style="background:url(javascript:alert('xss'))">test</p>`)
58 html, err = renderMarkdown(md, ctx)
59 is.NoErr(err)
60 is.True(!strings.Contains(string(html), "style="))
61 is.True(!strings.Contains(string(html), "javascript"))
62
63 // Test iframe injection
64 md = []byte(`<iframe src="https://evil.com"></iframe>`)
65 html, err = renderMarkdown(md, ctx)
66 is.NoErr(err)
67 is.True(!strings.Contains(string(html), "<iframe"))
68
69 // Test script tag
70 md = []byte(`<script>alert('xss')</script>`)
71 html, err = renderMarkdown(md, ctx)
72 is.NoErr(err)
73 is.True(!strings.Contains(string(html), "<script"))
74
75 // Test object/embed tags
76 md = []byte(`<object data="https://evil.com"></object>`)
77 html, err = renderMarkdown(md, ctx)
78 is.NoErr(err)
79 is.True(!strings.Contains(string(html), "<object"))
80
81 md = []byte(`<embed src="https://evil.com">`)
82 html, err = renderMarkdown(md, ctx)
83 is.NoErr(err)
84 is.True(!strings.Contains(string(html), "<embed"))
85}
86
87func TestSanitizerAllowedTags(t *testing.T) {
88 is := is.New(t)
89
90 ctx := &ReadmeContext{
91 RepoName: "test-repo",
92 CommitHash: "abc123",
93 ReadmePath: "README.md",
94 }
95
96 // Test allowed basic formatting
97 md := []byte(`**bold** *italic* ~~strikethrough~~`)
98 html, err := renderMarkdown(md, ctx)
99 is.NoErr(err)
100 is.True(strings.Contains(string(html), "<strong>"))
101 is.True(strings.Contains(string(html), "<em>"))
102 is.True(strings.Contains(string(html), "<del>"))
103
104 // Test allowed headings
105 md = []byte(`# H1
106## H2
107### H3`)
108 html, err = renderMarkdown(md, ctx)
109 is.NoErr(err)
110 is.True(strings.Contains(string(html), "<h1"))
111 is.True(strings.Contains(string(html), "<h2"))
112 is.True(strings.Contains(string(html), "<h3"))
113
114 // Test allowed lists
115 md = []byte(`- item 1
116- item 2
117
1181. numbered
1192. list`)
120 html, err = renderMarkdown(md, ctx)
121 is.NoErr(err)
122 is.True(strings.Contains(string(html), "<ul"))
123 is.True(strings.Contains(string(html), "<ol"))
124 is.True(strings.Contains(string(html), "<li"))
125
126 // Test allowed code blocks
127 md = []byte("```go\nfunc main() {}\n```")
128 html, err = renderMarkdown(md, ctx)
129 is.NoErr(err)
130 is.True(strings.Contains(string(html), "<pre"))
131 is.True(strings.Contains(string(html), "<code"))
132
133 // Test allowed tables
134 md = []byte(`| Col1 | Col2 |
135|------|------|
136| A | B |`)
137 html, err = renderMarkdown(md, ctx)
138 is.NoErr(err)
139 is.True(strings.Contains(string(html), "<table"))
140 is.True(strings.Contains(string(html), "<thead"))
141 is.True(strings.Contains(string(html), "<tbody"))
142 is.True(strings.Contains(string(html), "<tr"))
143 is.True(strings.Contains(string(html), "<th"))
144 is.True(strings.Contains(string(html), "<td"))
145
146 // Test allowed blockquote
147 md = []byte(`> quote`)
148 html, err = renderMarkdown(md, ctx)
149 is.NoErr(err)
150 is.True(strings.Contains(string(html), "<blockquote"))
151
152 // Test allowed details/summary (collapsible sections)
153 md = []byte(`<details>
154<summary>Click to expand</summary>
155
156Hidden content here
157
158</details>`)
159 html, err = renderMarkdown(md, ctx)
160 is.NoErr(err)
161 is.True(strings.Contains(string(html), "<details"))
162 is.True(strings.Contains(string(html), "<summary"))
163}
164
165func TestSanitizerHTTPSOnly(t *testing.T) {
166 is := is.New(t)
167
168 ctx := &ReadmeContext{
169 RepoName: "test-repo",
170 CommitHash: "abc123",
171 ReadmePath: "README.md",
172 }
173
174 // Test http:// link is stripped
175 md := []byte(`[insecure](http://example.com)`)
176 html, err := renderMarkdown(md, ctx)
177 is.NoErr(err)
178 htmlStr := string(html)
179 // Either no anchor tag or the anchor has no href attribute
180 is.True(!hasElementWithTag(htmlStr, "a") || elementAttrAbsent(htmlStr, "a", "href"))
181
182 // Test https:// link is allowed
183 md = []byte(`[secure](https://example.com)`)
184 html, err = renderMarkdown(md, ctx)
185 is.NoErr(err)
186 htmlStr = string(html)
187 is.True(elementHasAttr(htmlStr, "a", "href"))
188 is.True(elementAttrContains(htmlStr, "a", "href", "https://example.com"))
189
190 // Test http:// image is stripped
191 md = []byte(``)
192 html, err = renderMarkdown(md, ctx)
193 is.NoErr(err)
194 htmlStr = string(html)
195 is.True(!hasElementWithTag(htmlStr, "img") || elementAttrAbsent(htmlStr, "img", "src"))
196
197 // Test https:// image is allowed
198 md = []byte(``)
199 html, err = renderMarkdown(md, ctx)
200 is.NoErr(err)
201 htmlStr = string(html)
202 is.True(elementHasAttr(htmlStr, "img", "src"))
203 is.True(elementAttrContains(htmlStr, "img", "src", "https://example.com/image.png"))
204}
205
206func TestSanitizerNoFollowNoReferrer(t *testing.T) {
207 is := is.New(t)
208
209 ctx := &ReadmeContext{
210 RepoName: "test-repo",
211 CommitHash: "abc123",
212 ReadmePath: "README.md",
213 }
214
215 // Test that external links get rel="nofollow noreferrer"
216 md := []byte(`[external](https://example.com)`)
217 html, err := renderMarkdown(md, ctx)
218 is.NoErr(err)
219 htmlStr := string(html)
220 is.True(elementHasAttr(htmlStr, "a", "rel"))
221 relAttr := getElementAttr(htmlStr, "a", "rel")
222 is.True(strings.Contains(relAttr, "nofollow"))
223 is.True(strings.Contains(relAttr, "noreferrer"))
224
225 // Test that relative/internal links also get rel="nofollow noreferrer"
226 md = []byte(`[internal](docs/README.md)`)
227 html, err = renderMarkdown(md, ctx)
228 is.NoErr(err)
229 htmlStr = string(html)
230 is.True(elementHasAttr(htmlStr, "a", "rel"))
231 relAttr = getElementAttr(htmlStr, "a", "rel")
232 is.True(strings.Contains(relAttr, "nofollow"))
233 is.True(strings.Contains(relAttr, "noreferrer"))
234}
235
236func TestSanitizerAdditionalSchemes(t *testing.T) {
237 is := is.New(t)
238
239 ctx := &ReadmeContext{
240 RepoName: "test-repo",
241 CommitHash: "abc123",
242 ReadmePath: "README.md",
243 }
244
245 // Test protocol-relative URLs are stripped
246 md := []byte(`[protocol-relative](//evil.com/script.js)`)
247 html, err := renderMarkdown(md, ctx)
248 is.NoErr(err)
249 htmlStr := string(html)
250 // Rewriter blocks protocol-relative URLs with empty href, then sanitizer strips empty href links
251 is.True(!hasElementWithTag(htmlStr, "a"))
252
253 // Test mailto: scheme is stripped
254 md = []byte(`[email](mailto:user@example.com)`)
255 html, err = renderMarkdown(md, ctx)
256 is.NoErr(err)
257 htmlStr = string(html)
258 // mailto: is not in allowed schemes, so sanitizer strips the entire link
259 is.True(!hasElementWithTag(htmlStr, "a"))
260
261 // Test ftp: scheme is stripped
262 md = []byte(`[ftp](ftp://ftp.example.com/file.txt)`)
263 html, err = renderMarkdown(md, ctx)
264 is.NoErr(err)
265 htmlStr = string(html)
266 // ftp: is not in allowed schemes, so sanitizer strips the entire link
267 is.True(!hasElementWithTag(htmlStr, "a"))
268}
269
270func TestSanitizerDisallowedAttributes(t *testing.T) {
271 is := is.New(t)
272
273 ctx := &ReadmeContext{
274 RepoName: "test-repo",
275 CommitHash: "abc123",
276 ReadmePath: "README.md",
277 }
278
279 // Test target attribute is stripped from links
280 md := []byte(`<a href="https://example.com" target="_blank">link</a>`)
281 html, err := renderMarkdown(md, ctx)
282 is.NoErr(err)
283 htmlStr := string(html)
284 is.True(elementAttrAbsent(htmlStr, "a", "target"))
285
286 // Test class attribute is stripped
287 md = []byte(`<a href="https://example.com" class="dangerous">link</a>`)
288 html, err = renderMarkdown(md, ctx)
289 is.NoErr(err)
290 htmlStr = string(html)
291 is.True(elementAttrAbsent(htmlStr, "a", "class"))
292
293 // Test style attribute is stripped (already tested in XSS, but explicit here)
294 md = []byte(`<a href="https://example.com" style="color: red;">link</a>`)
295 html, err = renderMarkdown(md, ctx)
296 is.NoErr(err)
297 htmlStr = string(html)
298 is.True(elementAttrAbsent(htmlStr, "a", "style"))
299
300 // Test onclick is stripped (already tested in XSS, but explicit here)
301 md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
302 html, err = renderMarkdown(md, ctx)
303 is.NoErr(err)
304 htmlStr = string(html)
305 is.True(elementAttrAbsent(htmlStr, "a", "onclick"))
306
307 // Test onerror is stripped (already tested in XSS, but explicit here)
308 md = []byte(`<img src="x" onerror="alert('xss')">`)
309 html, err = renderMarkdown(md, ctx)
310 is.NoErr(err)
311 htmlStr = string(html)
312 is.True(elementAttrAbsent(htmlStr, "img", "onerror"))
313}
314
315func TestSanitizerImageAttributes(t *testing.T) {
316 is := is.New(t)
317
318 ctx := &ReadmeContext{
319 RepoName: "test-repo",
320 CommitHash: "abc123",
321 ReadmePath: "README.md",
322 }
323
324 // Test that width and height attributes are preserved on images
325 md := []byte(`<img src="https://example.com/image.png" width="100" height="50" alt="test">`)
326 html, err := renderMarkdown(md, ctx)
327 is.NoErr(err)
328 htmlStr := string(html)
329 is.True(elementAttrEquals(htmlStr, "img", "width", "100"))
330 is.True(elementAttrEquals(htmlStr, "img", "height", "50"))
331 is.True(elementAttrEquals(htmlStr, "img", "alt", "test"))
332}
333
334func TestSanitizerHeadingIDs(t *testing.T) {
335 is := is.New(t)
336
337 ctx := &ReadmeContext{
338 RepoName: "test-repo",
339 CommitHash: "abc123",
340 ReadmePath: "README.md",
341 }
342
343 // Test that heading IDs are preserved (generated by AutoHeadingID)
344 md := []byte(`# Installation
345
346Jump to [installation](#installation).`)
347 html, err := renderMarkdown(md, ctx)
348 is.NoErr(err)
349 htmlStr := string(html)
350 // Check that the heading has an id attribute set to "installation"
351 is.True(elementAttrEquals(htmlStr, "h1", "id", "installation"))
352 // Check that the anchor link is preserved
353 is.True(elementAttrContains(htmlStr, "a", "href", "#installation"))
354}
355
356// HTML Testing Helpers
357// These utilities help test HTML output by parsing the DOM instead of string matching.
358
359// findElement searches the HTML tree for an element matching the given tag name.
360// Returns the first matching element node, or nil if not found.
361func findElement(n *html.Node, tag string) *html.Node {
362 if n.Type == html.ElementNode && n.Data == tag {
363 return n
364 }
365 for c := n.FirstChild; c != nil; c = c.NextSibling {
366 if found := findElement(c, tag); found != nil {
367 return found
368 }
369 }
370 return nil
371}
372
373// findElementWithAttr searches for an element with a specific attribute value.
374// Returns the first matching element, or nil if not found.
375func findElementWithAttr(n *html.Node, tag, attrKey, attrValue string) *html.Node {
376 if n.Type == html.ElementNode && n.Data == tag {
377 if getAttr(n, attrKey) == attrValue {
378 return n
379 }
380 }
381 for c := n.FirstChild; c != nil; c = c.NextSibling {
382 if found := findElementWithAttr(c, tag, attrKey, attrValue); found != nil {
383 return found
384 }
385 }
386 return nil
387}
388
389// getAttr returns the value of an attribute, or empty string if not present.
390func getAttr(n *html.Node, key string) string {
391 for _, attr := range n.Attr {
392 if attr.Key == key {
393 return attr.Val
394 }
395 }
396 return ""
397}
398
399// hasAttr returns true if the element has the specified attribute (regardless of value).
400func hasAttr(n *html.Node, key string) bool {
401 for _, attr := range n.Attr {
402 if attr.Key == key {
403 return true
404 }
405 }
406 return false
407}
408
409// attrContains checks if an attribute value contains a substring.
410func attrContains(n *html.Node, key, substr string) bool {
411 val := getAttr(n, key)
412 return strings.Contains(val, substr)
413}
414
415// parseHTML parses an HTML string and returns the root node.
416func parseHTML(htmlStr string) (*html.Node, error) {
417 return html.Parse(strings.NewReader(htmlStr))
418}
419
420// hasElementWithTag returns true if the HTML contains an element with the given tag.
421func hasElementWithTag(htmlStr, tag string) bool {
422 doc, err := parseHTML(htmlStr)
423 if err != nil {
424 return false
425 }
426 return findElement(doc, tag) != nil
427}
428
429// getElementAttr finds an element by tag and returns the value of the specified attribute.
430// Returns empty string if element or attribute not found.
431func getElementAttr(htmlStr, tag, attrKey string) string {
432 doc, err := parseHTML(htmlStr)
433 if err != nil {
434 return ""
435 }
436 elem := findElement(doc, tag)
437 if elem == nil {
438 return ""
439 }
440 return getAttr(elem, attrKey)
441}
442
443// elementHasAttr checks if an element has a specific attribute key (regardless of value).
444func elementHasAttr(htmlStr, tag, attrKey string) bool {
445 doc, err := parseHTML(htmlStr)
446 if err != nil {
447 return false
448 }
449 elem := findElement(doc, tag)
450 if elem == nil {
451 return false
452 }
453 return hasAttr(elem, attrKey)
454}
455
456// elementAttrEquals checks if an element's attribute equals a specific value.
457func elementAttrEquals(htmlStr, tag, attrKey, expected string) bool {
458 return getElementAttr(htmlStr, tag, attrKey) == expected
459}
460
461// elementAttrContains checks if an element's attribute contains a substring.
462func elementAttrContains(htmlStr, tag, attrKey, substr string) bool {
463 doc, err := parseHTML(htmlStr)
464 if err != nil {
465 return false
466 }
467 elem := findElement(doc, tag)
468 if elem == nil {
469 return false
470 }
471 return attrContains(elem, attrKey, substr)
472}
473
474// elementAttrAbsent checks that an element does NOT have a specific attribute.
475func elementAttrAbsent(htmlStr, tag, attrKey string) bool {
476 return !elementHasAttr(htmlStr, tag, attrKey)
477}
478
479// elementAttrEmpty checks if an element's attribute is present but empty.
480func elementAttrEmpty(htmlStr, tag, attrKey string) bool {
481 doc, err := parseHTML(htmlStr)
482 if err != nil {
483 return false
484 }
485 elem := findElement(doc, tag)
486 if elem == nil {
487 return false
488 }
489 if !hasAttr(elem, attrKey) {
490 return false
491 }
492 return getAttr(elem, attrKey) == ""
493}