1package web
2
3import (
4 "strings"
5 "testing"
6
7 "github.com/matryer/is"
8 "golang.org/x/net/html"
9)
10
11func TestSanitizerXSSProtection(t *testing.T) {
12 is := is.New(t)
13
14 ctx := &ReadmeContext{
15 RepoName: "test-repo",
16 CommitHash: "abc123",
17 ReadmePath: "README.md",
18 }
19
20 // Test javascript: URL in link
21 md := []byte(`[click me](javascript:alert('xss'))`)
22 html, err := renderMarkdown(md, ctx)
23 is.NoErr(err)
24 is.True(!strings.Contains(string(html), "javascript:"))
25
26 // Test javascript: URL in image
27 md = []byte(`)`)
28 html, err = renderMarkdown(md, ctx)
29 is.NoErr(err)
30 is.True(!strings.Contains(string(html), "javascript:"))
31
32 // Test data: URI in image
33 md = []byte(``)
34 html, err = renderMarkdown(md, ctx)
35 is.NoErr(err)
36 is.True(!strings.Contains(string(html), "data:image"))
37
38 // Test data: URI in link
39 md = []byte(`[link](data:text/html,<script>alert('xss')</script>)`)
40 html, err = renderMarkdown(md, ctx)
41 is.NoErr(err)
42 is.True(!strings.Contains(string(html), "data:text"))
43
44 // Test onerror handler
45 md = []byte(`<img src="x" onerror="alert('xss')">`)
46 html, err = renderMarkdown(md, ctx)
47 is.NoErr(err)
48 is.True(!strings.Contains(string(html), "onerror"))
49
50 // Test onclick handler
51 md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
52 html, err = renderMarkdown(md, ctx)
53 is.NoErr(err)
54 is.True(!strings.Contains(string(html), "onclick"))
55
56 // Test style attribute
57 md = []byte(`<p style="background:url(javascript:alert('xss'))">test</p>`)
58 html, err = renderMarkdown(md, ctx)
59 is.NoErr(err)
60 is.True(!strings.Contains(string(html), "style="))
61 is.True(!strings.Contains(string(html), "javascript"))
62
63 // Test iframe injection
64 md = []byte(`<iframe src="https://evil.com"></iframe>`)
65 html, err = renderMarkdown(md, ctx)
66 is.NoErr(err)
67 is.True(!strings.Contains(string(html), "<iframe"))
68
69 // Test script tag
70 md = []byte(`<script>alert('xss')</script>`)
71 html, err = renderMarkdown(md, ctx)
72 is.NoErr(err)
73 is.True(!strings.Contains(string(html), "<script"))
74
75 // Test object/embed tags
76 md = []byte(`<object data="https://evil.com"></object>`)
77 html, err = renderMarkdown(md, ctx)
78 is.NoErr(err)
79 is.True(!strings.Contains(string(html), "<object"))
80
81 md = []byte(`<embed src="https://evil.com">`)
82 html, err = renderMarkdown(md, ctx)
83 is.NoErr(err)
84 is.True(!strings.Contains(string(html), "<embed"))
85}
86
87func TestSanitizerAllowedTags(t *testing.T) {
88 is := is.New(t)
89
90 ctx := &ReadmeContext{
91 RepoName: "test-repo",
92 CommitHash: "abc123",
93 ReadmePath: "README.md",
94 }
95
96 // Test allowed basic formatting
97 md := []byte(`**bold** *italic* ~~strikethrough~~`)
98 html, err := renderMarkdown(md, ctx)
99 is.NoErr(err)
100 is.True(strings.Contains(string(html), "<strong>"))
101 is.True(strings.Contains(string(html), "<em>"))
102 is.True(strings.Contains(string(html), "<del>"))
103
104 // Test allowed headings
105 md = []byte(`# H1
106## H2
107### H3`)
108 html, err = renderMarkdown(md, ctx)
109 is.NoErr(err)
110 is.True(strings.Contains(string(html), "<h1"))
111 is.True(strings.Contains(string(html), "<h2"))
112 is.True(strings.Contains(string(html), "<h3"))
113
114 // Test allowed lists
115 md = []byte(`- item 1
116- item 2
117
1181. numbered
1192. list`)
120 html, err = renderMarkdown(md, ctx)
121 is.NoErr(err)
122 is.True(strings.Contains(string(html), "<ul"))
123 is.True(strings.Contains(string(html), "<ol"))
124 is.True(strings.Contains(string(html), "<li"))
125
126 // Test allowed code blocks
127 md = []byte("```go\nfunc main() {}\n```")
128 html, err = renderMarkdown(md, ctx)
129 is.NoErr(err)
130 is.True(strings.Contains(string(html), "<pre"))
131 is.True(strings.Contains(string(html), "<code"))
132
133 // Test allowed tables
134 md = []byte(`| Col1 | Col2 |
135|------|------|
136| A | B |`)
137 html, err = renderMarkdown(md, ctx)
138 is.NoErr(err)
139 is.True(strings.Contains(string(html), "<table"))
140 is.True(strings.Contains(string(html), "<thead"))
141 is.True(strings.Contains(string(html), "<tbody"))
142 is.True(strings.Contains(string(html), "<tr"))
143 is.True(strings.Contains(string(html), "<th"))
144 is.True(strings.Contains(string(html), "<td"))
145
146 // Test allowed blockquote
147 md = []byte(`> quote`)
148 html, err = renderMarkdown(md, ctx)
149 is.NoErr(err)
150 is.True(strings.Contains(string(html), "<blockquote"))
151}
152
153func TestSanitizerHTTPSOnly(t *testing.T) {
154 is := is.New(t)
155
156 ctx := &ReadmeContext{
157 RepoName: "test-repo",
158 CommitHash: "abc123",
159 ReadmePath: "README.md",
160 }
161
162 // Test http:// link is stripped
163 md := []byte(`[insecure](http://example.com)`)
164 html, err := renderMarkdown(md, ctx)
165 is.NoErr(err)
166 htmlStr := string(html)
167 // Either no anchor tag or the anchor has no href attribute
168 is.True(!hasElementWithTag(htmlStr, "a") || elementAttrAbsent(htmlStr, "a", "href"))
169
170 // Test https:// link is allowed
171 md = []byte(`[secure](https://example.com)`)
172 html, err = renderMarkdown(md, ctx)
173 is.NoErr(err)
174 htmlStr = string(html)
175 is.True(elementHasAttr(htmlStr, "a", "href"))
176 is.True(elementAttrContains(htmlStr, "a", "href", "https://example.com"))
177
178 // Test http:// image is stripped
179 md = []byte(``)
180 html, err = renderMarkdown(md, ctx)
181 is.NoErr(err)
182 htmlStr = string(html)
183 is.True(!hasElementWithTag(htmlStr, "img") || elementAttrAbsent(htmlStr, "img", "src"))
184
185 // Test https:// image is allowed
186 md = []byte(``)
187 html, err = renderMarkdown(md, ctx)
188 is.NoErr(err)
189 htmlStr = string(html)
190 is.True(elementHasAttr(htmlStr, "img", "src"))
191 is.True(elementAttrContains(htmlStr, "img", "src", "https://example.com/image.png"))
192}
193
194func TestSanitizerNoFollowNoReferrer(t *testing.T) {
195 is := is.New(t)
196
197 ctx := &ReadmeContext{
198 RepoName: "test-repo",
199 CommitHash: "abc123",
200 ReadmePath: "README.md",
201 }
202
203 // Test that external links get rel="nofollow noreferrer"
204 md := []byte(`[external](https://example.com)`)
205 html, err := renderMarkdown(md, ctx)
206 is.NoErr(err)
207 htmlStr := string(html)
208 is.True(elementHasAttr(htmlStr, "a", "rel"))
209 relAttr := getElementAttr(htmlStr, "a", "rel")
210 is.True(strings.Contains(relAttr, "nofollow"))
211 is.True(strings.Contains(relAttr, "noreferrer"))
212
213 // Test that relative/internal links also get rel="nofollow noreferrer"
214 md = []byte(`[internal](docs/README.md)`)
215 html, err = renderMarkdown(md, ctx)
216 is.NoErr(err)
217 htmlStr = string(html)
218 is.True(elementHasAttr(htmlStr, "a", "rel"))
219 relAttr = getElementAttr(htmlStr, "a", "rel")
220 is.True(strings.Contains(relAttr, "nofollow"))
221 is.True(strings.Contains(relAttr, "noreferrer"))
222}
223
224func TestSanitizerAdditionalSchemes(t *testing.T) {
225 is := is.New(t)
226
227 ctx := &ReadmeContext{
228 RepoName: "test-repo",
229 CommitHash: "abc123",
230 ReadmePath: "README.md",
231 }
232
233 // Test protocol-relative URLs are stripped
234 md := []byte(`[protocol-relative](//evil.com/script.js)`)
235 html, err := renderMarkdown(md, ctx)
236 is.NoErr(err)
237 htmlStr := string(html)
238 // Rewriter blocks protocol-relative URLs with empty href, then sanitizer strips empty href links
239 is.True(!hasElementWithTag(htmlStr, "a"))
240
241 // Test mailto: scheme is stripped
242 md = []byte(`[email](mailto:user@example.com)`)
243 html, err = renderMarkdown(md, ctx)
244 is.NoErr(err)
245 htmlStr = string(html)
246 // mailto: is not in allowed schemes, so sanitizer strips the entire link
247 is.True(!hasElementWithTag(htmlStr, "a"))
248
249 // Test ftp: scheme is stripped
250 md = []byte(`[ftp](ftp://ftp.example.com/file.txt)`)
251 html, err = renderMarkdown(md, ctx)
252 is.NoErr(err)
253 htmlStr = string(html)
254 // ftp: is not in allowed schemes, so sanitizer strips the entire link
255 is.True(!hasElementWithTag(htmlStr, "a"))
256}
257
258func TestSanitizerDisallowedAttributes(t *testing.T) {
259 is := is.New(t)
260
261 ctx := &ReadmeContext{
262 RepoName: "test-repo",
263 CommitHash: "abc123",
264 ReadmePath: "README.md",
265 }
266
267 // Test target attribute is stripped from links
268 md := []byte(`<a href="https://example.com" target="_blank">link</a>`)
269 html, err := renderMarkdown(md, ctx)
270 is.NoErr(err)
271 htmlStr := string(html)
272 is.True(elementAttrAbsent(htmlStr, "a", "target"))
273
274 // Test class attribute is stripped
275 md = []byte(`<a href="https://example.com" class="dangerous">link</a>`)
276 html, err = renderMarkdown(md, ctx)
277 is.NoErr(err)
278 htmlStr = string(html)
279 is.True(elementAttrAbsent(htmlStr, "a", "class"))
280
281 // Test style attribute is stripped (already tested in XSS, but explicit here)
282 md = []byte(`<a href="https://example.com" style="color: red;">link</a>`)
283 html, err = renderMarkdown(md, ctx)
284 is.NoErr(err)
285 htmlStr = string(html)
286 is.True(elementAttrAbsent(htmlStr, "a", "style"))
287
288 // Test onclick is stripped (already tested in XSS, but explicit here)
289 md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
290 html, err = renderMarkdown(md, ctx)
291 is.NoErr(err)
292 htmlStr = string(html)
293 is.True(elementAttrAbsent(htmlStr, "a", "onclick"))
294
295 // Test onerror is stripped (already tested in XSS, but explicit here)
296 md = []byte(`<img src="x" onerror="alert('xss')">`)
297 html, err = renderMarkdown(md, ctx)
298 is.NoErr(err)
299 htmlStr = string(html)
300 is.True(elementAttrAbsent(htmlStr, "img", "onerror"))
301}
302
303func TestSanitizerImageAttributes(t *testing.T) {
304 is := is.New(t)
305
306 ctx := &ReadmeContext{
307 RepoName: "test-repo",
308 CommitHash: "abc123",
309 ReadmePath: "README.md",
310 }
311
312 // Test that width and height attributes are preserved on images
313 md := []byte(`<img src="https://example.com/image.png" width="100" height="50" alt="test">`)
314 html, err := renderMarkdown(md, ctx)
315 is.NoErr(err)
316 htmlStr := string(html)
317 is.True(elementAttrEquals(htmlStr, "img", "width", "100"))
318 is.True(elementAttrEquals(htmlStr, "img", "height", "50"))
319 is.True(elementAttrEquals(htmlStr, "img", "alt", "test"))
320}
321
322func TestSanitizerHeadingIDs(t *testing.T) {
323 is := is.New(t)
324
325 ctx := &ReadmeContext{
326 RepoName: "test-repo",
327 CommitHash: "abc123",
328 ReadmePath: "README.md",
329 }
330
331 // Test that heading IDs are preserved (generated by AutoHeadingID)
332 md := []byte(`# Installation
333
334Jump to [installation](#installation).`)
335 html, err := renderMarkdown(md, ctx)
336 is.NoErr(err)
337 htmlStr := string(html)
338 // Check that the heading has an id attribute set to "installation"
339 is.True(elementAttrEquals(htmlStr, "h1", "id", "installation"))
340 // Check that the anchor link is preserved
341 is.True(elementAttrContains(htmlStr, "a", "href", "#installation"))
342}
343
344// HTML Testing Helpers
345// These utilities help test HTML output by parsing the DOM instead of string matching.
346
347// findElement searches the HTML tree for an element matching the given tag name.
348// Returns the first matching element node, or nil if not found.
349func findElement(n *html.Node, tag string) *html.Node {
350 if n.Type == html.ElementNode && n.Data == tag {
351 return n
352 }
353 for c := n.FirstChild; c != nil; c = c.NextSibling {
354 if found := findElement(c, tag); found != nil {
355 return found
356 }
357 }
358 return nil
359}
360
361// findElementWithAttr searches for an element with a specific attribute value.
362// Returns the first matching element, or nil if not found.
363func findElementWithAttr(n *html.Node, tag, attrKey, attrValue string) *html.Node {
364 if n.Type == html.ElementNode && n.Data == tag {
365 if getAttr(n, attrKey) == attrValue {
366 return n
367 }
368 }
369 for c := n.FirstChild; c != nil; c = c.NextSibling {
370 if found := findElementWithAttr(c, tag, attrKey, attrValue); found != nil {
371 return found
372 }
373 }
374 return nil
375}
376
377// getAttr returns the value of an attribute, or empty string if not present.
378func getAttr(n *html.Node, key string) string {
379 for _, attr := range n.Attr {
380 if attr.Key == key {
381 return attr.Val
382 }
383 }
384 return ""
385}
386
387// hasAttr returns true if the element has the specified attribute (regardless of value).
388func hasAttr(n *html.Node, key string) bool {
389 for _, attr := range n.Attr {
390 if attr.Key == key {
391 return true
392 }
393 }
394 return false
395}
396
397// attrContains checks if an attribute value contains a substring.
398func attrContains(n *html.Node, key, substr string) bool {
399 val := getAttr(n, key)
400 return strings.Contains(val, substr)
401}
402
403// parseHTML parses an HTML string and returns the root node.
404func parseHTML(htmlStr string) (*html.Node, error) {
405 return html.Parse(strings.NewReader(htmlStr))
406}
407
408// hasElementWithTag returns true if the HTML contains an element with the given tag.
409func hasElementWithTag(htmlStr, tag string) bool {
410 doc, err := parseHTML(htmlStr)
411 if err != nil {
412 return false
413 }
414 return findElement(doc, tag) != nil
415}
416
417// getElementAttr finds an element by tag and returns the value of the specified attribute.
418// Returns empty string if element or attribute not found.
419func getElementAttr(htmlStr, tag, attrKey string) string {
420 doc, err := parseHTML(htmlStr)
421 if err != nil {
422 return ""
423 }
424 elem := findElement(doc, tag)
425 if elem == nil {
426 return ""
427 }
428 return getAttr(elem, attrKey)
429}
430
431// elementHasAttr checks if an element has a specific attribute key (regardless of value).
432func elementHasAttr(htmlStr, tag, attrKey string) bool {
433 doc, err := parseHTML(htmlStr)
434 if err != nil {
435 return false
436 }
437 elem := findElement(doc, tag)
438 if elem == nil {
439 return false
440 }
441 return hasAttr(elem, attrKey)
442}
443
444// elementAttrEquals checks if an element's attribute equals a specific value.
445func elementAttrEquals(htmlStr, tag, attrKey, expected string) bool {
446 return getElementAttr(htmlStr, tag, attrKey) == expected
447}
448
449// elementAttrContains checks if an element's attribute contains a substring.
450func elementAttrContains(htmlStr, tag, attrKey, substr string) bool {
451 doc, err := parseHTML(htmlStr)
452 if err != nil {
453 return false
454 }
455 elem := findElement(doc, tag)
456 if elem == nil {
457 return false
458 }
459 return attrContains(elem, attrKey, substr)
460}
461
462// elementAttrAbsent checks that an element does NOT have a specific attribute.
463func elementAttrAbsent(htmlStr, tag, attrKey string) bool {
464 return !elementHasAttr(htmlStr, tag, attrKey)
465}
466
467// elementAttrEmpty checks if an element's attribute is present but empty.
468func elementAttrEmpty(htmlStr, tag, attrKey string) bool {
469 doc, err := parseHTML(htmlStr)
470 if err != nil {
471 return false
472 }
473 elem := findElement(doc, tag)
474 if elem == nil {
475 return false
476 }
477 if !hasAttr(elem, attrKey) {
478 return false
479 }
480 return getAttr(elem, attrKey) == ""
481}