Detailed changes
@@ -0,0 +1,147 @@
+package web
+
+import (
+ "net/url"
+ "path"
+ "strings"
+
+ "github.com/yuin/goldmark/ast"
+ "github.com/yuin/goldmark/parser"
+ "github.com/yuin/goldmark/text"
+)
+
+// ReadmeContext contains information needed to rewrite URLs in README files.
+type ReadmeContext struct {
+ RepoName string // Repository name (e.g., "myrepo")
+ CommitHash string // Full commit hash to pin URLs to
+ ReadmePath string // Path to README file (e.g., "docs/README.md")
+}
+
+// urlRewriter is a Goldmark AST transformer that rewrites relative URLs in markdown.
+type urlRewriter struct {
+ ctx ReadmeContext
+}
+
+// newURLRewriter creates a new URL rewriter transformer.
+func newURLRewriter(ctx ReadmeContext) *urlRewriter {
+ return &urlRewriter{ctx: ctx}
+}
+
+// Transform implements ast.Transformer.
+func (r *urlRewriter) Transform(node *ast.Document, reader text.Reader, pc parser.Context) {
+ // Get the directory containing the README
+ readmeDir := path.Dir(r.ctx.ReadmePath)
+ if readmeDir == "." {
+ readmeDir = ""
+ }
+
+ // Walk the AST and rewrite links and images
+ ast.Walk(node, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
+ if !entering {
+ return ast.WalkContinue, nil
+ }
+
+ switch v := n.(type) {
+ case *ast.Link:
+ v.Destination = r.rewriteURL(v.Destination, readmeDir, false)
+ case *ast.Image:
+ v.Destination = r.rewriteURL(v.Destination, readmeDir, true)
+ }
+
+ return ast.WalkContinue, nil
+ })
+}
+
+// rewriteURL rewrites a single URL destination.
+func (r *urlRewriter) rewriteURL(dest []byte, readmeDir string, isImage bool) []byte {
+ destStr := string(dest)
+
+ // Skip empty URLs
+ if destStr == "" {
+ return dest
+ }
+
+ // Parse the URL
+ u, err := url.Parse(destStr)
+ if err != nil {
+ // Invalid URL, fail closed by returning empty
+ return []byte("")
+ }
+
+ // Skip absolute URLs (http://, https://, mailto:, etc.)
+ if u.Scheme != "" {
+ return dest
+ }
+
+ // Skip protocol-relative URLs (//example.com/path)
+ // These have no scheme but do have a host
+ if u.Host != "" {
+ return []byte("")
+ }
+
+ // Skip anchor-only links (#section)
+ if strings.HasPrefix(destStr, "#") {
+ return dest
+ }
+
+ // Skip absolute paths (starting with /)
+ if strings.HasPrefix(destStr, "/") {
+ return dest
+ }
+
+ // Now we have a relative URL - resolve it against the README directory
+ // Join README dir with the relative path
+ var resolvedPath string
+ if readmeDir != "" {
+ resolvedPath = path.Join(readmeDir, u.Path)
+ } else {
+ resolvedPath = u.Path
+ }
+
+ // Clean the path to resolve .. and .
+ resolvedPath = path.Clean(resolvedPath)
+
+ // Security check: reject any path that escapes the repository root
+ // After path.Clean, a path starting with ../ indicates traversal outside the repo
+ if strings.HasPrefix(resolvedPath, "../") || resolvedPath == ".." {
+ // Path tries to escape repo, return empty/invalid
+ return []byte("")
+ }
+
+ // For images, always rewrite to blob endpoint with ?raw=1
+ // For links, check if it's a file (has extension) or could be a directory/markdown file
+ if isImage {
+ // Build URL: /{repo}/blob/{commit}/{path}?raw=1
+ rewritten := "/" + r.ctx.RepoName + "/blob/" + r.ctx.CommitHash + "/" + resolvedPath + "?raw=1"
+ // Preserve fragment if present
+ if u.Fragment != "" {
+ rewritten += "#" + u.Fragment
+ }
+ return []byte(rewritten)
+ }
+
+ // For links, determine if it's a file or navigation
+ ext := path.Ext(resolvedPath)
+ if ext != "" && ext != ".md" && ext != ".markdown" {
+ // It's a file (not markdown), serve raw
+ rewritten := "/" + r.ctx.RepoName + "/blob/" + r.ctx.CommitHash + "/" + resolvedPath + "?raw=1"
+ if u.Fragment != "" {
+ rewritten += "#" + u.Fragment
+ }
+ return []byte(rewritten)
+ } else if ext == ".md" || ext == ".markdown" {
+ // It's a markdown file, link to blob view (rendered)
+ rewritten := "/" + r.ctx.RepoName + "/blob/" + r.ctx.CommitHash + "/" + resolvedPath
+ if u.Fragment != "" {
+ rewritten += "#" + u.Fragment
+ }
+ return []byte(rewritten)
+ } else {
+ // No extension, could be a directory - link to tree view
+ rewritten := "/" + r.ctx.RepoName + "/tree/" + r.ctx.CommitHash + "/" + resolvedPath
+ if u.Fragment != "" {
+ rewritten += "#" + u.Fragment
+ }
+ return []byte(rewritten)
+ }
+}
@@ -0,0 +1,241 @@
+package web
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/matryer/is"
+)
+
+func TestRenderMarkdownWithURLRewriting(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123def456",
+ ReadmePath: "README.md",
+ }
+
+ // Test relative image in root README
+ md := []byte(``)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(string(html) != "")
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/image.png?raw=1"))
+
+ // Test relative link to markdown file
+ md = []byte(`[docs](docs/README.md)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/README.md"))
+
+ // Test relative link to non-markdown file
+ md = []byte(`[download](file.tar.gz)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/file.tar.gz?raw=1"))
+
+ // Test relative link to directory
+ md = []byte(`[folder](docs)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/tree/abc123def456/docs"))
+
+ // Test absolute https URL unchanged
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "https://example.com/image.png"))
+
+ // Test anchor-only link unchanged
+ md = []byte(`[section](#heading)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "#heading"))
+
+ // Test absolute path unchanged (starts with /)
+ md = []byte(`[root](/other-repo/file)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/other-repo/file"))
+}
+
+func TestRenderMarkdownNestedReadme(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123def456",
+ ReadmePath: "docs/README.md",
+ }
+
+ // Test relative image in nested README
+ md := []byte(``)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/image.png?raw=1"))
+
+ // Test going up with ../
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/root.png?raw=1"))
+
+ // Test deep nesting
+ md = []byte(`[deep](subdir/file.md)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/subdir/file.md"))
+}
+
+func TestRenderMarkdownTraversalAttempts(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123def456",
+ ReadmePath: "README.md",
+ }
+
+ // Test ../ traversal outside repo - should be blocked (empty src/href)
+ md := []byte(``)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ // Should not contain the path in src attribute
+ is.True(!elementAttrContains(htmlStr, "img", "src", "etc/passwd"))
+
+ // Test absolute path traversal
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ // Absolute paths starting with / are left unchanged by rewriter
+ is.True(elementAttrContains(htmlStr, "img", "src", "/../../../etc/passwd"))
+
+ // Test that fragments don't leak when path traversal is blocked
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ // Should not contain the path or fragment in src attribute
+ is.True(!elementAttrContains(htmlStr, "img", "src", "secret.png") && !elementAttrContains(htmlStr, "img", "src", "#anchor"))
+}
+
+func TestRenderMarkdownURLDetails(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123def456",
+ ReadmePath: "README.md",
+ }
+
+ // Test fragments preserved for links
+ md := []byte(`[section](docs/guide.md#installation)`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/guide.md#installation"))
+
+ // Test fragments preserved for images
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/arch.png?raw=1#diagram"))
+
+ // Test directories link to /tree without trailing slash
+ md = []byte(`[docs](docs/)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/tree/abc123def456/docs"))
+ is.True(!strings.Contains(string(html), "docs/\"")) // no trailing slash in href
+
+ // Test non-md files get ?raw=1
+ md = []byte(`[archive](file.zip)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/file.zip?raw=1"))
+
+ md = []byte(`[pdf](docs/manual.pdf)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/manual.pdf?raw=1"))
+
+ // Test uppercase .MD/.MARKDOWN files are treated as non-markdown
+ // (document current behavior - filepath.Ext is case-sensitive)
+ md = []byte(`[upper](README.MD)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ // .MD is not .md, so treated as regular file requiring ?raw=1
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/README.MD?raw=1"))
+
+ md = []byte(`[markdown](README.MARKDOWN)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ // .MARKDOWN is not .markdown, so treated as regular file requiring ?raw=1
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/README.MARKDOWN?raw=1"))
+}
+
+func TestRenderMarkdownSpecialCharacters(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123def456",
+ ReadmePath: "README.md",
+ }
+
+ // Test spaces in filename - bluemonday may strip invalid URLs
+ md := []byte(``)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ // Should contain the repo path
+ is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/"))
+
+ // Test unicode
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(string(html) != "")
+}
+
+func TestRenderMarkdownDangerousSchemes(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123def456",
+ ReadmePath: "README.md",
+ }
+
+ // Test javascript: scheme - should be stripped by sanitizer
+ md := []byte(`[xss](javascript:alert('xss'))`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "javascript:"))
+
+ // Test data: scheme - should be stripped
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "data:image"))
+
+ // Test http: scheme - should be stripped (only https allowed)
+ md = []byte(`[http](http://example.com)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ // Sanitizer should strip non-https schemes
+ is.True(!strings.Contains(string(html), "http://example.com") || !strings.Contains(string(html), "href="))
+}
+
+func TestRenderMarkdownWithoutContext(t *testing.T) {
+ is := is.New(t)
+
+ // Test that rendering without context works (no rewriting)
+ md := []byte(``)
+ html, err := renderMarkdown(md, nil)
+ is.NoErr(err)
+ is.True(string(html) != "")
+ // Without context, relative URLs stay relative
+ is.True(strings.Contains(string(html), "image.png"))
+}
@@ -0,0 +1,481 @@
+package web
+
+import (
+ "strings"
+ "testing"
+
+ "github.com/matryer/is"
+ "golang.org/x/net/html"
+)
+
+func TestSanitizerXSSProtection(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test javascript: URL in link
+ md := []byte(`[click me](javascript:alert('xss'))`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "javascript:"))
+
+ // Test javascript: URL in image
+ md = []byte(`)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "javascript:"))
+
+ // Test data: URI in image
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "data:image"))
+
+ // Test data: URI in link
+ md = []byte(`[link](data:text/html,<script>alert('xss')</script>)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "data:text"))
+
+ // Test onerror handler
+ md = []byte(`<img src="x" onerror="alert('xss')">`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "onerror"))
+
+ // Test onclick handler
+ md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "onclick"))
+
+ // Test style attribute
+ md = []byte(`<p style="background:url(javascript:alert('xss'))">test</p>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "style="))
+ is.True(!strings.Contains(string(html), "javascript"))
+
+ // Test iframe injection
+ md = []byte(`<iframe src="https://evil.com"></iframe>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "<iframe"))
+
+ // Test script tag
+ md = []byte(`<script>alert('xss')</script>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "<script"))
+
+ // Test object/embed tags
+ md = []byte(`<object data="https://evil.com"></object>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "<object"))
+
+ md = []byte(`<embed src="https://evil.com">`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(!strings.Contains(string(html), "<embed"))
+}
+
+func TestSanitizerAllowedTags(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test allowed basic formatting
+ md := []byte(`**bold** *italic* ~~strikethrough~~`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "<strong>"))
+ is.True(strings.Contains(string(html), "<em>"))
+ is.True(strings.Contains(string(html), "<del>"))
+
+ // Test allowed headings
+ md = []byte(`# H1
+## H2
+### H3`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "<h1"))
+ is.True(strings.Contains(string(html), "<h2"))
+ is.True(strings.Contains(string(html), "<h3"))
+
+ // Test allowed lists
+ md = []byte(`- item 1
+- item 2
+
+1. numbered
+2. list`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "<ul"))
+ is.True(strings.Contains(string(html), "<ol"))
+ is.True(strings.Contains(string(html), "<li"))
+
+ // Test allowed code blocks
+ md = []byte("```go\nfunc main() {}\n```")
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "<pre"))
+ is.True(strings.Contains(string(html), "<code"))
+
+ // Test allowed tables
+ md = []byte(`| Col1 | Col2 |
+|------|------|
+| A | B |`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "<table"))
+ is.True(strings.Contains(string(html), "<thead"))
+ is.True(strings.Contains(string(html), "<tbody"))
+ is.True(strings.Contains(string(html), "<tr"))
+ is.True(strings.Contains(string(html), "<th"))
+ is.True(strings.Contains(string(html), "<td"))
+
+ // Test allowed blockquote
+ md = []byte(`> quote`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ is.True(strings.Contains(string(html), "<blockquote"))
+}
+
+func TestSanitizerHTTPSOnly(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test http:// link is stripped
+ md := []byte(`[insecure](http://example.com)`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ // Either no anchor tag or the anchor has no href attribute
+ is.True(!hasElementWithTag(htmlStr, "a") || elementAttrAbsent(htmlStr, "a", "href"))
+
+ // Test https:// link is allowed
+ md = []byte(`[secure](https://example.com)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementHasAttr(htmlStr, "a", "href"))
+ is.True(elementAttrContains(htmlStr, "a", "href", "https://example.com"))
+
+ // Test http:// image is stripped
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(!hasElementWithTag(htmlStr, "img") || elementAttrAbsent(htmlStr, "img", "src"))
+
+ // Test https:// image is allowed
+ md = []byte(``)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementHasAttr(htmlStr, "img", "src"))
+ is.True(elementAttrContains(htmlStr, "img", "src", "https://example.com/image.png"))
+}
+
+func TestSanitizerNoFollowNoReferrer(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test that external links get rel="nofollow noreferrer"
+ md := []byte(`[external](https://example.com)`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ is.True(elementHasAttr(htmlStr, "a", "rel"))
+ relAttr := getElementAttr(htmlStr, "a", "rel")
+ is.True(strings.Contains(relAttr, "nofollow"))
+ is.True(strings.Contains(relAttr, "noreferrer"))
+
+ // Test that relative/internal links also get rel="nofollow noreferrer"
+ md = []byte(`[internal](docs/README.md)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementHasAttr(htmlStr, "a", "rel"))
+ relAttr = getElementAttr(htmlStr, "a", "rel")
+ is.True(strings.Contains(relAttr, "nofollow"))
+ is.True(strings.Contains(relAttr, "noreferrer"))
+}
+
+func TestSanitizerAdditionalSchemes(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test protocol-relative URLs are stripped
+ md := []byte(`[protocol-relative](//evil.com/script.js)`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ // Rewriter blocks protocol-relative URLs with empty href, then sanitizer strips empty href links
+ is.True(!hasElementWithTag(htmlStr, "a"))
+
+ // Test mailto: scheme is stripped
+ md = []byte(`[email](mailto:user@example.com)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ // mailto: is not in allowed schemes, so sanitizer strips the entire link
+ is.True(!hasElementWithTag(htmlStr, "a"))
+
+ // Test ftp: scheme is stripped
+ md = []byte(`[ftp](ftp://ftp.example.com/file.txt)`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ // ftp: is not in allowed schemes, so sanitizer strips the entire link
+ is.True(!hasElementWithTag(htmlStr, "a"))
+}
+
+func TestSanitizerDisallowedAttributes(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test target attribute is stripped from links
+ md := []byte(`<a href="https://example.com" target="_blank">link</a>`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ is.True(elementAttrAbsent(htmlStr, "a", "target"))
+
+ // Test class attribute is stripped
+ md = []byte(`<a href="https://example.com" class="dangerous">link</a>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementAttrAbsent(htmlStr, "a", "class"))
+
+ // Test style attribute is stripped (already tested in XSS, but explicit here)
+ md = []byte(`<a href="https://example.com" style="color: red;">link</a>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementAttrAbsent(htmlStr, "a", "style"))
+
+ // Test onclick is stripped (already tested in XSS, but explicit here)
+ md = []byte(`<a href="#" onclick="alert('xss')">click</a>`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementAttrAbsent(htmlStr, "a", "onclick"))
+
+ // Test onerror is stripped (already tested in XSS, but explicit here)
+ md = []byte(`<img src="x" onerror="alert('xss')">`)
+ html, err = renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr = string(html)
+ is.True(elementAttrAbsent(htmlStr, "img", "onerror"))
+}
+
+func TestSanitizerImageAttributes(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test that width and height attributes are preserved on images
+ md := []byte(`<img src="https://example.com/image.png" width="100" height="50" alt="test">`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ is.True(elementAttrEquals(htmlStr, "img", "width", "100"))
+ is.True(elementAttrEquals(htmlStr, "img", "height", "50"))
+ is.True(elementAttrEquals(htmlStr, "img", "alt", "test"))
+}
+
+func TestSanitizerHeadingIDs(t *testing.T) {
+ is := is.New(t)
+
+ ctx := &ReadmeContext{
+ RepoName: "test-repo",
+ CommitHash: "abc123",
+ ReadmePath: "README.md",
+ }
+
+ // Test that heading IDs are preserved (generated by AutoHeadingID)
+ md := []byte(`# Installation
+
+Jump to [installation](#installation).`)
+ html, err := renderMarkdown(md, ctx)
+ is.NoErr(err)
+ htmlStr := string(html)
+ // Check that the heading has an id attribute set to "installation"
+ is.True(elementAttrEquals(htmlStr, "h1", "id", "installation"))
+ // Check that the anchor link is preserved
+ is.True(elementAttrContains(htmlStr, "a", "href", "#installation"))
+}
+
+// HTML Testing Helpers
+// These utilities help test HTML output by parsing the DOM instead of string matching.
+
+// findElement searches the HTML tree for an element matching the given tag name.
+// Returns the first matching element node, or nil if not found.
+func findElement(n *html.Node, tag string) *html.Node {
+ if n.Type == html.ElementNode && n.Data == tag {
+ return n
+ }
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ if found := findElement(c, tag); found != nil {
+ return found
+ }
+ }
+ return nil
+}
+
+// findElementWithAttr searches for an element with a specific attribute value.
+// Returns the first matching element, or nil if not found.
+func findElementWithAttr(n *html.Node, tag, attrKey, attrValue string) *html.Node {
+ if n.Type == html.ElementNode && n.Data == tag {
+ if getAttr(n, attrKey) == attrValue {
+ return n
+ }
+ }
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ if found := findElementWithAttr(c, tag, attrKey, attrValue); found != nil {
+ return found
+ }
+ }
+ return nil
+}
+
+// getAttr returns the value of an attribute, or empty string if not present.
+func getAttr(n *html.Node, key string) string {
+ for _, attr := range n.Attr {
+ if attr.Key == key {
+ return attr.Val
+ }
+ }
+ return ""
+}
+
+// hasAttr returns true if the element has the specified attribute (regardless of value).
+func hasAttr(n *html.Node, key string) bool {
+ for _, attr := range n.Attr {
+ if attr.Key == key {
+ return true
+ }
+ }
+ return false
+}
+
+// attrContains checks if an attribute value contains a substring.
+func attrContains(n *html.Node, key, substr string) bool {
+ val := getAttr(n, key)
+ return strings.Contains(val, substr)
+}
+
+// parseHTML parses an HTML string and returns the root node.
+func parseHTML(htmlStr string) (*html.Node, error) {
+ return html.Parse(strings.NewReader(htmlStr))
+}
+
+// hasElementWithTag returns true if the HTML contains an element with the given tag.
+func hasElementWithTag(htmlStr, tag string) bool {
+ doc, err := parseHTML(htmlStr)
+ if err != nil {
+ return false
+ }
+ return findElement(doc, tag) != nil
+}
+
+// getElementAttr finds an element by tag and returns the value of the specified attribute.
+// Returns empty string if element or attribute not found.
+func getElementAttr(htmlStr, tag, attrKey string) string {
+ doc, err := parseHTML(htmlStr)
+ if err != nil {
+ return ""
+ }
+ elem := findElement(doc, tag)
+ if elem == nil {
+ return ""
+ }
+ return getAttr(elem, attrKey)
+}
+
+// elementHasAttr checks if an element has a specific attribute key (regardless of value).
+func elementHasAttr(htmlStr, tag, attrKey string) bool {
+ doc, err := parseHTML(htmlStr)
+ if err != nil {
+ return false
+ }
+ elem := findElement(doc, tag)
+ if elem == nil {
+ return false
+ }
+ return hasAttr(elem, attrKey)
+}
+
+// elementAttrEquals checks if an element's attribute equals a specific value.
+func elementAttrEquals(htmlStr, tag, attrKey, expected string) bool {
+ return getElementAttr(htmlStr, tag, attrKey) == expected
+}
+
+// elementAttrContains checks if an element's attribute contains a substring.
+func elementAttrContains(htmlStr, tag, attrKey, substr string) bool {
+ doc, err := parseHTML(htmlStr)
+ if err != nil {
+ return false
+ }
+ elem := findElement(doc, tag)
+ if elem == nil {
+ return false
+ }
+ return attrContains(elem, attrKey, substr)
+}
+
+// elementAttrAbsent checks that an element does NOT have a specific attribute.
+func elementAttrAbsent(htmlStr, tag, attrKey string) bool {
+ return !elementHasAttr(htmlStr, tag, attrKey)
+}
+
+// elementAttrEmpty checks if an element's attribute is present but empty.
+func elementAttrEmpty(htmlStr, tag, attrKey string) bool {
+ doc, err := parseHTML(htmlStr)
+ if err != nil {
+ return false
+ }
+ elem := findElement(doc, tag)
+ if elem == nil {
+ return false
+ }
+ if !hasAttr(elem, attrKey) {
+ return false
+ }
+ return getAttr(elem, attrKey) == ""
+}
@@ -240,6 +240,13 @@ func renderHTML(w http.ResponseWriter, templateName string, data interface{}) {
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
+
+ // Security headers
+ // Note: style-src 'unsafe-inline' is required for inline styles in templates (tree.html, overview.html)
+ w.Header().Set("Content-Security-Policy", "default-src 'self'; img-src 'self' https:; style-src 'self' 'unsafe-inline'; script-src 'self'; object-src 'none'; frame-ancestors 'self'; base-uri 'none'")
+ w.Header().Set("Referrer-Policy", "no-referrer")
+ w.Header().Set("X-Content-Type-Options", "nosniff")
+
if err := tmpl.ExecuteTemplate(w, "layout", data); err != nil {
log.Debug("template execution failed", "template", templateName, "err", err)
// Already started writing response, so we can't render an error page
@@ -94,7 +94,16 @@ func repoBlob(w http.ResponseWriter, r *http.Request) {
var renderedHTML template.HTML
if isMarkdown && !isBinaryContent(content) && !showSource {
- renderedHTML, _ = renderMarkdown(content)
+ ctx := &ReadmeContext{
+ RepoName: repo.Name(),
+ CommitHash: refObj.Reference.ID,
+ ReadmePath: path,
+ }
+ var err error
+ renderedHTML, err = renderMarkdown(content, ctx)
+ if err != nil {
+ logger.Debug("failed to render markdown", "repo", repo.Name(), "path", path, "err", err)
+ }
} else if !isBinaryContent(content) {
renderedHTML = highlightCode(path, content)
}
@@ -5,6 +5,7 @@ import (
"context"
"html/template"
"strings"
+ "sync"
"github.com/charmbracelet/soft-serve/git"
"github.com/charmbracelet/soft-serve/pkg/backend"
@@ -14,12 +15,58 @@ import (
extension "github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
goldmarkhtml "github.com/yuin/goldmark/renderer/html"
+ "github.com/yuin/goldmark/util"
)
+var (
+ sanitizerPolicy *bluemonday.Policy
+ sanitizerPolicyOnce sync.Once
+)
+
+// getSanitizerPolicy returns a cached sanitizer policy.
+// The policy is safe for concurrent use after initialization.
+// Note: Tests should not use t.Parallel() with functions that call renderMarkdown
+// until we verify bluemonday.Policy initialization is thread-safe during sync.Once.
+func getSanitizerPolicy() *bluemonday.Policy {
+ sanitizerPolicyOnce.Do(func() {
+ policy := bluemonday.NewPolicy()
+
+ // Allowed tags
+ policy.AllowElements("p", "h1", "h2", "h3", "h4", "h5", "h6",
+ "ul", "ol", "li", "pre", "code", "blockquote",
+ "strong", "em", "del", "br", "hr",
+ "table", "thead", "tbody", "tr", "th", "td",
+ "center", "a", "img")
+
+ // Allow id attributes on headings (generated by AutoHeadingID for anchor links)
+ policy.AllowAttrs("id").OnElements("h1", "h2", "h3", "h4", "h5", "h6")
+
+ // Links: only https scheme and relative URLs, add nofollow/noreferrer
+ policy.AllowAttrs("href").OnElements("a")
+ policy.RequireNoFollowOnLinks(true)
+ policy.RequireNoReferrerOnLinks(true)
+
+ // Images: only https scheme or repo-relative paths, allow descriptive attrs
+ policy.AllowAttrs("src", "alt", "title", "width", "height").OnElements("img")
+
+ // URL schemes: only HTTPS for external URLs, relative URLs for repo paths
+ policy.AllowURLSchemes("https")
+ policy.AllowRelativeURLs(true)
+
+ // Table alignment attributes
+ policy.AllowAttrs("align").Matching(bluemonday.SpaceSeparatedTokens).OnElements("th", "td")
+
+ sanitizerPolicy = policy
+ })
+ return sanitizerPolicy
+}
+
// renderMarkdown converts markdown content to sanitized HTML.
-func renderMarkdown(content []byte) (template.HTML, error) {
+// If ctx is provided, relative URLs will be rewritten to point to repository files.
+func renderMarkdown(content []byte, ctx *ReadmeContext) (template.HTML, error) {
var buf bytes.Buffer
- md := goldmark.New(
+
+ mdOpts := []goldmark.Option{
goldmark.WithExtensions(extension.GFM),
goldmark.WithParserOptions(
parser.WithAutoHeadingID(),
@@ -27,17 +74,24 @@ func renderMarkdown(content []byte) (template.HTML, error) {
goldmark.WithRendererOptions(
goldmarkhtml.WithUnsafe(),
),
- )
+ }
+
+ // Add URL rewriter if context is provided
+ if ctx != nil {
+ rewriter := newURLRewriter(*ctx)
+ mdOpts = append(mdOpts, goldmark.WithParserOptions(
+ parser.WithASTTransformers(util.Prioritized(rewriter, 500)),
+ ))
+ }
+
+ md := goldmark.New(mdOpts...)
if err := md.Convert(content, &buf); err != nil {
return "", err
}
- policy := bluemonday.UGCPolicy()
- policy.AllowStyling()
- policy.RequireNoFollowOnLinks(false)
- policy.AllowElements("center")
- sanitized := policy.SanitizeBytes(buf.Bytes())
+ // Use cached sanitizer policy
+ sanitized := getSanitizerPolicy().SanitizeBytes(buf.Bytes())
return template.HTML(sanitized), nil
}
@@ -57,7 +111,7 @@ func getServerReadme(ctx context.Context, be *backend.Backend) (raw string, html
return "", "", err
}
if readme != "" {
- html, err := renderMarkdown([]byte(readme))
+ html, err := renderMarkdown([]byte(readme), nil)
if err != nil {
return "", "", err
}
@@ -44,9 +44,18 @@ func repoOverview(w http.ResponseWriter, r *http.Request) {
var readmeHTML template.HTML
if !isEmpty {
head, _ := gr.HEAD()
- readmeContent, _, err := backend.Readme(repo, head)
+ readmeContent, readmePath, err := backend.Readme(repo, head)
if err == nil && readmeContent != "" {
- readmeHTML, _ = renderMarkdown([]byte(readmeContent))
+ ctx := &ReadmeContext{
+ RepoName: repo.Name(),
+ CommitHash: head.Reference.ID,
+ ReadmePath: readmePath,
+ }
+ var mdErr error
+ readmeHTML, mdErr = renderMarkdown([]byte(readmeContent), ctx)
+ if mdErr != nil {
+ logger.Debug("failed to render readme markdown", "repo", repo.Name(), "path", readmePath, "err", mdErr)
+ }
}
}