diff --git a/pkg/web/markdown_rewriter.go b/pkg/web/markdown_rewriter.go new file mode 100644 index 0000000000000000000000000000000000000000..2ee5a9bb7353be89163b97b9c71290799c722efd --- /dev/null +++ b/pkg/web/markdown_rewriter.go @@ -0,0 +1,147 @@ +package web + +import ( + "net/url" + "path" + "strings" + + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/text" +) + +// ReadmeContext contains information needed to rewrite URLs in README files. +type ReadmeContext struct { + RepoName string // Repository name (e.g., "myrepo") + CommitHash string // Full commit hash to pin URLs to + ReadmePath string // Path to README file (e.g., "docs/README.md") +} + +// urlRewriter is a Goldmark AST transformer that rewrites relative URLs in markdown. +type urlRewriter struct { + ctx ReadmeContext +} + +// newURLRewriter creates a new URL rewriter transformer. +func newURLRewriter(ctx ReadmeContext) *urlRewriter { + return &urlRewriter{ctx: ctx} +} + +// Transform implements ast.Transformer. +func (r *urlRewriter) Transform(node *ast.Document, reader text.Reader, pc parser.Context) { + // Get the directory containing the README + readmeDir := path.Dir(r.ctx.ReadmePath) + if readmeDir == "." { + readmeDir = "" + } + + // Walk the AST and rewrite links and images + ast.Walk(node, func(n ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + + switch v := n.(type) { + case *ast.Link: + v.Destination = r.rewriteURL(v.Destination, readmeDir, false) + case *ast.Image: + v.Destination = r.rewriteURL(v.Destination, readmeDir, true) + } + + return ast.WalkContinue, nil + }) +} + +// rewriteURL rewrites a single URL destination. +func (r *urlRewriter) rewriteURL(dest []byte, readmeDir string, isImage bool) []byte { + destStr := string(dest) + + // Skip empty URLs + if destStr == "" { + return dest + } + + // Parse the URL + u, err := url.Parse(destStr) + if err != nil { + // Invalid URL, fail closed by returning empty + return []byte("") + } + + // Skip absolute URLs (http://, https://, mailto:, etc.) + if u.Scheme != "" { + return dest + } + + // Skip protocol-relative URLs (//example.com/path) + // These have no scheme but do have a host + if u.Host != "" { + return []byte("") + } + + // Skip anchor-only links (#section) + if strings.HasPrefix(destStr, "#") { + return dest + } + + // Skip absolute paths (starting with /) + if strings.HasPrefix(destStr, "/") { + return dest + } + + // Now we have a relative URL - resolve it against the README directory + // Join README dir with the relative path + var resolvedPath string + if readmeDir != "" { + resolvedPath = path.Join(readmeDir, u.Path) + } else { + resolvedPath = u.Path + } + + // Clean the path to resolve .. and . + resolvedPath = path.Clean(resolvedPath) + + // Security check: reject any path that escapes the repository root + // After path.Clean, a path starting with ../ indicates traversal outside the repo + if strings.HasPrefix(resolvedPath, "../") || resolvedPath == ".." { + // Path tries to escape repo, return empty/invalid + return []byte("") + } + + // For images, always rewrite to blob endpoint with ?raw=1 + // For links, check if it's a file (has extension) or could be a directory/markdown file + if isImage { + // Build URL: /{repo}/blob/{commit}/{path}?raw=1 + rewritten := "/" + r.ctx.RepoName + "/blob/" + r.ctx.CommitHash + "/" + resolvedPath + "?raw=1" + // Preserve fragment if present + if u.Fragment != "" { + rewritten += "#" + u.Fragment + } + return []byte(rewritten) + } + + // For links, determine if it's a file or navigation + ext := path.Ext(resolvedPath) + if ext != "" && ext != ".md" && ext != ".markdown" { + // It's a file (not markdown), serve raw + rewritten := "/" + r.ctx.RepoName + "/blob/" + r.ctx.CommitHash + "/" + resolvedPath + "?raw=1" + if u.Fragment != "" { + rewritten += "#" + u.Fragment + } + return []byte(rewritten) + } else if ext == ".md" || ext == ".markdown" { + // It's a markdown file, link to blob view (rendered) + rewritten := "/" + r.ctx.RepoName + "/blob/" + r.ctx.CommitHash + "/" + resolvedPath + if u.Fragment != "" { + rewritten += "#" + u.Fragment + } + return []byte(rewritten) + } else { + // No extension, could be a directory - link to tree view + rewritten := "/" + r.ctx.RepoName + "/tree/" + r.ctx.CommitHash + "/" + resolvedPath + if u.Fragment != "" { + rewritten += "#" + u.Fragment + } + return []byte(rewritten) + } +} diff --git a/pkg/web/markdown_rewriter_test.go b/pkg/web/markdown_rewriter_test.go new file mode 100644 index 0000000000000000000000000000000000000000..fbf802e7314858fbe0edc50b1c57414b1fd4aa57 --- /dev/null +++ b/pkg/web/markdown_rewriter_test.go @@ -0,0 +1,241 @@ +package web + +import ( + "strings" + "testing" + + "github.com/matryer/is" +) + +func TestRenderMarkdownWithURLRewriting(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123def456", + ReadmePath: "README.md", + } + + // Test relative image in root README + md := []byte(`![image](image.png)`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + is.True(string(html) != "") + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/image.png?raw=1")) + + // Test relative link to markdown file + md = []byte(`[docs](docs/README.md)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/README.md")) + + // Test relative link to non-markdown file + md = []byte(`[download](file.tar.gz)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/file.tar.gz?raw=1")) + + // Test relative link to directory + md = []byte(`[folder](docs)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/tree/abc123def456/docs")) + + // Test absolute https URL unchanged + md = []byte(`![remote](https://example.com/image.png)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "https://example.com/image.png")) + + // Test anchor-only link unchanged + md = []byte(`[section](#heading)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "#heading")) + + // Test absolute path unchanged (starts with /) + md = []byte(`[root](/other-repo/file)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/other-repo/file")) +} + +func TestRenderMarkdownNestedReadme(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123def456", + ReadmePath: "docs/README.md", + } + + // Test relative image in nested README + md := []byte(`![image](image.png)`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/image.png?raw=1")) + + // Test going up with ../ + md = []byte(`![up](../root.png)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/root.png?raw=1")) + + // Test deep nesting + md = []byte(`[deep](subdir/file.md)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/subdir/file.md")) +} + +func TestRenderMarkdownTraversalAttempts(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123def456", + ReadmePath: "README.md", + } + + // Test ../ traversal outside repo - should be blocked (empty src/href) + md := []byte(`![escape](../../etc/passwd)`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr := string(html) + // Should not contain the path in src attribute + is.True(!elementAttrContains(htmlStr, "img", "src", "etc/passwd")) + + // Test absolute path traversal + md = []byte(`![abs](/../../../etc/passwd)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr = string(html) + // Absolute paths starting with / are left unchanged by rewriter + is.True(elementAttrContains(htmlStr, "img", "src", "/../../../etc/passwd")) + + // Test that fragments don't leak when path traversal is blocked + md = []byte(`![escape](../../secret.png#anchor)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr = string(html) + // Should not contain the path or fragment in src attribute + is.True(!elementAttrContains(htmlStr, "img", "src", "secret.png") && !elementAttrContains(htmlStr, "img", "src", "#anchor")) +} + +func TestRenderMarkdownURLDetails(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123def456", + ReadmePath: "README.md", + } + + // Test fragments preserved for links + md := []byte(`[section](docs/guide.md#installation)`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/guide.md#installation")) + + // Test fragments preserved for images + md = []byte(`![diagram](docs/arch.png#diagram)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/arch.png?raw=1#diagram")) + + // Test directories link to /tree without trailing slash + md = []byte(`[docs](docs/)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/tree/abc123def456/docs")) + is.True(!strings.Contains(string(html), "docs/\"")) // no trailing slash in href + + // Test non-md files get ?raw=1 + md = []byte(`[archive](file.zip)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/file.zip?raw=1")) + + md = []byte(`[pdf](docs/manual.pdf)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/docs/manual.pdf?raw=1")) + + // Test uppercase .MD/.MARKDOWN files are treated as non-markdown + // (document current behavior - filepath.Ext is case-sensitive) + md = []byte(`[upper](README.MD)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + // .MD is not .md, so treated as regular file requiring ?raw=1 + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/README.MD?raw=1")) + + md = []byte(`[markdown](README.MARKDOWN)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + // .MARKDOWN is not .markdown, so treated as regular file requiring ?raw=1 + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/README.MARKDOWN?raw=1")) +} + +func TestRenderMarkdownSpecialCharacters(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123def456", + ReadmePath: "README.md", + } + + // Test spaces in filename - bluemonday may strip invalid URLs + md := []byte(`![spaces](my_image.png)`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + // Should contain the repo path + is.True(strings.Contains(string(html), "/test-repo/blob/abc123def456/")) + + // Test unicode + md = []byte(`![unicode](文件.png)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(string(html) != "") +} + +func TestRenderMarkdownDangerousSchemes(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123def456", + ReadmePath: "README.md", + } + + // Test javascript: scheme - should be stripped by sanitizer + md := []byte(`[xss](javascript:alert('xss'))`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "javascript:")) + + // Test data: scheme - should be stripped + md = []byte(`![data]()`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "data:image")) + + // Test http: scheme - should be stripped (only https allowed) + md = []byte(`[http](http://example.com)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + // Sanitizer should strip non-https schemes + is.True(!strings.Contains(string(html), "http://example.com") || !strings.Contains(string(html), "href=")) +} + +func TestRenderMarkdownWithoutContext(t *testing.T) { + is := is.New(t) + + // Test that rendering without context works (no rewriting) + md := []byte(`![image](image.png)`) + html, err := renderMarkdown(md, nil) + is.NoErr(err) + is.True(string(html) != "") + // Without context, relative URLs stay relative + is.True(strings.Contains(string(html), "image.png")) +} diff --git a/pkg/web/sanitizer_test.go b/pkg/web/sanitizer_test.go new file mode 100644 index 0000000000000000000000000000000000000000..2228522ec786c43f5bb5b530478982b4ec2d54ca --- /dev/null +++ b/pkg/web/sanitizer_test.go @@ -0,0 +1,481 @@ +package web + +import ( + "strings" + "testing" + + "github.com/matryer/is" + "golang.org/x/net/html" +) + +func TestSanitizerXSSProtection(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123", + ReadmePath: "README.md", + } + + // Test javascript: URL in link + md := []byte(`[click me](javascript:alert('xss'))`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "javascript:")) + + // Test javascript: URL in image + md = []byte(`![img](javascript:alert('xss'))`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "javascript:")) + + // Test data: URI in image + md = []byte(`![img]()`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "data:image")) + + // Test data: URI in link + md = []byte(`[link](data:text/html,)`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "data:text")) + + // Test onerror handler + md = []byte(``) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "onerror")) + + // Test onclick handler + md = []byte(`click`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "onclick")) + + // Test style attribute + md = []byte(`

test

`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "style=")) + is.True(!strings.Contains(string(html), "javascript")) + + // Test iframe injection + md = []byte(``) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "alert('xss')`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(!strings.Contains(string(html), "")) + is.True(strings.Contains(string(html), "")) + is.True(strings.Contains(string(html), "")) + + // Test allowed headings + md = []byte(`# H1 +## H2 +### H3`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), " quote`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + is.True(strings.Contains(string(html), "link`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr := string(html) + is.True(elementAttrAbsent(htmlStr, "a", "target")) + + // Test class attribute is stripped + md = []byte(`link`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr = string(html) + is.True(elementAttrAbsent(htmlStr, "a", "class")) + + // Test style attribute is stripped (already tested in XSS, but explicit here) + md = []byte(`link`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr = string(html) + is.True(elementAttrAbsent(htmlStr, "a", "style")) + + // Test onclick is stripped (already tested in XSS, but explicit here) + md = []byte(`click`) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr = string(html) + is.True(elementAttrAbsent(htmlStr, "a", "onclick")) + + // Test onerror is stripped (already tested in XSS, but explicit here) + md = []byte(``) + html, err = renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr = string(html) + is.True(elementAttrAbsent(htmlStr, "img", "onerror")) +} + +func TestSanitizerImageAttributes(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123", + ReadmePath: "README.md", + } + + // Test that width and height attributes are preserved on images + md := []byte(`test`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr := string(html) + is.True(elementAttrEquals(htmlStr, "img", "width", "100")) + is.True(elementAttrEquals(htmlStr, "img", "height", "50")) + is.True(elementAttrEquals(htmlStr, "img", "alt", "test")) +} + +func TestSanitizerHeadingIDs(t *testing.T) { + is := is.New(t) + + ctx := &ReadmeContext{ + RepoName: "test-repo", + CommitHash: "abc123", + ReadmePath: "README.md", + } + + // Test that heading IDs are preserved (generated by AutoHeadingID) + md := []byte(`# Installation + +Jump to [installation](#installation).`) + html, err := renderMarkdown(md, ctx) + is.NoErr(err) + htmlStr := string(html) + // Check that the heading has an id attribute set to "installation" + is.True(elementAttrEquals(htmlStr, "h1", "id", "installation")) + // Check that the anchor link is preserved + is.True(elementAttrContains(htmlStr, "a", "href", "#installation")) +} + +// HTML Testing Helpers +// These utilities help test HTML output by parsing the DOM instead of string matching. + +// findElement searches the HTML tree for an element matching the given tag name. +// Returns the first matching element node, or nil if not found. +func findElement(n *html.Node, tag string) *html.Node { + if n.Type == html.ElementNode && n.Data == tag { + return n + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if found := findElement(c, tag); found != nil { + return found + } + } + return nil +} + +// findElementWithAttr searches for an element with a specific attribute value. +// Returns the first matching element, or nil if not found. +func findElementWithAttr(n *html.Node, tag, attrKey, attrValue string) *html.Node { + if n.Type == html.ElementNode && n.Data == tag { + if getAttr(n, attrKey) == attrValue { + return n + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if found := findElementWithAttr(c, tag, attrKey, attrValue); found != nil { + return found + } + } + return nil +} + +// getAttr returns the value of an attribute, or empty string if not present. +func getAttr(n *html.Node, key string) string { + for _, attr := range n.Attr { + if attr.Key == key { + return attr.Val + } + } + return "" +} + +// hasAttr returns true if the element has the specified attribute (regardless of value). +func hasAttr(n *html.Node, key string) bool { + for _, attr := range n.Attr { + if attr.Key == key { + return true + } + } + return false +} + +// attrContains checks if an attribute value contains a substring. +func attrContains(n *html.Node, key, substr string) bool { + val := getAttr(n, key) + return strings.Contains(val, substr) +} + +// parseHTML parses an HTML string and returns the root node. +func parseHTML(htmlStr string) (*html.Node, error) { + return html.Parse(strings.NewReader(htmlStr)) +} + +// hasElementWithTag returns true if the HTML contains an element with the given tag. +func hasElementWithTag(htmlStr, tag string) bool { + doc, err := parseHTML(htmlStr) + if err != nil { + return false + } + return findElement(doc, tag) != nil +} + +// getElementAttr finds an element by tag and returns the value of the specified attribute. +// Returns empty string if element or attribute not found. +func getElementAttr(htmlStr, tag, attrKey string) string { + doc, err := parseHTML(htmlStr) + if err != nil { + return "" + } + elem := findElement(doc, tag) + if elem == nil { + return "" + } + return getAttr(elem, attrKey) +} + +// elementHasAttr checks if an element has a specific attribute key (regardless of value). +func elementHasAttr(htmlStr, tag, attrKey string) bool { + doc, err := parseHTML(htmlStr) + if err != nil { + return false + } + elem := findElement(doc, tag) + if elem == nil { + return false + } + return hasAttr(elem, attrKey) +} + +// elementAttrEquals checks if an element's attribute equals a specific value. +func elementAttrEquals(htmlStr, tag, attrKey, expected string) bool { + return getElementAttr(htmlStr, tag, attrKey) == expected +} + +// elementAttrContains checks if an element's attribute contains a substring. +func elementAttrContains(htmlStr, tag, attrKey, substr string) bool { + doc, err := parseHTML(htmlStr) + if err != nil { + return false + } + elem := findElement(doc, tag) + if elem == nil { + return false + } + return attrContains(elem, attrKey, substr) +} + +// elementAttrAbsent checks that an element does NOT have a specific attribute. +func elementAttrAbsent(htmlStr, tag, attrKey string) bool { + return !elementHasAttr(htmlStr, tag, attrKey) +} + +// elementAttrEmpty checks if an element's attribute is present but empty. +func elementAttrEmpty(htmlStr, tag, attrKey string) bool { + doc, err := parseHTML(htmlStr) + if err != nil { + return false + } + elem := findElement(doc, tag) + if elem == nil { + return false + } + if !hasAttr(elem, attrKey) { + return false + } + return getAttr(elem, attrKey) == "" +} diff --git a/pkg/web/webui.go b/pkg/web/webui.go index a0d7e51918c609b53896e91545fd621f2a007283..03a48eae5f64469b8e98fe475a288d82591b65f4 100644 --- a/pkg/web/webui.go +++ b/pkg/web/webui.go @@ -240,6 +240,13 @@ func renderHTML(w http.ResponseWriter, templateName string, data interface{}) { } w.Header().Set("Content-Type", "text/html; charset=utf-8") + + // Security headers + // Note: style-src 'unsafe-inline' is required for inline styles in templates (tree.html, overview.html) + w.Header().Set("Content-Security-Policy", "default-src 'self'; img-src 'self' https:; style-src 'self' 'unsafe-inline'; script-src 'self'; object-src 'none'; frame-ancestors 'self'; base-uri 'none'") + w.Header().Set("Referrer-Policy", "no-referrer") + w.Header().Set("X-Content-Type-Options", "nosniff") + if err := tmpl.ExecuteTemplate(w, "layout", data); err != nil { log.Debug("template execution failed", "template", templateName, "err", err) // Already started writing response, so we can't render an error page diff --git a/pkg/web/webui_blob.go b/pkg/web/webui_blob.go index 878156497e637ab1f047d9c593d72f8db044632b..3af2373e4271ffa131b3e7d72378f0f07b105796 100644 --- a/pkg/web/webui_blob.go +++ b/pkg/web/webui_blob.go @@ -94,7 +94,16 @@ func repoBlob(w http.ResponseWriter, r *http.Request) { var renderedHTML template.HTML if isMarkdown && !isBinaryContent(content) && !showSource { - renderedHTML, _ = renderMarkdown(content) + ctx := &ReadmeContext{ + RepoName: repo.Name(), + CommitHash: refObj.Reference.ID, + ReadmePath: path, + } + var err error + renderedHTML, err = renderMarkdown(content, ctx) + if err != nil { + logger.Debug("failed to render markdown", "repo", repo.Name(), "path", path, "err", err) + } } else if !isBinaryContent(content) { renderedHTML = highlightCode(path, content) } diff --git a/pkg/web/webui_helpers.go b/pkg/web/webui_helpers.go index 7beae31a84e4e9368d968463ddfb016935ea4b38..94d5b821cc8ef1fab14997d84f13bc436c0a9623 100644 --- a/pkg/web/webui_helpers.go +++ b/pkg/web/webui_helpers.go @@ -5,6 +5,7 @@ import ( "context" "html/template" "strings" + "sync" "github.com/charmbracelet/soft-serve/git" "github.com/charmbracelet/soft-serve/pkg/backend" @@ -14,12 +15,58 @@ import ( extension "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/parser" goldmarkhtml "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/util" ) +var ( + sanitizerPolicy *bluemonday.Policy + sanitizerPolicyOnce sync.Once +) + +// getSanitizerPolicy returns a cached sanitizer policy. +// The policy is safe for concurrent use after initialization. +// Note: Tests should not use t.Parallel() with functions that call renderMarkdown +// until we verify bluemonday.Policy initialization is thread-safe during sync.Once. +func getSanitizerPolicy() *bluemonday.Policy { + sanitizerPolicyOnce.Do(func() { + policy := bluemonday.NewPolicy() + + // Allowed tags + policy.AllowElements("p", "h1", "h2", "h3", "h4", "h5", "h6", + "ul", "ol", "li", "pre", "code", "blockquote", + "strong", "em", "del", "br", "hr", + "table", "thead", "tbody", "tr", "th", "td", + "center", "a", "img") + + // Allow id attributes on headings (generated by AutoHeadingID for anchor links) + policy.AllowAttrs("id").OnElements("h1", "h2", "h3", "h4", "h5", "h6") + + // Links: only https scheme and relative URLs, add nofollow/noreferrer + policy.AllowAttrs("href").OnElements("a") + policy.RequireNoFollowOnLinks(true) + policy.RequireNoReferrerOnLinks(true) + + // Images: only https scheme or repo-relative paths, allow descriptive attrs + policy.AllowAttrs("src", "alt", "title", "width", "height").OnElements("img") + + // URL schemes: only HTTPS for external URLs, relative URLs for repo paths + policy.AllowURLSchemes("https") + policy.AllowRelativeURLs(true) + + // Table alignment attributes + policy.AllowAttrs("align").Matching(bluemonday.SpaceSeparatedTokens).OnElements("th", "td") + + sanitizerPolicy = policy + }) + return sanitizerPolicy +} + // renderMarkdown converts markdown content to sanitized HTML. -func renderMarkdown(content []byte) (template.HTML, error) { +// If ctx is provided, relative URLs will be rewritten to point to repository files. +func renderMarkdown(content []byte, ctx *ReadmeContext) (template.HTML, error) { var buf bytes.Buffer - md := goldmark.New( + + mdOpts := []goldmark.Option{ goldmark.WithExtensions(extension.GFM), goldmark.WithParserOptions( parser.WithAutoHeadingID(), @@ -27,17 +74,24 @@ func renderMarkdown(content []byte) (template.HTML, error) { goldmark.WithRendererOptions( goldmarkhtml.WithUnsafe(), ), - ) + } + + // Add URL rewriter if context is provided + if ctx != nil { + rewriter := newURLRewriter(*ctx) + mdOpts = append(mdOpts, goldmark.WithParserOptions( + parser.WithASTTransformers(util.Prioritized(rewriter, 500)), + )) + } + + md := goldmark.New(mdOpts...) if err := md.Convert(content, &buf); err != nil { return "", err } - policy := bluemonday.UGCPolicy() - policy.AllowStyling() - policy.RequireNoFollowOnLinks(false) - policy.AllowElements("center") - sanitized := policy.SanitizeBytes(buf.Bytes()) + // Use cached sanitizer policy + sanitized := getSanitizerPolicy().SanitizeBytes(buf.Bytes()) return template.HTML(sanitized), nil } @@ -57,7 +111,7 @@ func getServerReadme(ctx context.Context, be *backend.Backend) (raw string, html return "", "", err } if readme != "" { - html, err := renderMarkdown([]byte(readme)) + html, err := renderMarkdown([]byte(readme), nil) if err != nil { return "", "", err } diff --git a/pkg/web/webui_overview.go b/pkg/web/webui_overview.go index 7b14bc97f29038e66a916da635a4ff2029129807..be0a0d77f80a843550c8cc9e12f1bb1288d5547f 100644 --- a/pkg/web/webui_overview.go +++ b/pkg/web/webui_overview.go @@ -44,9 +44,18 @@ func repoOverview(w http.ResponseWriter, r *http.Request) { var readmeHTML template.HTML if !isEmpty { head, _ := gr.HEAD() - readmeContent, _, err := backend.Readme(repo, head) + readmeContent, readmePath, err := backend.Readme(repo, head) if err == nil && readmeContent != "" { - readmeHTML, _ = renderMarkdown([]byte(readmeContent)) + ctx := &ReadmeContext{ + RepoName: repo.Name(), + CommitHash: head.Reference.ID, + ReadmePath: readmePath, + } + var mdErr error + readmeHTML, mdErr = renderMarkdown([]byte(readmeContent), ctx) + if mdErr != nil { + logger.Debug("failed to render readme markdown", "repo", repo.Name(), "path", readmePath, "err", mdErr) + } } }