1package tools
2
3import (
4 "bytes"
5 "context"
6 "encoding/json"
7 "errors"
8 "fmt"
9 "io"
10 "net/http"
11 "regexp"
12 "strings"
13 "unicode/utf8"
14
15 md "github.com/JohannesKaufmann/html-to-markdown"
16 "golang.org/x/net/html"
17)
18
19// BrowserUserAgent is a realistic browser User-Agent for better compatibility.
20const BrowserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
21
22var multipleNewlinesRe = regexp.MustCompile(`\n{3,}`)
23
24// FetchURLAndConvert fetches a URL and converts HTML content to markdown.
25func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
26 req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
27 if err != nil {
28 return "", fmt.Errorf("failed to create request: %w", err)
29 }
30
31 // Use realistic browser headers for better compatibility.
32 req.Header.Set("User-Agent", BrowserUserAgent)
33 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
34 req.Header.Set("Accept-Language", "en-US,en;q=0.5")
35
36 resp, err := client.Do(req)
37 if err != nil {
38 return "", fmt.Errorf("failed to fetch URL: %w", err)
39 }
40 defer resp.Body.Close()
41
42 if resp.StatusCode != http.StatusOK {
43 return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
44 }
45
46 maxSize := int64(5 * 1024 * 1024) // 5MB
47 body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
48 if err != nil {
49 return "", fmt.Errorf("failed to read response body: %w", err)
50 }
51
52 content := string(body)
53
54 if !utf8.ValidString(content) {
55 return "", errors.New("response content is not valid UTF-8")
56 }
57
58 contentType := resp.Header.Get("Content-Type")
59
60 // Convert HTML to markdown for better AI processing.
61 if strings.Contains(contentType, "text/html") {
62 // Remove noisy elements before conversion.
63 cleanedHTML := removeNoisyElements(content)
64 markdown, err := ConvertHTMLToMarkdown(cleanedHTML)
65 if err != nil {
66 return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
67 }
68 content = cleanupMarkdown(markdown)
69 } else if strings.Contains(contentType, "application/json") || strings.Contains(contentType, "text/json") {
70 // Format JSON for better readability.
71 formatted, err := FormatJSON(content)
72 if err == nil {
73 content = formatted
74 }
75 // If formatting fails, keep original content.
76 }
77
78 return content, nil
79}
80
81// removeNoisyElements removes script, style, nav, header, footer, and other
82// noisy elements from HTML to improve content extraction.
83func removeNoisyElements(htmlContent string) string {
84 doc, err := html.Parse(strings.NewReader(htmlContent))
85 if err != nil {
86 // If parsing fails, return original content.
87 return htmlContent
88 }
89
90 // Elements to remove entirely.
91 noisyTags := map[string]bool{
92 "script": true,
93 "style": true,
94 "nav": true,
95 "header": true,
96 "footer": true,
97 "aside": true,
98 "noscript": true,
99 "iframe": true,
100 "svg": true,
101 }
102
103 var removeNodes func(*html.Node)
104 removeNodes = func(n *html.Node) {
105 var toRemove []*html.Node
106
107 for c := n.FirstChild; c != nil; c = c.NextSibling {
108 if c.Type == html.ElementNode && noisyTags[c.Data] {
109 toRemove = append(toRemove, c)
110 } else {
111 removeNodes(c)
112 }
113 }
114
115 for _, node := range toRemove {
116 n.RemoveChild(node)
117 }
118 }
119
120 removeNodes(doc)
121
122 var buf bytes.Buffer
123 if err := html.Render(&buf, doc); err != nil {
124 return htmlContent
125 }
126
127 return buf.String()
128}
129
130// cleanupMarkdown removes excessive whitespace and blank lines from markdown.
131func cleanupMarkdown(content string) string {
132 // Collapse multiple blank lines into at most two.
133 content = multipleNewlinesRe.ReplaceAllString(content, "\n\n")
134
135 // Remove trailing whitespace from each line.
136 lines := strings.Split(content, "\n")
137 for i, line := range lines {
138 lines[i] = strings.TrimRight(line, " \t")
139 }
140 content = strings.Join(lines, "\n")
141
142 // Trim leading/trailing whitespace.
143 content = strings.TrimSpace(content)
144
145 return content
146}
147
148// ConvertHTMLToMarkdown converts HTML content to markdown format.
149func ConvertHTMLToMarkdown(htmlContent string) (string, error) {
150 converter := md.NewConverter("", true, nil)
151
152 markdown, err := converter.ConvertString(htmlContent)
153 if err != nil {
154 return "", err
155 }
156
157 return markdown, nil
158}
159
160// FormatJSON formats JSON content with proper indentation.
161func FormatJSON(content string) (string, error) {
162 var data any
163 if err := json.Unmarshal([]byte(content), &data); err != nil {
164 return "", err
165 }
166
167 var buf bytes.Buffer
168 encoder := json.NewEncoder(&buf)
169 encoder.SetIndent("", " ")
170 if err := encoder.Encode(data); err != nil {
171 return "", err
172 }
173
174 return buf.String(), nil
175}