1package tools
2
3import (
4 "bytes"
5 "context"
6 "encoding/json"
7 "errors"
8 "fmt"
9 "io"
10 "net/http"
11 "regexp"
12 "strings"
13 "unicode/utf8"
14
15 md "github.com/JohannesKaufmann/html-to-markdown"
16 "golang.org/x/net/html"
17)
18
19// BrowserUserAgent is a realistic browser User-Agent for better compatibility.
20const BrowserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
21
22// FetchURLAndConvert fetches a URL and converts HTML content to markdown.
23func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
24 req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
25 if err != nil {
26 return "", fmt.Errorf("failed to create request: %w", err)
27 }
28
29 // Use realistic browser headers for better compatibility.
30 req.Header.Set("User-Agent", BrowserUserAgent)
31 req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
32 req.Header.Set("Accept-Language", "en-US,en;q=0.5")
33
34 resp, err := client.Do(req)
35 if err != nil {
36 return "", fmt.Errorf("failed to fetch URL: %w", err)
37 }
38 defer resp.Body.Close()
39
40 if resp.StatusCode != http.StatusOK {
41 return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
42 }
43
44 maxSize := int64(5 * 1024 * 1024) // 5MB
45 body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
46 if err != nil {
47 return "", fmt.Errorf("failed to read response body: %w", err)
48 }
49
50 content := string(body)
51
52 if !utf8.ValidString(content) {
53 return "", errors.New("response content is not valid UTF-8")
54 }
55
56 contentType := resp.Header.Get("Content-Type")
57
58 // Convert HTML to markdown for better AI processing.
59 if strings.Contains(contentType, "text/html") {
60 // Remove noisy elements before conversion.
61 cleanedHTML := removeNoisyElements(content)
62 markdown, err := ConvertHTMLToMarkdown(cleanedHTML)
63 if err != nil {
64 return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
65 }
66 content = cleanupMarkdown(markdown)
67 } else if strings.Contains(contentType, "application/json") || strings.Contains(contentType, "text/json") {
68 // Format JSON for better readability.
69 formatted, err := FormatJSON(content)
70 if err == nil {
71 content = formatted
72 }
73 // If formatting fails, keep original content.
74 }
75
76 return content, nil
77}
78
79// removeNoisyElements removes script, style, nav, header, footer, and other
80// noisy elements from HTML to improve content extraction.
81func removeNoisyElements(htmlContent string) string {
82 doc, err := html.Parse(strings.NewReader(htmlContent))
83 if err != nil {
84 // If parsing fails, return original content.
85 return htmlContent
86 }
87
88 // Elements to remove entirely.
89 noisyTags := map[string]bool{
90 "script": true,
91 "style": true,
92 "nav": true,
93 "header": true,
94 "footer": true,
95 "aside": true,
96 "noscript": true,
97 "iframe": true,
98 "svg": true,
99 }
100
101 var removeNodes func(*html.Node)
102 removeNodes = func(n *html.Node) {
103 var toRemove []*html.Node
104
105 for c := n.FirstChild; c != nil; c = c.NextSibling {
106 if c.Type == html.ElementNode && noisyTags[c.Data] {
107 toRemove = append(toRemove, c)
108 } else {
109 removeNodes(c)
110 }
111 }
112
113 for _, node := range toRemove {
114 n.RemoveChild(node)
115 }
116 }
117
118 removeNodes(doc)
119
120 var buf bytes.Buffer
121 if err := html.Render(&buf, doc); err != nil {
122 return htmlContent
123 }
124
125 return buf.String()
126}
127
128// cleanupMarkdown removes excessive whitespace and blank lines from markdown.
129func cleanupMarkdown(content string) string {
130 // Collapse multiple blank lines into at most two.
131 multipleNewlines := regexp.MustCompile(`\n{3,}`)
132 content = multipleNewlines.ReplaceAllString(content, "\n\n")
133
134 // Remove trailing whitespace from each line.
135 lines := strings.Split(content, "\n")
136 for i, line := range lines {
137 lines[i] = strings.TrimRight(line, " \t")
138 }
139 content = strings.Join(lines, "\n")
140
141 // Trim leading/trailing whitespace.
142 content = strings.TrimSpace(content)
143
144 return content
145}
146
147// ConvertHTMLToMarkdown converts HTML content to markdown format.
148func ConvertHTMLToMarkdown(htmlContent string) (string, error) {
149 converter := md.NewConverter("", true, nil)
150
151 markdown, err := converter.ConvertString(htmlContent)
152 if err != nil {
153 return "", err
154 }
155
156 return markdown, nil
157}
158
159// FormatJSON formats JSON content with proper indentation.
160func FormatJSON(content string) (string, error) {
161 var data any
162 if err := json.Unmarshal([]byte(content), &data); err != nil {
163 return "", err
164 }
165
166 var buf bytes.Buffer
167 encoder := json.NewEncoder(&buf)
168 encoder.SetIndent("", " ")
169 if err := encoder.Encode(data); err != nil {
170 return "", err
171 }
172
173 return buf.String(), nil
174}