fetch_helpers.go

  1package tools
  2
  3import (
  4	"bytes"
  5	"context"
  6	"encoding/json"
  7	"errors"
  8	"fmt"
  9	"io"
 10	"net/http"
 11	"regexp"
 12	"strings"
 13	"unicode/utf8"
 14
 15	md "github.com/JohannesKaufmann/html-to-markdown"
 16	"golang.org/x/net/html"
 17)
 18
 19// BrowserUserAgent is a realistic browser User-Agent for better compatibility.
 20const BrowserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 21
 22// FetchURLAndConvert fetches a URL and converts HTML content to markdown.
 23func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
 24	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
 25	if err != nil {
 26		return "", fmt.Errorf("failed to create request: %w", err)
 27	}
 28
 29	// Use realistic browser headers for better compatibility.
 30	req.Header.Set("User-Agent", BrowserUserAgent)
 31	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
 32	req.Header.Set("Accept-Language", "en-US,en;q=0.5")
 33
 34	resp, err := client.Do(req)
 35	if err != nil {
 36		return "", fmt.Errorf("failed to fetch URL: %w", err)
 37	}
 38	defer resp.Body.Close()
 39
 40	if resp.StatusCode != http.StatusOK {
 41		return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
 42	}
 43
 44	maxSize := int64(5 * 1024 * 1024) // 5MB
 45	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
 46	if err != nil {
 47		return "", fmt.Errorf("failed to read response body: %w", err)
 48	}
 49
 50	content := string(body)
 51
 52	if !utf8.ValidString(content) {
 53		return "", errors.New("response content is not valid UTF-8")
 54	}
 55
 56	contentType := resp.Header.Get("Content-Type")
 57
 58	// Convert HTML to markdown for better AI processing.
 59	if strings.Contains(contentType, "text/html") {
 60		// Remove noisy elements before conversion.
 61		cleanedHTML := removeNoisyElements(content)
 62		markdown, err := ConvertHTMLToMarkdown(cleanedHTML)
 63		if err != nil {
 64			return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
 65		}
 66		content = cleanupMarkdown(markdown)
 67	} else if strings.Contains(contentType, "application/json") || strings.Contains(contentType, "text/json") {
 68		// Format JSON for better readability.
 69		formatted, err := FormatJSON(content)
 70		if err == nil {
 71			content = formatted
 72		}
 73		// If formatting fails, keep original content.
 74	}
 75
 76	return content, nil
 77}
 78
 79// removeNoisyElements removes script, style, nav, header, footer, and other
 80// noisy elements from HTML to improve content extraction.
 81func removeNoisyElements(htmlContent string) string {
 82	doc, err := html.Parse(strings.NewReader(htmlContent))
 83	if err != nil {
 84		// If parsing fails, return original content.
 85		return htmlContent
 86	}
 87
 88	// Elements to remove entirely.
 89	noisyTags := map[string]bool{
 90		"script":   true,
 91		"style":    true,
 92		"nav":      true,
 93		"header":   true,
 94		"footer":   true,
 95		"aside":    true,
 96		"noscript": true,
 97		"iframe":   true,
 98		"svg":      true,
 99	}
100
101	var removeNodes func(*html.Node)
102	removeNodes = func(n *html.Node) {
103		var toRemove []*html.Node
104
105		for c := n.FirstChild; c != nil; c = c.NextSibling {
106			if c.Type == html.ElementNode && noisyTags[c.Data] {
107				toRemove = append(toRemove, c)
108			} else {
109				removeNodes(c)
110			}
111		}
112
113		for _, node := range toRemove {
114			n.RemoveChild(node)
115		}
116	}
117
118	removeNodes(doc)
119
120	var buf bytes.Buffer
121	if err := html.Render(&buf, doc); err != nil {
122		return htmlContent
123	}
124
125	return buf.String()
126}
127
128// cleanupMarkdown removes excessive whitespace and blank lines from markdown.
129func cleanupMarkdown(content string) string {
130	// Collapse multiple blank lines into at most two.
131	multipleNewlines := regexp.MustCompile(`\n{3,}`)
132	content = multipleNewlines.ReplaceAllString(content, "\n\n")
133
134	// Remove trailing whitespace from each line.
135	lines := strings.Split(content, "\n")
136	for i, line := range lines {
137		lines[i] = strings.TrimRight(line, " \t")
138	}
139	content = strings.Join(lines, "\n")
140
141	// Trim leading/trailing whitespace.
142	content = strings.TrimSpace(content)
143
144	return content
145}
146
147// ConvertHTMLToMarkdown converts HTML content to markdown format.
148func ConvertHTMLToMarkdown(htmlContent string) (string, error) {
149	converter := md.NewConverter("", true, nil)
150
151	markdown, err := converter.ConvertString(htmlContent)
152	if err != nil {
153		return "", err
154	}
155
156	return markdown, nil
157}
158
159// FormatJSON formats JSON content with proper indentation.
160func FormatJSON(content string) (string, error) {
161	var data any
162	if err := json.Unmarshal([]byte(content), &data); err != nil {
163		return "", err
164	}
165
166	var buf bytes.Buffer
167	encoder := json.NewEncoder(&buf)
168	encoder.SetIndent("", "  ")
169	if err := encoder.Encode(data); err != nil {
170		return "", err
171	}
172
173	return buf.String(), nil
174}