fetch_helpers.go

  1package tools
  2
  3import (
  4	"bytes"
  5	"context"
  6	"encoding/json"
  7	"errors"
  8	"fmt"
  9	"io"
 10	"net/http"
 11	"regexp"
 12	"strings"
 13	"unicode/utf8"
 14
 15	md "github.com/JohannesKaufmann/html-to-markdown"
 16	"golang.org/x/net/html"
 17)
 18
 19// BrowserUserAgent is a realistic browser User-Agent for better compatibility.
 20const BrowserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 21
 22var multipleNewlinesRe = regexp.MustCompile(`\n{3,}`)
 23
 24// FetchURLAndConvert fetches a URL and converts HTML content to markdown.
 25func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
 26	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
 27	if err != nil {
 28		return "", fmt.Errorf("failed to create request: %w", err)
 29	}
 30
 31	// Use realistic browser headers for better compatibility.
 32	req.Header.Set("User-Agent", BrowserUserAgent)
 33	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
 34	req.Header.Set("Accept-Language", "en-US,en;q=0.5")
 35
 36	resp, err := client.Do(req)
 37	if err != nil {
 38		return "", fmt.Errorf("failed to fetch URL: %w", err)
 39	}
 40	defer resp.Body.Close()
 41
 42	if resp.StatusCode != http.StatusOK {
 43		return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
 44	}
 45
 46	maxSize := int64(5 * 1024 * 1024) // 5MB
 47	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
 48	if err != nil {
 49		return "", fmt.Errorf("failed to read response body: %w", err)
 50	}
 51
 52	content := string(body)
 53
 54	if !utf8.ValidString(content) {
 55		return "", errors.New("response content is not valid UTF-8")
 56	}
 57
 58	contentType := resp.Header.Get("Content-Type")
 59
 60	// Convert HTML to markdown for better AI processing.
 61	if strings.Contains(contentType, "text/html") {
 62		// Remove noisy elements before conversion.
 63		cleanedHTML := removeNoisyElements(content)
 64		markdown, err := ConvertHTMLToMarkdown(cleanedHTML)
 65		if err != nil {
 66			return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
 67		}
 68		content = cleanupMarkdown(markdown)
 69	} else if strings.Contains(contentType, "application/json") || strings.Contains(contentType, "text/json") {
 70		// Format JSON for better readability.
 71		formatted, err := FormatJSON(content)
 72		if err == nil {
 73			content = formatted
 74		}
 75		// If formatting fails, keep original content.
 76	}
 77
 78	return content, nil
 79}
 80
 81// removeNoisyElements removes script, style, nav, header, footer, and other
 82// noisy elements from HTML to improve content extraction.
 83func removeNoisyElements(htmlContent string) string {
 84	doc, err := html.Parse(strings.NewReader(htmlContent))
 85	if err != nil {
 86		// If parsing fails, return original content.
 87		return htmlContent
 88	}
 89
 90	// Elements to remove entirely.
 91	noisyTags := map[string]bool{
 92		"script":   true,
 93		"style":    true,
 94		"nav":      true,
 95		"header":   true,
 96		"footer":   true,
 97		"aside":    true,
 98		"noscript": true,
 99		"iframe":   true,
100		"svg":      true,
101	}
102
103	var removeNodes func(*html.Node)
104	removeNodes = func(n *html.Node) {
105		var toRemove []*html.Node
106
107		for c := n.FirstChild; c != nil; c = c.NextSibling {
108			if c.Type == html.ElementNode && noisyTags[c.Data] {
109				toRemove = append(toRemove, c)
110			} else {
111				removeNodes(c)
112			}
113		}
114
115		for _, node := range toRemove {
116			n.RemoveChild(node)
117		}
118	}
119
120	removeNodes(doc)
121
122	var buf bytes.Buffer
123	if err := html.Render(&buf, doc); err != nil {
124		return htmlContent
125	}
126
127	return buf.String()
128}
129
130// cleanupMarkdown removes excessive whitespace and blank lines from markdown.
131func cleanupMarkdown(content string) string {
132	// Collapse multiple blank lines into at most two.
133	content = multipleNewlinesRe.ReplaceAllString(content, "\n\n")
134
135	// Remove trailing whitespace from each line.
136	lines := strings.Split(content, "\n")
137	for i, line := range lines {
138		lines[i] = strings.TrimRight(line, " \t")
139	}
140	content = strings.Join(lines, "\n")
141
142	// Trim leading/trailing whitespace.
143	content = strings.TrimSpace(content)
144
145	return content
146}
147
148// ConvertHTMLToMarkdown converts HTML content to markdown format.
149func ConvertHTMLToMarkdown(htmlContent string) (string, error) {
150	converter := md.NewConverter("", true, nil)
151
152	markdown, err := converter.ConvertString(htmlContent)
153	if err != nil {
154		return "", err
155	}
156
157	return markdown, nil
158}
159
160// FormatJSON formats JSON content with proper indentation.
161func FormatJSON(content string) (string, error) {
162	var data any
163	if err := json.Unmarshal([]byte(content), &data); err != nil {
164		return "", err
165	}
166
167	var buf bytes.Buffer
168	encoder := json.NewEncoder(&buf)
169	encoder.SetIndent("", "  ")
170	if err := encoder.Encode(data); err != nil {
171		return "", err
172	}
173
174	return buf.String(), nil
175}