fetch_helpers.go

 1package tools
 2
 3import (
 4	"context"
 5	"errors"
 6	"fmt"
 7	"io"
 8	"net/http"
 9	"strings"
10	"unicode/utf8"
11
12	md "github.com/JohannesKaufmann/html-to-markdown"
13)
14
15// FetchURLAndConvert fetches a URL and converts HTML content to markdown.
16func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
17	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
18	if err != nil {
19		return "", fmt.Errorf("failed to create request: %w", err)
20	}
21
22	req.Header.Set("User-Agent", "crush/1.0")
23
24	resp, err := client.Do(req)
25	if err != nil {
26		return "", fmt.Errorf("failed to fetch URL: %w", err)
27	}
28	defer resp.Body.Close()
29
30	if resp.StatusCode != http.StatusOK {
31		return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
32	}
33
34	maxSize := int64(5 * 1024 * 1024) // 5MB
35	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
36	if err != nil {
37		return "", fmt.Errorf("failed to read response body: %w", err)
38	}
39
40	content := string(body)
41
42	if !utf8.ValidString(content) {
43		return "", errors.New("response content is not valid UTF-8")
44	}
45
46	contentType := resp.Header.Get("Content-Type")
47
48	// Convert HTML to markdown for better AI processing.
49	if strings.Contains(contentType, "text/html") {
50		markdown, err := ConvertHTMLToMarkdown(content)
51		if err != nil {
52			return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
53		}
54		content = markdown
55	}
56
57	return content, nil
58}
59
60// ConvertHTMLToMarkdown converts HTML content to markdown format.
61func ConvertHTMLToMarkdown(html string) (string, error) {
62	converter := md.NewConverter("", true, nil)
63
64	markdown, err := converter.ConvertString(html)
65	if err != nil {
66		return "", err
67	}
68
69	return markdown, nil
70}