1package tools
2
3import (
4 "context"
5 "errors"
6 "fmt"
7 "io"
8 "net/http"
9 "strings"
10 "unicode/utf8"
11
12 md "github.com/JohannesKaufmann/html-to-markdown"
13)
14
15// FetchURLAndConvert fetches a URL and converts HTML content to markdown.
16func FetchURLAndConvert(ctx context.Context, client *http.Client, url string) (string, error) {
17 req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
18 if err != nil {
19 return "", fmt.Errorf("failed to create request: %w", err)
20 }
21
22 req.Header.Set("User-Agent", "crush/1.0")
23
24 resp, err := client.Do(req)
25 if err != nil {
26 return "", fmt.Errorf("failed to fetch URL: %w", err)
27 }
28 defer resp.Body.Close()
29
30 if resp.StatusCode != http.StatusOK {
31 return "", fmt.Errorf("request failed with status code: %d", resp.StatusCode)
32 }
33
34 maxSize := int64(5 * 1024 * 1024) // 5MB
35 body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
36 if err != nil {
37 return "", fmt.Errorf("failed to read response body: %w", err)
38 }
39
40 content := string(body)
41
42 if !utf8.ValidString(content) {
43 return "", errors.New("response content is not valid UTF-8")
44 }
45
46 contentType := resp.Header.Get("Content-Type")
47
48 // Convert HTML to markdown for better AI processing.
49 if strings.Contains(contentType, "text/html") {
50 markdown, err := ConvertHTMLToMarkdown(content)
51 if err != nil {
52 return "", fmt.Errorf("failed to convert HTML to markdown: %w", err)
53 }
54 content = markdown
55 }
56
57 return content, nil
58}
59
60// ConvertHTMLToMarkdown converts HTML content to markdown format.
61func ConvertHTMLToMarkdown(html string) (string, error) {
62 converter := md.NewConverter("", true, nil)
63
64 markdown, err := converter.ConvertString(html)
65 if err != nil {
66 return "", err
67 }
68
69 return markdown, nil
70}