fetch.go

  1package tools
  2
  3import (
  4	"context"
  5	"fmt"
  6	"io"
  7	"net/http"
  8	"strings"
  9	"time"
 10	"unicode/utf8"
 11
 12	md "github.com/JohannesKaufmann/html-to-markdown"
 13	"github.com/PuerkitoBio/goquery"
 14	"github.com/charmbracelet/crush/internal/ai"
 15	"github.com/charmbracelet/crush/internal/permission"
 16)
 17
 18type FetchParams struct {
 19	URL     string `json:"url" description:"The URL to fetch content from"`
 20	Format  string `json:"format" description:"The format to return the content in (text, markdown, or html)"`
 21	Timeout int    `json:"timeout,omitempty" description:"Optional timeout in seconds (max 120)"`
 22}
 23
 24type FetchPermissionsParams struct {
 25	URL     string `json:"url"`
 26	Format  string `json:"format"`
 27	Timeout int    `json:"timeout,omitempty"`
 28}
 29
 30const (
 31	FetchToolName = "fetch"
 32)
 33
 34func NewFetchTool(permissions permission.Service, workingDir string) ai.AgentTool {
 35	client := &http.Client{
 36		Timeout: 30 * time.Second,
 37		Transport: &http.Transport{
 38			MaxIdleConns:        100,
 39			MaxIdleConnsPerHost: 10,
 40			IdleConnTimeout:     90 * time.Second,
 41		},
 42	}
 43	return ai.NewTypedToolFunc(
 44		FetchToolName,
 45		`Fetches content from a URL and returns it in the specified format.
 46
 47WHEN TO USE THIS TOOL:
 48- Use when you need to download content from a URL
 49- Helpful for retrieving documentation, API responses, or web content
 50- Useful for getting external information to assist with tasks
 51
 52HOW TO USE:
 53- Provide the URL to fetch content from
 54- Specify the desired output format (text, markdown, or html)
 55- Optionally set a timeout for the request
 56
 57FEATURES:
 58- Supports three output formats: text, markdown, and html
 59- Automatically handles HTTP redirects
 60- Sets reasonable timeouts to prevent hanging
 61- Validates input parameters before making requests
 62
 63LIMITATIONS:
 64- Maximum response size is 5MB
 65- Only supports HTTP and HTTPS protocols
 66- Cannot handle authentication or cookies
 67- Some websites may block automated requests
 68
 69TIPS:
 70- Use text format for plain text content or simple API responses
 71- Use markdown format for content that should be rendered with formatting
 72- Use html format when you need the raw HTML structure
 73- Set appropriate timeouts for potentially slow websites`,
 74		func(ctx context.Context, params FetchParams, call ai.ToolCall) (ai.ToolResponse, error) {
 75			if params.URL == "" {
 76				return ai.NewTextErrorResponse("URL parameter is required"), nil
 77			}
 78
 79			format := strings.ToLower(params.Format)
 80			if format != "text" && format != "markdown" && format != "html" {
 81				return ai.NewTextErrorResponse("Format must be one of: text, markdown, html"), nil
 82			}
 83
 84			if !strings.HasPrefix(params.URL, "http://") && !strings.HasPrefix(params.URL, "https://") {
 85				return ai.NewTextErrorResponse("URL must start with http:// or https://"), nil
 86			}
 87
 88			sessionID, messageID := GetContextValues(ctx)
 89			if sessionID == "" || messageID == "" {
 90				return ai.ToolResponse{}, fmt.Errorf("session ID and message ID are required for creating a new file")
 91			}
 92
 93			granted := permissions.Request(
 94				permission.CreatePermissionRequest{
 95					SessionID:   sessionID,
 96					Path:        workingDir,
 97					ToolCallID:  call.ID,
 98					ToolName:    FetchToolName,
 99					Action:      "fetch",
100					Description: fmt.Sprintf("Fetch content from URL: %s", params.URL),
101					Params:      FetchPermissionsParams(params),
102				},
103			)
104
105			if !granted {
106				return ai.ToolResponse{}, permission.ErrorPermissionDenied
107			}
108
109			// Handle timeout with context
110			requestCtx := ctx
111			if params.Timeout > 0 {
112				maxTimeout := 120 // 2 minutes
113				if params.Timeout > maxTimeout {
114					params.Timeout = maxTimeout
115				}
116				var cancel context.CancelFunc
117				requestCtx, cancel = context.WithTimeout(ctx, time.Duration(params.Timeout)*time.Second)
118				defer cancel()
119			}
120
121			req, err := http.NewRequestWithContext(requestCtx, "GET", params.URL, nil)
122			if err != nil {
123				return ai.ToolResponse{}, fmt.Errorf("failed to create request: %w", err)
124			}
125
126			req.Header.Set("User-Agent", "crush/1.0")
127
128			resp, err := client.Do(req)
129			if err != nil {
130				return ai.ToolResponse{}, fmt.Errorf("failed to fetch URL: %w", err)
131			}
132			defer resp.Body.Close()
133
134			if resp.StatusCode != http.StatusOK {
135				return ai.NewTextErrorResponse(fmt.Sprintf("Request failed with status code: %d", resp.StatusCode)), nil
136			}
137
138			maxSize := int64(5 * 1024 * 1024) // 5MB
139			body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
140			if err != nil {
141				return ai.NewTextErrorResponse("Failed to read response body: " + err.Error()), nil
142			}
143
144			content := string(body)
145
146			isValidUt8 := utf8.ValidString(content)
147			if !isValidUt8 {
148				return ai.NewTextErrorResponse("Response content is not valid UTF-8"), nil
149			}
150			contentType := resp.Header.Get("Content-Type")
151
152			switch format {
153			case "text":
154				if strings.Contains(contentType, "text/html") {
155					text, err := extractTextFromHTML(content)
156					if err != nil {
157						return ai.NewTextErrorResponse("Failed to extract text from HTML: " + err.Error()), nil
158					}
159					content = text
160				}
161
162			case "markdown":
163				if strings.Contains(contentType, "text/html") {
164					markdown, err := convertHTMLToMarkdown(content)
165					if err != nil {
166						return ai.NewTextErrorResponse("Failed to convert HTML to Markdown: " + err.Error()), nil
167					}
168					content = markdown
169				}
170
171				content = "```\n" + content + "\n```"
172
173			case "html":
174				// return only the body of the HTML document
175				if strings.Contains(contentType, "text/html") {
176					doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
177					if err != nil {
178						return ai.NewTextErrorResponse("Failed to parse HTML: " + err.Error()), nil
179					}
180					body, err := doc.Find("body").Html()
181					if err != nil {
182						return ai.NewTextErrorResponse("Failed to extract body from HTML: " + err.Error()), nil
183					}
184					if body == "" {
185						return ai.NewTextErrorResponse("No body content found in HTML"), nil
186					}
187					content = "<html>\n<body>\n" + body + "\n</body>\n</html>"
188				}
189			}
190			// calculate byte size of content
191			contentSize := int64(len(content))
192			if contentSize > MaxReadSize {
193				content = content[:MaxReadSize]
194				content += fmt.Sprintf("\n\n[Content truncated to %d bytes]", MaxReadSize)
195			}
196
197			return ai.NewTextResponse(content), nil
198		})
199}
200
201func extractTextFromHTML(html string) (string, error) {
202	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
203	if err != nil {
204		return "", err
205	}
206
207	text := doc.Find("body").Text()
208	text = strings.Join(strings.Fields(text), " ")
209
210	return text, nil
211}
212
213func convertHTMLToMarkdown(html string) (string, error) {
214	converter := md.NewConverter("", true, nil)
215
216	markdown, err := converter.ConvertString(html)
217	if err != nil {
218		return "", err
219	}
220
221	return markdown, nil
222}