fetch.go

  1package tools
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"strings"
 10	"time"
 11	"unicode/utf8"
 12
 13	md "github.com/JohannesKaufmann/html-to-markdown"
 14	"github.com/PuerkitoBio/goquery"
 15	"github.com/charmbracelet/crush/internal/permission"
 16)
 17
 18type FetchParams struct {
 19	URL     string `json:"url"`
 20	Format  string `json:"format"`
 21	Timeout int    `json:"timeout,omitempty"`
 22}
 23
 24type FetchPermissionsParams struct {
 25	URL     string `json:"url"`
 26	Format  string `json:"format"`
 27	Timeout int    `json:"timeout,omitempty"`
 28}
 29
 30type fetchTool struct {
 31	client      *http.Client
 32	permissions permission.Service
 33	workingDir  string
 34}
 35
 36const (
 37	FetchToolName        = "fetch"
 38	fetchToolDescription = `Fetches content from a URL and returns it in the specified format.
 39
 40WHEN TO USE THIS TOOL:
 41- Use when you need to download content from a URL
 42- Helpful for retrieving documentation, API responses, or web content
 43- Useful for getting external information to assist with tasks
 44
 45HOW TO USE:
 46- Provide the URL to fetch content from
 47- Specify the desired output format (text, markdown, or html)
 48- Optionally set a timeout for the request
 49
 50FEATURES:
 51- Supports three output formats: text, markdown, and html
 52- Automatically handles HTTP redirects
 53- Sets reasonable timeouts to prevent hanging
 54- Validates input parameters before making requests
 55
 56LIMITATIONS:
 57- Maximum response size is 5MB
 58- Only supports HTTP and HTTPS protocols
 59- Cannot handle authentication or cookies
 60- Some websites may block automated requests
 61
 62TIPS:
 63- Use text format for plain text content or simple API responses
 64- Use markdown format for content that should be rendered with formatting
 65- Use html format when you need the raw HTML structure
 66- Set appropriate timeouts for potentially slow websites`
 67)
 68
 69func NewFetchTool(permissions permission.Service, workingDir string) BaseTool {
 70	return &fetchTool{
 71		client: &http.Client{
 72			Timeout: 30 * time.Second,
 73			Transport: &http.Transport{
 74				MaxIdleConns:        100,
 75				MaxIdleConnsPerHost: 10,
 76				IdleConnTimeout:     90 * time.Second,
 77			},
 78		},
 79		permissions: permissions,
 80		workingDir:  workingDir,
 81	}
 82}
 83
 84func (t *fetchTool) Info() ToolInfo {
 85	return ToolInfo{
 86		Name:        FetchToolName,
 87		Description: fetchToolDescription,
 88		Parameters: map[string]any{
 89			"url": map[string]any{
 90				"type":        "string",
 91				"description": "The URL to fetch content from",
 92			},
 93			"format": map[string]any{
 94				"type":        "string",
 95				"description": "The format to return the content in (text, markdown, or html)",
 96				"enum":        []string{"text", "markdown", "html"},
 97			},
 98			"timeout": map[string]any{
 99				"type":        "number",
100				"description": "Optional timeout in seconds (max 120)",
101			},
102		},
103		Required: []string{"url", "format"},
104	}
105}
106
107func (t *fetchTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error) {
108	var params FetchParams
109	if err := json.Unmarshal([]byte(call.Input), &params); err != nil {
110		return NewTextErrorResponse("Failed to parse fetch parameters: " + err.Error()), nil
111	}
112
113	if params.URL == "" {
114		return NewTextErrorResponse("URL parameter is required"), nil
115	}
116
117	format := strings.ToLower(params.Format)
118	if format != "text" && format != "markdown" && format != "html" {
119		return NewTextErrorResponse("Format must be one of: text, markdown, html"), nil
120	}
121
122	if !strings.HasPrefix(params.URL, "http://") && !strings.HasPrefix(params.URL, "https://") {
123		return NewTextErrorResponse("URL must start with http:// or https://"), nil
124	}
125
126	sessionID, messageID := GetContextValues(ctx)
127	if sessionID == "" || messageID == "" {
128		return ToolResponse{}, fmt.Errorf("session ID and message ID are required for creating a new file")
129	}
130
131	p := t.permissions.Request(
132		permission.CreatePermissionRequest{
133			SessionID:   sessionID,
134			Path:        t.workingDir,
135			ToolCallID:  call.ID,
136			ToolName:    FetchToolName,
137			Action:      "fetch",
138			Description: fmt.Sprintf("Fetch content from URL: %s", params.URL),
139			Params:      FetchPermissionsParams(params),
140		},
141	)
142
143	if !p {
144		return ToolResponse{}, permission.ErrorPermissionDenied
145	}
146
147	// Handle timeout with context
148	requestCtx := ctx
149	if params.Timeout > 0 {
150		maxTimeout := 120 // 2 minutes
151		if params.Timeout > maxTimeout {
152			params.Timeout = maxTimeout
153		}
154		var cancel context.CancelFunc
155		requestCtx, cancel = context.WithTimeout(ctx, time.Duration(params.Timeout)*time.Second)
156		defer cancel()
157	}
158
159	req, err := http.NewRequestWithContext(requestCtx, "GET", params.URL, nil)
160	if err != nil {
161		return ToolResponse{}, fmt.Errorf("failed to create request: %w", err)
162	}
163
164	req.Header.Set("User-Agent", "crush/1.0")
165
166	resp, err := t.client.Do(req)
167	if err != nil {
168		return ToolResponse{}, fmt.Errorf("failed to fetch URL: %w", err)
169	}
170	defer resp.Body.Close()
171
172	if resp.StatusCode != http.StatusOK {
173		return NewTextErrorResponse(fmt.Sprintf("Request failed with status code: %d", resp.StatusCode)), nil
174	}
175
176	maxSize := int64(5 * 1024 * 1024) // 5MB
177	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
178	if err != nil {
179		return NewTextErrorResponse("Failed to read response body: " + err.Error()), nil
180	}
181
182	content := string(body)
183
184	isValidUt8 := utf8.ValidString(content)
185	if !isValidUt8 {
186		return NewTextErrorResponse("Response content is not valid UTF-8"), nil
187	}
188	contentType := resp.Header.Get("Content-Type")
189
190	switch format {
191	case "text":
192		if strings.Contains(contentType, "text/html") {
193			text, err := extractTextFromHTML(content)
194			if err != nil {
195				return NewTextErrorResponse("Failed to extract text from HTML: " + err.Error()), nil
196			}
197			content = text
198		}
199
200	case "markdown":
201		if strings.Contains(contentType, "text/html") {
202			markdown, err := convertHTMLToMarkdown(content)
203			if err != nil {
204				return NewTextErrorResponse("Failed to convert HTML to Markdown: " + err.Error()), nil
205			}
206			content = markdown
207		}
208
209		content = "```\n" + content + "\n```"
210
211	case "html":
212		// return only the body of the HTML document
213		if strings.Contains(contentType, "text/html") {
214			doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
215			if err != nil {
216				return NewTextErrorResponse("Failed to parse HTML: " + err.Error()), nil
217			}
218			body, err := doc.Find("body").Html()
219			if err != nil {
220				return NewTextErrorResponse("Failed to extract body from HTML: " + err.Error()), nil
221			}
222			if body == "" {
223				return NewTextErrorResponse("No body content found in HTML"), nil
224			}
225			content = "<html>\n<body>\n" + body + "\n</body>\n</html>"
226		}
227	}
228	// calculate byte size of content
229	contentSize := int64(len(content))
230	if contentSize > MaxReadSize {
231		content = content[:MaxReadSize]
232		content += fmt.Sprintf("\n\n[Content truncated to %d bytes]", MaxReadSize)
233	}
234
235	return NewTextResponse(content), nil
236}
237
238func extractTextFromHTML(html string) (string, error) {
239	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
240	if err != nil {
241		return "", err
242	}
243
244	text := doc.Find("body").Text()
245	text = strings.Join(strings.Fields(text), " ")
246
247	return text, nil
248}
249
250func convertHTMLToMarkdown(html string) (string, error) {
251	converter := md.NewConverter("", true, nil)
252
253	markdown, err := converter.ConvertString(html)
254	if err != nil {
255		return "", err
256	}
257
258	return markdown, nil
259}