fetch.go

  1package tools
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"strings"
 10	"time"
 11	"unicode/utf8"
 12
 13	md "github.com/JohannesKaufmann/html-to-markdown"
 14	"github.com/PuerkitoBio/goquery"
 15	"github.com/charmbracelet/crush/internal/permission"
 16)
 17
 18type FetchParams struct {
 19	URL     string `json:"url"`
 20	Format  string `json:"format"`
 21	Timeout int    `json:"timeout,omitempty"`
 22}
 23
 24type FetchPermissionsParams struct {
 25	URL     string `json:"url"`
 26	Format  string `json:"format"`
 27	Timeout int    `json:"timeout,omitempty"`
 28}
 29
 30type fetchTool struct {
 31	client      *http.Client
 32	permissions permission.Service
 33	workingDir  string
 34}
 35
 36const (
 37	FetchToolName        = "fetch"
 38	fetchToolDescription = `Fetches content from a URL and returns it in the specified format.
 39
 40WHEN TO USE THIS TOOL:
 41- Use when you need to download content from a URL
 42- Helpful for retrieving documentation, API responses, or web content
 43- Useful for getting external information to assist with tasks
 44
 45HOW TO USE:
 46- Provide the URL to fetch content from
 47- Specify the desired output format (text, markdown, or html)
 48- Optionally set a timeout for the request
 49
 50FEATURES:
 51- Supports three output formats: text, markdown, and html
 52- Automatically handles HTTP redirects
 53- Sets reasonable timeouts to prevent hanging
 54- Validates input parameters before making requests
 55
 56LIMITATIONS:
 57- Maximum response size is 5MB
 58- Only supports HTTP and HTTPS protocols
 59- Cannot handle authentication or cookies
 60- Some websites may block automated requests
 61
 62TIPS:
 63- Use text format for plain text content or simple API responses
 64- Use markdown format for content that should be rendered with formatting
 65- Use html format when you need the raw HTML structure
 66- Set appropriate timeouts for potentially slow websites`
 67)
 68
 69func NewFetchTool(permissions permission.Service, workingDir string) BaseTool {
 70	return &fetchTool{
 71		client: &http.Client{
 72			Timeout: 30 * time.Second,
 73			Transport: &http.Transport{
 74				MaxIdleConns:        100,
 75				MaxIdleConnsPerHost: 10,
 76				IdleConnTimeout:     90 * time.Second,
 77			},
 78		},
 79		permissions: permissions,
 80		workingDir:  workingDir,
 81	}
 82}
 83
 84func (t *fetchTool) Name() string {
 85	return FetchToolName
 86}
 87
 88func (t *fetchTool) Info() ToolInfo {
 89	return ToolInfo{
 90		Name:        FetchToolName,
 91		Description: fetchToolDescription,
 92		Parameters: map[string]any{
 93			"url": map[string]any{
 94				"type":        "string",
 95				"description": "The URL to fetch content from",
 96			},
 97			"format": map[string]any{
 98				"type":        "string",
 99				"description": "The format to return the content in (text, markdown, or html)",
100				"enum":        []string{"text", "markdown", "html"},
101			},
102			"timeout": map[string]any{
103				"type":        "number",
104				"description": "Optional timeout in seconds (max 120)",
105			},
106		},
107		Required: []string{"url", "format"},
108	}
109}
110
111func (t *fetchTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error) {
112	var params FetchParams
113	if err := json.Unmarshal([]byte(call.Input), &params); err != nil {
114		return NewTextErrorResponse("Failed to parse fetch parameters: " + err.Error()), nil
115	}
116
117	if params.URL == "" {
118		return NewTextErrorResponse("URL parameter is required"), nil
119	}
120
121	format := strings.ToLower(params.Format)
122	if format != "text" && format != "markdown" && format != "html" {
123		return NewTextErrorResponse("Format must be one of: text, markdown, html"), nil
124	}
125
126	if !strings.HasPrefix(params.URL, "http://") && !strings.HasPrefix(params.URL, "https://") {
127		return NewTextErrorResponse("URL must start with http:// or https://"), nil
128	}
129
130	sessionID, messageID := GetContextValues(ctx)
131	if sessionID == "" || messageID == "" {
132		return ToolResponse{}, fmt.Errorf("session ID and message ID are required for creating a new file")
133	}
134
135	p := t.permissions.Request(
136		permission.CreatePermissionRequest{
137			SessionID:   sessionID,
138			Path:        t.workingDir,
139			ToolName:    FetchToolName,
140			Action:      "fetch",
141			Description: fmt.Sprintf("Fetch content from URL: %s", params.URL),
142			Params:      FetchPermissionsParams(params),
143		},
144	)
145
146	if !p {
147		return ToolResponse{}, permission.ErrorPermissionDenied
148	}
149
150	// Handle timeout with context
151	requestCtx := ctx
152	if params.Timeout > 0 {
153		maxTimeout := 120 // 2 minutes
154		if params.Timeout > maxTimeout {
155			params.Timeout = maxTimeout
156		}
157		var cancel context.CancelFunc
158		requestCtx, cancel = context.WithTimeout(ctx, time.Duration(params.Timeout)*time.Second)
159		defer cancel()
160	}
161
162	req, err := http.NewRequestWithContext(requestCtx, "GET", params.URL, nil)
163	if err != nil {
164		return ToolResponse{}, fmt.Errorf("failed to create request: %w", err)
165	}
166
167	req.Header.Set("User-Agent", "crush/1.0")
168
169	resp, err := t.client.Do(req)
170	if err != nil {
171		return ToolResponse{}, fmt.Errorf("failed to fetch URL: %w", err)
172	}
173	defer resp.Body.Close()
174
175	if resp.StatusCode != http.StatusOK {
176		return NewTextErrorResponse(fmt.Sprintf("Request failed with status code: %d", resp.StatusCode)), nil
177	}
178
179	maxSize := int64(5 * 1024 * 1024) // 5MB
180	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
181	if err != nil {
182		return NewTextErrorResponse("Failed to read response body: " + err.Error()), nil
183	}
184
185	content := string(body)
186
187	isValidUt8 := utf8.ValidString(content)
188	if !isValidUt8 {
189		return NewTextErrorResponse("Response content is not valid UTF-8"), nil
190	}
191	contentType := resp.Header.Get("Content-Type")
192
193	switch format {
194	case "text":
195		if strings.Contains(contentType, "text/html") {
196			text, err := extractTextFromHTML(content)
197			if err != nil {
198				return NewTextErrorResponse("Failed to extract text from HTML: " + err.Error()), nil
199			}
200			content = text
201		}
202
203	case "markdown":
204		if strings.Contains(contentType, "text/html") {
205			markdown, err := convertHTMLToMarkdown(content)
206			if err != nil {
207				return NewTextErrorResponse("Failed to convert HTML to Markdown: " + err.Error()), nil
208			}
209			content = markdown
210		}
211
212		content = "```\n" + content + "\n```"
213
214	case "html":
215		// return only the body of the HTML document
216		if strings.Contains(contentType, "text/html") {
217			doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
218			if err != nil {
219				return NewTextErrorResponse("Failed to parse HTML: " + err.Error()), nil
220			}
221			body, err := doc.Find("body").Html()
222			if err != nil {
223				return NewTextErrorResponse("Failed to extract body from HTML: " + err.Error()), nil
224			}
225			if body == "" {
226				return NewTextErrorResponse("No body content found in HTML"), nil
227			}
228			content = "<html>\n<body>\n" + body + "\n</body>\n</html>"
229		}
230	}
231	// calculate byte size of content
232	contentSize := int64(len(content))
233	if contentSize > MaxReadSize {
234		content = content[:MaxReadSize]
235		content += fmt.Sprintf("\n\n[Content truncated to %d bytes]", MaxReadSize)
236	}
237
238	return NewTextResponse(content), nil
239}
240
241func extractTextFromHTML(html string) (string, error) {
242	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
243	if err != nil {
244		return "", err
245	}
246
247	text := doc.Find("body").Text()
248	text = strings.Join(strings.Fields(text), " ")
249
250	return text, nil
251}
252
253func convertHTMLToMarkdown(html string) (string, error) {
254	converter := md.NewConverter("", true, nil)
255
256	markdown, err := converter.ConvertString(html)
257	if err != nil {
258		return "", err
259	}
260
261	return markdown, nil
262}