fetch.go

  1package tools
  2
  3import (
  4	"context"
  5	"encoding/json"
  6	"fmt"
  7	"io"
  8	"net/http"
  9	"strings"
 10	"sync"
 11	"time"
 12
 13	md "github.com/JohannesKaufmann/html-to-markdown"
 14	"github.com/PuerkitoBio/goquery"
 15	"github.com/charmbracelet/crush/internal/config"
 16	"github.com/charmbracelet/crush/internal/permission"
 17)
 18
 19type FetchParams struct {
 20	URL     string `json:"url"`
 21	Format  string `json:"format"`
 22	Timeout int    `json:"timeout,omitempty"`
 23}
 24
 25type FetchPermissionsParams struct {
 26	URL     string `json:"url"`
 27	Format  string `json:"format"`
 28	Timeout int    `json:"timeout,omitempty"`
 29}
 30
 31type fetchTool struct {
 32	client       *http.Client
 33	clientPool   map[int]*http.Client
 34	clientPoolMu sync.RWMutex
 35	permissions  permission.Service
 36}
 37
 38const (
 39	FetchToolName        = "fetch"
 40	fetchToolDescription = `Fetches content from a URL and returns it in the specified format.
 41
 42WHEN TO USE THIS TOOL:
 43- Use when you need to download content from a URL
 44- Helpful for retrieving documentation, API responses, or web content
 45- Useful for getting external information to assist with tasks
 46
 47HOW TO USE:
 48- Provide the URL to fetch content from
 49- Specify the desired output format (text, markdown, or html)
 50- Optionally set a timeout for the request
 51
 52FEATURES:
 53- Supports three output formats: text, markdown, and html
 54- Automatically handles HTTP redirects
 55- Sets reasonable timeouts to prevent hanging
 56- Validates input parameters before making requests
 57
 58LIMITATIONS:
 59- Maximum response size is 5MB
 60- Only supports HTTP and HTTPS protocols
 61- Cannot handle authentication or cookies
 62- Some websites may block automated requests
 63
 64TIPS:
 65- Use text format for plain text content or simple API responses
 66- Use markdown format for content that should be rendered with formatting
 67- Use html format when you need the raw HTML structure
 68- Set appropriate timeouts for potentially slow websites`
 69)
 70
 71func NewFetchTool(permissions permission.Service) BaseTool {
 72	return &fetchTool{
 73		client: &http.Client{
 74			Timeout: 30 * time.Second,
 75			Transport: &http.Transport{
 76				MaxIdleConns:        100,
 77				MaxIdleConnsPerHost: 10,
 78				IdleConnTimeout:     90 * time.Second,
 79			},
 80		},
 81		clientPool:  make(map[int]*http.Client),
 82		permissions: permissions,
 83	}
 84}
 85
 86// getClientForTimeout returns a cached client for the given timeout or the default client
 87func (t *fetchTool) getClientForTimeout(timeout int) *http.Client {
 88	if timeout <= 0 {
 89		return t.client
 90	}
 91
 92	maxTimeout := 120 // 2 minutes
 93	if timeout > maxTimeout {
 94		timeout = maxTimeout
 95	}
 96
 97	// Check if we have a cached client for this timeout
 98	t.clientPoolMu.RLock()
 99	if client, exists := t.clientPool[timeout]; exists {
100		t.clientPoolMu.RUnlock()
101		return client
102	}
103	t.clientPoolMu.RUnlock()
104
105	// Create and cache a new client
106	t.clientPoolMu.Lock()
107	defer t.clientPoolMu.Unlock()
108
109	// Double-check in case another goroutine created it
110	if client, exists := t.clientPool[timeout]; exists {
111		return client
112	}
113
114	client := &http.Client{
115		Timeout: time.Duration(timeout) * time.Second,
116		Transport: &http.Transport{
117			MaxIdleConns:        100,
118			MaxIdleConnsPerHost: 10,
119			IdleConnTimeout:     90 * time.Second,
120		},
121	}
122	t.clientPool[timeout] = client
123	return client
124}
125
126func (t *fetchTool) Info() ToolInfo {
127	return ToolInfo{
128		Name:        FetchToolName,
129		Description: fetchToolDescription,
130		Parameters: map[string]any{
131			"url": map[string]any{
132				"type":        "string",
133				"description": "The URL to fetch content from",
134			},
135			"format": map[string]any{
136				"type":        "string",
137				"description": "The format to return the content in (text, markdown, or html)",
138				"enum":        []string{"text", "markdown", "html"},
139			},
140			"timeout": map[string]any{
141				"type":        "number",
142				"description": "Optional timeout in seconds (max 120)",
143			},
144		},
145		Required: []string{"url", "format"},
146	}
147}
148
149func (t *fetchTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error) {
150	var params FetchParams
151	if err := json.Unmarshal([]byte(call.Input), &params); err != nil {
152		return NewTextErrorResponse("Failed to parse fetch parameters: " + err.Error()), nil
153	}
154
155	if params.URL == "" {
156		return NewTextErrorResponse("URL parameter is required"), nil
157	}
158
159	format := strings.ToLower(params.Format)
160	if format != "text" && format != "markdown" && format != "html" {
161		return NewTextErrorResponse("Format must be one of: text, markdown, html"), nil
162	}
163
164	if !strings.HasPrefix(params.URL, "http://") && !strings.HasPrefix(params.URL, "https://") {
165		return NewTextErrorResponse("URL must start with http:// or https://"), nil
166	}
167
168	sessionID, messageID := GetContextValues(ctx)
169	if sessionID == "" || messageID == "" {
170		return ToolResponse{}, fmt.Errorf("session ID and message ID are required for creating a new file")
171	}
172
173	p := t.permissions.Request(
174		permission.CreatePermissionRequest{
175			SessionID:   sessionID,
176			Path:        config.WorkingDirectory(),
177			ToolName:    FetchToolName,
178			Action:      "fetch",
179			Description: fmt.Sprintf("Fetch content from URL: %s", params.URL),
180			Params:      FetchPermissionsParams(params),
181		},
182	)
183
184	if !p {
185		return ToolResponse{}, permission.ErrorPermissionDenied
186	}
187
188	client := t.getClientForTimeout(params.Timeout)
189
190	req, err := http.NewRequestWithContext(ctx, "GET", params.URL, nil)
191	if err != nil {
192		return ToolResponse{}, fmt.Errorf("failed to create request: %w", err)
193	}
194
195	req.Header.Set("User-Agent", "crush/1.0")
196
197	resp, err := client.Do(req)
198	if err != nil {
199		return ToolResponse{}, fmt.Errorf("failed to fetch URL: %w", err)
200	}
201	defer resp.Body.Close()
202
203	if resp.StatusCode != http.StatusOK {
204		return NewTextErrorResponse(fmt.Sprintf("Request failed with status code: %d", resp.StatusCode)), nil
205	}
206
207	maxSize := int64(5 * 1024 * 1024) // 5MB
208	body, err := io.ReadAll(io.LimitReader(resp.Body, maxSize))
209	if err != nil {
210		return NewTextErrorResponse("Failed to read response body: " + err.Error()), nil
211	}
212
213	content := string(body)
214	contentType := resp.Header.Get("Content-Type")
215
216	switch format {
217	case "text":
218		if strings.Contains(contentType, "text/html") {
219			text, err := extractTextFromHTML(content)
220			if err != nil {
221				return NewTextErrorResponse("Failed to extract text from HTML: " + err.Error()), nil
222			}
223			return NewTextResponse(text), nil
224		}
225		return NewTextResponse(content), nil
226
227	case "markdown":
228		if strings.Contains(contentType, "text/html") {
229			markdown, err := convertHTMLToMarkdown(content)
230			if err != nil {
231				return NewTextErrorResponse("Failed to convert HTML to Markdown: " + err.Error()), nil
232			}
233			return NewTextResponse(markdown), nil
234		}
235
236		return NewTextResponse("```\n" + content + "\n```"), nil
237
238	case "html":
239		return NewTextResponse(content), nil
240
241	default:
242		return NewTextResponse(content), nil
243	}
244}
245
246func extractTextFromHTML(html string) (string, error) {
247	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
248	if err != nil {
249		return "", err
250	}
251
252	text := doc.Text()
253	text = strings.Join(strings.Fields(text), " ")
254
255	return text, nil
256}
257
258func convertHTMLToMarkdown(html string) (string, error) {
259	converter := md.NewConverter("", true, nil)
260
261	markdown, err := converter.ConvertString(html)
262	if err != nil {
263		return "", err
264	}
265
266	return markdown, nil
267}