diff --git a/internal/llm/tools/fetch.go b/internal/llm/tools/fetch.go index 28e15d19cee8219ccc4575ed036f29e8286db229..1e44151b1124c643d2ddd428144e66c5d365e609 100644 --- a/internal/llm/tools/fetch.go +++ b/internal/llm/tools/fetch.go @@ -8,6 +8,7 @@ import ( "net/http" "strings" "time" + "unicode/utf8" md "github.com/JohannesKaufmann/html-to-markdown" "github.com/PuerkitoBio/goquery" @@ -182,6 +183,11 @@ func (t *fetchTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error } content := string(body) + + isValidUt8 := utf8.ValidString(content) + if !isValidUt8 { + return NewTextErrorResponse("Response content is not valid UTF-8"), nil + } contentType := resp.Header.Get("Content-Type") switch format { @@ -191,9 +197,8 @@ func (t *fetchTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error if err != nil { return NewTextErrorResponse("Failed to extract text from HTML: " + err.Error()), nil } - return NewTextResponse(text), nil + content = text } - return NewTextResponse(content), nil case "markdown": if strings.Contains(contentType, "text/html") { @@ -201,17 +206,36 @@ func (t *fetchTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error if err != nil { return NewTextErrorResponse("Failed to convert HTML to Markdown: " + err.Error()), nil } - return NewTextResponse(markdown), nil + content = markdown } - return NewTextResponse("```\n" + content + "\n```"), nil + content = "```\n" + content + "\n```" case "html": - return NewTextResponse(content), nil - - default: - return NewTextResponse(content), nil + // return only the body of the HTML document + if strings.Contains(contentType, "text/html") { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) + if err != nil { + return NewTextErrorResponse("Failed to parse HTML: " + err.Error()), nil + } + body, err := doc.Find("body").Html() + if err != nil { + return NewTextErrorResponse("Failed to extract body from HTML: " + err.Error()), nil + } + if body == "" { + return NewTextErrorResponse("No body content found in HTML"), nil + } + content = "\n\n" + body + "\n\n" + } + } + // calculate byte size of content + contentSize := int64(len(content)) + if contentSize > MaxReadSize { + content = content[:MaxReadSize] + content += fmt.Sprintf("\n\n[Content truncated to %d bytes]", MaxReadSize) } + + return NewTextResponse(content), nil } func extractTextFromHTML(html string) (string, error) { @@ -220,7 +244,7 @@ func extractTextFromHTML(html string) (string, error) { return "", err } - text := doc.Text() + text := doc.Find("body").Text() text = strings.Join(strings.Fields(text), " ") return text, nil diff --git a/internal/llm/tools/view.go b/internal/llm/tools/view.go index 27bbc237209e64637cfefb0f4ff1097f96641c2e..d8ca7e9e8c7880a760e8eb2096c83914a9dc13b5 100644 --- a/internal/llm/tools/view.go +++ b/internal/llm/tools/view.go @@ -9,6 +9,7 @@ import ( "os" "path/filepath" "strings" + "unicode/utf8" "github.com/charmbracelet/crush/internal/lsp" ) @@ -173,11 +174,15 @@ func (v *viewTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error) isImage, imageType := isImageFile(filePath) // TODO: handle images if isImage { - return NewTextErrorResponse(fmt.Sprintf("This is an image file of type: %s\nUse a different tool to process images", imageType)), nil + return NewTextErrorResponse(fmt.Sprintf("This is an image file of type: %s\n", imageType)), nil } // Read the file content content, lineCount, err := readTextFile(filePath, params.Offset, params.Limit) + isValidUt8 := utf8.ValidString(content) + if !isValidUt8 { + return NewTextErrorResponse("File content is not valid UTF-8"), nil + } if err != nil { return ToolResponse{}, fmt.Errorf("error reading file: %w", err) }