Detailed changes
@@ -19,6 +19,7 @@ import (
"github.com/chromedp/chromedp"
"github.com/google/uuid"
"shelley.exe.dev/llm"
+ "shelley.exe.dev/llm/imageutil"
)
// ScreenshotDir is the directory where screenshots are stored
@@ -45,25 +46,27 @@ type BrowseTools struct {
// Idle timeout management
idleTimeout time.Duration
idleTimer *time.Timer
+ // Max image dimension for resizing (0 means use default)
+ maxImageDimension int
}
-// NewBrowseTools creates a new set of browser automation tools
-func NewBrowseTools(ctx context.Context) *BrowseTools {
- return NewBrowseToolsWithIdleTimeout(ctx, DefaultIdleTimeout)
-}
-
-// NewBrowseToolsWithIdleTimeout creates browser tools with a custom idle timeout
-func NewBrowseToolsWithIdleTimeout(ctx context.Context, idleTimeout time.Duration) *BrowseTools {
- // Ensure the screenshot directory exists
+// NewBrowseTools creates a new set of browser automation tools.
+// idleTimeout is how long to wait before shutting down an idle browser (0 uses default).
+// maxImageDimension is the max pixel dimension for images (0 means unlimited).
+func NewBrowseTools(ctx context.Context, idleTimeout time.Duration, maxImageDimension int) *BrowseTools {
+ if idleTimeout <= 0 {
+ idleTimeout = DefaultIdleTimeout
+ }
if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
log.Printf("Failed to create screenshot directory: %v", err)
}
return &BrowseTools{
- ctx: ctx,
- screenshots: make(map[string]time.Time),
- consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
- maxConsoleLogs: 100,
+ ctx: ctx,
+ screenshots: make(map[string]time.Time),
+ consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
+ maxConsoleLogs: 100,
+ maxImageDimension: maxImageDimension,
idleTimeout: idleTimeout,
}
}
@@ -454,10 +457,21 @@ func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm.
// Get the full path to the screenshot
screenshotPath := GetScreenshotPath(id)
- // Encode the image as base64
- base64Data := base64.StdEncoding.EncodeToString(buf)
+ // Resize image if needed to fit within model's image dimension limits
+ imageData := buf
+ format := "png"
+ resized := false
+ if b.maxImageDimension > 0 {
+ var err error
+ imageData, format, resized, err = imageutil.ResizeImage(buf, b.maxImageDimension)
+ if err != nil {
+ return llm.ErrorToolOut(fmt.Errorf("failed to resize screenshot: %w", err))
+ }
+ }
+
+ base64Data := base64.StdEncoding.EncodeToString(imageData)
+ mediaType := "image/" + format
- // Prepare display data for the UI
display := map[string]any{
"type": "screenshot",
"id": id,
@@ -466,15 +480,19 @@ func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm.
"selector": input.Selector,
}
- // Return the screenshot directly to the LLM and provide display metadata for the UI
+ description := fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath)
+ if resized {
+ description += " [resized]"
+ }
+
return llm.ToolOut{LLMContent: []llm.Content{
{
Type: llm.ContentTypeText,
- Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
+ Text: description,
},
{
- Type: llm.ContentTypeText, // Will be mapped to image in content array
- MediaType: "image/png",
+ Type: llm.ContentTypeText,
+ MediaType: mediaType,
Data: base64Data,
},
}, Display: display}
@@ -570,24 +588,38 @@ func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) llm.T
return llm.ErrorfToolOut("failed to read image file: %w", err)
}
- // Detect the image type
- imageType := http.DetectContentType(imageData)
- if !strings.HasPrefix(imageType, "image/") {
- return llm.ErrorfToolOut("file is not an image: %s", imageType)
+ detectedType := http.DetectContentType(imageData)
+ if !strings.HasPrefix(detectedType, "image/") {
+ return llm.ErrorfToolOut("file is not an image: %s", detectedType)
+ }
+
+ // Resize image if needed to fit within model's image dimension limits
+ resized := false
+ format := strings.TrimPrefix(detectedType, "image/")
+ if b.maxImageDimension > 0 {
+ var err error
+ imageData, format, resized, err = imageutil.ResizeImage(imageData, b.maxImageDimension)
+ if err != nil {
+ return llm.ErrorToolOut(fmt.Errorf("failed to resize image: %w", err))
+ }
}
- // Encode the image as base64
base64Data := base64.StdEncoding.EncodeToString(imageData)
+ mediaType := "image/" + format
+
+ description := fmt.Sprintf("Image from %s (type: %s)", input.Path, mediaType)
+ if resized {
+ description += " [resized]"
+ }
- // Create a Content object that includes both text and the image
return llm.ToolOut{LLMContent: []llm.Content{
{
Type: llm.ContentTypeText,
- Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
+ Text: description,
},
{
- Type: llm.ContentTypeText, // Will be mapped to image in content array
- MediaType: imageType,
+ Type: llm.ContentTypeText,
+ MediaType: mediaType,
Data: base64Data,
},
}}
@@ -1,9 +1,14 @@
package browse
import (
+ "bytes"
"context"
+ "encoding/base64"
"encoding/json"
"fmt"
+ "image"
+ "image/color"
+ "image/png"
"os"
"path/filepath"
"slices"
@@ -17,7 +22,7 @@ import (
func TestToolCreation(t *testing.T) {
// Create browser tools instance
- tools := NewBrowseTools(context.Background())
+ tools := NewBrowseTools(context.Background(), 0, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -66,7 +71,7 @@ func TestToolCreation(t *testing.T) {
func TestGetTools(t *testing.T) {
// Create browser tools instance
- tools := NewBrowseTools(context.Background())
+ tools := NewBrowseTools(context.Background(), 0, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -107,7 +112,7 @@ func TestBrowserInitialization(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
- tools := NewBrowseTools(ctx)
+ tools := NewBrowseTools(ctx, 0, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -145,7 +150,7 @@ func TestNavigateTool(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
- tools := NewBrowseTools(ctx)
+ tools := NewBrowseTools(ctx, 0, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -202,7 +207,7 @@ func TestNavigateTool(t *testing.T) {
func TestScreenshotTool(t *testing.T) {
// Create browser tools instance
ctx := context.Background()
- tools := NewBrowseTools(ctx)
+ tools := NewBrowseTools(ctx, 0, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -239,7 +244,7 @@ func TestScreenshotTool(t *testing.T) {
func TestReadImageTool(t *testing.T) {
// Create a test BrowseTools instance
ctx := context.Background()
- browseTools := NewBrowseTools(ctx)
+ browseTools := NewBrowseTools(ctx, 0, 0)
t.Cleanup(func() {
browseTools.Close()
})
@@ -304,7 +309,7 @@ func TestDefaultViewportSize(t *testing.T) {
t.Skip("Skipping browser test in CI/headless environment")
}
- tools := NewBrowseTools(ctx)
+ tools := NewBrowseTools(ctx, 0, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -364,7 +369,7 @@ func TestBrowserIdleShutdownAndRestart(t *testing.T) {
// Use a short idle timeout for testing
idleTimeout := 100 * time.Millisecond
- tools := NewBrowseToolsWithIdleTimeout(ctx, idleTimeout)
+ tools := NewBrowseTools(ctx, idleTimeout, 0)
t.Cleanup(func() {
tools.Close()
})
@@ -406,3 +411,74 @@ func TestBrowserIdleShutdownAndRestart(t *testing.T) {
t.Fatalf("Navigate failed after restart: %v", toolOut.Error)
}
}
+
+func TestReadImageToolResizesLargeImage(t *testing.T) {
+ // Create a test BrowseTools instance with max dimension of 2000
+ ctx := context.Background()
+ browseTools := NewBrowseTools(ctx, 0, 2000)
+ t.Cleanup(func() {
+ browseTools.Close()
+ })
+
+ // Create a large test image (3000x2500 pixels)
+ testDir := t.TempDir()
+ testImagePath := filepath.Join(testDir, "large_image.png")
+
+ // Create a large image using image package
+ img := image.NewRGBA(image.Rect(0, 0, 3000, 2500))
+ for y := 0; y < 2500; y++ {
+ for x := 0; x < 3000; x++ {
+ img.Set(x, y, color.RGBA{R: 100, G: 150, B: 200, A: 255})
+ }
+ }
+
+ f, err := os.Create(testImagePath)
+ if err != nil {
+ t.Fatalf("Failed to create test image file: %v", err)
+ }
+ if err := png.Encode(f, img); err != nil {
+ f.Close()
+ t.Fatalf("Failed to encode test image: %v", err)
+ }
+ f.Close()
+
+ // Create the tool
+ readImageTool := browseTools.NewReadImageTool()
+
+ // Prepare input
+ input := fmt.Sprintf(`{"path": "%s"}`, testImagePath)
+
+ // Run the tool
+ toolOut := readImageTool.Run(ctx, json.RawMessage(input))
+ if toolOut.Error != nil {
+ t.Fatalf("Read image tool failed: %v", toolOut.Error)
+ }
+ result := toolOut.LLMContent
+
+ // Check that we got at least two content objects
+ if len(result) < 2 {
+ t.Fatalf("Expected at least 2 content objects, got %d", len(result))
+ }
+
+ // Check that the description mentions resizing
+ if !strings.Contains(result[0].Text, "resized") {
+ t.Errorf("Expected description to mention resizing, got: %s", result[0].Text)
+ }
+
+ // Decode the returned image and verify dimensions are within limits
+ imageData, err := base64.StdEncoding.DecodeString(result[1].Data)
+ if err != nil {
+ t.Fatalf("Failed to decode base64 image: %v", err)
+ }
+
+ config, _, err := image.DecodeConfig(bytes.NewReader(imageData))
+ if err != nil {
+ t.Fatalf("Failed to decode image config: %v", err)
+ }
+
+ if config.Width > 2000 || config.Height > 2000 {
+ t.Errorf("Image dimensions still exceed 2000 pixels: %dx%d", config.Width, config.Height)
+ }
+
+ t.Logf("Large image resized from 3000x2500 to %dx%d", config.Width, config.Height)
+}
@@ -9,8 +9,9 @@ import (
// RegisterBrowserTools returns all browser tools ready to be added to an agent.
// It also returns a cleanup function that should be called when done to properly close the browser.
// The browser will be initialized lazily when a browser tool is first used.
-func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool) ([]*llm.Tool, func()) {
- browserTools := NewBrowseTools(ctx)
+// maxImageDimension is the max pixel dimension for images (0 uses default of 2000).
+func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool, maxImageDimension int) ([]*llm.Tool, func()) {
+ browserTools := NewBrowseTools(ctx, 0, maxImageDimension)
return browserTools.GetTools(supportsScreenshots), func() {
browserTools.Close()
@@ -122,7 +122,14 @@ func NewToolSet(ctx context.Context, cfg ToolSetConfig) *ToolSet {
var cleanup func()
if cfg.EnableBrowser {
- browserTools, browserCleanup := browse.RegisterBrowserTools(ctx, true)
+ // Get max image dimension from the LLM service
+ maxImageDimension := 0
+ if cfg.LLMProvider != nil && cfg.ModelID != "" {
+ if svc, err := cfg.LLMProvider.GetService(cfg.ModelID); err == nil {
+ maxImageDimension = svc.MaxImageDimension()
+ }
+ }
+ browserTools, browserCleanup := browse.RegisterBrowserTools(ctx, true, maxImageDimension)
if len(browserTools) > 0 {
tools = append(tools, browserTools...)
}
@@ -63,6 +63,7 @@ require (
go.uber.org/zap v1.27.0 // indirect
golang.org/x/crypto v0.46.0 // indirect
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect
+ golang.org/x/image v0.34.0 // indirect
golang.org/x/mod v0.31.0 // indirect
golang.org/x/net v0.48.0 // indirect
golang.org/x/text v0.32.0 // indirect
@@ -182,6 +182,8 @@ golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
+golang.org/x/image v0.34.0 h1:33gCkyw9hmwbZJeZkct8XyR11yH889EQt/QH4VmXMn8=
+golang.org/x/image v0.34.0/go.mod h1:2RNFBZRB+vnwwFil8GkMdRvrJOFd1AzdZI6vOY+eJVU=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
@@ -74,6 +74,12 @@ func (s *Service) TokenContextWindow() int {
}
}
+// MaxImageDimension returns the maximum allowed image dimension for multi-image requests.
+// Anthropic enforces a 2000 pixel limit when multiple images are in a conversation.
+func (s *Service) MaxImageDimension() int {
+ return 2000
+}
+
// HTTPRecorder is a callback for recording HTTP request/response data for debugging
type HTTPRecorder func(url string, requestBody, responseBody []byte, statusCode int, err error, duration time.Duration)
@@ -461,6 +461,12 @@ func (s *Service) TokenContextWindow() int {
}
}
+// MaxImageDimension returns the maximum allowed image dimension.
+// TODO: determine actual Gemini image dimension limits
+func (s *Service) MaxImageDimension() int {
+ return 0 // No known limit
+}
+
// Do sends a request to Gemini.
func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error) {
// Log the incoming request for debugging
@@ -0,0 +1,62 @@
+// Package imageutil provides image manipulation utilities.
+package imageutil
+
+import (
+ "bytes"
+ "fmt"
+ "image"
+ "image/jpeg"
+ "image/png"
+ "strings"
+
+ "golang.org/x/image/draw"
+)
+
+// ResizeImage resizes an image if any dimension exceeds maxDimension.
+// Returns the resized image bytes and the format ("png" or "jpeg").
+// If no resize is needed, returns the original data unchanged.
+func ResizeImage(data []byte, maxDimension int) (resized []byte, format string, didResize bool, err error) {
+ img, detectedFormat, err := image.Decode(bytes.NewReader(data))
+ if err != nil {
+ return nil, "", false, fmt.Errorf("failed to decode image: %w", err)
+ }
+
+ bounds := img.Bounds()
+ width := bounds.Dx()
+ height := bounds.Dy()
+
+ if width <= maxDimension && height <= maxDimension {
+ return data, detectedFormat, false, nil
+ }
+
+ // Calculate new dimensions preserving aspect ratio
+ newWidth, newHeight := width, height
+ if width > height {
+ newWidth = maxDimension
+ newHeight = height * maxDimension / width
+ } else {
+ newHeight = maxDimension
+ newWidth = width * maxDimension / height
+ }
+
+ // Create resized image
+ resizedImg := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight))
+ draw.BiLinear.Scale(resizedImg, resizedImg.Bounds(), img, bounds, draw.Over, nil)
+
+ // Encode to the same format
+ var buf bytes.Buffer
+ switch strings.ToLower(detectedFormat) {
+ case "jpeg", "jpg":
+ err = jpeg.Encode(&buf, resizedImg, &jpeg.Options{Quality: 85})
+ format = "jpeg"
+ default:
+ err = png.Encode(&buf, resizedImg)
+ format = "png"
+ }
+
+ if err != nil {
+ return nil, "", false, fmt.Errorf("failed to encode resized image: %w", err)
+ }
+
+ return buf.Bytes(), format, true, nil
+}
@@ -0,0 +1,70 @@
+package imageutil
+
+import (
+ "bytes"
+ "image"
+ "image/color"
+ "image/png"
+ "testing"
+)
+
+func createTestPNG(t *testing.T, width, height int) []byte {
+ img := image.NewRGBA(image.Rect(0, 0, width, height))
+ for y := 0; y < height; y++ {
+ for x := 0; x < width; x++ {
+ img.Set(x, y, color.RGBA{R: 100, G: 150, B: 200, A: 255})
+ }
+ }
+ var buf bytes.Buffer
+ if err := png.Encode(&buf, img); err != nil {
+ t.Fatalf("Failed to create test image: %v", err)
+ }
+ return buf.Bytes()
+}
+
+func TestResizeImage(t *testing.T) {
+ tests := []struct {
+ name string
+ width int
+ height int
+ maxDim int
+ wantResize bool
+ wantMaxDim int
+ }{
+ {"small image", 800, 600, 2000, false, 800},
+ {"at limit", 2000, 2000, 2000, false, 2000},
+ {"width exceeds", 3000, 1000, 2000, true, 2000},
+ {"height exceeds", 1000, 3000, 2000, true, 2000},
+ {"both exceed", 3000, 3000, 2000, true, 2000},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ data := createTestPNG(t, tt.width, tt.height)
+ resized, format, didResize, err := ResizeImage(data, tt.maxDim)
+ if err != nil {
+ t.Fatalf("ResizeImage() error = %v", err)
+ }
+ if didResize != tt.wantResize {
+ t.Errorf("ResizeImage() didResize = %v, want %v", didResize, tt.wantResize)
+ }
+ if format != "png" {
+ t.Errorf("ResizeImage() format = %v, want png", format)
+ }
+ if didResize {
+ // Verify the resized image dimensions
+ config, _, err := image.DecodeConfig(bytes.NewReader(resized))
+ if err != nil {
+ t.Fatalf("Failed to decode resized image: %v", err)
+ }
+ if config.Width > tt.maxDim || config.Height > tt.maxDim {
+ t.Errorf("Resized image %dx%d still exceeds max %d", config.Width, config.Height, tt.maxDim)
+ }
+ } else {
+ if !bytes.Equal(resized, data) {
+ t.Error("Expected original data when no resize needed")
+ }
+ }
+ })
+ }
+}
@@ -19,6 +19,10 @@ type Service interface {
Do(context.Context, *Request) (*Response, error)
// TokenContextWindow returns the maximum token context window size for this service
TokenContextWindow() int
+ // MaxImageDimension returns the maximum allowed dimension (width or height) for images.
+ // For multi-image requests, some providers enforce stricter limits.
+ // Returns 0 if there is no limit.
+ MaxImageDimension() int
}
type SimplifiedPatcher interface {
@@ -767,6 +767,12 @@ func (s *Service) TokenContextWindow() int {
}
}
+// MaxImageDimension returns the maximum allowed image dimension.
+// TODO: determine actual OpenAI image dimension limits
+func (s *Service) MaxImageDimension() int {
+ return 0 // No known limit
+}
+
// Do sends a request to OpenAI using the go-openai package.
func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error) {
// Configure the OpenAI client
@@ -351,6 +351,12 @@ func (s *ResponsesService) TokenContextWindow() int {
}
}
+// MaxImageDimension returns the maximum allowed image dimension.
+// TODO: determine actual OpenAI image dimension limits
+func (s *ResponsesService) MaxImageDimension() int {
+ return 0 // No known limit
+}
+
// Do sends a request to OpenAI using the Responses API.
func (s *ResponsesService) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error) {
httpc := cmp.Or(s.HTTPC, http.DefaultClient)
@@ -52,6 +52,11 @@ func (s *PredictableService) TokenContextWindow() int {
return s.tokenContextWindow
}
+// MaxImageDimension returns the maximum allowed image dimension.
+func (s *PredictableService) MaxImageDimension() int {
+ return 2000
+}
+
// Do processes a request and returns a predictable response based on the input text
func (s *PredictableService) Do(ctx context.Context, req *llm.Request) (*llm.Response, error) {
// Store request for testing inspection
@@ -389,6 +389,11 @@ func (l *loggingService) TokenContextWindow() int {
return l.service.TokenContextWindow()
}
+// MaxImageDimension delegates to the underlying service
+func (l *loggingService) MaxImageDimension() int {
+ return l.service.MaxImageDimension()
+}
+
// UseSimplifiedPatch delegates to the underlying service if it supports it
func (l *loggingService) UseSimplifiedPatch() bool {
if sp, ok := l.service.(llm.SimplifiedPatcher); ok {
@@ -87,6 +87,10 @@ func (m *MockLLMService) TokenContextWindow() int {
return 8192 // Mock token limit
}
+func (m *MockLLMService) MaxImageDimension() int {
+ return 0 // No limit for mock
+}
+
// MockLLMProvider provides a mock LLM provider for testing
type MockLLMProvider struct {
Service *MockLLMService