From 573b34ba3df63bd0c3c997296a9543a6b2872a48 Mon Sep 17 00:00:00 2001 From: Philip Zeyliger Date: Sun, 4 Jan 2026 22:46:27 -0800 Subject: [PATCH] shelley: resize large images in read_image and screenshot tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When conversations have many images, Anthropic enforces a 2000px max dimension limit per image. Previously, if an uploaded image exceeded this limit, the API would return a 400 error and the conversation would be stuck. This fix proactively resizes images in the browser tools: - read_image: resizes images that exceed max dimension before returning - browser_take_screenshot: resizes screenshots that exceed max dimension The resize preserves aspect ratio and uses bilinear interpolation. A note '[resized to fit API limits]' is added to the description when resizing occurs. Changes: - Add MaxImageDimension() method to llm.Service interface - Implement MaxImageDimension() for all LLM service implementations - New imageutil package provides image resizing utilities - BrowseTools now accepts maxImageDimension parameter - ToolSet passes the LLM's max dimension to browser tools Prompt: In a new worktree, fix what happened in conversation add-dark-mode-to-shelley where a too big image made it impossible to continue the conversation after an anthropic error. Ideally the agent would get the error and handle it… I don't think boldsoftware/sketch had this problem. Write a test against the real anthropic api to test this. --- claudetool/browse/browse.go | 88 ++++++++++++++++++++---------- claudetool/browse/browse_test.go | 92 +++++++++++++++++++++++++++++--- claudetool/browse/register.go | 5 +- claudetool/toolset.go | 9 +++- go.mod | 1 + go.sum | 2 + llm/ant/ant.go | 6 +++ llm/gem/gem.go | 6 +++ llm/imageutil/resize.go | 62 +++++++++++++++++++++ llm/imageutil/resize_test.go | 70 ++++++++++++++++++++++++ llm/llm.go | 4 ++ llm/oai/oai.go | 6 +++ llm/oai/oai_responses.go | 6 +++ loop/predictable.go | 5 ++ models/models.go | 5 ++ slug/slug_test.go | 4 ++ 16 files changed, 332 insertions(+), 39 deletions(-) create mode 100644 llm/imageutil/resize.go create mode 100644 llm/imageutil/resize_test.go diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go index 9a517d0f2625de2b8e72b5ce223a249a49df4db2..79674f29be62f47eca2fcc070489c7f80bc0e3ca 100644 --- a/claudetool/browse/browse.go +++ b/claudetool/browse/browse.go @@ -19,6 +19,7 @@ import ( "github.com/chromedp/chromedp" "github.com/google/uuid" "shelley.exe.dev/llm" + "shelley.exe.dev/llm/imageutil" ) // ScreenshotDir is the directory where screenshots are stored @@ -45,25 +46,27 @@ type BrowseTools struct { // Idle timeout management idleTimeout time.Duration idleTimer *time.Timer + // Max image dimension for resizing (0 means use default) + maxImageDimension int } -// NewBrowseTools creates a new set of browser automation tools -func NewBrowseTools(ctx context.Context) *BrowseTools { - return NewBrowseToolsWithIdleTimeout(ctx, DefaultIdleTimeout) -} - -// NewBrowseToolsWithIdleTimeout creates browser tools with a custom idle timeout -func NewBrowseToolsWithIdleTimeout(ctx context.Context, idleTimeout time.Duration) *BrowseTools { - // Ensure the screenshot directory exists +// NewBrowseTools creates a new set of browser automation tools. +// idleTimeout is how long to wait before shutting down an idle browser (0 uses default). +// maxImageDimension is the max pixel dimension for images (0 means unlimited). +func NewBrowseTools(ctx context.Context, idleTimeout time.Duration, maxImageDimension int) *BrowseTools { + if idleTimeout <= 0 { + idleTimeout = DefaultIdleTimeout + } if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil { log.Printf("Failed to create screenshot directory: %v", err) } return &BrowseTools{ - ctx: ctx, - screenshots: make(map[string]time.Time), - consoleLogs: make([]*runtime.EventConsoleAPICalled, 0), - maxConsoleLogs: 100, + ctx: ctx, + screenshots: make(map[string]time.Time), + consoleLogs: make([]*runtime.EventConsoleAPICalled, 0), + maxConsoleLogs: 100, + maxImageDimension: maxImageDimension, idleTimeout: idleTimeout, } } @@ -454,10 +457,21 @@ func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm. // Get the full path to the screenshot screenshotPath := GetScreenshotPath(id) - // Encode the image as base64 - base64Data := base64.StdEncoding.EncodeToString(buf) + // Resize image if needed to fit within model's image dimension limits + imageData := buf + format := "png" + resized := false + if b.maxImageDimension > 0 { + var err error + imageData, format, resized, err = imageutil.ResizeImage(buf, b.maxImageDimension) + if err != nil { + return llm.ErrorToolOut(fmt.Errorf("failed to resize screenshot: %w", err)) + } + } + + base64Data := base64.StdEncoding.EncodeToString(imageData) + mediaType := "image/" + format - // Prepare display data for the UI display := map[string]any{ "type": "screenshot", "id": id, @@ -466,15 +480,19 @@ func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm. "selector": input.Selector, } - // Return the screenshot directly to the LLM and provide display metadata for the UI + description := fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath) + if resized { + description += " [resized]" + } + return llm.ToolOut{LLMContent: []llm.Content{ { Type: llm.ContentTypeText, - Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath), + Text: description, }, { - Type: llm.ContentTypeText, // Will be mapped to image in content array - MediaType: "image/png", + Type: llm.ContentTypeText, + MediaType: mediaType, Data: base64Data, }, }, Display: display} @@ -570,24 +588,38 @@ func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) llm.T return llm.ErrorfToolOut("failed to read image file: %w", err) } - // Detect the image type - imageType := http.DetectContentType(imageData) - if !strings.HasPrefix(imageType, "image/") { - return llm.ErrorfToolOut("file is not an image: %s", imageType) + detectedType := http.DetectContentType(imageData) + if !strings.HasPrefix(detectedType, "image/") { + return llm.ErrorfToolOut("file is not an image: %s", detectedType) + } + + // Resize image if needed to fit within model's image dimension limits + resized := false + format := strings.TrimPrefix(detectedType, "image/") + if b.maxImageDimension > 0 { + var err error + imageData, format, resized, err = imageutil.ResizeImage(imageData, b.maxImageDimension) + if err != nil { + return llm.ErrorToolOut(fmt.Errorf("failed to resize image: %w", err)) + } } - // Encode the image as base64 base64Data := base64.StdEncoding.EncodeToString(imageData) + mediaType := "image/" + format + + description := fmt.Sprintf("Image from %s (type: %s)", input.Path, mediaType) + if resized { + description += " [resized]" + } - // Create a Content object that includes both text and the image return llm.ToolOut{LLMContent: []llm.Content{ { Type: llm.ContentTypeText, - Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType), + Text: description, }, { - Type: llm.ContentTypeText, // Will be mapped to image in content array - MediaType: imageType, + Type: llm.ContentTypeText, + MediaType: mediaType, Data: base64Data, }, }} diff --git a/claudetool/browse/browse_test.go b/claudetool/browse/browse_test.go index fc3341aa24943449fa7c4a712bd6493ca6216ca1..ae36559c15f07a4f2ded924ac9829858d7b8e37c 100644 --- a/claudetool/browse/browse_test.go +++ b/claudetool/browse/browse_test.go @@ -1,9 +1,14 @@ package browse import ( + "bytes" "context" + "encoding/base64" "encoding/json" "fmt" + "image" + "image/color" + "image/png" "os" "path/filepath" "slices" @@ -17,7 +22,7 @@ import ( func TestToolCreation(t *testing.T) { // Create browser tools instance - tools := NewBrowseTools(context.Background()) + tools := NewBrowseTools(context.Background(), 0, 0) t.Cleanup(func() { tools.Close() }) @@ -66,7 +71,7 @@ func TestToolCreation(t *testing.T) { func TestGetTools(t *testing.T) { // Create browser tools instance - tools := NewBrowseTools(context.Background()) + tools := NewBrowseTools(context.Background(), 0, 0) t.Cleanup(func() { tools.Close() }) @@ -107,7 +112,7 @@ func TestBrowserInitialization(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() - tools := NewBrowseTools(ctx) + tools := NewBrowseTools(ctx, 0, 0) t.Cleanup(func() { tools.Close() }) @@ -145,7 +150,7 @@ func TestNavigateTool(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() - tools := NewBrowseTools(ctx) + tools := NewBrowseTools(ctx, 0, 0) t.Cleanup(func() { tools.Close() }) @@ -202,7 +207,7 @@ func TestNavigateTool(t *testing.T) { func TestScreenshotTool(t *testing.T) { // Create browser tools instance ctx := context.Background() - tools := NewBrowseTools(ctx) + tools := NewBrowseTools(ctx, 0, 0) t.Cleanup(func() { tools.Close() }) @@ -239,7 +244,7 @@ func TestScreenshotTool(t *testing.T) { func TestReadImageTool(t *testing.T) { // Create a test BrowseTools instance ctx := context.Background() - browseTools := NewBrowseTools(ctx) + browseTools := NewBrowseTools(ctx, 0, 0) t.Cleanup(func() { browseTools.Close() }) @@ -304,7 +309,7 @@ func TestDefaultViewportSize(t *testing.T) { t.Skip("Skipping browser test in CI/headless environment") } - tools := NewBrowseTools(ctx) + tools := NewBrowseTools(ctx, 0, 0) t.Cleanup(func() { tools.Close() }) @@ -364,7 +369,7 @@ func TestBrowserIdleShutdownAndRestart(t *testing.T) { // Use a short idle timeout for testing idleTimeout := 100 * time.Millisecond - tools := NewBrowseToolsWithIdleTimeout(ctx, idleTimeout) + tools := NewBrowseTools(ctx, idleTimeout, 0) t.Cleanup(func() { tools.Close() }) @@ -406,3 +411,74 @@ func TestBrowserIdleShutdownAndRestart(t *testing.T) { t.Fatalf("Navigate failed after restart: %v", toolOut.Error) } } + +func TestReadImageToolResizesLargeImage(t *testing.T) { + // Create a test BrowseTools instance with max dimension of 2000 + ctx := context.Background() + browseTools := NewBrowseTools(ctx, 0, 2000) + t.Cleanup(func() { + browseTools.Close() + }) + + // Create a large test image (3000x2500 pixels) + testDir := t.TempDir() + testImagePath := filepath.Join(testDir, "large_image.png") + + // Create a large image using image package + img := image.NewRGBA(image.Rect(0, 0, 3000, 2500)) + for y := 0; y < 2500; y++ { + for x := 0; x < 3000; x++ { + img.Set(x, y, color.RGBA{R: 100, G: 150, B: 200, A: 255}) + } + } + + f, err := os.Create(testImagePath) + if err != nil { + t.Fatalf("Failed to create test image file: %v", err) + } + if err := png.Encode(f, img); err != nil { + f.Close() + t.Fatalf("Failed to encode test image: %v", err) + } + f.Close() + + // Create the tool + readImageTool := browseTools.NewReadImageTool() + + // Prepare input + input := fmt.Sprintf(`{"path": "%s"}`, testImagePath) + + // Run the tool + toolOut := readImageTool.Run(ctx, json.RawMessage(input)) + if toolOut.Error != nil { + t.Fatalf("Read image tool failed: %v", toolOut.Error) + } + result := toolOut.LLMContent + + // Check that we got at least two content objects + if len(result) < 2 { + t.Fatalf("Expected at least 2 content objects, got %d", len(result)) + } + + // Check that the description mentions resizing + if !strings.Contains(result[0].Text, "resized") { + t.Errorf("Expected description to mention resizing, got: %s", result[0].Text) + } + + // Decode the returned image and verify dimensions are within limits + imageData, err := base64.StdEncoding.DecodeString(result[1].Data) + if err != nil { + t.Fatalf("Failed to decode base64 image: %v", err) + } + + config, _, err := image.DecodeConfig(bytes.NewReader(imageData)) + if err != nil { + t.Fatalf("Failed to decode image config: %v", err) + } + + if config.Width > 2000 || config.Height > 2000 { + t.Errorf("Image dimensions still exceed 2000 pixels: %dx%d", config.Width, config.Height) + } + + t.Logf("Large image resized from 3000x2500 to %dx%d", config.Width, config.Height) +} diff --git a/claudetool/browse/register.go b/claudetool/browse/register.go index c9317850dc2604a14d84178b7f30b60ea95b4075..c3e0035fcd4910270f9295561f95f15ddbc8f1fc 100644 --- a/claudetool/browse/register.go +++ b/claudetool/browse/register.go @@ -9,8 +9,9 @@ import ( // RegisterBrowserTools returns all browser tools ready to be added to an agent. // It also returns a cleanup function that should be called when done to properly close the browser. // The browser will be initialized lazily when a browser tool is first used. -func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool) ([]*llm.Tool, func()) { - browserTools := NewBrowseTools(ctx) +// maxImageDimension is the max pixel dimension for images (0 uses default of 2000). +func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool, maxImageDimension int) ([]*llm.Tool, func()) { + browserTools := NewBrowseTools(ctx, 0, maxImageDimension) return browserTools.GetTools(supportsScreenshots), func() { browserTools.Close() diff --git a/claudetool/toolset.go b/claudetool/toolset.go index dd06779aea200e3a7956f3a7fb63f3f4ba8c1d91..6524f86a3e1378f8bfa7a2c50d8e0c4c0acddb7e 100644 --- a/claudetool/toolset.go +++ b/claudetool/toolset.go @@ -122,7 +122,14 @@ func NewToolSet(ctx context.Context, cfg ToolSetConfig) *ToolSet { var cleanup func() if cfg.EnableBrowser { - browserTools, browserCleanup := browse.RegisterBrowserTools(ctx, true) + // Get max image dimension from the LLM service + maxImageDimension := 0 + if cfg.LLMProvider != nil && cfg.ModelID != "" { + if svc, err := cfg.LLMProvider.GetService(cfg.ModelID); err == nil { + maxImageDimension = svc.MaxImageDimension() + } + } + browserTools, browserCleanup := browse.RegisterBrowserTools(ctx, true, maxImageDimension) if len(browserTools) > 0 { tools = append(tools, browserTools...) } diff --git a/go.mod b/go.mod index a0523e93d2aeaf46d01857d62c384460851124df..2c9d69b207c7a27ef22b58c133363808f9190fad 100644 --- a/go.mod +++ b/go.mod @@ -63,6 +63,7 @@ require ( go.uber.org/zap v1.27.0 // indirect golang.org/x/crypto v0.46.0 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect + golang.org/x/image v0.34.0 // indirect golang.org/x/mod v0.31.0 // indirect golang.org/x/net v0.48.0 // indirect golang.org/x/text v0.32.0 // indirect diff --git a/go.sum b/go.sum index 2b38e0b549c81cf497a69db62d9f9559f2077f16..1486e25c5356d1d70c19f803a186b8cab0f34ab9 100644 --- a/go.sum +++ b/go.sum @@ -182,6 +182,8 @@ golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8= +golang.org/x/image v0.34.0 h1:33gCkyw9hmwbZJeZkct8XyR11yH889EQt/QH4VmXMn8= +golang.org/x/image v0.34.0/go.mod h1:2RNFBZRB+vnwwFil8GkMdRvrJOFd1AzdZI6vOY+eJVU= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg= diff --git a/llm/ant/ant.go b/llm/ant/ant.go index 4e8630611feecfbeaf9e73f34e92096b1b0c1f06..a1b9acf10013a79c3a8359c181f3992d61d7e340 100644 --- a/llm/ant/ant.go +++ b/llm/ant/ant.go @@ -74,6 +74,12 @@ func (s *Service) TokenContextWindow() int { } } +// MaxImageDimension returns the maximum allowed image dimension for multi-image requests. +// Anthropic enforces a 2000 pixel limit when multiple images are in a conversation. +func (s *Service) MaxImageDimension() int { + return 2000 +} + // HTTPRecorder is a callback for recording HTTP request/response data for debugging type HTTPRecorder func(url string, requestBody, responseBody []byte, statusCode int, err error, duration time.Duration) diff --git a/llm/gem/gem.go b/llm/gem/gem.go index 9663ac875fb81b4a0ef332ea6a728a351f17c065..7b41614e2303dc70ead57d883677972cb9119a60 100644 --- a/llm/gem/gem.go +++ b/llm/gem/gem.go @@ -461,6 +461,12 @@ func (s *Service) TokenContextWindow() int { } } +// MaxImageDimension returns the maximum allowed image dimension. +// TODO: determine actual Gemini image dimension limits +func (s *Service) MaxImageDimension() int { + return 0 // No known limit +} + // Do sends a request to Gemini. func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error) { // Log the incoming request for debugging diff --git a/llm/imageutil/resize.go b/llm/imageutil/resize.go new file mode 100644 index 0000000000000000000000000000000000000000..a16fe630890dcc3ab86f79e2678f1277bcaf3a61 --- /dev/null +++ b/llm/imageutil/resize.go @@ -0,0 +1,62 @@ +// Package imageutil provides image manipulation utilities. +package imageutil + +import ( + "bytes" + "fmt" + "image" + "image/jpeg" + "image/png" + "strings" + + "golang.org/x/image/draw" +) + +// ResizeImage resizes an image if any dimension exceeds maxDimension. +// Returns the resized image bytes and the format ("png" or "jpeg"). +// If no resize is needed, returns the original data unchanged. +func ResizeImage(data []byte, maxDimension int) (resized []byte, format string, didResize bool, err error) { + img, detectedFormat, err := image.Decode(bytes.NewReader(data)) + if err != nil { + return nil, "", false, fmt.Errorf("failed to decode image: %w", err) + } + + bounds := img.Bounds() + width := bounds.Dx() + height := bounds.Dy() + + if width <= maxDimension && height <= maxDimension { + return data, detectedFormat, false, nil + } + + // Calculate new dimensions preserving aspect ratio + newWidth, newHeight := width, height + if width > height { + newWidth = maxDimension + newHeight = height * maxDimension / width + } else { + newHeight = maxDimension + newWidth = width * maxDimension / height + } + + // Create resized image + resizedImg := image.NewRGBA(image.Rect(0, 0, newWidth, newHeight)) + draw.BiLinear.Scale(resizedImg, resizedImg.Bounds(), img, bounds, draw.Over, nil) + + // Encode to the same format + var buf bytes.Buffer + switch strings.ToLower(detectedFormat) { + case "jpeg", "jpg": + err = jpeg.Encode(&buf, resizedImg, &jpeg.Options{Quality: 85}) + format = "jpeg" + default: + err = png.Encode(&buf, resizedImg) + format = "png" + } + + if err != nil { + return nil, "", false, fmt.Errorf("failed to encode resized image: %w", err) + } + + return buf.Bytes(), format, true, nil +} diff --git a/llm/imageutil/resize_test.go b/llm/imageutil/resize_test.go new file mode 100644 index 0000000000000000000000000000000000000000..df27b9d0f430e486c3d31c3f54ba133c0c15962f --- /dev/null +++ b/llm/imageutil/resize_test.go @@ -0,0 +1,70 @@ +package imageutil + +import ( + "bytes" + "image" + "image/color" + "image/png" + "testing" +) + +func createTestPNG(t *testing.T, width, height int) []byte { + img := image.NewRGBA(image.Rect(0, 0, width, height)) + for y := 0; y < height; y++ { + for x := 0; x < width; x++ { + img.Set(x, y, color.RGBA{R: 100, G: 150, B: 200, A: 255}) + } + } + var buf bytes.Buffer + if err := png.Encode(&buf, img); err != nil { + t.Fatalf("Failed to create test image: %v", err) + } + return buf.Bytes() +} + +func TestResizeImage(t *testing.T) { + tests := []struct { + name string + width int + height int + maxDim int + wantResize bool + wantMaxDim int + }{ + {"small image", 800, 600, 2000, false, 800}, + {"at limit", 2000, 2000, 2000, false, 2000}, + {"width exceeds", 3000, 1000, 2000, true, 2000}, + {"height exceeds", 1000, 3000, 2000, true, 2000}, + {"both exceed", 3000, 3000, 2000, true, 2000}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := createTestPNG(t, tt.width, tt.height) + resized, format, didResize, err := ResizeImage(data, tt.maxDim) + if err != nil { + t.Fatalf("ResizeImage() error = %v", err) + } + if didResize != tt.wantResize { + t.Errorf("ResizeImage() didResize = %v, want %v", didResize, tt.wantResize) + } + if format != "png" { + t.Errorf("ResizeImage() format = %v, want png", format) + } + if didResize { + // Verify the resized image dimensions + config, _, err := image.DecodeConfig(bytes.NewReader(resized)) + if err != nil { + t.Fatalf("Failed to decode resized image: %v", err) + } + if config.Width > tt.maxDim || config.Height > tt.maxDim { + t.Errorf("Resized image %dx%d still exceeds max %d", config.Width, config.Height, tt.maxDim) + } + } else { + if !bytes.Equal(resized, data) { + t.Error("Expected original data when no resize needed") + } + } + }) + } +} diff --git a/llm/llm.go b/llm/llm.go index bd6ecee399755e557878f3fe0ea052a4e2e5a1ce..d6414a7405a4dfac81677db7a0ee5fd2872cedd8 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -19,6 +19,10 @@ type Service interface { Do(context.Context, *Request) (*Response, error) // TokenContextWindow returns the maximum token context window size for this service TokenContextWindow() int + // MaxImageDimension returns the maximum allowed dimension (width or height) for images. + // For multi-image requests, some providers enforce stricter limits. + // Returns 0 if there is no limit. + MaxImageDimension() int } type SimplifiedPatcher interface { diff --git a/llm/oai/oai.go b/llm/oai/oai.go index 911fa84fc82fd32bce9f63cb85476c53255017b1..2cf2f0d87983b5997ecd32d72002c32b62b42204 100644 --- a/llm/oai/oai.go +++ b/llm/oai/oai.go @@ -767,6 +767,12 @@ func (s *Service) TokenContextWindow() int { } } +// MaxImageDimension returns the maximum allowed image dimension. +// TODO: determine actual OpenAI image dimension limits +func (s *Service) MaxImageDimension() int { + return 0 // No known limit +} + // Do sends a request to OpenAI using the go-openai package. func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error) { // Configure the OpenAI client diff --git a/llm/oai/oai_responses.go b/llm/oai/oai_responses.go index b6e4ff82f328d9909ed40286d4e9c2106072c4cb..3fedd053f0fd2e3ae02b1a3002bcae2b8b89c6e3 100644 --- a/llm/oai/oai_responses.go +++ b/llm/oai/oai_responses.go @@ -351,6 +351,12 @@ func (s *ResponsesService) TokenContextWindow() int { } } +// MaxImageDimension returns the maximum allowed image dimension. +// TODO: determine actual OpenAI image dimension limits +func (s *ResponsesService) MaxImageDimension() int { + return 0 // No known limit +} + // Do sends a request to OpenAI using the Responses API. func (s *ResponsesService) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error) { httpc := cmp.Or(s.HTTPC, http.DefaultClient) diff --git a/loop/predictable.go b/loop/predictable.go index c8f658d3bde621e0736100d760ff2cdcebeb0dcd..2f51331deb73fd3834f674a52cfcf34efd880a49 100644 --- a/loop/predictable.go +++ b/loop/predictable.go @@ -52,6 +52,11 @@ func (s *PredictableService) TokenContextWindow() int { return s.tokenContextWindow } +// MaxImageDimension returns the maximum allowed image dimension. +func (s *PredictableService) MaxImageDimension() int { + return 2000 +} + // Do processes a request and returns a predictable response based on the input text func (s *PredictableService) Do(ctx context.Context, req *llm.Request) (*llm.Response, error) { // Store request for testing inspection diff --git a/models/models.go b/models/models.go index f2c684bda9e5ae35b4db49ef6baf3c3c819a96ee..ecfe214583acacbf609fc0a3f81df10a90a3ecf3 100644 --- a/models/models.go +++ b/models/models.go @@ -389,6 +389,11 @@ func (l *loggingService) TokenContextWindow() int { return l.service.TokenContextWindow() } +// MaxImageDimension delegates to the underlying service +func (l *loggingService) MaxImageDimension() int { + return l.service.MaxImageDimension() +} + // UseSimplifiedPatch delegates to the underlying service if it supports it func (l *loggingService) UseSimplifiedPatch() bool { if sp, ok := l.service.(llm.SimplifiedPatcher); ok { diff --git a/slug/slug_test.go b/slug/slug_test.go index 8b4ef31130ba42f15b0fcfc2b4736f0604749233..c48f218fbf043f0e5d7076dbf9845ceaf39476e7 100644 --- a/slug/slug_test.go +++ b/slug/slug_test.go @@ -87,6 +87,10 @@ func (m *MockLLMService) TokenContextWindow() int { return 8192 // Mock token limit } +func (m *MockLLMService) MaxImageDimension() int { + return 0 // No limit for mock +} + // MockLLMProvider provides a mock LLM provider for testing type MockLLMProvider struct { Service *MockLLMService