Enforce 5MB per-image limit when converting images for language models (#45313)

Nathan Sobo and Zed Zippy created

## Problem

When users paste or drag large images into the agent panel, the encoded
payload can exceed upstream provider limits (e.g., Anthropic's 5MB
per-image limit), causing API errors.

## Solution

Enforce a default 5MB limit on encoded PNG bytes in
`LanguageModelImage::from_image`:

1. Apply existing Anthropic dimension limits first (1568px max in either
dimension)
2. Iteratively downscale by ~15% per pass until the encoded PNG is under
5MB
3. Return `None` if the image can't be shrunk within 8 passes
(fail-safe)

The limit is enforced at the `LanguageModelImage` conversion layer,
which is the choke point for all image ingestion paths (agent panel
paste/drag, file mentions, text threads, etc.).

## Future Work

The 5MB limit is a conservative default. Provider-specific limits can be
introduced later by adding a `from_image_with_constraints` API.

## Testing

Added a regression test that:
1. Generates a noisy 4096x4096 PNG (guaranteed >5MB)
2. Converts it via `LanguageModelImage::from_image`
3. Asserts the result is ≤5MB and was actually downscaled

---

**Note:** This PR builds on #45312 (prompt store fail-open fix). Please
merge that first.

cc @rtfeldman

---------

Co-authored-by: Zed Zippy <234243425+zed-zippy[bot]@users.noreply.github.com>

Change summary

crates/language_model/src/request.rs | 168 ++++++++++++++++++++++++-----
1 file changed, 138 insertions(+), 30 deletions(-)

Detailed changes

crates/language_model/src/request.rs 🔗

@@ -8,6 +8,7 @@ use gpui::{
     App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, SharedString, Size, Task,
     point, px, size,
 };
+use image::GenericImageView as _;
 use image::codecs::png::PngEncoder;
 use serde::{Deserialize, Serialize};
 use util::ResultExt;
@@ -80,6 +81,16 @@ impl std::fmt::Debug for LanguageModelImage {
 /// Anthropic wants uploaded images to be smaller than this in both dimensions.
 const ANTHROPIC_SIZE_LIMIT: f32 = 1568.;
 
+/// Default per-image hard limit (in bytes) for the encoded image payload we send upstream.
+///
+/// NOTE: `LanguageModelImage.source` is base64-encoded PNG bytes (without the `data:` prefix).
+/// This limit is enforced on the encoded PNG bytes *before* base64 encoding.
+const DEFAULT_IMAGE_MAX_BYTES: usize = 5 * 1024 * 1024;
+
+/// Conservative cap on how many times we'll attempt to shrink/re-encode an image to fit
+/// `DEFAULT_IMAGE_MAX_BYTES`.
+const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8;
+
 impl LanguageModelImage {
     pub fn empty() -> Self {
         Self {
@@ -112,29 +123,62 @@ impl LanguageModelImage {
             let height = dynamic_image.height();
             let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
 
-            let base64_image = {
-                if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
-                    || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
-                {
-                    let new_bounds = ObjectFit::ScaleDown.get_bounds(
-                        gpui::Bounds {
-                            origin: point(px(0.0), px(0.0)),
-                            size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
-                        },
-                        image_size,
-                    );
-                    let resized_image = dynamic_image.resize(
-                        new_bounds.size.width.into(),
-                        new_bounds.size.height.into(),
-                        image::imageops::FilterType::Triangle,
-                    );
-
-                    encode_as_base64(data, resized_image)
-                } else {
-                    encode_as_base64(data, dynamic_image)
+            // First apply any provider-specific dimension constraints we know about (Anthropic).
+            let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
+                || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
+            {
+                let new_bounds = ObjectFit::ScaleDown.get_bounds(
+                    gpui::Bounds {
+                        origin: point(px(0.0), px(0.0)),
+                        size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
+                    },
+                    image_size,
+                );
+                dynamic_image.resize(
+                    new_bounds.size.width.into(),
+                    new_bounds.size.height.into(),
+                    image::imageops::FilterType::Triangle,
+                )
+            } else {
+                dynamic_image
+            };
+
+            // Then enforce a default per-image size cap on the encoded PNG bytes.
+            //
+            // We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd.
+            // The upstream provider limit we want to respect is effectively on the binary image
+            // payload size, so we enforce against the encoded PNG bytes before base64 encoding.
+            let mut encoded_png = encode_png_bytes(&processed_image).log_err()?;
+            for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES {
+                if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES {
+                    break;
                 }
+
+                // Scale down geometrically to converge quickly. We don't know the final PNG size
+                // as a function of pixels, so we iteratively shrink.
+                let (w, h) = processed_image.dimensions();
+                if w <= 1 || h <= 1 {
+                    break;
+                }
+
+                // Shrink by ~15% each pass (0.85). This is a compromise between speed and
+                // preserving image detail.
+                let new_w = ((w as f32) * 0.85).round().max(1.0) as u32;
+                let new_h = ((h as f32) * 0.85).round().max(1.0) as u32;
+
+                processed_image =
+                    processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle);
+                encoded_png = encode_png_bytes(&processed_image).log_err()?;
             }
-            .log_err()?;
+
+            if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES {
+                // Still too large after multiple passes; treat as non-convertible for now.
+                // (Provider-specific handling can be introduced later.)
+                return None;
+            }
+
+            // Now base64 encode the PNG bytes.
+            let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?;
 
             // SAFETY: The base64 encoder should not produce non-UTF8.
             let source = unsafe { String::from_utf8_unchecked(base64_image) };
@@ -164,21 +208,20 @@ impl LanguageModelImage {
     }
 }
 
-fn encode_as_base64(data: Arc<Image>, image: image::DynamicImage) -> Result<Vec<u8>> {
+fn encode_png_bytes(image: &image::DynamicImage) -> Result<Vec<u8>> {
+    let mut png = Vec::new();
+    image.write_with_encoder(PngEncoder::new(&mut png))?;
+    Ok(png)
+}
+
+fn encode_bytes_as_base64(bytes: &[u8]) -> Result<Vec<u8>> {
     let mut base64_image = Vec::new();
     {
         let mut base64_encoder = EncoderWriter::new(
             Cursor::new(&mut base64_image),
             &base64::engine::general_purpose::STANDARD,
         );
-        if data.format() == ImageFormat::Png {
-            base64_encoder.write_all(data.bytes())?;
-        } else {
-            let mut png = Vec::new();
-            image.write_with_encoder(PngEncoder::new(&mut png))?;
-
-            base64_encoder.write_all(png.as_slice())?;
-        }
+        base64_encoder.write_all(bytes)?;
     }
     Ok(base64_image)
 }
@@ -417,6 +460,71 @@ pub struct LanguageModelResponseMessage {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use base64::Engine as _;
+    use gpui::TestAppContext;
+    use image::ImageDecoder as _;
+
+    fn base64_to_png_bytes(base64_png: &str) -> Vec<u8> {
+        base64::engine::general_purpose::STANDARD
+            .decode(base64_png.as_bytes())
+            .expect("base64 should decode")
+    }
+
+    fn png_dimensions(png_bytes: &[u8]) -> (u32, u32) {
+        let decoder =
+            image::codecs::png::PngDecoder::new(Cursor::new(png_bytes)).expect("png should decode");
+        decoder.dimensions()
+    }
+
+    fn make_noisy_png_bytes(width: u32, height: u32) -> Vec<u8> {
+        // Create an RGBA image with per-pixel variance to avoid PNG compressing too well.
+        let mut img = image::RgbaImage::new(width, height);
+        for y in 0..height {
+            for x in 0..width {
+                let r = ((x ^ y) & 0xFF) as u8;
+                let g = ((x.wrapping_mul(31) ^ y.wrapping_mul(17)) & 0xFF) as u8;
+                let b = ((x.wrapping_mul(131) ^ y.wrapping_mul(7)) & 0xFF) as u8;
+                img.put_pixel(x, y, image::Rgba([r, g, b, 0xFF]));
+            }
+        }
+
+        let mut out = Vec::new();
+        image::DynamicImage::ImageRgba8(img)
+            .write_with_encoder(PngEncoder::new(&mut out))
+            .expect("png encoding should succeed");
+        out
+    }
+
+    #[gpui::test]
+    async fn test_from_image_downscales_to_default_5mb_limit(cx: &mut TestAppContext) {
+        // Pick a size that reliably produces a PNG > 5MB when filled with noise.
+        // If this fails (image is too small), bump dimensions.
+        let original_png = make_noisy_png_bytes(4096, 4096);
+        assert!(
+            original_png.len() > DEFAULT_IMAGE_MAX_BYTES,
+            "precondition failed: noisy PNG must exceed DEFAULT_IMAGE_MAX_BYTES"
+        );
+
+        let image = gpui::Image::from_bytes(ImageFormat::Png, original_png);
+        let lm_image = cx
+            .update(|cx| LanguageModelImage::from_image(Arc::new(image), cx))
+            .await
+            .expect("image conversion should succeed");
+
+        let encoded_png = base64_to_png_bytes(lm_image.source.as_ref());
+        assert!(
+            encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES,
+            "expected encoded PNG <= DEFAULT_IMAGE_MAX_BYTES, got {} bytes",
+            encoded_png.len()
+        );
+
+        // Ensure we actually downscaled in pixels (not just re-encoded).
+        let (w, h) = png_dimensions(&encoded_png);
+        assert!(
+            w < 4096 || h < 4096,
+            "expected image to be downscaled in at least one dimension; got {w}x{h}"
+        );
+    }
 
     #[test]
     fn test_language_model_tool_result_content_deserialization() {