request.rs

  1use std::io::{Cursor, Write};
  2use std::sync::Arc;
  3
  4use anyhow::Result;
  5use base64::write::EncoderWriter;
  6use gpui::{
  7    App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, Size, Task, point, px, size,
  8};
  9use image::GenericImageView as _;
 10use image::codecs::png::PngEncoder;
 11use util::ResultExt;
 12
 13use language_model_core::{ImageSize, LanguageModelImage};
 14
 15/// Anthropic wants uploaded images to be smaller than this in both dimensions.
 16const ANTHROPIC_SIZE_LIMIT: f32 = 1568.;
 17
 18/// Default per-image hard limit (in bytes) for the encoded image payload we send upstream.
 19///
 20/// NOTE: `LanguageModelImage.source` is base64-encoded PNG bytes (without the `data:` prefix).
 21/// This limit is enforced on the encoded PNG bytes *before* base64 encoding.
 22const DEFAULT_IMAGE_MAX_BYTES: usize = 5 * 1024 * 1024;
 23
 24/// Conservative cap on how many times we'll attempt to shrink/re-encode an image to fit
 25/// `DEFAULT_IMAGE_MAX_BYTES`.
 26const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8;
 27
 28/// Extension trait for `LanguageModelImage` that provides GPUI-dependent functionality.
 29pub trait LanguageModelImageExt {
 30    const FORMAT: ImageFormat;
 31    fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>>;
 32}
 33
 34impl LanguageModelImageExt for LanguageModelImage {
 35    const FORMAT: ImageFormat = ImageFormat::Png;
 36
 37    fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>> {
 38        cx.background_spawn(async move {
 39            let image_bytes = Cursor::new(data.bytes());
 40            let dynamic_image = match data.format() {
 41                ImageFormat::Png => image::codecs::png::PngDecoder::new(image_bytes)
 42                    .and_then(image::DynamicImage::from_decoder),
 43                ImageFormat::Jpeg => image::codecs::jpeg::JpegDecoder::new(image_bytes)
 44                    .and_then(image::DynamicImage::from_decoder),
 45                ImageFormat::Webp => image::codecs::webp::WebPDecoder::new(image_bytes)
 46                    .and_then(image::DynamicImage::from_decoder),
 47                ImageFormat::Gif => image::codecs::gif::GifDecoder::new(image_bytes)
 48                    .and_then(image::DynamicImage::from_decoder),
 49                ImageFormat::Bmp => image::codecs::bmp::BmpDecoder::new(image_bytes)
 50                    .and_then(image::DynamicImage::from_decoder),
 51                ImageFormat::Tiff => image::codecs::tiff::TiffDecoder::new(image_bytes)
 52                    .and_then(image::DynamicImage::from_decoder),
 53                _ => return None,
 54            }
 55            .log_err()?;
 56
 57            let width = dynamic_image.width();
 58            let height = dynamic_image.height();
 59            let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
 60
 61            // First apply any provider-specific dimension constraints we know about (Anthropic).
 62            let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
 63                || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
 64            {
 65                let new_bounds = ObjectFit::ScaleDown.get_bounds(
 66                    gpui::Bounds {
 67                        origin: point(px(0.0), px(0.0)),
 68                        size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
 69                    },
 70                    image_size,
 71                );
 72                dynamic_image.resize(
 73                    new_bounds.size.width.into(),
 74                    new_bounds.size.height.into(),
 75                    image::imageops::FilterType::Triangle,
 76                )
 77            } else {
 78                dynamic_image
 79            };
 80
 81            // Then enforce a default per-image size cap on the encoded PNG bytes.
 82            //
 83            // We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd.
 84            // The upstream provider limit we want to respect is effectively on the binary image
 85            // payload size, so we enforce against the encoded PNG bytes before base64 encoding.
 86            let mut encoded_png = encode_png_bytes(&processed_image).log_err()?;
 87            for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES {
 88                if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES {
 89                    break;
 90                }
 91
 92                // Scale down geometrically to converge quickly. We don't know the final PNG size
 93                // as a function of pixels, so we iteratively shrink.
 94                let (w, h) = processed_image.dimensions();
 95                if w <= 1 || h <= 1 {
 96                    break;
 97                }
 98
 99                // Shrink by ~15% each pass (0.85). This is a compromise between speed and
100                // preserving image detail.
101                let new_w = ((w as f32) * 0.85).round().max(1.0) as u32;
102                let new_h = ((h as f32) * 0.85).round().max(1.0) as u32;
103
104                processed_image =
105                    processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle);
106                encoded_png = encode_png_bytes(&processed_image).log_err()?;
107            }
108
109            if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES {
110                // Still too large after multiple passes; treat as non-convertible for now.
111                // (Provider-specific handling can be introduced later.)
112                return None;
113            }
114
115            // Now base64 encode the PNG bytes.
116            let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?;
117
118            // SAFETY: The base64 encoder should not produce non-UTF8.
119            let source = unsafe { String::from_utf8_unchecked(base64_image) };
120
121            let (final_width, final_height) = processed_image.dimensions();
122
123            Some(LanguageModelImage {
124                size: Some(ImageSize {
125                    width: final_width as i32,
126                    height: final_height as i32,
127                }),
128                source: source.into(),
129            })
130        })
131    }
132}
133
134fn encode_png_bytes(image: &image::DynamicImage) -> Result<Vec<u8>> {
135    let mut png = Vec::new();
136    image.write_with_encoder(PngEncoder::new(&mut png))?;
137    Ok(png)
138}
139
140fn encode_bytes_as_base64(bytes: &[u8]) -> Result<Vec<u8>> {
141    let mut base64_image = Vec::new();
142    {
143        let mut base64_encoder = EncoderWriter::new(
144            Cursor::new(&mut base64_image),
145            &base64::engine::general_purpose::STANDARD,
146        );
147        base64_encoder.write_all(bytes)?;
148    }
149    Ok(base64_image)
150}
151
152/// Convert a core `ImageSize` to a gpui `Size<DevicePixels>`.
153pub fn image_size_to_gpui(size: ImageSize) -> Size<DevicePixels> {
154    Size {
155        width: DevicePixels(size.width),
156        height: DevicePixels(size.height),
157    }
158}
159
160/// Convert a gpui `Size<DevicePixels>` to a core `ImageSize`.
161pub fn gpui_size_to_image_size(size: Size<DevicePixels>) -> ImageSize {
162    ImageSize {
163        width: size.width.0,
164        height: size.height.0,
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171    use base64::Engine as _;
172    use gpui::TestAppContext;
173
174    fn base64_to_png_bytes(base64: &str) -> Vec<u8> {
175        base64::engine::general_purpose::STANDARD
176            .decode(base64)
177            .expect("valid base64")
178    }
179
180    fn png_dimensions(png_bytes: &[u8]) -> (u32, u32) {
181        let img = image::load_from_memory(png_bytes).expect("valid png");
182        (img.width(), img.height())
183    }
184
185    fn make_noisy_png_bytes(width: u32, height: u32) -> Vec<u8> {
186        use image::{ImageBuffer, Rgba};
187        use std::hash::{Hash, Hasher};
188
189        let img = ImageBuffer::from_fn(width, height, |x, y| {
190            let mut hasher = std::hash::DefaultHasher::new();
191            (x, y, width, height).hash(&mut hasher);
192            let h = hasher.finish();
193            Rgba([h as u8, (h >> 8) as u8, (h >> 16) as u8, 255])
194        });
195
196        let mut buf = Cursor::new(Vec::new());
197        img.write_with_encoder(PngEncoder::new(&mut buf))
198            .expect("encode");
199        buf.into_inner()
200    }
201
202    #[gpui::test]
203    async fn test_from_image_downscales_to_default_5mb_limit(cx: &mut TestAppContext) {
204        let raw_png = make_noisy_png_bytes(4096, 4096);
205        assert!(
206            raw_png.len() > DEFAULT_IMAGE_MAX_BYTES,
207            "Test image should exceed the 5 MB limit (actual: {} bytes)",
208            raw_png.len()
209        );
210
211        let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png));
212        let lm_image = cx
213            .update(|cx| LanguageModelImage::from_image(Arc::clone(&image), cx))
214            .await
215            .expect("from_image should succeed");
216
217        let decoded_png = base64_to_png_bytes(lm_image.source.as_ref());
218        assert!(
219            decoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES,
220            "Encoded PNG should be ≤ {} bytes after downscale, but was {} bytes",
221            DEFAULT_IMAGE_MAX_BYTES,
222            decoded_png.len()
223        );
224
225        let (w, h) = png_dimensions(&decoded_png);
226        assert!(
227            w < 4096 && h < 4096,
228            "Dimensions should have shrunk: got {}×{}",
229            w,
230            h
231        );
232
233        let size = lm_image.size.expect("ImageSize should be present");
234        assert_eq!(
235            size.width, w as i32,
236            "ImageSize.width should match the encoded PNG width after downscaling"
237        );
238        assert_eq!(
239            size.height, h as i32,
240            "ImageSize.height should match the encoded PNG height after downscaling"
241        );
242    }
243}