1use std::io::{Cursor, Write};
2use std::sync::Arc;
3
4use anyhow::Result;
5use base64::write::EncoderWriter;
6use gpui::{
7 App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, Size, Task, point, px, size,
8};
9use image::GenericImageView as _;
10use image::codecs::png::PngEncoder;
11use util::ResultExt;
12
13use language_model_core::{ImageSize, LanguageModelImage};
14
15/// Anthropic wants uploaded images to be smaller than this in both dimensions.
16const ANTHROPIC_SIZE_LIMIT: f32 = 1568.;
17
18/// Default per-image hard limit (in bytes) for the encoded image payload we send upstream.
19///
20/// NOTE: `LanguageModelImage.source` is base64-encoded PNG bytes (without the `data:` prefix).
21/// This limit is enforced on the encoded PNG bytes *before* base64 encoding.
22const DEFAULT_IMAGE_MAX_BYTES: usize = 5 * 1024 * 1024;
23
24/// Conservative cap on how many times we'll attempt to shrink/re-encode an image to fit
25/// `DEFAULT_IMAGE_MAX_BYTES`.
26const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8;
27
28/// Extension trait for `LanguageModelImage` that provides GPUI-dependent functionality.
29pub trait LanguageModelImageExt {
30 const FORMAT: ImageFormat;
31 fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>>;
32}
33
34impl LanguageModelImageExt for LanguageModelImage {
35 const FORMAT: ImageFormat = ImageFormat::Png;
36
37 fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>> {
38 cx.background_spawn(async move {
39 let image_bytes = Cursor::new(data.bytes());
40 let dynamic_image = match data.format() {
41 ImageFormat::Png => image::codecs::png::PngDecoder::new(image_bytes)
42 .and_then(image::DynamicImage::from_decoder),
43 ImageFormat::Jpeg => image::codecs::jpeg::JpegDecoder::new(image_bytes)
44 .and_then(image::DynamicImage::from_decoder),
45 ImageFormat::Webp => image::codecs::webp::WebPDecoder::new(image_bytes)
46 .and_then(image::DynamicImage::from_decoder),
47 ImageFormat::Gif => image::codecs::gif::GifDecoder::new(image_bytes)
48 .and_then(image::DynamicImage::from_decoder),
49 ImageFormat::Bmp => image::codecs::bmp::BmpDecoder::new(image_bytes)
50 .and_then(image::DynamicImage::from_decoder),
51 ImageFormat::Tiff => image::codecs::tiff::TiffDecoder::new(image_bytes)
52 .and_then(image::DynamicImage::from_decoder),
53 _ => return None,
54 }
55 .log_err()?;
56
57 let width = dynamic_image.width();
58 let height = dynamic_image.height();
59 let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
60
61 // First apply any provider-specific dimension constraints we know about (Anthropic).
62 let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
63 || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
64 {
65 let new_bounds = ObjectFit::ScaleDown.get_bounds(
66 gpui::Bounds {
67 origin: point(px(0.0), px(0.0)),
68 size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
69 },
70 image_size,
71 );
72 dynamic_image.resize(
73 new_bounds.size.width.into(),
74 new_bounds.size.height.into(),
75 image::imageops::FilterType::Triangle,
76 )
77 } else {
78 dynamic_image
79 };
80
81 // Then enforce a default per-image size cap on the encoded PNG bytes.
82 //
83 // We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd.
84 // The upstream provider limit we want to respect is effectively on the binary image
85 // payload size, so we enforce against the encoded PNG bytes before base64 encoding.
86 let mut encoded_png = encode_png_bytes(&processed_image).log_err()?;
87 for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES {
88 if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES {
89 break;
90 }
91
92 // Scale down geometrically to converge quickly. We don't know the final PNG size
93 // as a function of pixels, so we iteratively shrink.
94 let (w, h) = processed_image.dimensions();
95 if w <= 1 || h <= 1 {
96 break;
97 }
98
99 // Shrink by ~15% each pass (0.85). This is a compromise between speed and
100 // preserving image detail.
101 let new_w = ((w as f32) * 0.85).round().max(1.0) as u32;
102 let new_h = ((h as f32) * 0.85).round().max(1.0) as u32;
103
104 processed_image =
105 processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle);
106 encoded_png = encode_png_bytes(&processed_image).log_err()?;
107 }
108
109 if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES {
110 // Still too large after multiple passes; treat as non-convertible for now.
111 // (Provider-specific handling can be introduced later.)
112 return None;
113 }
114
115 // Now base64 encode the PNG bytes.
116 let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?;
117
118 // SAFETY: The base64 encoder should not produce non-UTF8.
119 let source = unsafe { String::from_utf8_unchecked(base64_image) };
120
121 let (final_width, final_height) = processed_image.dimensions();
122
123 Some(LanguageModelImage {
124 size: Some(ImageSize {
125 width: final_width as i32,
126 height: final_height as i32,
127 }),
128 source: source.into(),
129 })
130 })
131 }
132}
133
134fn encode_png_bytes(image: &image::DynamicImage) -> Result<Vec<u8>> {
135 let mut png = Vec::new();
136 image.write_with_encoder(PngEncoder::new(&mut png))?;
137 Ok(png)
138}
139
140fn encode_bytes_as_base64(bytes: &[u8]) -> Result<Vec<u8>> {
141 let mut base64_image = Vec::new();
142 {
143 let mut base64_encoder = EncoderWriter::new(
144 Cursor::new(&mut base64_image),
145 &base64::engine::general_purpose::STANDARD,
146 );
147 base64_encoder.write_all(bytes)?;
148 }
149 Ok(base64_image)
150}
151
152/// Convert a core `ImageSize` to a gpui `Size<DevicePixels>`.
153pub fn image_size_to_gpui(size: ImageSize) -> Size<DevicePixels> {
154 Size {
155 width: DevicePixels(size.width),
156 height: DevicePixels(size.height),
157 }
158}
159
160/// Convert a gpui `Size<DevicePixels>` to a core `ImageSize`.
161pub fn gpui_size_to_image_size(size: Size<DevicePixels>) -> ImageSize {
162 ImageSize {
163 width: size.width.0,
164 height: size.height.0,
165 }
166}
167
168#[cfg(test)]
169mod tests {
170 use super::*;
171 use base64::Engine as _;
172 use gpui::TestAppContext;
173
174 fn base64_to_png_bytes(base64: &str) -> Vec<u8> {
175 base64::engine::general_purpose::STANDARD
176 .decode(base64)
177 .expect("valid base64")
178 }
179
180 fn png_dimensions(png_bytes: &[u8]) -> (u32, u32) {
181 let img = image::load_from_memory(png_bytes).expect("valid png");
182 (img.width(), img.height())
183 }
184
185 fn make_noisy_png_bytes(width: u32, height: u32) -> Vec<u8> {
186 use image::{ImageBuffer, Rgba};
187 use std::hash::{Hash, Hasher};
188
189 let img = ImageBuffer::from_fn(width, height, |x, y| {
190 let mut hasher = std::hash::DefaultHasher::new();
191 (x, y, width, height).hash(&mut hasher);
192 let h = hasher.finish();
193 Rgba([h as u8, (h >> 8) as u8, (h >> 16) as u8, 255])
194 });
195
196 let mut buf = Cursor::new(Vec::new());
197 img.write_with_encoder(PngEncoder::new(&mut buf))
198 .expect("encode");
199 buf.into_inner()
200 }
201
202 #[gpui::test]
203 async fn test_from_image_downscales_to_default_5mb_limit(cx: &mut TestAppContext) {
204 let raw_png = make_noisy_png_bytes(4096, 4096);
205 assert!(
206 raw_png.len() > DEFAULT_IMAGE_MAX_BYTES,
207 "Test image should exceed the 5 MB limit (actual: {} bytes)",
208 raw_png.len()
209 );
210
211 let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png));
212 let lm_image = cx
213 .update(|cx| LanguageModelImage::from_image(Arc::clone(&image), cx))
214 .await
215 .expect("from_image should succeed");
216
217 let decoded_png = base64_to_png_bytes(lm_image.source.as_ref());
218 assert!(
219 decoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES,
220 "Encoded PNG should be ≤ {} bytes after downscale, but was {} bytes",
221 DEFAULT_IMAGE_MAX_BYTES,
222 decoded_png.len()
223 );
224
225 let (w, h) = png_dimensions(&decoded_png);
226 assert!(
227 w < 4096 && h < 4096,
228 "Dimensions should have shrunk: got {}×{}",
229 w,
230 h
231 );
232
233 let size = lm_image.size.expect("ImageSize should be present");
234 assert_eq!(
235 size.width, w as i32,
236 "ImageSize.width should match the encoded PNG width after downscaling"
237 );
238 assert_eq!(
239 size.height, h as i32,
240 "ImageSize.height should match the encoded PNG height after downscaling"
241 );
242 }
243}