1use std::io::{Cursor, Write};
2use std::sync::Arc;
3
4use anyhow::Result;
5use base64::write::EncoderWriter;
6use gpui::{
7 App, AppContext as _, DevicePixels, Image, ImageFormat, ObjectFit, Size, Task, point, px, size,
8};
9use image::GenericImageView as _;
10use image::codecs::png::PngEncoder;
11use util::ResultExt;
12
13use language_model_core::{ImageSize, LanguageModelImage};
14
15/// Anthropic wants uploaded images to be smaller than this in both dimensions.
16const ANTHROPIC_SIZE_LIMIT: f32 = 1568.;
17
18/// Default per-image hard limit (in bytes) for the encoded image payload we send upstream.
19///
20/// NOTE: `LanguageModelImage.source` is base64-encoded PNG bytes (without the `data:` prefix).
21/// This limit is enforced on the encoded PNG bytes *before* base64 encoding.
22const DEFAULT_IMAGE_MAX_BYTES: usize = 5 * 1024 * 1024;
23
24/// Conservative cap on how many times we'll attempt to shrink/re-encode an image to fit
25/// `DEFAULT_IMAGE_MAX_BYTES`.
26const MAX_IMAGE_DOWNSCALE_PASSES: usize = 8;
27
28/// Extension trait for `LanguageModelImage` that provides GPUI-dependent functionality.
29pub trait LanguageModelImageExt {
30 const FORMAT: ImageFormat;
31 fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>>;
32}
33
34impl LanguageModelImageExt for LanguageModelImage {
35 const FORMAT: ImageFormat = ImageFormat::Png;
36
37 fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<LanguageModelImage>> {
38 cx.background_spawn(async move {
39 let image_bytes = Cursor::new(data.bytes());
40 let dynamic_image = match data.format() {
41 ImageFormat::Png => image::codecs::png::PngDecoder::new(image_bytes)
42 .and_then(image::DynamicImage::from_decoder),
43 ImageFormat::Jpeg => image::codecs::jpeg::JpegDecoder::new(image_bytes)
44 .and_then(image::DynamicImage::from_decoder),
45 ImageFormat::Webp => image::codecs::webp::WebPDecoder::new(image_bytes)
46 .and_then(image::DynamicImage::from_decoder),
47 ImageFormat::Gif => image::codecs::gif::GifDecoder::new(image_bytes)
48 .and_then(image::DynamicImage::from_decoder),
49 ImageFormat::Bmp => image::codecs::bmp::BmpDecoder::new(image_bytes)
50 .and_then(image::DynamicImage::from_decoder),
51 ImageFormat::Tiff => image::codecs::tiff::TiffDecoder::new(image_bytes)
52 .and_then(image::DynamicImage::from_decoder),
53 _ => return None,
54 }
55 .log_err()?;
56
57 let width = dynamic_image.width();
58 let height = dynamic_image.height();
59 let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
60
61 // First apply any provider-specific dimension constraints we know about (Anthropic).
62 let mut processed_image = if image_size.width.0 > ANTHROPIC_SIZE_LIMIT as i32
63 || image_size.height.0 > ANTHROPIC_SIZE_LIMIT as i32
64 {
65 let new_bounds = ObjectFit::ScaleDown.get_bounds(
66 gpui::Bounds {
67 origin: point(px(0.0), px(0.0)),
68 size: size(px(ANTHROPIC_SIZE_LIMIT), px(ANTHROPIC_SIZE_LIMIT)),
69 },
70 image_size,
71 );
72 dynamic_image.resize(
73 new_bounds.size.width.into(),
74 new_bounds.size.height.into(),
75 image::imageops::FilterType::Triangle,
76 )
77 } else {
78 dynamic_image
79 };
80
81 // Then enforce a default per-image size cap on the encoded PNG bytes.
82 //
83 // We always send PNG bytes (either original PNG bytes, or re-encoded PNG) base64'd.
84 // The upstream provider limit we want to respect is effectively on the binary image
85 // payload size, so we enforce against the encoded PNG bytes before base64 encoding.
86 let mut encoded_png = encode_png_bytes(&processed_image).log_err()?;
87 for _pass in 0..MAX_IMAGE_DOWNSCALE_PASSES {
88 if encoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES {
89 break;
90 }
91
92 // Scale down geometrically to converge quickly. We don't know the final PNG size
93 // as a function of pixels, so we iteratively shrink.
94 let (w, h) = processed_image.dimensions();
95 if w <= 1 || h <= 1 {
96 break;
97 }
98
99 // Shrink by ~15% each pass (0.85). This is a compromise between speed and
100 // preserving image detail.
101 let new_w = ((w as f32) * 0.85).round().max(1.0) as u32;
102 let new_h = ((h as f32) * 0.85).round().max(1.0) as u32;
103
104 processed_image =
105 processed_image.resize(new_w, new_h, image::imageops::FilterType::Triangle);
106 encoded_png = encode_png_bytes(&processed_image).log_err()?;
107 }
108
109 if encoded_png.len() > DEFAULT_IMAGE_MAX_BYTES {
110 // Still too large after multiple passes; treat as non-convertible for now.
111 // (Provider-specific handling can be introduced later.)
112 return None;
113 }
114
115 // Now base64 encode the PNG bytes.
116 let base64_image = encode_bytes_as_base64(encoded_png.as_slice()).log_err()?;
117
118 // SAFETY: The base64 encoder should not produce non-UTF8.
119 let source = unsafe { String::from_utf8_unchecked(base64_image) };
120
121 Some(LanguageModelImage {
122 size: Some(ImageSize {
123 width: width as i32,
124 height: height as i32,
125 }),
126 source: source.into(),
127 })
128 })
129 }
130}
131
132fn encode_png_bytes(image: &image::DynamicImage) -> Result<Vec<u8>> {
133 let mut png = Vec::new();
134 image.write_with_encoder(PngEncoder::new(&mut png))?;
135 Ok(png)
136}
137
138fn encode_bytes_as_base64(bytes: &[u8]) -> Result<Vec<u8>> {
139 let mut base64_image = Vec::new();
140 {
141 let mut base64_encoder = EncoderWriter::new(
142 Cursor::new(&mut base64_image),
143 &base64::engine::general_purpose::STANDARD,
144 );
145 base64_encoder.write_all(bytes)?;
146 }
147 Ok(base64_image)
148}
149
150/// Convert a core `ImageSize` to a gpui `Size<DevicePixels>`.
151pub fn image_size_to_gpui(size: ImageSize) -> Size<DevicePixels> {
152 Size {
153 width: DevicePixels(size.width),
154 height: DevicePixels(size.height),
155 }
156}
157
158/// Convert a gpui `Size<DevicePixels>` to a core `ImageSize`.
159pub fn gpui_size_to_image_size(size: Size<DevicePixels>) -> ImageSize {
160 ImageSize {
161 width: size.width.0,
162 height: size.height.0,
163 }
164}
165
166#[cfg(test)]
167mod tests {
168 use super::*;
169 use base64::Engine as _;
170 use gpui::TestAppContext;
171
172 fn base64_to_png_bytes(base64: &str) -> Vec<u8> {
173 base64::engine::general_purpose::STANDARD
174 .decode(base64)
175 .expect("valid base64")
176 }
177
178 fn png_dimensions(png_bytes: &[u8]) -> (u32, u32) {
179 let img = image::load_from_memory(png_bytes).expect("valid png");
180 (img.width(), img.height())
181 }
182
183 fn make_noisy_png_bytes(width: u32, height: u32) -> Vec<u8> {
184 use image::{ImageBuffer, Rgba};
185 use std::hash::{Hash, Hasher};
186
187 let img = ImageBuffer::from_fn(width, height, |x, y| {
188 let mut hasher = std::hash::DefaultHasher::new();
189 (x, y, width, height).hash(&mut hasher);
190 let h = hasher.finish();
191 Rgba([h as u8, (h >> 8) as u8, (h >> 16) as u8, 255])
192 });
193
194 let mut buf = Cursor::new(Vec::new());
195 img.write_with_encoder(PngEncoder::new(&mut buf))
196 .expect("encode");
197 buf.into_inner()
198 }
199
200 #[gpui::test]
201 async fn test_from_image_downscales_to_default_5mb_limit(cx: &mut TestAppContext) {
202 let raw_png = make_noisy_png_bytes(4096, 4096);
203 assert!(
204 raw_png.len() > DEFAULT_IMAGE_MAX_BYTES,
205 "Test image should exceed the 5 MB limit (actual: {} bytes)",
206 raw_png.len()
207 );
208
209 let image = Arc::new(gpui::Image::from_bytes(ImageFormat::Png, raw_png));
210 let lm_image = cx
211 .update(|cx| LanguageModelImage::from_image(Arc::clone(&image), cx))
212 .await
213 .expect("from_image should succeed");
214
215 let decoded_png = base64_to_png_bytes(lm_image.source.as_ref());
216 assert!(
217 decoded_png.len() <= DEFAULT_IMAGE_MAX_BYTES,
218 "Encoded PNG should be ≤ {} bytes after downscale, but was {} bytes",
219 DEFAULT_IMAGE_MAX_BYTES,
220 decoded_png.len()
221 );
222
223 let (w, h) = png_dimensions(&decoded_png);
224 assert!(
225 w < 4096 && h < 4096,
226 "Dimensions should have shrunk: got {}×{}",
227 w,
228 h
229 );
230 }
231}