When images are resized to meet provider size constraints (Anthropic's
1568px limit or the 5MB encoded-PNG cap), the stored ImageSize was still
recording the original width/height rather than the final post-downscale
dimensions. This caused incorrect token estimation via estimate_tokens()
since it uses width * height / 750.
Use processed_image.dimensions() after all downscale passes so that
ImageSize reflects the actual image sent to the provider.
Release Notes:
- Fixed an issue where token estimation would be incorrect in case where
the thread contained downscaled images.
@@ -118,10 +118,12 @@ impl LanguageModelImageExt for LanguageModelImage {
// SAFETY: The base64 encoder should not produce non-UTF8.
let source = unsafe { String::from_utf8_unchecked(base64_image) };
+ let (final_width, final_height) = processed_image.dimensions();
+
Some(LanguageModelImage {
size: Some(ImageSize {
- width: width as i32,- height: height as i32,
+ width: final_width as i32,
+ height: final_height as i32,
}),
source: source.into(),
})
@@ -227,5 +229,15 @@ mod tests {
w,
h
);
+
+ let size = lm_image.size.expect("ImageSize should be present");
+ assert_eq!(
+ size.width, w as i32,
+ "ImageSize.width should match the encoded PNG width after downscaling"
+ );
+ assert_eq!(
+ size.height, h as i32,
+ "ImageSize.height should match the encoded PNG height after downscaling"
+ );
}
}