agent: Allow LanguageModelImage size to be optional (#44956)

Xiaobo Liu created

Release Notes:

- Improved allow LanguageModelImage size to be optional

Signed-off-by: Xiaobo Liu <cppcoffee@gmail.com>

Change summary

crates/agent/src/thread.rs                     |  3 -
crates/language_model/src/request.rs           | 36 ++++++++++++-------
crates/language_models/src/provider/mistral.rs |  2 
3 files changed, 24 insertions(+), 17 deletions(-)

Detailed changes

crates/agent/src/thread.rs 🔗

@@ -2662,7 +2662,6 @@ impl From<UserMessageContent> for acp::ContentBlock {
 fn convert_image(image_content: acp::ImageContent) -> LanguageModelImage {
     LanguageModelImage {
         source: image_content.data.into(),
-        // TODO: make this optional?
-        size: gpui::Size::new(0.into(), 0.into()),
+        size: None,
     }
 }

crates/language_model/src/request.rs 🔗

@@ -19,7 +19,8 @@ use crate::{LanguageModelToolUse, LanguageModelToolUseId};
 pub struct LanguageModelImage {
     /// A base64-encoded PNG image.
     pub source: SharedString,
-    pub size: Size<DevicePixels>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub size: Option<Size<DevicePixels>>,
 }
 
 impl LanguageModelImage {
@@ -61,7 +62,7 @@ impl LanguageModelImage {
         }
 
         Some(Self {
-            size: size(DevicePixels(width?), DevicePixels(height?)),
+            size: Some(size(DevicePixels(width?), DevicePixels(height?))),
             source: SharedString::from(source.to_string()),
         })
     }
@@ -83,7 +84,7 @@ impl LanguageModelImage {
     pub fn empty() -> Self {
         Self {
             source: "".into(),
-            size: size(DevicePixels(0), DevicePixels(0)),
+            size: None,
         }
     }
 
@@ -139,15 +140,18 @@ impl LanguageModelImage {
             let source = unsafe { String::from_utf8_unchecked(base64_image) };
 
             Some(LanguageModelImage {
-                size: image_size,
+                size: Some(image_size),
                 source: source.into(),
             })
         })
     }
 
     pub fn estimate_tokens(&self) -> usize {
-        let width = self.size.width.0.unsigned_abs() as usize;
-        let height = self.size.height.0.unsigned_abs() as usize;
+        let Some(size) = self.size.as_ref() else {
+            return 0;
+        };
+        let width = size.width.0.unsigned_abs() as usize;
+        let height = size.height.0.unsigned_abs() as usize;
 
         // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
         // Note that are a lot of conditions on Anthropic's API, and OpenAI doesn't use this,
@@ -463,8 +467,9 @@ mod tests {
         match result {
             LanguageModelToolResultContent::Image(image) => {
                 assert_eq!(image.source.as_ref(), "base64encodedimagedata");
-                assert_eq!(image.size.width.0, 100);
-                assert_eq!(image.size.height.0, 200);
+                let size = image.size.expect("size");
+                assert_eq!(size.width.0, 100);
+                assert_eq!(size.height.0, 200);
             }
             _ => panic!("Expected Image variant"),
         }
@@ -483,8 +488,9 @@ mod tests {
         match result {
             LanguageModelToolResultContent::Image(image) => {
                 assert_eq!(image.source.as_ref(), "wrappedimagedata");
-                assert_eq!(image.size.width.0, 50);
-                assert_eq!(image.size.height.0, 75);
+                let size = image.size.expect("size");
+                assert_eq!(size.width.0, 50);
+                assert_eq!(size.height.0, 75);
             }
             _ => panic!("Expected Image variant"),
         }
@@ -503,8 +509,9 @@ mod tests {
         match result {
             LanguageModelToolResultContent::Image(image) => {
                 assert_eq!(image.source.as_ref(), "caseinsensitive");
-                assert_eq!(image.size.width.0, 30);
-                assert_eq!(image.size.height.0, 40);
+                let size = image.size.expect("size");
+                assert_eq!(size.width.0, 30);
+                assert_eq!(size.height.0, 40);
             }
             _ => panic!("Expected Image variant"),
         }
@@ -541,8 +548,9 @@ mod tests {
         match result {
             LanguageModelToolResultContent::Image(image) => {
                 assert_eq!(image.source.as_ref(), "directimage");
-                assert_eq!(image.size.width.0, 200);
-                assert_eq!(image.size.height.0, 300);
+                let size = image.size.expect("size");
+                assert_eq!(size.width.0, 200);
+                assert_eq!(size.height.0, 300);
             }
             _ => panic!("Expected Image variant"),
         }

crates/language_models/src/provider/mistral.rs 🔗

@@ -927,7 +927,7 @@ mod tests {
                     MessageContent::Text("What's in this image?".into()),
                     MessageContent::Image(LanguageModelImage {
                         source: "base64data".into(),
-                        size: Default::default(),
+                        size: None,
                     }),
                 ],
                 cache: false,