1use std::io::{Cursor, Write};
2
3use crate::role::Role;
4use crate::{LanguageModelToolUse, LanguageModelToolUseId};
5use base64::write::EncoderWriter;
6use gpui::{
7 point, size, App, AppContext as _, DevicePixels, Image, ObjectFit, RenderImage, Size, Task,
8};
9use image::{codecs::png::PngEncoder, imageops::resize, DynamicImage, ImageDecoder};
10use serde::{Deserialize, Serialize};
11use ui::{px, SharedString};
12use util::ResultExt;
13
14#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
15pub struct LanguageModelImage {
16 /// A base64-encoded PNG image.
17 pub source: SharedString,
18 size: Size<DevicePixels>,
19}
20
21impl std::fmt::Debug for LanguageModelImage {
22 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23 f.debug_struct("LanguageModelImage")
24 .field("source", &format!("<{} bytes>", self.source.len()))
25 .field("size", &self.size)
26 .finish()
27 }
28}
29
30/// Anthropic wants uploaded images to be smaller than this in both dimensions.
31const ANTHROPIC_SIZE_LIMT: f32 = 1568.;
32
33impl LanguageModelImage {
34 pub fn from_image(data: Image, cx: &mut App) -> Task<Option<Self>> {
35 cx.background_spawn(async move {
36 match data.format() {
37 gpui::ImageFormat::Png
38 | gpui::ImageFormat::Jpeg
39 | gpui::ImageFormat::Webp
40 | gpui::ImageFormat::Gif => {}
41 _ => return None,
42 };
43
44 let image = image::codecs::png::PngDecoder::new(Cursor::new(data.bytes())).log_err()?;
45 let (width, height) = image.dimensions();
46 let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
47
48 let mut base64_image = Vec::new();
49
50 {
51 let mut base64_encoder = EncoderWriter::new(
52 Cursor::new(&mut base64_image),
53 &base64::engine::general_purpose::STANDARD,
54 );
55
56 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
57 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
58 {
59 let new_bounds = ObjectFit::ScaleDown.get_bounds(
60 gpui::Bounds {
61 origin: point(px(0.0), px(0.0)),
62 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
63 },
64 image_size,
65 );
66 let image = DynamicImage::from_decoder(image).log_err()?.resize(
67 new_bounds.size.width.0 as u32,
68 new_bounds.size.height.0 as u32,
69 image::imageops::FilterType::Triangle,
70 );
71
72 let mut png = Vec::new();
73 image
74 .write_with_encoder(PngEncoder::new(&mut png))
75 .log_err()?;
76
77 base64_encoder.write_all(png.as_slice()).log_err()?;
78 } else {
79 base64_encoder.write_all(data.bytes()).log_err()?;
80 }
81 }
82
83 // SAFETY: The base64 encoder should not produce non-UTF8.
84 let source = unsafe { String::from_utf8_unchecked(base64_image) };
85
86 Some(LanguageModelImage {
87 size: image_size,
88 source: source.into(),
89 })
90 })
91 }
92
93 /// Resolves image into an LLM-ready format (base64).
94 pub fn from_render_image(data: &RenderImage) -> Option<Self> {
95 let image_size = data.size(0);
96
97 let mut bytes = data.as_bytes(0).unwrap_or(&[]).to_vec();
98 // Convert from BGRA to RGBA.
99 for pixel in bytes.chunks_exact_mut(4) {
100 pixel.swap(2, 0);
101 }
102 let mut image = image::RgbaImage::from_vec(
103 image_size.width.0 as u32,
104 image_size.height.0 as u32,
105 bytes,
106 )
107 .expect("We already know this works");
108
109 // https://docs.anthropic.com/en/docs/build-with-claude/vision
110 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
111 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
112 {
113 let new_bounds = ObjectFit::ScaleDown.get_bounds(
114 gpui::Bounds {
115 origin: point(px(0.0), px(0.0)),
116 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
117 },
118 image_size,
119 );
120
121 image = resize(
122 &image,
123 new_bounds.size.width.0 as u32,
124 new_bounds.size.height.0 as u32,
125 image::imageops::FilterType::Triangle,
126 );
127 }
128
129 let mut png = Vec::new();
130
131 image
132 .write_with_encoder(PngEncoder::new(&mut png))
133 .log_err()?;
134
135 let mut base64_image = Vec::new();
136
137 {
138 let mut base64_encoder = EncoderWriter::new(
139 Cursor::new(&mut base64_image),
140 &base64::engine::general_purpose::STANDARD,
141 );
142
143 base64_encoder.write_all(png.as_slice()).log_err()?;
144 }
145
146 // SAFETY: The base64 encoder should not produce non-UTF8.
147 let source = unsafe { String::from_utf8_unchecked(base64_image) };
148
149 Some(LanguageModelImage {
150 size: image_size,
151 source: source.into(),
152 })
153 }
154
155 pub fn estimate_tokens(&self) -> usize {
156 let width = self.size.width.0.unsigned_abs() as usize;
157 let height = self.size.height.0.unsigned_abs() as usize;
158
159 // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
160 // Note that are a lot of conditions on Anthropic's API, and OpenAI doesn't use this,
161 // so this method is more of a rough guess.
162 (width * height) / 750
163 }
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
167pub struct LanguageModelToolResult {
168 pub tool_use_id: LanguageModelToolUseId,
169 pub is_error: bool,
170 pub content: String,
171}
172
173#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
174pub enum MessageContent {
175 Text(String),
176 Image(LanguageModelImage),
177 ToolUse(LanguageModelToolUse),
178 ToolResult(LanguageModelToolResult),
179}
180
181impl From<String> for MessageContent {
182 fn from(value: String) -> Self {
183 MessageContent::Text(value)
184 }
185}
186
187impl From<&str> for MessageContent {
188 fn from(value: &str) -> Self {
189 MessageContent::Text(value.to_string())
190 }
191}
192
193#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Hash)]
194pub struct LanguageModelRequestMessage {
195 pub role: Role,
196 pub content: Vec<MessageContent>,
197 pub cache: bool,
198}
199
200impl LanguageModelRequestMessage {
201 pub fn string_contents(&self) -> String {
202 let mut string_buffer = String::new();
203 for string in self.content.iter().filter_map(|content| match content {
204 MessageContent::Text(text) => Some(text),
205 MessageContent::ToolResult(tool_result) => Some(&tool_result.content),
206 MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
207 }) {
208 string_buffer.push_str(string.as_str())
209 }
210 string_buffer
211 }
212
213 pub fn contents_empty(&self) -> bool {
214 self.content.is_empty()
215 || self
216 .content
217 .first()
218 .map(|content| match content {
219 MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
220 MessageContent::ToolResult(tool_result) => {
221 tool_result.content.chars().all(|c| c.is_whitespace())
222 }
223 MessageContent::ToolUse(_) | MessageContent::Image(_) => true,
224 })
225 .unwrap_or(false)
226 }
227}
228
229#[derive(Debug, PartialEq, Hash, Clone, Serialize, Deserialize)]
230pub struct LanguageModelRequestTool {
231 pub name: String,
232 pub description: String,
233 pub input_schema: serde_json::Value,
234}
235
236#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
237pub struct LanguageModelRequest {
238 pub messages: Vec<LanguageModelRequestMessage>,
239 pub tools: Vec<LanguageModelRequestTool>,
240 pub stop: Vec<String>,
241 pub temperature: Option<f32>,
242}
243
244#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
245pub struct LanguageModelResponseMessage {
246 pub role: Option<Role>,
247 pub content: Option<String>,
248}