1use std::io::{Cursor, Write};
2use std::sync::Arc;
3
4use crate::role::Role;
5use crate::{LanguageModelToolUse, LanguageModelToolUseId};
6use base64::write::EncoderWriter;
7use gpui::{
8 App, AppContext as _, DevicePixels, Image, ObjectFit, RenderImage, SharedString, Size, Task,
9 point, px, size,
10};
11use image::{DynamicImage, ImageDecoder, codecs::png::PngEncoder, imageops::resize};
12use serde::{Deserialize, Serialize};
13use util::ResultExt;
14
15#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
16pub struct LanguageModelImage {
17 /// A base64-encoded PNG image.
18 pub source: SharedString,
19 size: Size<DevicePixels>,
20}
21
22impl std::fmt::Debug for LanguageModelImage {
23 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
24 f.debug_struct("LanguageModelImage")
25 .field("source", &format!("<{} bytes>", self.source.len()))
26 .field("size", &self.size)
27 .finish()
28 }
29}
30
31/// Anthropic wants uploaded images to be smaller than this in both dimensions.
32const ANTHROPIC_SIZE_LIMT: f32 = 1568.;
33
34impl LanguageModelImage {
35 pub fn empty() -> Self {
36 Self {
37 source: "".into(),
38 size: size(DevicePixels(0), DevicePixels(0)),
39 }
40 }
41
42 pub fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<Self>> {
43 cx.background_spawn(async move {
44 match data.format() {
45 gpui::ImageFormat::Png
46 | gpui::ImageFormat::Jpeg
47 | gpui::ImageFormat::Webp
48 | gpui::ImageFormat::Gif => {}
49 _ => return None,
50 };
51
52 let image = image::codecs::png::PngDecoder::new(Cursor::new(data.bytes())).log_err()?;
53 let (width, height) = image.dimensions();
54 let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
55
56 let mut base64_image = Vec::new();
57
58 {
59 let mut base64_encoder = EncoderWriter::new(
60 Cursor::new(&mut base64_image),
61 &base64::engine::general_purpose::STANDARD,
62 );
63
64 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
65 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
66 {
67 let new_bounds = ObjectFit::ScaleDown.get_bounds(
68 gpui::Bounds {
69 origin: point(px(0.0), px(0.0)),
70 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
71 },
72 image_size,
73 );
74 let image = DynamicImage::from_decoder(image).log_err()?.resize(
75 new_bounds.size.width.0 as u32,
76 new_bounds.size.height.0 as u32,
77 image::imageops::FilterType::Triangle,
78 );
79
80 let mut png = Vec::new();
81 image
82 .write_with_encoder(PngEncoder::new(&mut png))
83 .log_err()?;
84
85 base64_encoder.write_all(png.as_slice()).log_err()?;
86 } else {
87 base64_encoder.write_all(data.bytes()).log_err()?;
88 }
89 }
90
91 // SAFETY: The base64 encoder should not produce non-UTF8.
92 let source = unsafe { String::from_utf8_unchecked(base64_image) };
93
94 Some(LanguageModelImage {
95 size: image_size,
96 source: source.into(),
97 })
98 })
99 }
100
101 /// Resolves image into an LLM-ready format (base64).
102 pub fn from_render_image(data: &RenderImage) -> Option<Self> {
103 let image_size = data.size(0);
104
105 let mut bytes = data.as_bytes(0).unwrap_or(&[]).to_vec();
106 // Convert from BGRA to RGBA.
107 for pixel in bytes.chunks_exact_mut(4) {
108 pixel.swap(2, 0);
109 }
110 let mut image = image::RgbaImage::from_vec(
111 image_size.width.0 as u32,
112 image_size.height.0 as u32,
113 bytes,
114 )
115 .expect("We already know this works");
116
117 // https://docs.anthropic.com/en/docs/build-with-claude/vision
118 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
119 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
120 {
121 let new_bounds = ObjectFit::ScaleDown.get_bounds(
122 gpui::Bounds {
123 origin: point(px(0.0), px(0.0)),
124 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
125 },
126 image_size,
127 );
128
129 image = resize(
130 &image,
131 new_bounds.size.width.0 as u32,
132 new_bounds.size.height.0 as u32,
133 image::imageops::FilterType::Triangle,
134 );
135 }
136
137 let mut png = Vec::new();
138
139 image
140 .write_with_encoder(PngEncoder::new(&mut png))
141 .log_err()?;
142
143 let mut base64_image = Vec::new();
144
145 {
146 let mut base64_encoder = EncoderWriter::new(
147 Cursor::new(&mut base64_image),
148 &base64::engine::general_purpose::STANDARD,
149 );
150
151 base64_encoder.write_all(png.as_slice()).log_err()?;
152 }
153
154 // SAFETY: The base64 encoder should not produce non-UTF8.
155 let source = unsafe { String::from_utf8_unchecked(base64_image) };
156
157 Some(LanguageModelImage {
158 size: image_size,
159 source: source.into(),
160 })
161 }
162
163 pub fn estimate_tokens(&self) -> usize {
164 let width = self.size.width.0.unsigned_abs() as usize;
165 let height = self.size.height.0.unsigned_abs() as usize;
166
167 // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
168 // Note that are a lot of conditions on Anthropic's API, and OpenAI doesn't use this,
169 // so this method is more of a rough guess.
170 (width * height) / 750
171 }
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
175pub struct LanguageModelToolResult {
176 pub tool_use_id: LanguageModelToolUseId,
177 pub tool_name: Arc<str>,
178 pub is_error: bool,
179 pub content: Arc<str>,
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
183pub enum MessageContent {
184 Text(String),
185 Thinking {
186 text: String,
187 signature: Option<String>,
188 },
189 RedactedThinking(Vec<u8>),
190 Image(LanguageModelImage),
191 ToolUse(LanguageModelToolUse),
192 ToolResult(LanguageModelToolResult),
193}
194
195impl From<String> for MessageContent {
196 fn from(value: String) -> Self {
197 MessageContent::Text(value)
198 }
199}
200
201impl From<&str> for MessageContent {
202 fn from(value: &str) -> Self {
203 MessageContent::Text(value.to_string())
204 }
205}
206
207#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Hash)]
208pub struct LanguageModelRequestMessage {
209 pub role: Role,
210 pub content: Vec<MessageContent>,
211 pub cache: bool,
212}
213
214impl LanguageModelRequestMessage {
215 pub fn string_contents(&self) -> String {
216 let mut buffer = String::new();
217 for string in self.content.iter().filter_map(|content| match content {
218 MessageContent::Text(text) => Some(text.as_str()),
219 MessageContent::Thinking { text, .. } => Some(text.as_str()),
220 MessageContent::RedactedThinking(_) => None,
221 MessageContent::ToolResult(tool_result) => Some(tool_result.content.as_ref()),
222 MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
223 }) {
224 buffer.push_str(string);
225 }
226
227 buffer
228 }
229
230 pub fn contents_empty(&self) -> bool {
231 self.content.iter().all(|content| match content {
232 MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
233 MessageContent::Thinking { text, .. } => text.chars().all(|c| c.is_whitespace()),
234 MessageContent::ToolResult(tool_result) => {
235 tool_result.content.chars().all(|c| c.is_whitespace())
236 }
237 MessageContent::RedactedThinking(_)
238 | MessageContent::ToolUse(_)
239 | MessageContent::Image(_) => false,
240 })
241 }
242}
243
244#[derive(Debug, PartialEq, Hash, Clone, Serialize, Deserialize)]
245pub struct LanguageModelRequestTool {
246 pub name: String,
247 pub description: String,
248 pub input_schema: serde_json::Value,
249}
250
251#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
252pub struct LanguageModelRequest {
253 pub thread_id: Option<String>,
254 pub prompt_id: Option<String>,
255 pub messages: Vec<LanguageModelRequestMessage>,
256 pub tools: Vec<LanguageModelRequestTool>,
257 pub stop: Vec<String>,
258 pub temperature: Option<f32>,
259}
260
261#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
262pub struct LanguageModelResponseMessage {
263 pub role: Option<Role>,
264 pub content: Option<String>,
265}