1use std::io::{Cursor, Write};
2use std::sync::Arc;
3
4use crate::role::Role;
5use crate::{LanguageModelToolUse, LanguageModelToolUseId};
6use base64::write::EncoderWriter;
7use gpui::{
8 App, AppContext as _, DevicePixels, Image, ObjectFit, RenderImage, SharedString, Size, Task,
9 point, px, size,
10};
11use image::{DynamicImage, ImageDecoder, codecs::png::PngEncoder, imageops::resize};
12use serde::{Deserialize, Serialize};
13use util::ResultExt;
14
15#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
16pub struct LanguageModelImage {
17 /// A base64-encoded PNG image.
18 pub source: SharedString,
19 size: Size<DevicePixels>,
20}
21
22impl std::fmt::Debug for LanguageModelImage {
23 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
24 f.debug_struct("LanguageModelImage")
25 .field("source", &format!("<{} bytes>", self.source.len()))
26 .field("size", &self.size)
27 .finish()
28 }
29}
30
31/// Anthropic wants uploaded images to be smaller than this in both dimensions.
32const ANTHROPIC_SIZE_LIMT: f32 = 1568.;
33
34impl LanguageModelImage {
35 pub fn from_image(data: Image, cx: &mut App) -> Task<Option<Self>> {
36 cx.background_spawn(async move {
37 match data.format() {
38 gpui::ImageFormat::Png
39 | gpui::ImageFormat::Jpeg
40 | gpui::ImageFormat::Webp
41 | gpui::ImageFormat::Gif => {}
42 _ => return None,
43 };
44
45 let image = image::codecs::png::PngDecoder::new(Cursor::new(data.bytes())).log_err()?;
46 let (width, height) = image.dimensions();
47 let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
48
49 let mut base64_image = Vec::new();
50
51 {
52 let mut base64_encoder = EncoderWriter::new(
53 Cursor::new(&mut base64_image),
54 &base64::engine::general_purpose::STANDARD,
55 );
56
57 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
58 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
59 {
60 let new_bounds = ObjectFit::ScaleDown.get_bounds(
61 gpui::Bounds {
62 origin: point(px(0.0), px(0.0)),
63 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
64 },
65 image_size,
66 );
67 let image = DynamicImage::from_decoder(image).log_err()?.resize(
68 new_bounds.size.width.0 as u32,
69 new_bounds.size.height.0 as u32,
70 image::imageops::FilterType::Triangle,
71 );
72
73 let mut png = Vec::new();
74 image
75 .write_with_encoder(PngEncoder::new(&mut png))
76 .log_err()?;
77
78 base64_encoder.write_all(png.as_slice()).log_err()?;
79 } else {
80 base64_encoder.write_all(data.bytes()).log_err()?;
81 }
82 }
83
84 // SAFETY: The base64 encoder should not produce non-UTF8.
85 let source = unsafe { String::from_utf8_unchecked(base64_image) };
86
87 Some(LanguageModelImage {
88 size: image_size,
89 source: source.into(),
90 })
91 })
92 }
93
94 /// Resolves image into an LLM-ready format (base64).
95 pub fn from_render_image(data: &RenderImage) -> Option<Self> {
96 let image_size = data.size(0);
97
98 let mut bytes = data.as_bytes(0).unwrap_or(&[]).to_vec();
99 // Convert from BGRA to RGBA.
100 for pixel in bytes.chunks_exact_mut(4) {
101 pixel.swap(2, 0);
102 }
103 let mut image = image::RgbaImage::from_vec(
104 image_size.width.0 as u32,
105 image_size.height.0 as u32,
106 bytes,
107 )
108 .expect("We already know this works");
109
110 // https://docs.anthropic.com/en/docs/build-with-claude/vision
111 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
112 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
113 {
114 let new_bounds = ObjectFit::ScaleDown.get_bounds(
115 gpui::Bounds {
116 origin: point(px(0.0), px(0.0)),
117 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
118 },
119 image_size,
120 );
121
122 image = resize(
123 &image,
124 new_bounds.size.width.0 as u32,
125 new_bounds.size.height.0 as u32,
126 image::imageops::FilterType::Triangle,
127 );
128 }
129
130 let mut png = Vec::new();
131
132 image
133 .write_with_encoder(PngEncoder::new(&mut png))
134 .log_err()?;
135
136 let mut base64_image = Vec::new();
137
138 {
139 let mut base64_encoder = EncoderWriter::new(
140 Cursor::new(&mut base64_image),
141 &base64::engine::general_purpose::STANDARD,
142 );
143
144 base64_encoder.write_all(png.as_slice()).log_err()?;
145 }
146
147 // SAFETY: The base64 encoder should not produce non-UTF8.
148 let source = unsafe { String::from_utf8_unchecked(base64_image) };
149
150 Some(LanguageModelImage {
151 size: image_size,
152 source: source.into(),
153 })
154 }
155
156 pub fn estimate_tokens(&self) -> usize {
157 let width = self.size.width.0.unsigned_abs() as usize;
158 let height = self.size.height.0.unsigned_abs() as usize;
159
160 // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
161 // Note that are a lot of conditions on Anthropic's API, and OpenAI doesn't use this,
162 // so this method is more of a rough guess.
163 (width * height) / 750
164 }
165}
166
167#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
168pub struct LanguageModelToolResult {
169 pub tool_use_id: LanguageModelToolUseId,
170 pub tool_name: Arc<str>,
171 pub is_error: bool,
172 pub content: Arc<str>,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
176pub enum MessageContent {
177 Text(String),
178 Thinking {
179 text: String,
180 signature: Option<String>,
181 },
182 RedactedThinking(Vec<u8>),
183 Image(LanguageModelImage),
184 ToolUse(LanguageModelToolUse),
185 ToolResult(LanguageModelToolResult),
186}
187
188impl From<String> for MessageContent {
189 fn from(value: String) -> Self {
190 MessageContent::Text(value)
191 }
192}
193
194impl From<&str> for MessageContent {
195 fn from(value: &str) -> Self {
196 MessageContent::Text(value.to_string())
197 }
198}
199
200#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Hash)]
201pub struct LanguageModelRequestMessage {
202 pub role: Role,
203 pub content: Vec<MessageContent>,
204 pub cache: bool,
205}
206
207impl LanguageModelRequestMessage {
208 pub fn string_contents(&self) -> String {
209 let mut buffer = String::new();
210 for string in self.content.iter().filter_map(|content| match content {
211 MessageContent::Text(text) => Some(text.as_str()),
212 MessageContent::Thinking { text, .. } => Some(text.as_str()),
213 MessageContent::RedactedThinking(_) => None,
214 MessageContent::ToolResult(tool_result) => Some(tool_result.content.as_ref()),
215 MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
216 }) {
217 buffer.push_str(string);
218 }
219
220 buffer
221 }
222
223 pub fn contents_empty(&self) -> bool {
224 self.content.is_empty()
225 || self
226 .content
227 .first()
228 .map(|content| match content {
229 MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
230 MessageContent::Thinking { text, .. } => {
231 text.chars().all(|c| c.is_whitespace())
232 }
233 MessageContent::ToolResult(tool_result) => {
234 tool_result.content.chars().all(|c| c.is_whitespace())
235 }
236 MessageContent::RedactedThinking(_)
237 | MessageContent::ToolUse(_)
238 | MessageContent::Image(_) => true,
239 })
240 .unwrap_or(false)
241 }
242}
243
244#[derive(Debug, PartialEq, Hash, Clone, Serialize, Deserialize)]
245pub struct LanguageModelRequestTool {
246 pub name: String,
247 pub description: String,
248 pub input_schema: serde_json::Value,
249}
250
251#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
252pub struct LanguageModelRequest {
253 pub thread_id: Option<String>,
254 pub prompt_id: Option<String>,
255 pub messages: Vec<LanguageModelRequestMessage>,
256 pub tools: Vec<LanguageModelRequestTool>,
257 pub stop: Vec<String>,
258 pub temperature: Option<f32>,
259}
260
261#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
262pub struct LanguageModelResponseMessage {
263 pub role: Option<Role>,
264 pub content: Option<String>,
265}