1use std::io::{Cursor, Write};
2use std::sync::Arc;
3
4use crate::role::Role;
5use crate::{LanguageModelToolUse, LanguageModelToolUseId};
6use base64::write::EncoderWriter;
7use gpui::{
8 App, AppContext as _, DevicePixels, Image, ObjectFit, RenderImage, SharedString, Size, Task,
9 point, px, size,
10};
11use image::{DynamicImage, ImageDecoder, codecs::png::PngEncoder, imageops::resize};
12use serde::{Deserialize, Serialize};
13use util::ResultExt;
14use zed_llm_client::CompletionMode;
15
16#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
17pub struct LanguageModelImage {
18 /// A base64-encoded PNG image.
19 pub source: SharedString,
20 size: Size<DevicePixels>,
21}
22
23impl std::fmt::Debug for LanguageModelImage {
24 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
25 f.debug_struct("LanguageModelImage")
26 .field("source", &format!("<{} bytes>", self.source.len()))
27 .field("size", &self.size)
28 .finish()
29 }
30}
31
32/// Anthropic wants uploaded images to be smaller than this in both dimensions.
33const ANTHROPIC_SIZE_LIMT: f32 = 1568.;
34
35impl LanguageModelImage {
36 pub fn empty() -> Self {
37 Self {
38 source: "".into(),
39 size: size(DevicePixels(0), DevicePixels(0)),
40 }
41 }
42
43 pub fn from_image(data: Arc<Image>, cx: &mut App) -> Task<Option<Self>> {
44 cx.background_spawn(async move {
45 match data.format() {
46 gpui::ImageFormat::Png
47 | gpui::ImageFormat::Jpeg
48 | gpui::ImageFormat::Webp
49 | gpui::ImageFormat::Gif => {}
50 _ => return None,
51 };
52
53 let image = image::codecs::png::PngDecoder::new(Cursor::new(data.bytes())).log_err()?;
54 let (width, height) = image.dimensions();
55 let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
56
57 let mut base64_image = Vec::new();
58
59 {
60 let mut base64_encoder = EncoderWriter::new(
61 Cursor::new(&mut base64_image),
62 &base64::engine::general_purpose::STANDARD,
63 );
64
65 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
66 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
67 {
68 let new_bounds = ObjectFit::ScaleDown.get_bounds(
69 gpui::Bounds {
70 origin: point(px(0.0), px(0.0)),
71 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
72 },
73 image_size,
74 );
75 let image = DynamicImage::from_decoder(image).log_err()?.resize(
76 new_bounds.size.width.0 as u32,
77 new_bounds.size.height.0 as u32,
78 image::imageops::FilterType::Triangle,
79 );
80
81 let mut png = Vec::new();
82 image
83 .write_with_encoder(PngEncoder::new(&mut png))
84 .log_err()?;
85
86 base64_encoder.write_all(png.as_slice()).log_err()?;
87 } else {
88 base64_encoder.write_all(data.bytes()).log_err()?;
89 }
90 }
91
92 // SAFETY: The base64 encoder should not produce non-UTF8.
93 let source = unsafe { String::from_utf8_unchecked(base64_image) };
94
95 Some(LanguageModelImage {
96 size: image_size,
97 source: source.into(),
98 })
99 })
100 }
101
102 /// Resolves image into an LLM-ready format (base64).
103 pub fn from_render_image(data: &RenderImage) -> Option<Self> {
104 let image_size = data.size(0);
105
106 let mut bytes = data.as_bytes(0).unwrap_or(&[]).to_vec();
107 // Convert from BGRA to RGBA.
108 for pixel in bytes.chunks_exact_mut(4) {
109 pixel.swap(2, 0);
110 }
111 let mut image = image::RgbaImage::from_vec(
112 image_size.width.0 as u32,
113 image_size.height.0 as u32,
114 bytes,
115 )
116 .expect("We already know this works");
117
118 // https://docs.anthropic.com/en/docs/build-with-claude/vision
119 if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
120 || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
121 {
122 let new_bounds = ObjectFit::ScaleDown.get_bounds(
123 gpui::Bounds {
124 origin: point(px(0.0), px(0.0)),
125 size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
126 },
127 image_size,
128 );
129
130 image = resize(
131 &image,
132 new_bounds.size.width.0 as u32,
133 new_bounds.size.height.0 as u32,
134 image::imageops::FilterType::Triangle,
135 );
136 }
137
138 let mut png = Vec::new();
139
140 image
141 .write_with_encoder(PngEncoder::new(&mut png))
142 .log_err()?;
143
144 let mut base64_image = Vec::new();
145
146 {
147 let mut base64_encoder = EncoderWriter::new(
148 Cursor::new(&mut base64_image),
149 &base64::engine::general_purpose::STANDARD,
150 );
151
152 base64_encoder.write_all(png.as_slice()).log_err()?;
153 }
154
155 // SAFETY: The base64 encoder should not produce non-UTF8.
156 let source = unsafe { String::from_utf8_unchecked(base64_image) };
157
158 Some(LanguageModelImage {
159 size: image_size,
160 source: source.into(),
161 })
162 }
163
164 pub fn estimate_tokens(&self) -> usize {
165 let width = self.size.width.0.unsigned_abs() as usize;
166 let height = self.size.height.0.unsigned_abs() as usize;
167
168 // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
169 // Note that are a lot of conditions on Anthropic's API, and OpenAI doesn't use this,
170 // so this method is more of a rough guess.
171 (width * height) / 750
172 }
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
176pub struct LanguageModelToolResult {
177 pub tool_use_id: LanguageModelToolUseId,
178 pub tool_name: Arc<str>,
179 pub is_error: bool,
180 pub content: Arc<str>,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
184pub enum MessageContent {
185 Text(String),
186 Thinking {
187 text: String,
188 signature: Option<String>,
189 },
190 RedactedThinking(Vec<u8>),
191 Image(LanguageModelImage),
192 ToolUse(LanguageModelToolUse),
193 ToolResult(LanguageModelToolResult),
194}
195
196impl From<String> for MessageContent {
197 fn from(value: String) -> Self {
198 MessageContent::Text(value)
199 }
200}
201
202impl From<&str> for MessageContent {
203 fn from(value: &str) -> Self {
204 MessageContent::Text(value.to_string())
205 }
206}
207
208#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Hash)]
209pub struct LanguageModelRequestMessage {
210 pub role: Role,
211 pub content: Vec<MessageContent>,
212 pub cache: bool,
213}
214
215impl LanguageModelRequestMessage {
216 pub fn string_contents(&self) -> String {
217 let mut buffer = String::new();
218 for string in self.content.iter().filter_map(|content| match content {
219 MessageContent::Text(text) => Some(text.as_str()),
220 MessageContent::Thinking { text, .. } => Some(text.as_str()),
221 MessageContent::RedactedThinking(_) => None,
222 MessageContent::ToolResult(tool_result) => Some(tool_result.content.as_ref()),
223 MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
224 }) {
225 buffer.push_str(string);
226 }
227
228 buffer
229 }
230
231 pub fn contents_empty(&self) -> bool {
232 self.content.iter().all(|content| match content {
233 MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
234 MessageContent::Thinking { text, .. } => text.chars().all(|c| c.is_whitespace()),
235 MessageContent::ToolResult(tool_result) => {
236 tool_result.content.chars().all(|c| c.is_whitespace())
237 }
238 MessageContent::RedactedThinking(_)
239 | MessageContent::ToolUse(_)
240 | MessageContent::Image(_) => false,
241 })
242 }
243}
244
245#[derive(Debug, PartialEq, Hash, Clone, Serialize, Deserialize)]
246pub struct LanguageModelRequestTool {
247 pub name: String,
248 pub description: String,
249 pub input_schema: serde_json::Value,
250}
251
252#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
253pub struct LanguageModelRequest {
254 pub thread_id: Option<String>,
255 pub prompt_id: Option<String>,
256 pub mode: Option<CompletionMode>,
257 pub messages: Vec<LanguageModelRequestMessage>,
258 pub tools: Vec<LanguageModelRequestTool>,
259 pub stop: Vec<String>,
260 pub temperature: Option<f32>,
261}
262
263#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
264pub struct LanguageModelResponseMessage {
265 pub role: Option<Role>,
266 pub content: Option<String>,
267}