request.rs

  1use std::io::{Cursor, Write};
  2use std::sync::Arc;
  3
  4use crate::role::Role;
  5use crate::{LanguageModelToolUse, LanguageModelToolUseId};
  6use base64::write::EncoderWriter;
  7use gpui::{
  8    App, AppContext as _, DevicePixels, Image, ObjectFit, RenderImage, SharedString, Size, Task,
  9    point, px, size,
 10};
 11use image::{DynamicImage, ImageDecoder, codecs::png::PngEncoder, imageops::resize};
 12use serde::{Deserialize, Serialize};
 13use util::ResultExt;
 14
 15#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Hash)]
 16pub struct LanguageModelImage {
 17    /// A base64-encoded PNG image.
 18    pub source: SharedString,
 19    size: Size<DevicePixels>,
 20}
 21
 22impl std::fmt::Debug for LanguageModelImage {
 23    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 24        f.debug_struct("LanguageModelImage")
 25            .field("source", &format!("<{} bytes>", self.source.len()))
 26            .field("size", &self.size)
 27            .finish()
 28    }
 29}
 30
 31/// Anthropic wants uploaded images to be smaller than this in both dimensions.
 32const ANTHROPIC_SIZE_LIMT: f32 = 1568.;
 33
 34impl LanguageModelImage {
 35    pub fn from_image(data: Image, cx: &mut App) -> Task<Option<Self>> {
 36        cx.background_spawn(async move {
 37            match data.format() {
 38                gpui::ImageFormat::Png
 39                | gpui::ImageFormat::Jpeg
 40                | gpui::ImageFormat::Webp
 41                | gpui::ImageFormat::Gif => {}
 42                _ => return None,
 43            };
 44
 45            let image = image::codecs::png::PngDecoder::new(Cursor::new(data.bytes())).log_err()?;
 46            let (width, height) = image.dimensions();
 47            let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
 48
 49            let mut base64_image = Vec::new();
 50
 51            {
 52                let mut base64_encoder = EncoderWriter::new(
 53                    Cursor::new(&mut base64_image),
 54                    &base64::engine::general_purpose::STANDARD,
 55                );
 56
 57                if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
 58                    || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
 59                {
 60                    let new_bounds = ObjectFit::ScaleDown.get_bounds(
 61                        gpui::Bounds {
 62                            origin: point(px(0.0), px(0.0)),
 63                            size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
 64                        },
 65                        image_size,
 66                    );
 67                    let image = DynamicImage::from_decoder(image).log_err()?.resize(
 68                        new_bounds.size.width.0 as u32,
 69                        new_bounds.size.height.0 as u32,
 70                        image::imageops::FilterType::Triangle,
 71                    );
 72
 73                    let mut png = Vec::new();
 74                    image
 75                        .write_with_encoder(PngEncoder::new(&mut png))
 76                        .log_err()?;
 77
 78                    base64_encoder.write_all(png.as_slice()).log_err()?;
 79                } else {
 80                    base64_encoder.write_all(data.bytes()).log_err()?;
 81                }
 82            }
 83
 84            // SAFETY: The base64 encoder should not produce non-UTF8.
 85            let source = unsafe { String::from_utf8_unchecked(base64_image) };
 86
 87            Some(LanguageModelImage {
 88                size: image_size,
 89                source: source.into(),
 90            })
 91        })
 92    }
 93
 94    /// Resolves image into an LLM-ready format (base64).
 95    pub fn from_render_image(data: &RenderImage) -> Option<Self> {
 96        let image_size = data.size(0);
 97
 98        let mut bytes = data.as_bytes(0).unwrap_or(&[]).to_vec();
 99        // Convert from BGRA to RGBA.
100        for pixel in bytes.chunks_exact_mut(4) {
101            pixel.swap(2, 0);
102        }
103        let mut image = image::RgbaImage::from_vec(
104            image_size.width.0 as u32,
105            image_size.height.0 as u32,
106            bytes,
107        )
108        .expect("We already know this works");
109
110        // https://docs.anthropic.com/en/docs/build-with-claude/vision
111        if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
112            || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
113        {
114            let new_bounds = ObjectFit::ScaleDown.get_bounds(
115                gpui::Bounds {
116                    origin: point(px(0.0), px(0.0)),
117                    size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
118                },
119                image_size,
120            );
121
122            image = resize(
123                &image,
124                new_bounds.size.width.0 as u32,
125                new_bounds.size.height.0 as u32,
126                image::imageops::FilterType::Triangle,
127            );
128        }
129
130        let mut png = Vec::new();
131
132        image
133            .write_with_encoder(PngEncoder::new(&mut png))
134            .log_err()?;
135
136        let mut base64_image = Vec::new();
137
138        {
139            let mut base64_encoder = EncoderWriter::new(
140                Cursor::new(&mut base64_image),
141                &base64::engine::general_purpose::STANDARD,
142            );
143
144            base64_encoder.write_all(png.as_slice()).log_err()?;
145        }
146
147        // SAFETY: The base64 encoder should not produce non-UTF8.
148        let source = unsafe { String::from_utf8_unchecked(base64_image) };
149
150        Some(LanguageModelImage {
151            size: image_size,
152            source: source.into(),
153        })
154    }
155
156    pub fn estimate_tokens(&self) -> usize {
157        let width = self.size.width.0.unsigned_abs() as usize;
158        let height = self.size.height.0.unsigned_abs() as usize;
159
160        // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
161        // Note that are a lot of conditions on Anthropic's API, and OpenAI doesn't use this,
162        // so this method is more of a rough guess.
163        (width * height) / 750
164    }
165}
166
167#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
168pub struct LanguageModelToolResult {
169    pub tool_use_id: LanguageModelToolUseId,
170    pub tool_name: Arc<str>,
171    pub is_error: bool,
172    pub content: Arc<str>,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
176pub enum MessageContent {
177    Text(String),
178    Image(LanguageModelImage),
179    ToolUse(LanguageModelToolUse),
180    ToolResult(LanguageModelToolResult),
181}
182
183impl From<String> for MessageContent {
184    fn from(value: String) -> Self {
185        MessageContent::Text(value)
186    }
187}
188
189impl From<&str> for MessageContent {
190    fn from(value: &str) -> Self {
191        MessageContent::Text(value.to_string())
192    }
193}
194
195#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Hash)]
196pub struct LanguageModelRequestMessage {
197    pub role: Role,
198    pub content: Vec<MessageContent>,
199    pub cache: bool,
200}
201
202impl LanguageModelRequestMessage {
203    pub fn string_contents(&self) -> String {
204        let mut buffer = String::new();
205        for string in self.content.iter().filter_map(|content| match content {
206            MessageContent::Text(text) => Some(text.as_str()),
207            MessageContent::ToolResult(tool_result) => Some(tool_result.content.as_ref()),
208            MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
209        }) {
210            buffer.push_str(string);
211        }
212
213        buffer
214    }
215
216    pub fn contents_empty(&self) -> bool {
217        self.content.is_empty()
218            || self
219                .content
220                .first()
221                .map(|content| match content {
222                    MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
223                    MessageContent::ToolResult(tool_result) => {
224                        tool_result.content.chars().all(|c| c.is_whitespace())
225                    }
226                    MessageContent::ToolUse(_) | MessageContent::Image(_) => true,
227                })
228                .unwrap_or(false)
229    }
230}
231
232#[derive(Debug, PartialEq, Hash, Clone, Serialize, Deserialize)]
233pub struct LanguageModelRequestTool {
234    pub name: String,
235    pub description: String,
236    pub input_schema: serde_json::Value,
237}
238
239#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
240pub struct LanguageModelRequest {
241    pub messages: Vec<LanguageModelRequestMessage>,
242    pub tools: Vec<LanguageModelRequestTool>,
243    pub stop: Vec<String>,
244    pub temperature: Option<f32>,
245}
246
247#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
248pub struct LanguageModelResponseMessage {
249    pub role: Option<Role>,
250    pub content: Option<String>,
251}