Allow `StreamingEditFileTool` to also create files (#29785)

Antonio Scandurra , Ben Brandt , and Oleksiy Syvokon created 9 months ago

Refs #29733 

This pull request introduces a new field to the `StreamingEditFileTool`
that lets the model create or overwrite a file in a streaming way. When
one of the `assistant.stream_edits` setting / `agent-stream-edits`
feature flag is enabled, we are going to disable the `CreateFileTool` so
that the agent model can only use `StreamingEditFileTool` for file
creation.

Release Notes:

- N/A

---------

Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com>
Co-authored-by: Oleksiy Syvokon <oleksiy.syvokon@gmail.com>

Change summary

crates/assistant_tools/src/assistant_tools.rs                           |    3 
crates/assistant_tools/src/edit_agent.rs                                |  139 
crates/assistant_tools/src/edit_agent/evals.rs                          |  379 
crates/assistant_tools/src/edit_agent/evals/fixtures/zode/prompt.md     | 1912 
crates/assistant_tools/src/edit_agent/evals/fixtures/zode/react.py      |   14 
crates/assistant_tools/src/edit_agent/evals/fixtures/zode/react_test.py |  271 
crates/assistant_tools/src/streaming_edit_file_tool.rs                  |   31 
crates/assistant_tools/src/streaming_edit_file_tool/description.md      |    2 
crates/assistant_tools/src/templates/create_file_prompt.hbs             |   12 
crates/assistant_tools/src/templates/edit_file_prompt.hbs               |    0 
crates/language/src/buffer.rs                                           |    8 
11 files changed, 2,632 insertions(+), 139 deletions(-)

Detailed changes

crates/assistant_tools/src/assistant_tools.rs 🔗

@@ -77,7 +77,6 @@ pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
     registry.register_tool(TerminalTool);
     registry.register_tool(BatchTool);
     registry.register_tool(CreateDirectoryTool);
-    registry.register_tool(CreateFileTool);
     registry.register_tool(CopyPathTool);
     registry.register_tool(DeletePathTool);
     registry.register_tool(SymbolInfoTool);
@@ -125,12 +124,14 @@ pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
 fn register_edit_file_tool(cx: &mut App) {
     let registry = ToolRegistry::global(cx);
 
+    registry.unregister_tool(CreateFileTool);
     registry.unregister_tool(EditFileTool);
     registry.unregister_tool(StreamingEditFileTool);
 
     if AssistantSettings::get_global(cx).stream_edits(cx) {
         registry.register_tool(StreamingEditFileTool);
     } else {
+        registry.register_tool(CreateFileTool);
         registry.register_tool(EditFileTool);
     }
 }

crates/assistant_tools/src/edit_agent.rs 🔗

@@ -10,6 +10,7 @@ use edit_parser::{EditParser, EditParserEvent, EditParserMetrics};
 use futures::{
     Stream, StreamExt,
     channel::mpsc::{self, UnboundedReceiver},
+    pin_mut,
     stream::BoxStream,
 };
 use gpui::{AppContext, AsyncApp, Entity, SharedString, Task};
@@ -23,19 +24,29 @@ use std::{cmp, iter, mem, ops::Range, path::PathBuf, sync::Arc, task::Poll};
 use streaming_diff::{CharOperation, StreamingDiff};
 
 #[derive(Serialize)]
-pub struct EditAgentTemplate {
+struct CreateFilePromptTemplate {
     path: Option<PathBuf>,
     edit_description: String,
 }
 
-impl Template for EditAgentTemplate {
-    const TEMPLATE_NAME: &'static str = "edit_agent.hbs";
+impl Template for CreateFilePromptTemplate {
+    const TEMPLATE_NAME: &'static str = "create_file_prompt.hbs";
+}
+
+#[derive(Serialize)]
+struct EditFilePromptTemplate {
+    path: Option<PathBuf>,
+    edit_description: String,
+}
+
+impl Template for EditFilePromptTemplate {
+    const TEMPLATE_NAME: &'static str = "edit_file_prompt.hbs";
 }
 
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum EditAgentOutputEvent {
     Edited,
-    HallucinatedOldText(SharedString),
+    OldTextNotFound(SharedString),
 }
 
 #[derive(Clone, Debug)]
@@ -64,6 +75,82 @@ impl EditAgent {
         }
     }
 
+    pub fn overwrite(
+        &self,
+        buffer: Entity<Buffer>,
+        edit_description: String,
+        previous_messages: Vec<LanguageModelRequestMessage>,
+        cx: &mut AsyncApp,
+    ) -> (
+        Task<Result<EditAgentOutput>>,
+        mpsc::UnboundedReceiver<EditAgentOutputEvent>,
+    ) {
+        let this = self.clone();
+        let (events_tx, events_rx) = mpsc::unbounded();
+        let output = cx.spawn(async move |cx| {
+            let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot())?;
+            let path = cx.update(|cx| snapshot.resolve_file_path(cx, true))?;
+            let prompt = CreateFilePromptTemplate {
+                path,
+                edit_description,
+            }
+            .render(&this.templates)?;
+            let new_chunks = this.request(previous_messages, prompt, cx).await?;
+
+            let (output, mut inner_events) = this.replace_text_with_chunks(buffer, new_chunks, cx);
+            while let Some(event) = inner_events.next().await {
+                events_tx.unbounded_send(event).ok();
+            }
+            output.await
+        });
+        (output, events_rx)
+    }
+
+    fn replace_text_with_chunks(
+        &self,
+        buffer: Entity<Buffer>,
+        edit_chunks: impl 'static + Send + Stream<Item = Result<String, LanguageModelCompletionError>>,
+        cx: &mut AsyncApp,
+    ) -> (
+        Task<Result<EditAgentOutput>>,
+        mpsc::UnboundedReceiver<EditAgentOutputEvent>,
+    ) {
+        let (output_events_tx, output_events_rx) = mpsc::unbounded();
+        let this = self.clone();
+        let task = cx.spawn(async move |cx| {
+            // Ensure the buffer is tracked by the action log.
+            this.action_log
+                .update(cx, |log, cx| log.track_buffer(buffer.clone(), cx))?;
+
+            cx.update(|cx| {
+                buffer.update(cx, |buffer, cx| buffer.set_text("", cx));
+                this.action_log
+                    .update(cx, |log, cx| log.buffer_edited(buffer.clone(), cx));
+            })?;
+
+            let mut raw_edits = String::new();
+            pin_mut!(edit_chunks);
+            while let Some(chunk) = edit_chunks.next().await {
+                let chunk = chunk?;
+                raw_edits.push_str(&chunk);
+                cx.update(|cx| {
+                    buffer.update(cx, |buffer, cx| buffer.append(chunk, cx));
+                    this.action_log
+                        .update(cx, |log, cx| log.buffer_edited(buffer.clone(), cx));
+                })?;
+                output_events_tx
+                    .unbounded_send(EditAgentOutputEvent::Edited)
+                    .ok();
+            }
+
+            Ok(EditAgentOutput {
+                _raw_edits: raw_edits,
+                _parser_metrics: EditParserMetrics::default(),
+            })
+        });
+        (task, output_events_rx)
+    }
+
     pub fn edit(
         &self,
         buffer: Entity<Buffer>,
@@ -78,10 +165,15 @@ impl EditAgent {
         let (events_tx, events_rx) = mpsc::unbounded();
         let output = cx.spawn(async move |cx| {
             let snapshot = buffer.read_with(cx, |buffer, _| buffer.snapshot())?;
-            let edit_chunks = this
-                .request_edits(snapshot, edit_description, previous_messages, cx)
-                .await?;
-            let (output, mut inner_events) = this.apply_edits(buffer, edit_chunks, cx);
+            let path = cx.update(|cx| snapshot.resolve_file_path(cx, true))?;
+            let prompt = EditFilePromptTemplate {
+                path,
+                edit_description,
+            }
+            .render(&this.templates)?;
+            let edit_chunks = this.request(previous_messages, prompt, cx).await?;
+
+            let (output, mut inner_events) = this.apply_edit_chunks(buffer, edit_chunks, cx);
             while let Some(event) = inner_events.next().await {
                 events_tx.unbounded_send(event).ok();
             }
@@ -90,7 +182,7 @@ impl EditAgent {
         (output, events_rx)
     }
 
-    fn apply_edits(
+    fn apply_edit_chunks(
         &self,
         buffer: Entity<Buffer>,
         edit_chunks: impl 'static + Send + Stream<Item = Result<String, LanguageModelCompletionError>>,
@@ -138,7 +230,7 @@ impl EditAgent {
             let Some(old_range) = old_range else {
                 // We couldn't find the old text in the buffer. Report the error.
                 output_events
-                    .unbounded_send(EditAgentOutputEvent::HallucinatedOldText(old_text_query))
+                    .unbounded_send(EditAgentOutputEvent::OldTextNotFound(old_text_query))
                     .ok();
                 continue;
             };
@@ -232,7 +324,7 @@ impl EditAgent {
     ) {
         let (tx, rx) = mpsc::unbounded();
         let output = cx.background_spawn(async move {
-            futures::pin_mut!(chunks);
+            pin_mut!(chunks);
 
             let mut parser = EditParser::new();
             let mut raw_edits = String::new();
@@ -336,20 +428,12 @@ impl EditAgent {
         })
     }
 
-    async fn request_edits(
+    async fn request(
         &self,
-        snapshot: BufferSnapshot,
-        edit_description: String,
         mut messages: Vec<LanguageModelRequestMessage>,
+        prompt: String,
         cx: &mut AsyncApp,
     ) -> Result<BoxStream<'static, Result<String, LanguageModelCompletionError>>> {
-        let path = cx.update(|cx| snapshot.resolve_file_path(cx, true))?;
-        let prompt = EditAgentTemplate {
-            path,
-            edit_description,
-        }
-        .render(&self.templates)?;
-
         let mut message_content = Vec::new();
         if let Some(last_message) = messages.last_mut() {
             if last_message.role == Role::Assistant {
@@ -611,7 +695,8 @@ mod tests {
             &mut rng,
             cx,
         );
-        let (apply, _events) = agent.apply_edits(buffer.clone(), raw_edits, &mut cx.to_async());
+        let (apply, _events) =
+            agent.apply_edit_chunks(buffer.clone(), raw_edits, &mut cx.to_async());
         apply.await.unwrap();
         pretty_assertions::assert_eq!(
             buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
@@ -648,7 +733,8 @@ mod tests {
             &mut rng,
             cx,
         );
-        let (apply, _events) = agent.apply_edits(buffer.clone(), raw_edits, &mut cx.to_async());
+        let (apply, _events) =
+            agent.apply_edit_chunks(buffer.clone(), raw_edits, &mut cx.to_async());
         apply.await.unwrap();
         assert_eq!(
             buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
@@ -679,7 +765,8 @@ mod tests {
             &mut rng,
             cx,
         );
-        let (apply, _events) = agent.apply_edits(buffer.clone(), raw_edits, &mut cx.to_async());
+        let (apply, _events) =
+            agent.apply_edit_chunks(buffer.clone(), raw_edits, &mut cx.to_async());
         apply.await.unwrap();
         assert_eq!(
             buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
@@ -692,7 +779,7 @@ mod tests {
         let agent = init_test(cx).await;
         let buffer = cx.new(|cx| Buffer::local("abc\ndef\nghi", cx));
         let (chunks_tx, chunks_rx) = mpsc::unbounded();
-        let (apply, mut events) = agent.apply_edits(
+        let (apply, mut events) = agent.apply_edit_chunks(
             buffer.clone(),
             chunks_rx.map(|chunk: &str| Ok(chunk.to_string())),
             &mut cx.to_async(),
@@ -744,7 +831,7 @@ mod tests {
         cx.run_until_parked();
         assert_eq!(
             drain_events(&mut events),
-            vec![EditAgentOutputEvent::HallucinatedOldText(
+            vec![EditAgentOutputEvent::OldTextNotFound(
                 "hallucinated old".into()
             )]
         );

crates/assistant_tools/src/edit_agent/evals.rs 🔗

@@ -4,10 +4,11 @@ use crate::{
     streaming_edit_file_tool::StreamingEditFileToolInput,
 };
 use Role::*;
-use anyhow::{Context, anyhow};
+use anyhow::anyhow;
 use client::{Client, UserStore};
 use collections::HashMap;
 use fs::FakeFs;
+use futures::{FutureExt, future::LocalBoxFuture};
 use gpui::{AppContext, TestAppContext};
 use indoc::indoc;
 use language_model::{
@@ -71,14 +72,15 @@ fn eval_extract_handle_command_output() {
                         StreamingEditFileToolInput {
                             display_description: edit_description.into(),
                             path: input_file_path.into(),
+                            create_or_overwrite: false,
                         },
                     )],
                 ),
             ],
             input_path: input_file_path.into(),
-            input_content: input_file_content.into(),
+            input_content: Some(input_file_content.into()),
             edit_description: edit_description.into(),
-            assertion: EvalAssertion::AssertEqual(output_file_content.into()),
+            assertion: EvalAssertion::assert_eq(output_file_content),
         },
     );
 }
@@ -126,14 +128,15 @@ fn eval_delete_run_git_blame() {
                         StreamingEditFileToolInput {
                             display_description: edit_description.into(),
                             path: input_file_path.into(),
+                            create_or_overwrite: false,
                         },
                     )],
                 ),
             ],
             input_path: input_file_path.into(),
-            input_content: input_file_content.into(),
+            input_content: Some(input_file_content.into()),
             edit_description: edit_description.into(),
-            assertion: EvalAssertion::AssertEqual(output_file_content.into()),
+            assertion: EvalAssertion::assert_eq(output_file_content),
         },
     );
 }
@@ -240,14 +243,15 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
                         StreamingEditFileToolInput {
                             display_description: edit_description.into(),
                             path: input_file_path.into(),
+                            create_or_overwrite: false,
                         },
                     )],
                 ),
             ],
             input_path: input_file_path.into(),
-            input_content: input_file_content.into(),
+            input_content: Some(input_file_content.into()),
             edit_description: edit_description.into(),
-            assertion: EvalAssertion::JudgeDiff(indoc! {"
+            assertion: EvalAssertion::judge_diff(indoc! {"
                 - The compile_parser_to_wasm method has been changed to use wasi-sdk
                 - ureq is used to download the SDK for current platform and architecture
             "}),
@@ -315,14 +319,15 @@ fn eval_disable_cursor_blinking() {
                         StreamingEditFileToolInput {
                             display_description: edit_description.into(),
                             path: input_file_path.into(),
+                            create_or_overwrite: false,
                         },
                     )],
                 ),
             ],
             input_path: input_file_path.into(),
-            input_content: input_file_content.into(),
+            input_content: Some(input_file_content.into()),
             edit_description: edit_description.into(),
-            assertion: EvalAssertion::AssertEqual(output_file_content.into()),
+            assertion: EvalAssertion::assert_eq(output_file_content),
         },
     );
 }
@@ -504,14 +509,15 @@ fn eval_from_pixels_constructor() {
                         StreamingEditFileToolInput {
                             display_description: edit_description.into(),
                             path: input_file_path.into(),
+                            create_or_overwrite: false,
                         },
                     )],
                 ),
             ],
             input_path: input_file_path.into(),
-            input_content: input_file_content.into(),
+            input_content: Some(input_file_content.into()),
             edit_description: edit_description.into(),
-            assertion: EvalAssertion::JudgeDiff(indoc! {"
+            assertion: EvalAssertion::assert_eq(indoc! {"
                 - The diff contains a new `from_pixels` constructor
                 - The diff contains new tests for the `from_pixels` constructor
             "}),
@@ -519,6 +525,104 @@ fn eval_from_pixels_constructor() {
     );
 }
 
+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_zode() {
+    let input_file_path = "root/zode.py";
+    let edit_description = "Create the main Zode CLI script";
+    eval(
+        200,
+        1.,
+        EvalInput {
+            conversation: vec![
+                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
+                message(
+                    Assistant,
+                    [
+                        tool_use(
+                            "tool_1",
+                            "read_file",
+                            ReadFileToolInput {
+                                path: "root/eval/react.py".into(),
+                                start_line: None,
+                                end_line: None,
+                            },
+                        ),
+                        tool_use(
+                            "tool_2",
+                            "read_file",
+                            ReadFileToolInput {
+                                path: "root/eval/react_test.py".into(),
+                                start_line: None,
+                                end_line: None,
+                            },
+                        ),
+                    ],
+                ),
+                message(
+                    User,
+                    [
+                        tool_result(
+                            "tool_1",
+                            "read_file",
+                            include_str!("evals/fixtures/zode/react.py"),
+                        ),
+                        tool_result(
+                            "tool_2",
+                            "read_file",
+                            include_str!("evals/fixtures/zode/react_test.py"),
+                        ),
+                    ],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text(
+                            "Now that I understand what we need to build, I'll create the main Python script:",
+                        ),
+                        tool_use(
+                            "tool_3",
+                            "edit_file",
+                            StreamingEditFileToolInput {
+                                display_description: edit_description.into(),
+                                path: input_file_path.into(),
+                                create_or_overwrite: true,
+                            },
+                        ),
+                    ],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: None,
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::new(async move |sample, _, _cx| {
+                let invalid_starts = [' ', '`', '\n'];
+                let mut message = String::new();
+                for start in invalid_starts {
+                    if sample.text.starts_with(start) {
+                        message.push_str(&format!("The sample starts with a {:?}\n", start));
+                        break;
+                    }
+                }
+                // Remove trailing newline.
+                message.pop();
+
+                if message.is_empty() {
+                    Ok(EvalAssertionOutcome {
+                        score: 100,
+                        message: None,
+                    })
+                } else {
+                    Ok(EvalAssertionOutcome {
+                        score: 0,
+                        message: Some(message),
+                    })
+                }
+            }),
+        },
+    );
+}
+
 fn message(
     role: Role,
     contents: impl IntoIterator<Item = MessageContent>,
@@ -574,11 +678,135 @@ fn tool_result(
 struct EvalInput {
     conversation: Vec<LanguageModelRequestMessage>,
     input_path: PathBuf,
-    input_content: String,
+    input_content: Option<String>,
     edit_description: String,
     assertion: EvalAssertion,
 }
 
+#[derive(Clone)]
+struct EvalSample {
+    text: String,
+    edit_output: EditAgentOutput,
+    diff: String,
+}
+
+trait AssertionFn: 'static + Send + Sync {
+    fn assert<'a>(
+        &'a self,
+        sample: &'a EvalSample,
+        judge_model: Arc<dyn LanguageModel>,
+        cx: &'a mut TestAppContext,
+    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
+}
+
+impl<F> AssertionFn for F
+where
+    F: 'static
+        + Send
+        + Sync
+        + AsyncFn(
+            &EvalSample,
+            Arc<dyn LanguageModel>,
+            &mut TestAppContext,
+        ) -> Result<EvalAssertionOutcome>,
+{
+    fn assert<'a>(
+        &'a self,
+        sample: &'a EvalSample,
+        judge_model: Arc<dyn LanguageModel>,
+        cx: &'a mut TestAppContext,
+    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
+        (self)(sample, judge_model, cx).boxed_local()
+    }
+}
+
+#[derive(Clone)]
+struct EvalAssertion(Arc<dyn AssertionFn>);
+
+impl EvalAssertion {
+    fn new<F>(f: F) -> Self
+    where
+        F: 'static
+            + Send
+            + Sync
+            + AsyncFn(
+                &EvalSample,
+                Arc<dyn LanguageModel>,
+                &mut TestAppContext,
+            ) -> Result<EvalAssertionOutcome>,
+    {
+        EvalAssertion(Arc::new(f))
+    }
+
+    fn assert_eq(expected: impl Into<String>) -> Self {
+        let expected = expected.into();
+        Self::new(async move |sample, _judge, _cx| {
+            Ok(EvalAssertionOutcome {
+                score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
+                    100
+                } else {
+                    0
+                },
+                message: None,
+            })
+        })
+    }
+
+    fn judge_diff(assertions: &'static str) -> Self {
+        Self::new(async move |sample, judge, cx| {
+            let prompt = DiffJudgeTemplate {
+                diff: sample.diff.clone(),
+                assertions,
+            }
+            .render(&Templates::new())
+            .unwrap();
+
+            let request = LanguageModelRequest {
+                messages: vec![LanguageModelRequestMessage {
+                    role: Role::User,
+                    content: vec![prompt.into()],
+                    cache: false,
+                }],
+                ..Default::default()
+            };
+            let mut response = judge
+                .stream_completion_text(request, &cx.to_async())
+                .await?;
+            let mut output = String::new();
+            while let Some(chunk) = response.stream.next().await {
+                let chunk = chunk?;
+                output.push_str(&chunk);
+            }
+
+            // Parse the score from the response
+            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
+            if let Some(captures) = re.captures(&output) {
+                if let Some(score_match) = captures.get(1) {
+                    let score = score_match.as_str().parse().unwrap_or(0);
+                    return Ok(EvalAssertionOutcome {
+                        score,
+                        message: Some(output),
+                    });
+                }
+            }
+
+            Err(anyhow!(
+                "No score found in response. Raw output: {}",
+                output
+            ))
+        })
+    }
+
+    async fn run(
+        &self,
+        input: &EvalSample,
+        judge_model: Arc<dyn LanguageModel>,
+        cx: &mut TestAppContext,
+    ) -> Result<EvalAssertionOutcome> {
+        self.0.assert(input, judge_model, cx).await
+    }
+}
+
 fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
     let mut evaluated_count = 0;
     report_progress(evaluated_count, iterations);
@@ -606,12 +834,12 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
     while let Ok(output) = rx.recv() {
         match output {
             Ok(output) => {
-                cumulative_parser_metrics += output.edit_output._parser_metrics.clone();
+                cumulative_parser_metrics += output.sample.edit_output._parser_metrics.clone();
                 eval_outputs.push(output.clone());
                 if output.assertion.score < 80 {
                     failed_count += 1;
                     failed_evals
-                        .entry(output.buffer_text.clone())
+                        .entry(output.sample.text.clone())
                         .or_insert(Vec::new())
                         .push(output);
                 }
@@ -671,10 +899,8 @@ fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
 
 #[derive(Clone)]
 struct EvalOutput {
-    assertion: EvalAssertionResult,
-    buffer_text: String,
-    edit_output: EditAgentOutput,
-    diff: String,
+    sample: EvalSample,
+    assertion: EvalAssertionOutcome,
 }
 
 impl Display for EvalOutput {
@@ -684,14 +910,14 @@ impl Display for EvalOutput {
             writeln!(f, "Message: {}", message)?;
         }
 
-        writeln!(f, "Diff:\n{}", self.diff)?;
+        writeln!(f, "Diff:\n{}", self.sample.diff)?;
 
         writeln!(
             f,
             "Parser Metrics:\n{:#?}",
-            self.edit_output._parser_metrics
+            self.sample.edit_output._parser_metrics
         )?;
-        writeln!(f, "Raw Edits:\n{}", self.edit_output._raw_edits)?;
+        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output._raw_edits)?;
         Ok(())
     }
 }
@@ -777,96 +1003,45 @@ impl EditAgentTest {
             .update(cx, |project, cx| project.open_buffer(path, cx))
             .await
             .unwrap();
-        buffer.update(cx, |buffer, cx| {
-            buffer.set_text(eval.input_content.clone(), cx)
-        });
-        let (edit_output, _events) = self.agent.edit(
-            buffer.clone(),
-            eval.edit_description,
-            eval.conversation,
-            &mut cx.to_async(),
-        );
-        let edit_output = edit_output.await?;
-        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
-        let actual_diff = language::unified_diff(&eval.input_content, &buffer_text);
-        let assertion = match eval.assertion {
-            EvalAssertion::AssertEqual(expected_output) => EvalAssertionResult {
-                score: if strip_empty_lines(&buffer_text) == strip_empty_lines(&expected_output) {
-                    100
-                } else {
-                    0
-                },
-                message: None,
-            },
-            EvalAssertion::JudgeDiff(assertions) => self
-                .judge_diff(&actual_diff, assertions, &cx.to_async())
-                .await
-                .context("failed comparing diffs")?,
+        let edit_output = if let Some(input_content) = eval.input_content.as_deref() {
+            buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+            let (edit_output, _) = self.agent.edit(
+                buffer.clone(),
+                eval.edit_description,
+                eval.conversation,
+                &mut cx.to_async(),
+            );
+            edit_output.await?
+        } else {
+            let (edit_output, _) = self.agent.overwrite(
+                buffer.clone(),
+                eval.edit_description,
+                eval.conversation,
+                &mut cx.to_async(),
+            );
+            edit_output.await?
         };
 
-        Ok(EvalOutput {
-            assertion,
-            diff: actual_diff,
-            buffer_text,
+        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
+        let sample = EvalSample {
             edit_output,
-        })
-    }
-
-    async fn judge_diff(
-        &self,
-        diff: &str,
-        assertions: &'static str,
-        cx: &AsyncApp,
-    ) -> Result<EvalAssertionResult> {
-        let prompt = DiffJudgeTemplate {
-            diff: diff.to_string(),
-            assertions,
-        }
-        .render(&self.agent.templates)
-        .unwrap();
-
-        let request = LanguageModelRequest {
-            messages: vec![LanguageModelRequestMessage {
-                role: Role::User,
-                content: vec![prompt.into()],
-                cache: false,
-            }],
-            ..Default::default()
+            diff: language::unified_diff(
+                eval.input_content.as_deref().unwrap_or_default(),
+                &buffer_text,
+            ),
+            text: buffer_text,
         };
-        let mut response = self.judge_model.stream_completion_text(request, cx).await?;
-        let mut output = String::new();
-        while let Some(chunk) = response.stream.next().await {
-            let chunk = chunk?;
-            output.push_str(&chunk);
-        }
-
-        // Parse the score from the response
-        let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
-        if let Some(captures) = re.captures(&output) {
-            if let Some(score_match) = captures.get(1) {
-                let score = score_match.as_str().parse().unwrap_or(0);
-                return Ok(EvalAssertionResult {
-                    score,
-                    message: Some(output),
-                });
-            }
-        }
+        let assertion = eval
+            .assertion
+            .run(&sample, self.judge_model.clone(), cx)
+            .await?;
 
-        Err(anyhow!(
-            "No score found in response. Raw output: {}",
-            output
-        ))
+        Ok(EvalOutput { assertion, sample })
     }
 }
 
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
-enum EvalAssertion {
-    AssertEqual(String),
-    JudgeDiff(&'static str),
-}
-
-#[derive(Clone, Debug, Eq, PartialEq, Hash)]
-struct EvalAssertionResult {
+struct EvalAssertionOutcome {
     score: usize,
     message: Option<String>,
 }

crates/assistant_tools/src/edit_agent/evals/fixtures/zode/prompt.md 🔗

@@ -0,0 +1,2193 @@
+- We're building a CLI code agent tool called Zode that is intended to work like Aider or Claude code
+- We're starting from a completely blank project
+- Like Aider/Claude Code you take the user's initial prompt and then call the LLM and perform tool calls in a loop until the ultimate goal is achieved.
+- Unlike Aider or Claude code, it's not intended to be interactive. Once the initial prompt is passed in, there will be no further input from the user.
+- The system you will build must reach the stated goal just by performing too calls and calling the LLM
+- I want you to build this in python. Use the anthropic python sdk and the model context protocol sdk. Use a virtual env and pip to install dependencies
+- Follow the anthropic guidance on tool calls: https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview
+- Use this Anthropic model: `claude-3-7-sonnet-20250219`
+- Use this Anthropic API Key: `sk-ant-api03-qweeryiofdjsncmxquywefidopsugus`
+- One of the most important pieces to this is having good too calls. We will be using the tools provided by the Claude MCP server. You can start this server using `claude mcp serve` and then you will need to write code that acts as an MCP **client** to connect to this mcp server via MCP. Likely you want to start this using a subprocess. The JSON schema showing the tools available via this sdk are available below. Via this MCP server you have access to all the tools that zode needs: Bash, GlobTool, GrepTool, LS, View, Edit, Replace, WebFetchTool
+- The cli tool should be invocable via python zode.py file.md where file.md is any possible file that contains the users prompt. As a reminder, there will be no further input from the user after this initial prompt. Zode must take it from there and call the LLM and tools until the user goal is accomplished
+- Try and keep all code in zode.py and make heavy use of the asks I mentioned
+- Once you’ve implemented this, you must run python zode.py eval/instructions.md to see how well our new agent tool does!
+
+Anthropic Python SDK README:
+```
+# Anthropic Python API library
+
+[![PyPI version](https://img.shields.io/pypi/v/anthropic.svg)](https://pypi.org/project/anthropic/)
+
+The Anthropic Python library provides convenient access to the Anthropic REST API from any Python 3.8+
+application. It includes type definitions for all request params and response fields,
+and offers both synchronous and asynchronous clients powered by [httpx](https://github.com/encode/httpx).
+
+## Documentation
+
+The REST API documentation can be found on [docs.anthropic.com](https://docs.anthropic.com/claude/reference/). The full API of this library can be found in [api.md](api.md).
+
+## Installation
+
+```sh
+# install from PyPI
+pip install anthropic
+```
+
+## Usage
+
+The full API of this library can be found in [api.md](api.md).
+
+```python
+import os
+from anthropic import Anthropic
+
+client = Anthropic(
+    api_key=os.environ.get("ANTHROPIC_API_KEY"),  # This is the default and can be omitted
+)
+
+message = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+)
+print(message.content)
+```
+
+While you can provide an `api_key` keyword argument,
+we recommend using [python-dotenv](https://pypi.org/project/python-dotenv/)
+to add `ANTHROPIC_API_KEY="my-anthropic-api-key"` to your `.env` file
+so that your API Key is not stored in source control.
+
+## Async usage
+
+Simply import `AsyncAnthropic` instead of `Anthropic` and use `await` with each API call:
+
+```python
+import os
+import asyncio
+from anthropic import AsyncAnthropic
+
+client = AsyncAnthropic(
+    api_key=os.environ.get("ANTHROPIC_API_KEY"),  # This is the default and can be omitted
+)
+
+
+async def main() -> None:
+    message = await client.messages.create(
+        max_tokens=1024,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, Claude",
+            }
+        ],
+        model="claude-3-5-sonnet-latest",
+    )
+    print(message.content)
+
+
+asyncio.run(main())
+```
+
+Functionality between the synchronous and asynchronous clients is otherwise identical.
+
+## Streaming responses
+
+We provide support for streaming responses using Server Side Events (SSE).
+
+```python
+from anthropic import Anthropic
+
+client = Anthropic()
+
+stream = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+    stream=True,
+)
+for event in stream:
+    print(event.type)
+```
+
+The async client uses the exact same interface.
+
+```python
+from anthropic import AsyncAnthropic
+
+client = AsyncAnthropic()
+
+stream = await client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+    stream=True,
+)
+async for event in stream:
+    print(event.type)
+```
+
+### Streaming Helpers
+
+This library provides several conveniences for streaming messages, for example:
+
+```py
+import asyncio
+from anthropic import AsyncAnthropic
+
+client = AsyncAnthropic()
+
+async def main() -> None:
+    async with client.messages.stream(
+        max_tokens=1024,
+        messages=[
+            {
+                "role": "user",
+                "content": "Say hello there!",
+            }
+        ],
+        model="claude-3-5-sonnet-latest",
+    ) as stream:
+        async for text in stream.text_stream:
+            print(text, end="", flush=True)
+        print()
+
+    message = await stream.get_final_message()
+    print(message.to_json())
+
+asyncio.run(main())
+```
+
+Streaming with `client.messages.stream(...)` exposes [various helpers for your convenience](helpers.md) including accumulation & SDK-specific events.
+
+Alternatively, you can use `client.messages.create(..., stream=True)` which only returns an async iterable of the events in the stream and thus uses less memory (it does not build up a final message object for you).
+
+## Token counting
+
+To get the token count for a message without creating it you can use the `client.beta.messages.count_tokens()` method. This takes the same `messages` list as the `.create()` method.
+
+```py
+count = client.beta.messages.count_tokens(
+    model="claude-3-5-sonnet-20241022",
+    messages=[
+        {"role": "user", "content": "Hello, world"}
+    ]
+)
+count.input_tokens  # 10
+```
+
+You can also see the exact usage for a given request through the `usage` response property, e.g.
+
+```py
+message = client.messages.create(...)
+message.usage
+# Usage(input_tokens=25, output_tokens=13)
+```
+
+## Message Batches
+
+This SDK provides beta support for the [Message Batches API](https://docs.anthropic.com/en/docs/build-with-claude/message-batches) under the `client.beta.messages.batches` namespace.
+
+
+### Creating a batch
+
+Message Batches take the exact same request params as the standard Messages API:
+
+```python
+await client.beta.messages.batches.create(
+    requests=[
+        {
+            "custom_id": "my-first-request",
+            "params": {
+                "model": "claude-3-5-sonnet-latest",
+                "max_tokens": 1024,
+                "messages": [{"role": "user", "content": "Hello, world"}],
+            },
+        },
+        {
+            "custom_id": "my-second-request",
+            "params": {
+                "model": "claude-3-5-sonnet-latest",
+                "max_tokens": 1024,
+                "messages": [{"role": "user", "content": "Hi again, friend"}],
+            },
+        },
+    ]
+)
+```
+
+
+### Getting results from a batch
+
+Once a Message Batch has been processed, indicated by `.processing_status === 'ended'`, you can access the results with `.batches.results()`
+
+```python
+result_stream = await client.beta.messages.batches.results(batch_id)
+async for entry in result_stream:
+    if entry.result.type == "succeeded":
+        print(entry.result.message.content)
+```
+
+## Tool use
+
+This SDK provides support for tool use, aka function calling. More details can be found in [the documentation](https://docs.anthropic.com/claude/docs/tool-use).
+
+## AWS Bedrock
+
+This library also provides support for the [Anthropic Bedrock API](https://aws.amazon.com/bedrock/claude/) if you install this library with the `bedrock` extra, e.g. `pip install -U anthropic[bedrock]`.
+
+You can then import and instantiate a separate `AnthropicBedrock` class, the rest of the API is the same.
+
+```py
+from anthropic import AnthropicBedrock
+
+client = AnthropicBedrock()
+
+message = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello!",
+        }
+    ],
+    model="anthropic.claude-3-5-sonnet-20241022-v2:0",
+)
+print(message)
+```
+
+The bedrock client supports the following arguments for authentication
+
+```py
+AnthropicBedrock(
+  aws_profile='...',
+  aws_region='us-east'
+  aws_secret_key='...',
+  aws_access_key='...',
+  aws_session_token='...',
+)
+```
+
+For a more fully fledged example see [`examples/bedrock.py`](https://github.com/anthropics/anthropic-sdk-python/blob/main/examples/bedrock.py).
+
+## Google Vertex
+
+This library also provides support for the [Anthropic Vertex API](https://cloud.google.com/vertex-ai?hl=en) if you install this library with the `vertex` extra, e.g. `pip install -U anthropic[vertex]`.
+
+You can then import and instantiate a separate `AnthropicVertex`/`AsyncAnthropicVertex` class, which has the same API as the base `Anthropic`/`AsyncAnthropic` class.
+
+```py
+from anthropic import AnthropicVertex
+
+client = AnthropicVertex()
+
+message = client.messages.create(
+    model="claude-3-5-sonnet-v2@20241022",
+    max_tokens=100,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello!",
+        }
+    ],
+)
+print(message)
+```
+
+For a more complete example see [`examples/vertex.py`](https://github.com/anthropics/anthropic-sdk-python/blob/main/examples/vertex.py).
+
+## Using types
+
+Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:
+
+- Serializing back into JSON, `model.to_json()`
+- Converting to a dictionary, `model.to_dict()`
+
+Typed requests and responses provide autocomplete and documentation within your editor. If you would like to see type errors in VS Code to help catch bugs earlier, set `python.analysis.typeCheckingMode` to `basic`.
+
+## Pagination
+
+List methods in the Anthropic API are paginated.
+
+This library provides auto-paginating iterators with each list response, so you do not have to request successive pages manually:
+
+```python
+from anthropic import Anthropic
+
+client = Anthropic()
+
+all_batches = []
+# Automatically fetches more pages as needed.
+for batch in client.beta.messages.batches.list(
+    limit=20,
+):
+    # Do something with batch here
+    all_batches.append(batch)
+print(all_batches)
+```
+
+Or, asynchronously:
+
+```python
+import asyncio
+from anthropic import AsyncAnthropic
+
+client = AsyncAnthropic()
+
+
+async def main() -> None:
+    all_batches = []
+    # Iterate through items across all pages, issuing requests as needed.
+    async for batch in client.beta.messages.batches.list(
+        limit=20,
+    ):
+        all_batches.append(batch)
+    print(all_batches)
+
+
+asyncio.run(main())
+```
+
+Alternatively, you can use the `.has_next_page()`, `.next_page_info()`, or `.get_next_page()` methods for more granular control working with pages:
+
+```python
+first_page = await client.beta.messages.batches.list(
+    limit=20,
+)
+if first_page.has_next_page():
+    print(f"will fetch next page using these details: {first_page.next_page_info()}")
+    next_page = await first_page.get_next_page()
+    print(f"number of items we just fetched: {len(next_page.data)}")
+
+# Remove `await` for non-async usage.
+```
+
+Or just work directly with the returned data:
+
+```python
+first_page = await client.beta.messages.batches.list(
+    limit=20,
+)
+
+print(f"next page cursor: {first_page.last_id}")  # => "next page cursor: ..."
+for batch in first_page.data:
+    print(batch.id)
+
+# Remove `await` for non-async usage.
+```
+
+## Handling errors
+
+When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `anthropic.APIConnectionError` is raised.
+
+When the API returns a non-success status code (that is, 4xx or 5xx
+response), a subclass of `anthropic.APIStatusError` is raised, containing `status_code` and `response` properties.
+
+All errors inherit from `anthropic.APIError`.
+
+```python
+import anthropic
+from anthropic import Anthropic
+
+client = Anthropic()
+
+try:
+    client.messages.create(
+        max_tokens=1024,
+        messages=[
+            {
+                "role": "user",
+                "content": "Hello, Claude",
+            }
+        ],
+        model="claude-3-5-sonnet-latest",
+    )
+except anthropic.APIConnectionError as e:
+    print("The server could not be reached")
+    print(e.__cause__)  # an underlying Exception, likely raised within httpx.
+except anthropic.RateLimitError as e:
+    print("A 429 status code was received; we should back off a bit.")
+except anthropic.APIStatusError as e:
+    print("Another non-200-range status code was received")
+    print(e.status_code)
+    print(e.response)
+```
+
+Error codes are as follows:
+
+| Status Code | Error Type                 |
+| ----------- | -------------------------- |
+| 400         | `BadRequestError`          |
+| 401         | `AuthenticationError`      |
+| 403         | `PermissionDeniedError`    |
+| 404         | `NotFoundError`            |
+| 422         | `UnprocessableEntityError` |
+| 429         | `RateLimitError`           |
+| >=500       | `InternalServerError`      |
+| N/A         | `APIConnectionError`       |
+
+## Request IDs
+
+> For more information on debugging requests, see [these docs](https://docs.anthropic.com/en/api/errors#request-id)
+
+All object responses in the SDK provide a `_request_id` property which is added from the `request-id` response header so that you can quickly log failing requests and report them back to Anthropic.
+
+```python
+message = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+)
+print(message._request_id)  # req_018EeWyXxfu5pfWkrYcMdjWG
+```
+
+Note that unlike other properties that use an `_` prefix, the `_request_id` property
+*is* public. Unless documented otherwise, *all* other `_` prefix properties,
+methods and modules are *private*.
+
+### Retries
+
+Certain errors are automatically retried 2 times by default, with a short exponential backoff.
+Connection errors (for example, due to a network connectivity problem), 408 Request Timeout, 409 Conflict,
+429 Rate Limit, and >=500 Internal errors are all retried by default.
+
+You can use the `max_retries` option to configure or disable retry settings:
+
+```python
+from anthropic import Anthropic
+
+# Configure the default for all requests:
+client = Anthropic(
+    # default is 2
+    max_retries=0,
+)
+
+# Or, configure per-request:
+client.with_options(max_retries=5).messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+)
+```
+
+### Timeouts
+
+By default requests time out after 10 minutes. You can configure this with a `timeout` option,
+which accepts a float or an [`httpx.Timeout`](https://www.python-httpx.org/advanced/#fine-tuning-the-configuration) object:
+
+```python
+from anthropic import Anthropic
+
+# Configure the default for all requests:
+client = Anthropic(
+    # 20 seconds (default is 10 minutes)
+    timeout=20.0,
+)
+
+# More granular control:
+client = Anthropic(
+    timeout=httpx.Timeout(60.0, read=5.0, write=10.0, connect=2.0),
+)
+
+# Override per-request:
+client.with_options(timeout=5.0).messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+)
+```
+
+On timeout, an `APITimeoutError` is thrown.
+
+Note that requests that time out are [retried twice by default](#retries).
+
+### Long Requests
+
+> [!IMPORTANT]
+> We highly encourage you use the streaming [Messages API](#streaming-responses) for longer running requests.
+
+We do not recommend setting a large `max_tokens` values without using streaming.
+Some networks may drop idle connections after a certain period of time, which
+can cause the request to fail or [timeout](#timeouts) without receiving a response from Anthropic.
+
+This SDK will also throw a `ValueError` if a non-streaming request is expected to be above roughly 10 minutes long.
+Passing `stream=True` or [overriding](#timeouts) the `timeout` option at the client or request level disables this error.
+
+An expected request latency longer than the [timeout](#timeouts) for a non-streaming request
+will result in the client terminating the connection and retrying without receiving a response.
+
+We set a [TCP socket keep-alive](https://tldp.org/HOWTO/TCP-Keepalive-HOWTO/overview.html) option in order
+to reduce the impact of idle connection timeouts on some networks.
+This can be [overridden](#Configuring-the-HTTP-client) by passing a `http_client` option to the client.
+
+## Default Headers
+
+We automatically send the `anthropic-version` header set to `2023-06-01`.
+
+If you need to, you can override it by setting default headers per-request or on the client object.
+
+Be aware that doing so may result in incorrect types and other unexpected or undefined behavior in the SDK.
+
+```python
+from anthropic import Anthropic
+
+client = Anthropic(
+    default_headers={"anthropic-version": "My-Custom-Value"},
+)
+```
+
+## Advanced
+
+### Logging
+
+We use the standard library [`logging`](https://docs.python.org/3/library/logging.html) module.
+
+You can enable logging by setting the environment variable `ANTHROPIC_LOG` to `info`.
+
+```shell
+$ export ANTHROPIC_LOG=info
+```
+
+Or to `debug` for more verbose logging.
+
+### How to tell whether `None` means `null` or missing
+
+In an API response, a field may be explicitly `null`, or missing entirely; in either case, its value is `None` in this library. You can differentiate the two cases with `.model_fields_set`:
+
+```py
+if response.my_field is None:
+  if 'my_field' not in response.model_fields_set:
+    print('Got json like {}, without a "my_field" key present at all.')
+  else:
+    print('Got json like {"my_field": null}.')
+```
+
+### Accessing raw response data (e.g. headers)
+
+The "raw" Response object can be accessed by prefixing `.with_raw_response.` to any HTTP method call, e.g.,
+
+```py
+from anthropic import Anthropic
+
+client = Anthropic()
+response = client.messages.with_raw_response.create(
+    max_tokens=1024,
+    messages=[{
+        "role": "user",
+        "content": "Hello, Claude",
+    }],
+    model="claude-3-5-sonnet-latest",
+)
+print(response.headers.get('X-My-Header'))
+
+message = response.parse()  # get the object that `messages.create()` would have returned
+print(message.content)
+```
+
+These methods return a [`LegacyAPIResponse`](https://github.com/anthropics/anthropic-sdk-python/tree/main/src/anthropic/_legacy_response.py) object. This is a legacy class as we're changing it slightly in the next major version.
+
+For the sync client this will mostly be the same with the exception
+of `content` & `text` will be methods instead of properties. In the
+async client, all methods will be async.
+
+A migration script will be provided & the migration in general should
+be smooth.
+
+#### `.with_streaming_response`
+
+The above interface eagerly reads the full response body when you make the request, which may not always be what you want.
+
+To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods.
+
+As such, `.with_streaming_response` methods return a different [`APIResponse`](https://github.com/anthropics/anthropic-sdk-python/tree/main/src/anthropic/_response.py) object, and the async client returns an [`AsyncAPIResponse`](https://github.com/anthropics/anthropic-sdk-python/tree/main/src/anthropic/_response.py) object.
+
+```python
+with client.messages.with_streaming_response.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-5-sonnet-latest",
+) as response:
+    print(response.headers.get("X-My-Header"))
+
+    for line in response.iter_lines():
+        print(line)
+```
+
+The context manager is required so that the response will reliably be closed.
+
+### Making custom/undocumented requests
+
+This library is typed for convenient access to the documented API.
+
+If you need to access undocumented endpoints, params, or response properties, the library can still be used.
+
+#### Undocumented endpoints
+
+To make requests to undocumented endpoints, you can make requests using `client.get`, `client.post`, and other
+http verbs. Options on the client will be respected (such as retries) when making this request.
+
+```py
+import httpx
+
+response = client.post(
+    "/foo",
+    cast_to=httpx.Response,
+    body={"my_param": True},
+)
+
+print(response.headers.get("x-foo"))
+```
+
+#### Undocumented request params
+
+If you want to explicitly send an extra param, you can do so with the `extra_query`, `extra_body`, and `extra_headers` request
+options.
+
+#### Undocumented response properties
+
+To access undocumented response properties, you can access the extra fields like `response.unknown_prop`. You
+can also get all the extra fields on the Pydantic model as a dict with
+[`response.model_extra`](https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_extra).
+
+### Configuring the HTTP client
+
+You can directly override the [httpx client](https://www.python-httpx.org/api/#client) to customize it for your use case, including:
+
+- Support for [proxies](https://www.python-httpx.org/advanced/proxies/)
+- Custom [transports](https://www.python-httpx.org/advanced/transports/)
+- Additional [advanced](https://www.python-httpx.org/advanced/clients/) functionality
+
+```python
+import httpx
+from anthropic import Anthropic, DefaultHttpxClient
+
+client = Anthropic(
+    # Or use the `ANTHROPIC_BASE_URL` env var
+    base_url="http://my.test.server.example.com:8083",
+    http_client=DefaultHttpxClient(
+        proxy="http://my.test.proxy.example.com",
+        transport=httpx.HTTPTransport(local_address="0.0.0.0"),
+    ),
+)
+```
+
+You can also customize the client on a per-request basis by using `with_options()`:
+
+```python
+client.with_options(http_client=DefaultHttpxClient(...))
+```
+
+### Managing HTTP resources
+
+By default the library closes underlying HTTP connections whenever the client is [garbage collected](https://docs.python.org/3/reference/datamodel.html#object.__del__). You can manually close the client using the `.close()` method if desired, or with a context manager that closes when exiting.
+
+```py
+from anthropic import Anthropic
+
+with Anthropic() as client:
+  # make requests here
+  ...
+
+# HTTP client is now closed
+```
+
+## Versioning
+
+This package generally follows [SemVer](https://semver.org/spec/v2.0.0.html) conventions, though certain backwards-incompatible changes may be released as minor versions:
+
+1. Changes that only affect static types, without breaking runtime behavior.
+2. Changes to library internals which are technically public but not intended or documented for external use. _(Please open a GitHub issue to let us know if you are relying on such internals.)_
+3. Changes that we do not expect to impact the vast majority of users in practice.
+
+We take backwards-compatibility seriously and work hard to ensure you can rely on a smooth upgrade experience.
+
+We are keen for your feedback; please open an [issue](https://www.github.com/anthropics/anthropic-sdk-python/issues) with questions, bugs, or suggestions.
+
+### Determining the installed version
+
+If you've upgraded to the latest version but aren't seeing any new features you were expecting then your python environment is likely still using an older version.
+
+You can determine the version that is being used at runtime with:
+
+```py
+import anthropic
+print(anthropic.__version__)
+```
+
+## Requirements
+
+Python 3.8 or higher.
+
+## Contributing
+
+See [the contributing documentation](./CONTRIBUTING.md).
+```
+
+
+MCP Python SDK README:
+# MCP Python SDK
+
+<div align="center">
+
+<strong>Python implementation of the Model Context Protocol (MCP)</strong>
+
+[![PyPI][pypi-badge]][pypi-url]
+[![MIT licensed][mit-badge]][mit-url]
+[![Python Version][python-badge]][python-url]
+[![Documentation][docs-badge]][docs-url]
+[![Specification][spec-badge]][spec-url]
+[![GitHub Discussions][discussions-badge]][discussions-url]
+
+</div>
+
+<!-- omit in toc -->
+## Table of Contents
+
+- [MCP Python SDK](#mcp-python-sdk)
+  - [Overview](#overview)
+  - [Installation](#installation)
+    - [Adding MCP to your python project](#adding-mcp-to-your-python-project)
+    - [Running the standalone MCP development tools](#running-the-standalone-mcp-development-tools)
+  - [Quickstart](#quickstart)
+  - [What is MCP?](#what-is-mcp)
+  - [Core Concepts](#core-concepts)
+    - [Server](#server)
+    - [Resources](#resources)
+    - [Tools](#tools)
+    - [Prompts](#prompts)
+    - [Images](#images)
+    - [Context](#context)
+  - [Running Your Server](#running-your-server)
+    - [Development Mode](#development-mode)
+    - [Claude Desktop Integration](#claude-desktop-integration)
+    - [Direct Execution](#direct-execution)
+    - [Mounting to an Existing ASGI Server](#mounting-to-an-existing-asgi-server)
+  - [Examples](#examples)
+    - [Echo Server](#echo-server)
+    - [SQLite Explorer](#sqlite-explorer)
+  - [Advanced Usage](#advanced-usage)
+    - [Low-Level Server](#low-level-server)
+    - [Writing MCP Clients](#writing-mcp-clients)
+    - [MCP Primitives](#mcp-primitives)
+    - [Server Capabilities](#server-capabilities)
+  - [Documentation](#documentation)
+  - [Contributing](#contributing)
+  - [License](#license)
+
+[pypi-badge]: https://img.shields.io/pypi/v/mcp.svg
+[pypi-url]: https://pypi.org/project/mcp/
+[mit-badge]: https://img.shields.io/pypi/l/mcp.svg
+[mit-url]: https://github.com/modelcontextprotocol/python-sdk/blob/main/LICENSE
+[python-badge]: https://img.shields.io/pypi/pyversions/mcp.svg
+[python-url]: https://www.python.org/downloads/
+[docs-badge]: https://img.shields.io/badge/docs-modelcontextprotocol.io-blue.svg
+[docs-url]: https://modelcontextprotocol.io
+[spec-badge]: https://img.shields.io/badge/spec-spec.modelcontextprotocol.io-blue.svg
+[spec-url]: https://spec.modelcontextprotocol.io
+[discussions-badge]: https://img.shields.io/github/discussions/modelcontextprotocol/python-sdk
+[discussions-url]: https://github.com/modelcontextprotocol/python-sdk/discussions
+
+## Overview
+
+The Model Context Protocol allows applications to provide context for LLMs in a standardized way, separating the concerns of providing context from the actual LLM interaction. This Python SDK implements the full MCP specification, making it easy to:
+
+- Build MCP clients that can connect to any MCP server
+- Create MCP servers that expose resources, prompts and tools
+- Use standard transports like stdio and SSE
+- Handle all MCP protocol messages and lifecycle events
+
+## Installation
+
+### Adding MCP to your python project
+
+We recommend using [uv](https://docs.astral.sh/uv/) to manage your Python projects.
+
+If you haven't created a uv-managed project yet, create one:
+
+   ```bash
+   uv init mcp-server-demo
+   cd mcp-server-demo
+   ```
+
+   Then add MCP to your project dependencies:
+
+   ```bash
+   uv add "mcp[cli]"
+   ```
+
+Alternatively, for projects using pip for dependencies:
+```bash
+pip install "mcp[cli]"
+```
+
+### Running the standalone MCP development tools
+
+To run the mcp command with uv:
+
+```bash
+uv run mcp
+```
+
+## Quickstart
+
+Let's create a simple MCP server that exposes a calculator tool and some data:
+
+```python
+# server.py
+from mcp.server.fastmcp import FastMCP
+
+# Create an MCP server
+mcp = FastMCP("Demo")
+
+
+# Add an addition tool
+@mcp.tool()
+def add(a: int, b: int) -> int:
+    """Add two numbers"""
+    return a + b
+
+
+# Add a dynamic greeting resource
+@mcp.resource("greeting://{name}")
+def get_greeting(name: str) -> str:
+    """Get a personalized greeting"""
+    return f"Hello, {name}!"
+```
+
+You can install this server in [Claude Desktop](https://claude.ai/download) and interact with it right away by running:
+```bash
+mcp install server.py
+```
+
+Alternatively, you can test it with the MCP Inspector:
+```bash
+mcp dev server.py
+```
+
+## What is MCP?
+
+The [Model Context Protocol (MCP)](https://modelcontextprotocol.io) lets you build servers that expose data and functionality to LLM applications in a secure, standardized way. Think of it like a web API, but specifically designed for LLM interactions. MCP servers can:
+
+- Expose data through **Resources** (think of these sort of like GET endpoints; they are used to load information into the LLM's context)
+- Provide functionality through **Tools** (sort of like POST endpoints; they are used to execute code or otherwise produce a side effect)
+- Define interaction patterns through **Prompts** (reusable templates for LLM interactions)
+- And more!
+
+## Core Concepts
+
+### Server
+
+The FastMCP server is your core interface to the MCP protocol. It handles connection management, protocol compliance, and message routing:
+
+```python
+# Add lifespan support for startup/shutdown with strong typing
+from contextlib import asynccontextmanager
+from collections.abc import AsyncIterator
+from dataclasses import dataclass
+
+from fake_database import Database  # Replace with your actual DB type
+
+from mcp.server.fastmcp import Context, FastMCP
+
+# Create a named server
+mcp = FastMCP("My App")
+
+# Specify dependencies for deployment and development
+mcp = FastMCP("My App", dependencies=["pandas", "numpy"])
+
+
+@dataclass
+class AppContext:
+    db: Database
+
+
+@asynccontextmanager
+async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
+    """Manage application lifecycle with type-safe context"""
+    # Initialize on startup
+    db = await Database.connect()
+    try:
+        yield AppContext(db=db)
+    finally:
+        # Cleanup on shutdown
+        await db.disconnect()
+
+
+# Pass lifespan to server
+mcp = FastMCP("My App", lifespan=app_lifespan)
+
+
+# Access type-safe lifespan context in tools
+@mcp.tool()
+def query_db(ctx: Context) -> str:
+    """Tool that uses initialized resources"""
+    db = ctx.request_context.lifespan_context["db"]
+    return db.query()
+```
+
+### Resources
+
+Resources are how you expose data to LLMs. They're similar to GET endpoints in a REST API - they provide data but shouldn't perform significant computation or have side effects:
+
+```python
+from mcp.server.fastmcp import FastMCP
+
+mcp = FastMCP("My App")
+
+
+@mcp.resource("config://app")
+def get_config() -> str:
+    """Static configuration data"""
+    return "App configuration here"
+
+
+@mcp.resource("users://{user_id}/profile")
+def get_user_profile(user_id: str) -> str:
+    """Dynamic user data"""
+    return f"Profile data for user {user_id}"
+```
+
+### Tools
+
+Tools let LLMs take actions through your server. Unlike resources, tools are expected to perform computation and have side effects:
+
+```python
+import httpx
+from mcp.server.fastmcp import FastMCP
+
+mcp = FastMCP("My App")
+
+
+@mcp.tool()
+def calculate_bmi(weight_kg: float, height_m: float) -> float:
+    """Calculate BMI given weight in kg and height in meters"""
+    return weight_kg / (height_m**2)
+
+
+@mcp.tool()
+async def fetch_weather(city: str) -> str:
+    """Fetch current weather for a city"""
+    async with httpx.AsyncClient() as client:
+        response = await client.get(f"https://api.weather.com/{city}")
+        return response.text
+```
+
+### Prompts
+
+Prompts are reusable templates that help LLMs interact with your server effectively:
+
+```python
+from mcp.server.fastmcp import FastMCP
+from mcp.server.fastmcp.prompts import base
+
+mcp = FastMCP("My App")
+
+
+@mcp.prompt()
+def review_code(code: str) -> str:
+    return f"Please review this code:\n\n{code}"
+
+
+@mcp.prompt()
+def debug_error(error: str) -> list[base.Message]:
+    return [
+        base.UserMessage("I'm seeing this error:"),
+        base.UserMessage(error),
+        base.AssistantMessage("I'll help debug that. What have you tried so far?"),
+    ]
+```
+
+### Images
+
+FastMCP provides an `Image` class that automatically handles image data:
+
+```python
+from mcp.server.fastmcp import FastMCP, Image
+from PIL import Image as PILImage
+
+mcp = FastMCP("My App")
+
+
+@mcp.tool()
+def create_thumbnail(image_path: str) -> Image:
+    """Create a thumbnail from an image"""
+    img = PILImage.open(image_path)
+    img.thumbnail((100, 100))
+    return Image(data=img.tobytes(), format="png")
+```
+
+### Context
+
+The Context object gives your tools and resources access to MCP capabilities:
+
+```python
+from mcp.server.fastmcp import FastMCP, Context
+
+mcp = FastMCP("My App")
+
+
+@mcp.tool()
+async def long_task(files: list[str], ctx: Context) -> str:
+    """Process multiple files with progress tracking"""
+    for i, file in enumerate(files):
+        ctx.info(f"Processing {file}")
+        await ctx.report_progress(i, len(files))
+        data, mime_type = await ctx.read_resource(f"file://{file}")
+    return "Processing complete"
+```
+
+## Running Your Server
+
+### Development Mode
+
+The fastest way to test and debug your server is with the MCP Inspector:
+
+```bash
+mcp dev server.py
+
+# Add dependencies
+mcp dev server.py --with pandas --with numpy
+
+# Mount local code
+mcp dev server.py --with-editable .
+```
+
+### Claude Desktop Integration
+
+Once your server is ready, install it in Claude Desktop:
+
+```bash
+mcp install server.py
+
+# Custom name
+mcp install server.py --name "My Analytics Server"
+
+# Environment variables
+mcp install server.py -v API_KEY=abc123 -v DB_URL=postgres://...
+mcp install server.py -f .env
+```
+
+### Direct Execution
+
+For advanced scenarios like custom deployments:
+
+```python
+from mcp.server.fastmcp import FastMCP
+
+mcp = FastMCP("My App")
+
+if __name__ == "__main__":
+    mcp.run()
+```
+
+Run it with:
+```bash
+python server.py
+# or
+mcp run server.py
+```
+
+### Mounting to an Existing ASGI Server
+
+You can mount the SSE server to an existing ASGI server using the `sse_app` method. This allows you to integrate the SSE server with other ASGI applications.
+
+```python
+from starlette.applications import Starlette
+from starlette.routing import Mount, Host
+from mcp.server.fastmcp import FastMCP
+
+
+mcp = FastMCP("My App")
+
+# Mount the SSE server to the existing ASGI server
+app = Starlette(
+    routes=[
+        Mount('/', app=mcp.sse_app()),
+    ]
+)
+
+# or dynamically mount as host
+app.router.routes.append(Host('mcp.acme.corp', app=mcp.sse_app()))
+```
+
+For more information on mounting applications in Starlette, see the [Starlette documentation](https://www.starlette.io/routing/#submounting-routes).
+
+## Examples
+
+### Echo Server
+
+A simple server demonstrating resources, tools, and prompts:
+
+```python
+from mcp.server.fastmcp import FastMCP
+
+mcp = FastMCP("Echo")
+
+
+@mcp.resource("echo://{message}")
+def echo_resource(message: str) -> str:
+    """Echo a message as a resource"""
+    return f"Resource echo: {message}"
+
+
+@mcp.tool()
+def echo_tool(message: str) -> str:
+    """Echo a message as a tool"""
+    return f"Tool echo: {message}"
+
+
+@mcp.prompt()
+def echo_prompt(message: str) -> str:
+    """Create an echo prompt"""
+    return f"Please process this message: {message}"
+```
+
+### SQLite Explorer
+
+A more complex example showing database integration:
+
+```python
+import sqlite3
+
+from mcp.server.fastmcp import FastMCP
+
+mcp = FastMCP("SQLite Explorer")
+
+
+@mcp.resource("schema://main")
+def get_schema() -> str:
+    """Provide the database schema as a resource"""
+    conn = sqlite3.connect("database.db")
+    schema = conn.execute("SELECT sql FROM sqlite_master WHERE type='table'").fetchall()
+    return "\n".join(sql[0] for sql in schema if sql[0])
+
+
+@mcp.tool()
+def query_data(sql: str) -> str:
+    """Execute SQL queries safely"""
+    conn = sqlite3.connect("database.db")
+    try:
+        result = conn.execute(sql).fetchall()
+        return "\n".join(str(row) for row in result)
+    except Exception as e:
+        return f"Error: {str(e)}"
+```
+
+## Advanced Usage
+
+### Low-Level Server
+
+For more control, you can use the low-level server implementation directly. This gives you full access to the protocol and allows you to customize every aspect of your server, including lifecycle management through the lifespan API:
+
+```python
+from contextlib import asynccontextmanager
+from collections.abc import AsyncIterator
+
+from fake_database import Database  # Replace with your actual DB type
+
+from mcp.server import Server
+
+
+@asynccontextmanager
+async def server_lifespan(server: Server) -> AsyncIterator[dict]:
+    """Manage server startup and shutdown lifecycle."""
+    # Initialize resources on startup
+    db = await Database.connect()
+    try:
+        yield {"db": db}
+    finally:
+        # Clean up on shutdown
+        await db.disconnect()
+
+
+# Pass lifespan to server
+server = Server("example-server", lifespan=server_lifespan)
+
+
+# Access lifespan context in handlers
+@server.call_tool()
+async def query_db(name: str, arguments: dict) -> list:
+    ctx = server.request_context
+    db = ctx.lifespan_context["db"]
+    return await db.query(arguments["query"])
+```
+
+The lifespan API provides:
+- A way to initialize resources when the server starts and clean them up when it stops
+- Access to initialized resources through the request context in handlers
+- Type-safe context passing between lifespan and request handlers
+
+```python
+import mcp.server.stdio
+import mcp.types as types
+from mcp.server.lowlevel import NotificationOptions, Server
+from mcp.server.models import InitializationOptions
+
+# Create a server instance
+server = Server("example-server")
+
+
+@server.list_prompts()
+async def handle_list_prompts() -> list[types.Prompt]:
+    return [
+        types.Prompt(
+            name="example-prompt",
+            description="An example prompt template",
+            arguments=[
+                types.PromptArgument(
+                    name="arg1", description="Example argument", required=True
+                )
+            ],
+        )
+    ]
+
+
+@server.get_prompt()
+async def handle_get_prompt(
+    name: str, arguments: dict[str, str] | None
+) -> types.GetPromptResult:
+    if name != "example-prompt":
+        raise ValueError(f"Unknown prompt: {name}")
+
+    return types.GetPromptResult(
+        description="Example prompt",
+        messages=[
+            types.PromptMessage(
+                role="user",
+                content=types.TextContent(type="text", text="Example prompt text"),
+            )
+        ],
+    )
+
+
+async def run():
+    async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
+        await server.run(
+            read_stream,
+            write_stream,
+            InitializationOptions(
+                server_name="example",
+                server_version="0.1.0",
+                capabilities=server.get_capabilities(
+                    notification_options=NotificationOptions(),
+                    experimental_capabilities={},
+                ),
+            ),
+        )
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(run())
+```
+
+### Writing MCP Clients
+
+The SDK provides a high-level client interface for connecting to MCP servers:
+
+```python
+from mcp import ClientSession, StdioServerParameters, types
+from mcp.client.stdio import stdio_client
+
+# Create server parameters for stdio connection
+server_params = StdioServerParameters(
+    command="python",  # Executable
+    args=["example_server.py"],  # Optional command line arguments
+    env=None,  # Optional environment variables
+)
+
+
+# Optional: create a sampling callback
+async def handle_sampling_message(
+    message: types.CreateMessageRequestParams,
+) -> types.CreateMessageResult:
+    return types.CreateMessageResult(
+        role="assistant",
+        content=types.TextContent(
+            type="text",
+            text="Hello, world! from model",
+        ),
+        model="gpt-3.5-turbo",
+        stopReason="endTurn",
+    )
+
+
+async def run():
+    async with stdio_client(server_params) as (read, write):
+        async with ClientSession(
+            read, write, sampling_callback=handle_sampling_message
+        ) as session:
+            # Initialize the connection
+            await session.initialize()
+
+            # List available prompts
+            prompts = await session.list_prompts()
+
+            # Get a prompt
+            prompt = await session.get_prompt(
+                "example-prompt", arguments={"arg1": "value"}
+            )
+
+            # List available resources
+            resources = await session.list_resources()
+
+            # List available tools
+            tools = await session.list_tools()
+
+            # Read a resource
+            content, mime_type = await session.read_resource("file://some/path")
+
+            # Call a tool
+            result = await session.call_tool("tool-name", arguments={"arg1": "value"})
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(run())
+```
+
+### MCP Primitives
+
+The MCP protocol defines three core primitives that servers can implement:
+
+| Primitive | Control               | Description                                         | Example Use                  |
+|-----------|-----------------------|-----------------------------------------------------|------------------------------|
+| Prompts   | User-controlled       | Interactive templates invoked by user choice        | Slash commands, menu options |
+| Resources | Application-controlled| Contextual data managed by the client application   | File contents, API responses |
+| Tools     | Model-controlled      | Functions exposed to the LLM to take actions        | API calls, data updates      |
+
+### Server Capabilities
+
+MCP servers declare capabilities during initialization:
+
+| Capability  | Feature Flag                 | Description                        |
+|-------------|------------------------------|------------------------------------|
+| `prompts`   | `listChanged`                | Prompt template management         |
+| `resources` | `subscribe`<br/>`listChanged`| Resource exposure and updates      |
+| `tools`     | `listChanged`                | Tool discovery and execution       |
+| `logging`   | -                            | Server logging configuration       |
+| `completion`| -                            | Argument completion suggestions    |
+
+## Documentation
+
+- [Model Context Protocol documentation](https://modelcontextprotocol.io)
+- [Model Context Protocol specification](https://spec.modelcontextprotocol.io)
+- [Officially supported servers](https://github.com/modelcontextprotocol/servers)
+
+## Contributing
+
+We are passionate about supporting contributors of all levels of experience and would love to see you get involved in the project. See the [contributing guide](CONTRIBUTING.md) to get started.
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
+
+MCP Python SDK example of an MCP client:
+```py
+import asyncio
+import json
+import logging
+import os
+import shutil
+from contextlib import AsyncExitStack
+from typing import Any
+
+import httpx
+from dotenv import load_dotenv
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+class Configuration:
+    """Manages configuration and environment variables for the MCP client."""
+
+    def __init__(self) -> None:
+        """Initialize configuration with environment variables."""
+        self.load_env()
+        self.api_key = os.getenv("LLM_API_KEY")
+
+    @staticmethod
+    def load_env() -> None:
+        """Load environment variables from .env file."""
+        load_dotenv()
+
+    @staticmethod
+    def load_config(file_path: str) -> dict[str, Any]:
+        """Load server configuration from JSON file.
+
+        Args:
+            file_path: Path to the JSON configuration file.
+
+        Returns:
+            Dict containing server configuration.
+
+        Raises:
+            FileNotFoundError: If configuration file doesn't exist.
+            JSONDecodeError: If configuration file is invalid JSON.
+        """
+        with open(file_path, "r") as f:
+            return json.load(f)
+
+    @property
+    def llm_api_key(self) -> str:
+        """Get the LLM API key.
+
+        Returns:
+            The API key as a string.
+
+        Raises:
+            ValueError: If the API key is not found in environment variables.
+        """
+        if not self.api_key:
+            raise ValueError("LLM_API_KEY not found in environment variables")
+        return self.api_key
+
+
+class Server:
+    """Manages MCP server connections and tool execution."""
+
+    def __init__(self, name: str, config: dict[str, Any]) -> None:
+        self.name: str = name
+        self.config: dict[str, Any] = config
+        self.stdio_context: Any | None = None
+        self.session: ClientSession | None = None
+        self._cleanup_lock: asyncio.Lock = asyncio.Lock()
+        self.exit_stack: AsyncExitStack = AsyncExitStack()
+
+    async def initialize(self) -> None:
+        """Initialize the server connection."""
+        command = (
+            shutil.which("npx")
+            if self.config["command"] == "npx"
+            else self.config["command"]
+        )
+        if command is None:
+            raise ValueError("The command must be a valid string and cannot be None.")
+
+        server_params = StdioServerParameters(
+            command=command,
+            args=self.config["args"],
+            env={**os.environ, **self.config["env"]}
+            if self.config.get("env")
+            else None,
+        )
+        try:
+            stdio_transport = await self.exit_stack.enter_async_context(
+                stdio_client(server_params)
+            )
+            read, write = stdio_transport
+            session = await self.exit_stack.enter_async_context(
+                ClientSession(read, write)
+            )
+            await session.initialize()
+            self.session = session
+        except Exception as e:
+            logging.error(f"Error initializing server {self.name}: {e}")
+            await self.cleanup()
+            raise
+
+    async def list_tools(self) -> list[Any]:
+        """List available tools from the server.
+
+        Returns:
+            A list of available tools.
+
+        Raises:
+            RuntimeError: If the server is not initialized.
+        """
+        if not self.session:
+            raise RuntimeError(f"Server {self.name} not initialized")
+
+        tools_response = await self.session.list_tools()
+        tools = []
+
+        for item in tools_response:
+            if isinstance(item, tuple) and item[0] == "tools":
+                for tool in item[1]:
+                    tools.append(Tool(tool.name, tool.description, tool.inputSchema))
+
+        return tools
+
+    async def execute_tool(
+        self,
+        tool_name: str,
+        arguments: dict[str, Any],
+        retries: int = 2,
+        delay: float = 1.0,
+    ) -> Any:
+        """Execute a tool with retry mechanism.
+
+        Args:
+            tool_name: Name of the tool to execute.
+            arguments: Tool arguments.
+            retries: Number of retry attempts.
+            delay: Delay between retries in seconds.
+
+        Returns:
+            Tool execution result.
+
+        Raises:
+            RuntimeError: If server is not initialized.
+            Exception: If tool execution fails after all retries.
+        """
+        if not self.session:
+            raise RuntimeError(f"Server {self.name} not initialized")
+
+        attempt = 0
+        while attempt < retries:
+            try:
+                logging.info(f"Executing {tool_name}...")
+                result = await self.session.call_tool(tool_name, arguments)
+
+                return result
+
+            except Exception as e:
+                attempt += 1
+                logging.warning(
+                    f"Error executing tool: {e}. Attempt {attempt} of {retries}."
+                )
+                if attempt < retries:
+                    logging.info(f"Retrying in {delay} seconds...")
+                    await asyncio.sleep(delay)
+                else:
+                    logging.error("Max retries reached. Failing.")
+                    raise
+
+    async def cleanup(self) -> None:
+        """Clean up server resources."""
+        async with self._cleanup_lock:
+            try:
+                await self.exit_stack.aclose()
+                self.session = None
+                self.stdio_context = None
+            except Exception as e:
+                logging.error(f"Error during cleanup of server {self.name}: {e}")
+
+
+class Tool:
+    """Represents a tool with its properties and formatting."""
+
+    def __init__(
+        self, name: str, description: str, input_schema: dict[str, Any]
+    ) -> None:
+        self.name: str = name
+        self.description: str = description
+        self.input_schema: dict[str, Any] = input_schema
+
+    def format_for_llm(self) -> str:
+        """Format tool information for LLM.
+
+        Returns:
+            A formatted string describing the tool.
+        """
+        args_desc = []
+        if "properties" in self.input_schema:
+            for param_name, param_info in self.input_schema["properties"].items():
+                arg_desc = (
+                    f"- {param_name}: {param_info.get('description', 'No description')}"
+                )
+                if param_name in self.input_schema.get("required", []):
+                    arg_desc += " (required)"
+                args_desc.append(arg_desc)
+
+        return f"""
+Tool: {self.name}
+Description: {self.description}
+Arguments:
+{chr(10).join(args_desc)}
+"""
+
+
+class LLMClient:
+    """Manages communication with the LLM provider."""
+
+    def __init__(self, api_key: str) -> None:
+        self.api_key: str = api_key
+
+    def get_response(self, messages: list[dict[str, str]]) -> str:
+        """Get a response from the LLM.
+
+        Args:
+            messages: A list of message dictionaries.
+
+        Returns:
+            The LLM's response as a string.
+
+        Raises:
+            httpx.RequestError: If the request to the LLM fails.
+        """
+        url = "https://api.groq.com/openai/v1/chat/completions"
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+        payload = {
+            "messages": messages,
+            "model": "llama-3.2-90b-vision-preview",
+            "temperature": 0.7,
+            "max_tokens": 4096,
+            "top_p": 1,
+            "stream": False,
+            "stop": None,
+        }
+
+        try:
+            with httpx.Client() as client:
+                response = client.post(url, headers=headers, json=payload)
+                response.raise_for_status()
+                data = response.json()
+                return data["choices"][0]["message"]["content"]
+
+        except httpx.RequestError as e:
+            error_message = f"Error getting LLM response: {str(e)}"
+            logging.error(error_message)
+
+            if isinstance(e, httpx.HTTPStatusError):
+                status_code = e.response.status_code
+                logging.error(f"Status code: {status_code}")
+                logging.error(f"Response details: {e.response.text}")
+
+            return (
+                f"I encountered an error: {error_message}. "
+                "Please try again or rephrase your request."
+            )
+
+
+class ChatSession:
+    """Orchestrates the interaction between user, LLM, and tools."""
+
+    def __init__(self, servers: list[Server], llm_client: LLMClient) -> None:
+        self.servers: list[Server] = servers
+        self.llm_client: LLMClient = llm_client
+
+    async def cleanup_servers(self) -> None:
+        """Clean up all servers properly."""
+        cleanup_tasks = []
+        for server in self.servers:
+            cleanup_tasks.append(asyncio.create_task(server.cleanup()))
+
+        if cleanup_tasks:
+            try:
+                await asyncio.gather(*cleanup_tasks, return_exceptions=True)
+            except Exception as e:
+                logging.warning(f"Warning during final cleanup: {e}")
+
+    async def process_llm_response(self, llm_response: str) -> str:
+        """Process the LLM response and execute tools if needed.
+
+        Args:
+            llm_response: The response from the LLM.
+
+        Returns:
+            The result of tool execution or the original response.
+        """
+        import json
+
+        try:
+            tool_call = json.loads(llm_response)
+            if "tool" in tool_call and "arguments" in tool_call:
+                logging.info(f"Executing tool: {tool_call['tool']}")
+                logging.info(f"With arguments: {tool_call['arguments']}")
+
+                for server in self.servers:
+                    tools = await server.list_tools()
+                    if any(tool.name == tool_call["tool"] for tool in tools):
+                        try:
+                            result = await server.execute_tool(
+                                tool_call["tool"], tool_call["arguments"]
+                            )
+
+                            if isinstance(result, dict) and "progress" in result:
+                                progress = result["progress"]
+                                total = result["total"]
+                                percentage = (progress / total) * 100
+                                logging.info(
+                                    f"Progress: {progress}/{total} "
+                                    f"({percentage:.1f}%)"
+                                )
+
+                            return f"Tool execution result: {result}"
+                        except Exception as e:
+                            error_msg = f"Error executing tool: {str(e)}"
+                            logging.error(error_msg)
+                            return error_msg
+
+                return f"No server found with tool: {tool_call['tool']}"
+            return llm_response
+        except json.JSONDecodeError:
+            return llm_response
+
+    async def start(self) -> None:
+        """Main chat session handler."""
+        try:
+            for server in self.servers:
+                try:
+                    await server.initialize()
+                except Exception as e:
+                    logging.error(f"Failed to initialize server: {e}")
+                    await self.cleanup_servers()
+                    return
+
+            all_tools = []
+            for server in self.servers:
+                tools = await server.list_tools()
+                all_tools.extend(tools)
+
+            tools_description = "\n".join([tool.format_for_llm() for tool in all_tools])
+
+            system_message = (
+                "You are a helpful assistant with access to these tools:\n\n"
+                f"{tools_description}\n"
+                "Choose the appropriate tool based on the user's question. "
+                "If no tool is needed, reply directly.\n\n"
+                "IMPORTANT: When you need to use a tool, you must ONLY respond with "
+                "the exact JSON object format below, nothing else:\n"
+                "{\n"
+                '    "tool": "tool-name",\n'
+                '    "arguments": {\n'
+                '        "argument-name": "value"\n'
+                "    }\n"
+                "}\n\n"
+                "After receiving a tool's response:\n"
+                "1. Transform the raw data into a natural, conversational response\n"
+                "2. Keep responses concise but informative\n"
+                "3. Focus on the most relevant information\n"
+                "4. Use appropriate context from the user's question\n"
+                "5. Avoid simply repeating the raw data\n\n"
+                "Please use only the tools that are explicitly defined above."
+            )
+
+            messages = [{"role": "system", "content": system_message}]
+
+            while True:
+                try:
+                    user_input = input("You: ").strip().lower()
+                    if user_input in ["quit", "exit"]:
+                        logging.info("\nExiting...")
+                        break
+
+                    messages.append({"role": "user", "content": user_input})
+
+                    llm_response = self.llm_client.get_response(messages)
+                    logging.info("\nAssistant: %s", llm_response)
+
+                    result = await self.process_llm_response(llm_response)
+
+                    if result != llm_response:
+                        messages.append({"role": "assistant", "content": llm_response})
+                        messages.append({"role": "system", "content": result})
+
+                        final_response = self.llm_client.get_response(messages)
+                        logging.info("\nFinal response: %s", final_response)
+                        messages.append(
+                            {"role": "assistant", "content": final_response}
+                        )
+                    else:
+                        messages.append({"role": "assistant", "content": llm_response})
+
+                except KeyboardInterrupt:
+                    logging.info("\nExiting...")
+                    break
+
+        finally:
+            await self.cleanup_servers()
+
+
+async def main() -> None:
+    """Initialize and run the chat session."""
+    config = Configuration()
+    server_config = config.load_config("servers_config.json")
+    servers = [
+        Server(name, srv_config)
+        for name, srv_config in server_config["mcpServers"].items()
+    ]
+    llm_client = LLMClient(config.llm_api_key)
+    chat_session = ChatSession(servers, llm_client)
+    await chat_session.start()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+
+
+
+JSON schema for Claude Code tools available via MCP:
+```json
+{
+    "jsonrpc": "2.0",
+    "id": 1,
+    "result": {
+        "tools": [
+            {
+                "name": "dispatch_agent",
+                "description": "Launch a new task",
+                "inputSchema": {
+                    "type": "object",
+                    "properties": {
+                        "prompt": {
+                            "type": "string",
+                            "description": "The task for the agent to perform"
+                        }
+                    },
+                    "required": [
+                        "prompt"
+                    ],
+                    "additionalProperties": false,
+                    "$schema": "http://json-schema.org/draft-07/schema#"
+                }
+            },
+            {
+                "name": "Bash",
+                "description": "Run shell command",
+                "inputSchema": {
+                    "type": "object",
+                    "properties": {
+                        "command": {
+                            "type": "string",
+                            "description": "The command to execute"
+                        },
+                        "timeout": {
+                            "type": "number",
+                            "description": "Optional timeout in milliseconds (max 600000)"
+                        },
+                        "description": {
+                            "type": "string",
+                            "description": " Clear, concise description of what this command does in 5-10 words. Examples:\nInput: ls\nOutput: Lists files in current directory\n\nInput: git status\nOutput: Shows working tree status\n\nInput: npm install\nOutput: Installs package dependencies\n\nInput: mkdir foo\nOutput: Creates directory 'foo'"
+                        }
+                    },
+                    "required": [
+                        "command"
+                    ],
+                    "additionalProperties": false,
+                    "$schema": "http://json-schema.org/draft-07/schema#"
+                }
+            },
+            {
+                "name": "BatchTool",

crates/assistant_tools/src/edit_agent/evals/fixtures/zode/react.py 🔗

@@ -0,0 +1,14 @@
+class InputCell:
+    def __init__(self, initial_value):
+        self.value = None
+
+
+class ComputeCell:
+    def __init__(self, inputs, compute_function):
+        self.value = None
+
+    def add_callback(self, callback):
+        pass
+
+    def remove_callback(self, callback):
+        pass

crates/assistant_tools/src/edit_agent/evals/fixtures/zode/react_test.py 🔗

@@ -0,0 +1,271 @@
+# These tests are auto-generated with test data from:
+# https://github.com/exercism/problem-specifications/tree/main/exercises/react/canonical-data.json
+# File last updated on 2023-07-19
+
+from functools import partial
+import unittest
+
+from react import (
+    InputCell,
+    ComputeCell,
+)
+
+
+class ReactTest(unittest.TestCase):
+    def test_input_cells_have_a_value(self):
+        input = InputCell(10)
+        self.assertEqual(input.value, 10)
+
+    def test_an_input_cell_s_value_can_be_set(self):
+        input = InputCell(4)
+        input.value = 20
+        self.assertEqual(input.value, 20)
+
+    def test_compute_cells_calculate_initial_value(self):
+        input = InputCell(1)
+        output = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        self.assertEqual(output.value, 2)
+
+    def test_compute_cells_take_inputs_in_the_right_order(self):
+        one = InputCell(1)
+        two = InputCell(2)
+        output = ComputeCell(
+            [
+                one,
+                two,
+            ],
+            lambda inputs: inputs[0] + inputs[1] * 10,
+        )
+        self.assertEqual(output.value, 21)
+
+    def test_compute_cells_update_value_when_dependencies_are_changed(self):
+        input = InputCell(1)
+        output = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        input.value = 3
+        self.assertEqual(output.value, 4)
+
+    def test_compute_cells_can_depend_on_other_compute_cells(self):
+        input = InputCell(1)
+        times_two = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] * 2,
+        )
+        times_thirty = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] * 30,
+        )
+        output = ComputeCell(
+            [
+                times_two,
+                times_thirty,
+            ],
+            lambda inputs: inputs[0] + inputs[1],
+        )
+        self.assertEqual(output.value, 32)
+        input.value = 3
+        self.assertEqual(output.value, 96)
+
+    def test_compute_cells_fire_callbacks(self):
+        input = InputCell(1)
+        output = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        cb1_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        output.add_callback(callback1)
+        input.value = 3
+        self.assertEqual(cb1_observer[-1], 4)
+
+    def test_callback_cells_only_fire_on_change(self):
+        input = InputCell(1)
+        output = ComputeCell([input], lambda inputs: 111 if inputs[0] < 3 else 222)
+        cb1_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        output.add_callback(callback1)
+        input.value = 2
+        self.assertEqual(cb1_observer, [])
+        input.value = 4
+        self.assertEqual(cb1_observer[-1], 222)
+
+    def test_callbacks_do_not_report_already_reported_values(self):
+        input = InputCell(1)
+        output = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        cb1_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        output.add_callback(callback1)
+        input.value = 2
+        self.assertEqual(cb1_observer[-1], 3)
+        input.value = 3
+        self.assertEqual(cb1_observer[-1], 4)
+
+    def test_callbacks_can_fire_from_multiple_cells(self):
+        input = InputCell(1)
+        plus_one = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        minus_one = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] - 1,
+        )
+        cb1_observer = []
+        cb2_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        callback2 = self.callback_factory(cb2_observer)
+        plus_one.add_callback(callback1)
+        minus_one.add_callback(callback2)
+        input.value = 10
+        self.assertEqual(cb1_observer[-1], 11)
+        self.assertEqual(cb2_observer[-1], 9)
+
+    def test_callbacks_can_be_added_and_removed(self):
+        input = InputCell(11)
+        output = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        cb1_observer = []
+        cb2_observer = []
+        cb3_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        callback2 = self.callback_factory(cb2_observer)
+        callback3 = self.callback_factory(cb3_observer)
+        output.add_callback(callback1)
+        output.add_callback(callback2)
+        input.value = 31
+        self.assertEqual(cb1_observer[-1], 32)
+        self.assertEqual(cb2_observer[-1], 32)
+        output.remove_callback(callback1)
+        output.add_callback(callback3)
+        input.value = 41
+        self.assertEqual(len(cb1_observer), 1)
+        self.assertEqual(cb2_observer[-1], 42)
+        self.assertEqual(cb3_observer[-1], 42)
+
+    def test_removing_a_callback_multiple_times_doesn_t_interfere_with_other_callbacks(
+        self,
+    ):
+        input = InputCell(1)
+        output = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        cb1_observer = []
+        cb2_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        callback2 = self.callback_factory(cb2_observer)
+        output.add_callback(callback1)
+        output.add_callback(callback2)
+        output.remove_callback(callback1)
+        output.remove_callback(callback1)
+        output.remove_callback(callback1)
+        input.value = 2
+        self.assertEqual(cb1_observer, [])
+        self.assertEqual(cb2_observer[-1], 3)
+
+    def test_callbacks_should_only_be_called_once_even_if_multiple_dependencies_change(
+        self,
+    ):
+        input = InputCell(1)
+        plus_one = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        minus_one1 = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] - 1,
+        )
+        minus_one2 = ComputeCell(
+            [
+                minus_one1,
+            ],
+            lambda inputs: inputs[0] - 1,
+        )
+        output = ComputeCell(
+            [
+                plus_one,
+                minus_one2,
+            ],
+            lambda inputs: inputs[0] * inputs[1],
+        )
+        cb1_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        output.add_callback(callback1)
+        input.value = 4
+        self.assertEqual(cb1_observer[-1], 10)
+
+    def test_callbacks_should_not_be_called_if_dependencies_change_but_output_value_doesn_t_change(
+        self,
+    ):
+        input = InputCell(1)
+        plus_one = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] + 1,
+        )
+        minus_one = ComputeCell(
+            [
+                input,
+            ],
+            lambda inputs: inputs[0] - 1,
+        )
+        always_two = ComputeCell(
+            [
+                plus_one,
+                minus_one,
+            ],
+            lambda inputs: inputs[0] - inputs[1],
+        )
+        cb1_observer = []
+        callback1 = self.callback_factory(cb1_observer)
+        always_two.add_callback(callback1)
+        input.value = 2
+        self.assertEqual(cb1_observer, [])
+        input.value = 3
+        self.assertEqual(cb1_observer, [])
+        input.value = 4
+        self.assertEqual(cb1_observer, [])
+        input.value = 5
+        self.assertEqual(cb1_observer, [])
+
+    # Utility functions.
+    def callback_factory(self, observer):
+        def callback(observer, value):
+            observer.append(value)
+
+        return partial(callback, observer)

crates/assistant_tools/src/streaming_edit_file_tool.rs 🔗

@@ -38,7 +38,7 @@ pub struct StreamingEditFileToolInput {
     /// so that we can display it immediately.
     pub display_description: String,
 
-    /// The full path of the file to modify in the project.
+    /// The full path of the file to create or modify in the project.
     ///
     /// WARNING: When specifying which file path need changing, you MUST
     /// start each path with one of the project's root directories.
@@ -58,6 +58,10 @@ pub struct StreamingEditFileToolInput {
     /// `frontend/db.js`
     /// </example>
     pub path: PathBuf,
+
+    /// If true, this tool will recreate the file from scratch.
+    /// If false, this tool will produce granular edits to an existing file.
+    pub create_or_overwrite: bool,
 }
 
 #[derive(Debug, Serialize, Deserialize, JsonSchema)]
@@ -158,7 +162,7 @@ impl Tool for StreamingEditFileTool {
         let card_clone = card.clone();
         let messages = messages.to_vec();
         let task = cx.spawn(async move |cx: &mut AsyncApp| {
-            if !exists.await? {
+            if !input.create_or_overwrite && !exists.await? {
                 return Err(anyhow!("{} not found", input.path.display()));
             }
 
@@ -182,12 +186,21 @@ impl Tool for StreamingEditFileTool {
                 })
                 .await;
 
-            let (output, mut events) = edit_agent.edit(
-                buffer.clone(),
-                input.display_description.clone(),
-                messages,
-                cx,
-            );
+            let (output, mut events) = if input.create_or_overwrite {
+                edit_agent.overwrite(
+                    buffer.clone(),
+                    input.display_description.clone(),
+                    messages,
+                    cx,
+                )
+            } else {
+                edit_agent.edit(
+                    buffer.clone(),
+                    input.display_description.clone(),
+                    messages,
+                    cx,
+                )
+            };
 
             let mut hallucinated_old_text = false;
             while let Some(event) = events.next().await {
@@ -213,7 +226,7 @@ impl Tool for StreamingEditFileTool {
                             .log_err();
                         }
                     }
-                    EditAgentOutputEvent::HallucinatedOldText(_) => hallucinated_old_text = true,
+                    EditAgentOutputEvent::OldTextNotFound(_) => hallucinated_old_text = true,
                 }
             }
             output.await?;

crates/assistant_tools/src/streaming_edit_file_tool/description.md 🔗

@@ -1,4 +1,4 @@
-This is a tool for editing files. For moving or renaming files, you should generally use the `terminal` tool with the 'mv' command instead. For larger edits, use the `create_file` tool to overwrite files.
+This is a tool for creating a new file or editing an existing file. For moving or renaming files, you should generally use the `terminal` tool with the 'mv' command instead.
 
 Before using this tool:

crates/assistant_tools/src/templates/create_file_prompt.hbs 🔗

@@ -0,0 +1,12 @@
+You are an expert engineer and your task is to write a new file from scratch.
+
+<file_to_edit>
+{{path}}
+</file_to_edit>
+
+<edit_description>
+{{edit_description}}
+</edit_description>
+
+You MUST respond directly with the file's content, without explanations, additional text or triple backticks.
+The text you output will be saved verbatim as the content of the file.

crates/assistant_tools/src/templates/edit_agent.hbs → crates/assistant_tools/src/templates/edit_file_prompt.hbs 🔗

crates/language/src/buffer.rs 🔗

@@ -2141,6 +2141,14 @@ impl Buffer {
         self.edit([(0..self.len(), text)], None, cx)
     }
 
+    /// Appends the given text to the end of the buffer.
+    pub fn append<T>(&mut self, text: T, cx: &mut Context<Self>) -> Option<clock::Lamport>
+    where
+        T: Into<Arc<str>>,
+    {
+        self.edit([(self.len()..self.len(), text)], None, cx)
+    }
+
     /// Applies the given edits to the buffer. Each edit is specified as a range of text to
     /// delete, and a string of text to insert at that location.
     ///