evals: Eval for creating an empty file (#31034)

crates/assistant_tools/src/edit_agent.rs 🔗

@@ -24,6 +24,7 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::{cmp, iter, mem, ops::Range, path::PathBuf, sync::Arc, task::Poll};
 use streaming_diff::{CharOperation, StreamingDiff};
+use util::debug_panic;
 
 #[derive(Serialize)]
 struct CreateFilePromptTemplate {
@@ -543,6 +544,11 @@ impl EditAgent {
                 if last_message.content.is_empty() {
                     conversation.messages.pop();
                 }
+            } else {
+                debug_panic!(
+                    "Last message must be an Assistant tool calling! Got {:?}",
+                    last_message.content
+                );
             }
         }

crates/assistant_tools/src/edit_agent/evals.rs 🔗

@@ -3,6 +3,7 @@ use crate::{
     ReadFileToolInput,
     edit_file_tool::{EditFileMode, EditFileToolInput},
     grep_tool::GrepToolInput,
+    list_directory_tool::ListDirectoryToolInput,
 };
 use Role::*;
 use anyhow::anyhow;
@@ -40,8 +41,8 @@ fn eval_extract_handle_command_output() {
     eval(
         100,
         0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(
                     User,
                     [text(formatdoc! {"
@@ -81,11 +82,9 @@ fn eval_extract_handle_command_output() {
                     )],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::assert_eq(output_file_content),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::assert_eq(output_file_content),
+        ),
     );
 }
 
@@ -99,8 +98,8 @@ fn eval_delete_run_git_blame() {
     eval(
         100,
         0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(
                     User,
                     [text(formatdoc! {"
@@ -137,11 +136,9 @@ fn eval_delete_run_git_blame() {
                     )],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::assert_eq(output_file_content),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::assert_eq(output_file_content),
+        ),
     );
 }
 
@@ -154,8 +151,8 @@ fn eval_translate_doc_comments() {
     eval(
         200,
         1.,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(
                     User,
                     [text(formatdoc! {"
@@ -192,11 +189,9 @@ fn eval_translate_doc_comments() {
                     )],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff("Doc comments were translated to Italian"),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
+        ),
     );
 }
 
@@ -210,8 +205,8 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
     eval(
         100,
         0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(
                     User,
                     [text(formatdoc! {"
@@ -307,14 +302,12 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
                     )],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(indoc! {"
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(indoc! {"
                 - The compile_parser_to_wasm method has been changed to use wasi-sdk
                 - ureq is used to download the SDK for current platform and architecture
             "}),
-        },
+        ),
     );
 }
 
@@ -325,10 +318,10 @@ fn eval_disable_cursor_blinking() {
     let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
     let edit_description = "Comment out the call to `BlinkManager::enable`";
     eval(
-        200,
+        100,
         0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(User, [text("Let's research how to cursor blinking works.")]),
                 message(
                     Assistant,
@@ -382,15 +375,13 @@ fn eval_disable_cursor_blinking() {
                     )],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(indoc! {"
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(indoc! {"
                 - Calls to BlinkManager in `observe_window_activation` were commented out
                 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
                 - All the edits have valid indentation
             "}),
-        },
+        ),
     );
 }
 
@@ -403,8 +394,8 @@ fn eval_from_pixels_constructor() {
     eval(
         100,
         0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(
                     User,
                     [text(indoc! {"
@@ -576,14 +567,12 @@ fn eval_from_pixels_constructor() {
                     )],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(indoc! {"
-                - The diff contains a new `from_pixels` constructor
-                - The diff contains new tests for the `from_pixels` constructor
-            "}),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(indoc! {"
+                    - The diff contains a new `from_pixels` constructor
+                    - The diff contains new tests for the `from_pixels` constructor
+                "}),
+        ),
     );
 }
 
@@ -591,12 +580,13 @@ fn eval_from_pixels_constructor() {
 #[cfg_attr(not(feature = "eval"), ignore)]
 fn eval_zode() {
     let input_file_path = "root/zode.py";
+    let input_content = None;
     let edit_description = "Create the main Zode CLI script";
     eval(
         200,
         1.,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
                 message(
                     Assistant,
@@ -654,10 +644,8 @@ fn eval_zode() {
                     ],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: None,
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::new(async move |sample, _, _cx| {
+            input_content,
+            EvalAssertion::new(async move |sample, _, _cx| {
                 let invalid_starts = [' ', '`', '\n'];
                 let mut message = String::new();
                 for start in invalid_starts {
@@ -681,7 +669,7 @@ fn eval_zode() {
                     })
                 }
             }),
-        },
+        ),
     );
 }
 
@@ -694,8 +682,8 @@ fn eval_add_overwrite_test() {
     eval(
         200,
         0.5, // TODO: make this eval better
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                 message(
                     User,
                     [text(indoc! {"
@@ -899,13 +887,121 @@ fn eval_add_overwrite_test() {
                     ],
                 ),
             ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(
                 "A new test for overwritten files was created, without changing any previous test",
             ),
-        },
+        ),
+    );
+}
+
+#[test]
+#[ignore] // until we figure out the mystery described in the comments
+// #[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_create_empty_file() {
+    // Check that Edit Agent can create a file without writing its
+    // thoughts into it. This issue is not specific to empty files, but
+    // it's easier to reproduce with them.
+    //
+    // NOTE: For some mysterious reason, I could easily reproduce this
+    // issue roughly 90% of the time in actual Zed. However, once I
+    // extract the exact LLM request before the failure point and
+    // generate from that, the reproduction rate drops to 2%!
+    //
+    // Things I've tried to make sure it's not a fluke: disabling prompt
+    // caching, capturing the LLM request via a proxy server, running the
+    // prompt on Claude separately from evals. Every time it was mostly
+    // giving good outcomes, which doesn't match my actual experience in
+    // Zed.
+    //
+    // At some point I discovered that simply adding one insignificant
+    // space or a newline to the prompt suddenly results in an outcome I
+    // tried to reproduce almost perfectly.
+    //
+    // This weirdness happens even outside of the Zed code base and even
+    // when using a different subscription. The result is the same: an
+    // extra newline or space changes the model behavior significantly
+    // enough, so that the pass rate drops from 99% to 0-3%
+    //
+    // I have no explanation to this.
+    //
+    //
+    //  Model                          | Pass rate
+    // ============================================
+    //
+    // --------------------------------------------
+    //           Prompt version: 2025-05-19
+    // --------------------------------------------
+    //
+    //  claude-3.7-sonnet              |  0.98
+    //    + one extra space in prompt  |  0.00
+    //    + original prompt again      |  0.99
+    //    + extra newline              |  0.03
+    //  gemini-2.5-pro-preview-03-25   |  1.00
+    //  gemini-2.5-flash-preview-04-17 |  1.00
+    //    + one extra space            |  1.00
+    //  gpt-4.1                        |  1.00
+    //    + one extra space            |  1.00
+    //
+    //
+    // TODO: gpt-4.1-mini errored 38 times:
+    // "data did not match any variant of untagged enum ResponseStreamResult"
+    //
+    let input_file_content = None;
+    let expected_output_content = String::new();
+    eval(
+        1,
+        1.0,
+        EvalInput::from_conversation(
+            vec![
+                message(User, [text("Create a second empty todo file ")]),
+                message(
+                    Assistant,
+                    [
+                        text(formatdoc! {"
+                        I'll help you create a second empty todo file.
+                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
+                        "}),
+                        tool_use(
+                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
+                            "list_directory",
+                            ListDirectoryToolInput {
+                                path: "root".to_string(),
+                            },
+                        ),
+                    ],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
+                        "list_directory",
+                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text(formatdoc! {"
+                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
+                    "}),
+                        tool_use(
+                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
+                            "edit_file",
+                            EditFileToolInput {
+                                display_description: "Create empty TODO3 file".to_string(),
+                                mode: EditFileMode::Create,
+                                path: "root/TODO3".into(),
+                            },
+                        ),
+                    ],
+                ),
+            ],
+            input_file_content,
+            // Bad behavior is to write something like
+            // "I'll create an empty TODO3 file as requested."
+            EvalAssertion::assert_eq(expected_output_content),
+        ),
     );
 }
 
@@ -964,12 +1060,46 @@ fn tool_result(
 #[derive(Clone)]
 struct EvalInput {
     conversation: Vec<LanguageModelRequestMessage>,
-    input_path: PathBuf,
+    edit_file_input: EditFileToolInput,
     input_content: Option<String>,
-    edit_description: String,
     assertion: EvalAssertion,
 }
 
+impl EvalInput {
+    fn from_conversation(
+        conversation: Vec<LanguageModelRequestMessage>,
+        input_content: Option<String>,
+        assertion: EvalAssertion,
+    ) -> Self {
+        let msg = conversation.last().expect("Conversation must not be empty");
+        if msg.role != Role::Assistant {
+            panic!("Conversation must end with an assistant message");
+        }
+        let tool_use = msg
+            .content
+            .iter()
+            .flat_map(|content| match content {
+                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
+                    Some(tool_use)
+                }
+                _ => None,
+            })
+            .next()
+            .expect("Conversation must end with an edit_file tool use")
+            .clone();
+
+        let edit_file_input: EditFileToolInput =
+            serde_json::from_value(tool_use.input.clone()).unwrap();
+
+        EvalInput {
+            conversation,
+            edit_file_input,
+            input_content,
+            assertion,
+        }
+    }
+}
+
 #[derive(Clone)]
 struct EvalSample {
     text: String,
@@ -1308,7 +1438,7 @@ impl EditAgentTest {
         let path = self
             .project
             .read_with(cx, |project, cx| {
-                project.find_project_path(eval.input_path, cx)
+                project.find_project_path(eval.edit_file_input.path, cx)
             })
             .unwrap();
         let buffer = self
@@ -1336,11 +1466,13 @@ impl EditAgentTest {
             }),
             ..Default::default()
         };
-        let edit_output = if let Some(input_content) = eval.input_content.as_deref() {
-            buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
+            if let Some(input_content) = eval.input_content.as_deref() {
+                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+            }
             let (edit_output, _) = self.agent.edit(
                 buffer.clone(),
-                eval.edit_description,
+                eval.edit_file_input.display_description,
                 &conversation,
                 &mut cx.to_async(),
             );
@@ -1348,7 +1480,7 @@ impl EditAgentTest {
         } else {
             let (edit_output, _) = self.agent.overwrite(
                 buffer.clone(),
-                eval.edit_description,
+                eval.edit_file_input.display_description,
                 &conversation,
                 &mut cx.to_async(),
             );

crates/assistant_tools/src/edit_file_tool.rs 🔗

@@ -38,7 +38,7 @@ use workspace::Workspace;
 
 pub struct EditFileTool;
 
-#[derive(Debug, Serialize, Deserialize, JsonSchema)]
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 pub struct EditFileToolInput {
     /// A one-line, user-friendly markdown description of the edit. This will be
     /// shown in the UI and also passed to another model to perform the edit.

evals: Eval for creating an empty file (#31034)

Change summary

Detailed changes

crates/assistant_tools/src/edit_agent.rs 🔗

crates/assistant_tools/src/edit_agent/evals.rs 🔗

crates/assistant_tools/src/edit_file_tool.rs 🔗