@@ -3,6 +3,7 @@ use crate::{
ReadFileToolInput,
edit_file_tool::{EditFileMode, EditFileToolInput},
grep_tool::GrepToolInput,
+ list_directory_tool::ListDirectoryToolInput,
};
use Role::*;
use anyhow::anyhow;
@@ -40,8 +41,8 @@ fn eval_extract_handle_command_output() {
eval(
100,
0.95,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(
User,
[text(formatdoc! {"
@@ -81,11 +82,9 @@ fn eval_extract_handle_command_output() {
)],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::assert_eq(output_file_content),
- },
+ Some(input_file_content.into()),
+ EvalAssertion::assert_eq(output_file_content),
+ ),
);
}
@@ -99,8 +98,8 @@ fn eval_delete_run_git_blame() {
eval(
100,
0.95,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(
User,
[text(formatdoc! {"
@@ -137,11 +136,9 @@ fn eval_delete_run_git_blame() {
)],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::assert_eq(output_file_content),
- },
+ Some(input_file_content.into()),
+ EvalAssertion::assert_eq(output_file_content),
+ ),
);
}
@@ -154,8 +151,8 @@ fn eval_translate_doc_comments() {
eval(
200,
1.,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(
User,
[text(formatdoc! {"
@@ -192,11 +189,9 @@ fn eval_translate_doc_comments() {
)],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::judge_diff("Doc comments were translated to Italian"),
- },
+ Some(input_file_content.into()),
+ EvalAssertion::judge_diff("Doc comments were translated to Italian"),
+ ),
);
}
@@ -210,8 +205,8 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
eval(
100,
0.95,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(
User,
[text(formatdoc! {"
@@ -307,14 +302,12 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
)],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::judge_diff(indoc! {"
+ Some(input_file_content.into()),
+ EvalAssertion::judge_diff(indoc! {"
- The compile_parser_to_wasm method has been changed to use wasi-sdk
- ureq is used to download the SDK for current platform and architecture
"}),
- },
+ ),
);
}
@@ -325,10 +318,10 @@ fn eval_disable_cursor_blinking() {
let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
let edit_description = "Comment out the call to `BlinkManager::enable`";
eval(
- 200,
+ 100,
0.95,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(User, [text("Let's research how to cursor blinking works.")]),
message(
Assistant,
@@ -382,15 +375,13 @@ fn eval_disable_cursor_blinking() {
)],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::judge_diff(indoc! {"
+ Some(input_file_content.into()),
+ EvalAssertion::judge_diff(indoc! {"
- Calls to BlinkManager in `observe_window_activation` were commented out
- The call to `blink_manager.enable` above the call to show_cursor_names was commented out
- All the edits have valid indentation
"}),
- },
+ ),
);
}
@@ -403,8 +394,8 @@ fn eval_from_pixels_constructor() {
eval(
100,
0.95,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(
User,
[text(indoc! {"
@@ -576,14 +567,12 @@ fn eval_from_pixels_constructor() {
)],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::judge_diff(indoc! {"
- - The diff contains a new `from_pixels` constructor
- - The diff contains new tests for the `from_pixels` constructor
- "}),
- },
+ Some(input_file_content.into()),
+ EvalAssertion::judge_diff(indoc! {"
+ - The diff contains a new `from_pixels` constructor
+ - The diff contains new tests for the `from_pixels` constructor
+ "}),
+ ),
);
}
@@ -591,12 +580,13 @@ fn eval_from_pixels_constructor() {
#[cfg_attr(not(feature = "eval"), ignore)]
fn eval_zode() {
let input_file_path = "root/zode.py";
+ let input_content = None;
let edit_description = "Create the main Zode CLI script";
eval(
200,
1.,
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
message(
Assistant,
@@ -654,10 +644,8 @@ fn eval_zode() {
],
),
],
- input_path: input_file_path.into(),
- input_content: None,
- edit_description: edit_description.into(),
- assertion: EvalAssertion::new(async move |sample, _, _cx| {
+ input_content,
+ EvalAssertion::new(async move |sample, _, _cx| {
let invalid_starts = [' ', '`', '\n'];
let mut message = String::new();
for start in invalid_starts {
@@ -681,7 +669,7 @@ fn eval_zode() {
})
}
}),
- },
+ ),
);
}
@@ -694,8 +682,8 @@ fn eval_add_overwrite_test() {
eval(
200,
0.5, // TODO: make this eval better
- EvalInput {
- conversation: vec![
+ EvalInput::from_conversation(
+ vec![
message(
User,
[text(indoc! {"
@@ -899,13 +887,121 @@ fn eval_add_overwrite_test() {
],
),
],
- input_path: input_file_path.into(),
- input_content: Some(input_file_content.into()),
- edit_description: edit_description.into(),
- assertion: EvalAssertion::judge_diff(
+ Some(input_file_content.into()),
+ EvalAssertion::judge_diff(
"A new test for overwritten files was created, without changing any previous test",
),
- },
+ ),
+ );
+}
+
+#[test]
+#[ignore] // until we figure out the mystery described in the comments
+// #[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_create_empty_file() {
+ // Check that Edit Agent can create a file without writing its
+ // thoughts into it. This issue is not specific to empty files, but
+ // it's easier to reproduce with them.
+ //
+ // NOTE: For some mysterious reason, I could easily reproduce this
+ // issue roughly 90% of the time in actual Zed. However, once I
+ // extract the exact LLM request before the failure point and
+ // generate from that, the reproduction rate drops to 2%!
+ //
+ // Things I've tried to make sure it's not a fluke: disabling prompt
+ // caching, capturing the LLM request via a proxy server, running the
+ // prompt on Claude separately from evals. Every time it was mostly
+ // giving good outcomes, which doesn't match my actual experience in
+ // Zed.
+ //
+ // At some point I discovered that simply adding one insignificant
+ // space or a newline to the prompt suddenly results in an outcome I
+ // tried to reproduce almost perfectly.
+ //
+ // This weirdness happens even outside of the Zed code base and even
+ // when using a different subscription. The result is the same: an
+ // extra newline or space changes the model behavior significantly
+ // enough, so that the pass rate drops from 99% to 0-3%
+ //
+ // I have no explanation to this.
+ //
+ //
+ // Model | Pass rate
+ // ============================================
+ //
+ // --------------------------------------------
+ // Prompt version: 2025-05-19
+ // --------------------------------------------
+ //
+ // claude-3.7-sonnet | 0.98
+ // + one extra space in prompt | 0.00
+ // + original prompt again | 0.99
+ // + extra newline | 0.03
+ // gemini-2.5-pro-preview-03-25 | 1.00
+ // gemini-2.5-flash-preview-04-17 | 1.00
+ // + one extra space | 1.00
+ // gpt-4.1 | 1.00
+ // + one extra space | 1.00
+ //
+ //
+ // TODO: gpt-4.1-mini errored 38 times:
+ // "data did not match any variant of untagged enum ResponseStreamResult"
+ //
+ let input_file_content = None;
+ let expected_output_content = String::new();
+ eval(
+ 1,
+ 1.0,
+ EvalInput::from_conversation(
+ vec![
+ message(User, [text("Create a second empty todo file ")]),
+ message(
+ Assistant,
+ [
+ text(formatdoc! {"
+ I'll help you create a second empty todo file.
+ First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
+ "}),
+ tool_use(
+ "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
+ "list_directory",
+ ListDirectoryToolInput {
+ path: "root".to_string(),
+ },
+ ),
+ ],
+ ),
+ message(
+ User,
+ [tool_result(
+ "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
+ "list_directory",
+ "root/TODO\nroot/TODO2\nroot/new.txt\n",
+ )],
+ ),
+ message(
+ Assistant,
+ [
+ text(formatdoc! {"
+ I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
+ "}),
+ tool_use(
+ "toolu_01Tb3iQ9griqSYMmVuykQPWU",
+ "edit_file",
+ EditFileToolInput {
+ display_description: "Create empty TODO3 file".to_string(),
+ mode: EditFileMode::Create,
+ path: "root/TODO3".into(),
+ },
+ ),
+ ],
+ ),
+ ],
+ input_file_content,
+ // Bad behavior is to write something like
+ // "I'll create an empty TODO3 file as requested."
+ EvalAssertion::assert_eq(expected_output_content),
+ ),
);
}
@@ -964,12 +1060,46 @@ fn tool_result(
#[derive(Clone)]
struct EvalInput {
conversation: Vec<LanguageModelRequestMessage>,
- input_path: PathBuf,
+ edit_file_input: EditFileToolInput,
input_content: Option<String>,
- edit_description: String,
assertion: EvalAssertion,
}
+impl EvalInput {
+ fn from_conversation(
+ conversation: Vec<LanguageModelRequestMessage>,
+ input_content: Option<String>,
+ assertion: EvalAssertion,
+ ) -> Self {
+ let msg = conversation.last().expect("Conversation must not be empty");
+ if msg.role != Role::Assistant {
+ panic!("Conversation must end with an assistant message");
+ }
+ let tool_use = msg
+ .content
+ .iter()
+ .flat_map(|content| match content {
+ MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
+ Some(tool_use)
+ }
+ _ => None,
+ })
+ .next()
+ .expect("Conversation must end with an edit_file tool use")
+ .clone();
+
+ let edit_file_input: EditFileToolInput =
+ serde_json::from_value(tool_use.input.clone()).unwrap();
+
+ EvalInput {
+ conversation,
+ edit_file_input,
+ input_content,
+ assertion,
+ }
+ }
+}
+
#[derive(Clone)]
struct EvalSample {
text: String,
@@ -1308,7 +1438,7 @@ impl EditAgentTest {
let path = self
.project
.read_with(cx, |project, cx| {
- project.find_project_path(eval.input_path, cx)
+ project.find_project_path(eval.edit_file_input.path, cx)
})
.unwrap();
let buffer = self
@@ -1336,11 +1466,13 @@ impl EditAgentTest {
}),
..Default::default()
};
- let edit_output = if let Some(input_content) = eval.input_content.as_deref() {
- buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+ let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
+ if let Some(input_content) = eval.input_content.as_deref() {
+ buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+ }
let (edit_output, _) = self.agent.edit(
buffer.clone(),
- eval.edit_description,
+ eval.edit_file_input.display_description,
&conversation,
&mut cx.to_async(),
);
@@ -1348,7 +1480,7 @@ impl EditAgentTest {
} else {
let (edit_output, _) = self.agent.overwrite(
buffer.clone(),
- eval.edit_description,
+ eval.edit_file_input.display_description,
&conversation,
&mut cx.to_async(),
);