Cargo.lock 🔗
@@ -688,6 +688,7 @@ dependencies = [
"portable-pty",
"pretty_assertions",
"project",
+ "prompt_store",
"rand 0.8.5",
"regex",
"reqwest_client",
Oleksiy Syvokon , Ben Brandt , and Antonio Scandurra created
1. Add system prompt: this is how it's called from threads. Previously,
we were sending
2. Fix an issue with writing agent thought into a newly created empty
file.
Release Notes:
- N/A
---------
Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com>
Co-authored-by: Antonio Scandurra <me@as-cii.com>
Cargo.lock | 1
crates/assistant_tools/Cargo.toml | 1
crates/assistant_tools/src/edit_agent/evals.rs | 108 +++---
crates/assistant_tools/src/templates/create_file_prompt.hbs | 11
4 files changed, 66 insertions(+), 55 deletions(-)
@@ -688,6 +688,7 @@ dependencies = [
"portable-pty",
"pretty_assertions",
"project",
+ "prompt_store",
"rand 0.8.5",
"regex",
"reqwest_client",
@@ -41,6 +41,7 @@ open.workspace = true
paths.workspace = true
portable-pty.workspace = true
project.workspace = true
+prompt_store.workspace = true
regex.workspace = true
rust-embed.workspace = true
schemars.workspace = true
@@ -18,6 +18,7 @@ use language_model::{
LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
};
use project::Project;
+use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
use rand::prelude::*;
use reqwest_client::ReqwestClient;
use serde_json::json;
@@ -895,52 +896,24 @@ fn eval_add_overwrite_test() {
}
#[test]
-#[ignore] // until we figure out the mystery described in the comments
-// #[cfg_attr(not(feature = "eval"), ignore)]
+#[cfg_attr(not(feature = "eval"), ignore)]
fn eval_create_empty_file() {
// Check that Edit Agent can create a file without writing its
// thoughts into it. This issue is not specific to empty files, but
// it's easier to reproduce with them.
//
- // NOTE: For some mysterious reason, I could easily reproduce this
- // issue roughly 90% of the time in actual Zed. However, once I
- // extract the exact LLM request before the failure point and
- // generate from that, the reproduction rate drops to 2%!
- //
- // Things I've tried to make sure it's not a fluke: disabling prompt
- // caching, capturing the LLM request via a proxy server, running the
- // prompt on Claude separately from evals. Every time it was mostly
- // giving good outcomes, which doesn't match my actual experience in
- // Zed.
- //
- // At some point I discovered that simply adding one insignificant
- // space or a newline to the prompt suddenly results in an outcome I
- // tried to reproduce almost perfectly.
- //
- // This weirdness happens even outside of the Zed code base and even
- // when using a different subscription. The result is the same: an
- // extra newline or space changes the model behavior significantly
- // enough, so that the pass rate drops from 99% to 0-3%
- //
- // I have no explanation to this.
- //
//
// Model | Pass rate
// ============================================
//
// --------------------------------------------
- // Prompt version: 2025-05-19
+ // Prompt version: 2025-05-21
// --------------------------------------------
//
- // claude-3.7-sonnet | 0.98
- // + one extra space in prompt | 0.00
- // + original prompt again | 0.99
- // + extra newline | 0.03
+ // claude-3.7-sonnet | 1.00
// gemini-2.5-pro-preview-03-25 | 1.00
// gemini-2.5-flash-preview-04-17 | 1.00
- // + one extra space | 1.00
// gpt-4.1 | 1.00
- // + one extra space | 1.00
//
//
// TODO: gpt-4.1-mini errored 38 times:
@@ -949,8 +922,8 @@ fn eval_create_empty_file() {
let input_file_content = None;
let expected_output_content = String::new();
eval(
- 1,
- 1.0,
+ 100,
+ 0.99,
EvalInput::from_conversation(
vec![
message(User, [text("Create a second empty todo file ")]),
@@ -1442,24 +1415,59 @@ impl EditAgentTest {
.update(cx, |project, cx| project.open_buffer(path, cx))
.await
.unwrap();
- let conversation = LanguageModelRequest {
- messages: eval.conversation,
- tools: cx.update(|cx| {
- ToolRegistry::default_global(cx)
- .tools()
- .into_iter()
- .filter_map(|tool| {
- let input_schema = tool
- .input_schema(self.agent.model.tool_input_format())
- .ok()?;
- Some(LanguageModelRequestTool {
- name: tool.name(),
- description: tool.description(),
- input_schema,
- })
+ let tools = cx.update(|cx| {
+ ToolRegistry::default_global(cx)
+ .tools()
+ .into_iter()
+ .filter_map(|tool| {
+ let input_schema = tool
+ .input_schema(self.agent.model.tool_input_format())
+ .ok()?;
+ Some(LanguageModelRequestTool {
+ name: tool.name(),
+ description: tool.description(),
+ input_schema,
})
- .collect()
- }),
+ })
+ .collect::<Vec<_>>()
+ });
+ let tool_names = tools
+ .iter()
+ .map(|tool| tool.name.clone())
+ .collect::<Vec<_>>();
+ let worktrees = vec![WorktreeContext {
+ root_name: "root".to_string(),
+ rules_file: None,
+ }];
+ let prompt_builder = PromptBuilder::new(None)?;
+ let project_context = ProjectContext::new(worktrees, Vec::default());
+ let system_prompt = prompt_builder.generate_assistant_system_prompt(
+ &project_context,
+ &ModelContext {
+ available_tools: tool_names,
+ },
+ )?;
+
+ let has_system_prompt = eval
+ .conversation
+ .first()
+ .map_or(false, |msg| msg.role == Role::System);
+ let messages = if has_system_prompt {
+ eval.conversation
+ } else {
+ [LanguageModelRequestMessage {
+ role: Role::System,
+ content: vec![MessageContent::Text(system_prompt)],
+ cache: true,
+ }]
+ .into_iter()
+ .chain(eval.conversation)
+ .collect::<Vec<_>>()
+ };
+
+ let conversation = LanguageModelRequest {
+ messages,
+ tools,
..Default::default()
};
let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
@@ -1,12 +1,13 @@
You are an expert engineer and your task is to write a new file from scratch.
-<file_to_edit>
+You MUST respond directly with the file's content, without explanations, additional text or triple backticks.
+The text you output will be saved verbatim as the content of the file.
+Tool calls have been disabled. You MUST start your response directly with the file's new content.
+
+<file_path>
{{path}}
-</file_to_edit>
+</file_path>
<edit_description>
{{edit_description}}
</edit_description>
-
-You MUST respond directly with the file's content, without explanations, additional text or triple backticks.
-The text you output will be saved verbatim as the content of the file.