diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 65d70cf85639755cf373bd9aeb88b1ba2ca865c2..54e960264e7bd04c326c8c2f539d7d9f7b447542 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -66,7 +66,7 @@ jobs: run: cargo build --package=eval - name: Run eval - run: cargo run --package=eval + run: cargo run --package=eval -- --repetitions=3 --concurrency=1 # Even the Linux runner is not stateful, in theory there is no need to do this cleanup. # But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code diff --git a/Cargo.lock b/Cargo.lock index 16d5028af3a7f791f96d9e6e8c614013a09f6dc7..18b68edf1a12fec12561dea561cc935057cb2881 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4912,6 +4912,7 @@ dependencies = [ "language_models", "languages", "node_runtime", + "parking_lot", "paths", "project", "prompt_store", diff --git a/crates/eval/Cargo.toml b/crates/eval/Cargo.toml index e69af358fababc92124a3a7b507173814a0070dc..23780222a79aa31b481862e04def4bc45c75334b 100644 --- a/crates/eval/Cargo.toml +++ b/crates/eval/Cargo.toml @@ -29,6 +29,7 @@ language_model.workspace = true language_models.workspace = true languages = { workspace = true, features = ["load-grammars"] } node_runtime.workspace = true +parking_lot.workspace = true paths.workspace = true project.workspace = true prompt_store.workspace = true diff --git a/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md b/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md index 393056f134cc6a1fefcfff1386ee297eb1197650..12290f66ed6d02d7c02ed870a219cda698a83301 100644 --- a/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md +++ b/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md @@ -1,2 +1,2 @@ -1. The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. The struct should contain an `output` field that is the same as the string we were returning before, and a new `card` field that contains a view for the card -2. The card should be a view that displays a diff. Each line in the diff should be colored according to whether it was added, removed or unchanged. +- The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. The struct should contain an `output` field that is the same as the task we were returning before, and a new `card` field that contains a view for the card. +- The card should be a view that displays a diff. Each line in the diff should be colored according to whether it was added, removed or unchanged. diff --git a/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md b/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md index f5e1ef0401b88667674937d0e4664e012a5c4e54..79144bb1a0095a98ba47fd45dbdc376593bd0513 100644 --- a/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md +++ b/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md @@ -1,3 +1,3 @@ -1. The first tool call should be to path search including "find_replace_file_tool.rs" in the string. (*Not* grep, for example, or reading the file based on a guess at the path.) This is because we gave the model a filename and it needs to turn that into a real path. -2. After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path. -3. When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information on what path the Render trait might be in. +- The first tool call should be to path search including "find_replace_file_tool.rs" in the string. (*Not* grep, for example, or reading the file based on a guess at the path.) This is because we gave the model a filename and it needs to turn that into a real path. +- After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path. +- When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information on what path the Render trait might be in. diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index 986cc76354e15790dfe003ae611f1d357d0b0eaf..516c34b53a51f13a38720f1142fdc4a00ea990e9 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -3,15 +3,16 @@ mod ids; mod tool_metrics; pub(crate) use example::*; +use parking_lot::Mutex; pub(crate) use tool_metrics::*; use ::fs::RealFs; use anyhow::{Result, anyhow}; use clap::Parser; use client::{Client, ProxySettings, UserStore}; -use collections::HashSet; +use collections::{HashMap, HashSet}; use extension::ExtensionHostProxy; -use futures::{StreamExt, future}; +use futures::future; use gpui::http_client::{Uri, read_proxy_from_env}; use gpui::{App, AppContext, Application, AsyncApp, Entity, SemanticVersion, UpdateGlobal}; use gpui_tokio::Tokio; @@ -24,6 +25,7 @@ use prompt_store::PromptBuilder; use release_channel::AppVersion; use reqwest_client::ReqwestClient; use settings::{Settings, SettingsStore}; +use std::collections::VecDeque; use std::env; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -40,13 +42,9 @@ struct Args { model: String, #[arg(long, value_delimiter = ',', default_value = "rs,ts")] languages: Vec, - /// How many times to run each example. Note that this is currently not very efficient as N - /// worktrees will be created for the examples. + /// How many times to run each example. #[arg(long, default_value = "1")] - repetitions: u32, - /// How many times to run the judge on each example run. - #[arg(long, default_value = "3")] - judge_repetitions: u32, + repetitions: usize, /// Maximum number of examples to run concurrently. #[arg(long, default_value = "10")] concurrency: usize, @@ -163,7 +161,6 @@ fn main() { "\x1b[96m", // Bright Cyan ]; - let mut max_name_width = 0; let mut skipped = Vec::new(); for example_path in &example_paths { @@ -184,20 +181,7 @@ fn main() { continue; } - // TODO: This creates a worktree per repetition. Ideally these examples should - // either be run sequentially on the same worktree, or reuse worktrees when there - // are more examples to run than the concurrency limit. - for repetition_number in 0..args.repetitions { - let mut example = example.clone(); - example.set_repetition_number(repetition_number); - - let name_len = example.name.len(); - if name_len > max_name_width { - max_name_width = example.name.len(); - } - - examples.push(example); - } + examples.extend(example.repeat(args.repetitions)); } println!("Skipped examples: {}\n", skipped.join(", ")); @@ -210,6 +194,11 @@ fn main() { let mut repo_urls = HashSet::default(); let mut clone_tasks = Vec::new(); + let max_name_width = examples + .iter() + .map(|e| e.repetition_name().len()) + .max() + .unwrap_or(0); for (i, example) in examples.iter_mut().enumerate() { let color = COLORS[i % COLORS.len()].to_string(); example.set_log_prefix_style(&color, max_name_width); @@ -217,7 +206,7 @@ fn main() { println!( "{}Logging to: {}", example.log_prefix, - example.example_output_directory().display() + example.run_directory_path().display() ); let repo_url = example.base.url.clone(); @@ -263,49 +252,53 @@ fn main() { future::join_all(clone_tasks).await; for example in examples.iter_mut() { - example.setup().await?; + example.fetch().await?; } - let judge_repetitions = args.judge_repetitions; - let concurrency = args.concurrency; + let examples = Arc::new(Mutex::new(VecDeque::from(examples))); + let results_by_example_name = Arc::new(Mutex::new(HashMap::default())); - let tasks = examples.iter().map(|example| { + future::join_all((0..args.concurrency).map(|_| { let app_state = app_state.clone(); let model = model.clone(); - let example = example.clone(); let zed_commit_sha = zed_commit_sha.clone(); let zed_branch_name = zed_branch_name.clone(); let run_id = run_id.clone(); + let examples = examples.clone(); + let results = results_by_example_name.clone(); cx.spawn(async move |cx| { - let result = async { - let run_output = cx - .update(|cx| example.run(model.clone(), app_state.clone(), cx))? - .await?; - let judge_tasks = (0..judge_repetitions).map(|round| { - run_judge_repetition( + loop { + let Some(mut example) = examples.lock().pop_front() else { + break; + }; + let result = async { + example.setup().await?; + let run_output = cx + .update(|cx| example.run(model.clone(), app_state.clone(), cx))? + .await?; + let judge_output = judge_example( example.clone(), model.clone(), &zed_commit_sha, &zed_branch_name, &run_id, &run_output, - round, enable_telemetry, cx, ) - }); - let judge_outputs = future::join_all(judge_tasks).await; - anyhow::Ok((run_output, judge_outputs)) + .await; + anyhow::Ok((run_output, judge_output)) + } + .await; + results + .lock() + .entry(example.name.clone()) + .or_insert(Vec::new()) + .push((example.clone(), result)); } - .await; - (example, result) }) - }); - - let results = futures::stream::iter(tasks) - .buffer_unordered(concurrency) - .collect::>() - .await; + })) + .await; println!("\n\n"); print_header("EVAL RESULTS"); @@ -314,59 +307,64 @@ fn main() { let mut thread_scores = Vec::new(); let mut error_count = 0; - for (example, result) in results { - print_header(&example.name); - - match result { - Err(err) => { - println!("💥 {}{:?}", example.log_prefix, err); - error_count += 1; - } - Ok((run_output, judge_results)) => { - cumulative_tool_metrics.merge(&run_output.tool_metrics); - - println!("┌───────┬──────┬────────┐"); - println!("│ Judge │ Diff │ Thread │"); - println!("├───────┼──────┼────────┤"); + for (example_name, results) in results_by_example_name.lock().iter_mut() { + print_header(&example_name); + + results.sort_unstable_by_key(|(example, _)| example.repetition); + let mut example_cumulative_tool_metrics = ToolMetrics::default(); + + println!("┌───────┬──────┬────────┐"); + println!("│ Round │ Diff │ Thread │"); + println!("├───────┼──────┼────────┤"); + for (example, result) in results { + let run_dir_path = example.run_directory_path(); + let relative_run_dir_path = run_dir_path.strip_prefix(root_dir).unwrap(); + + match result { + Err(err) => { + println!( + "|{:^7}│{:^6}│{:^8}│ {:?}{}", + example.repetition, + "N/A", + "N/A", + err, + relative_run_dir_path.display() + ); + error_count += 1; + } + Ok((run_output, judge_result)) => { + cumulative_tool_metrics.merge(&run_output.tool_metrics); + example_cumulative_tool_metrics.merge(&run_output.tool_metrics); - for (i, judge_result) in judge_results.iter().enumerate() { match judge_result { Ok(judge_output) => { - let diff_score = judge_output.diff.score; - diff_scores.push(diff_score); - - let thread_display = if let Some(thread) = &judge_output.thread - { - let thread_score = thread.score; - thread_scores.push(thread_score); - format!("{}", thread_score) - } else { - "N/A".to_string() - }; - + diff_scores.push(judge_output.diff.score()); + thread_scores.push(judge_output.thread.score()); println!( - "|{:^7}│{:^6}│{:^8}│", - i + 1, - diff_score, - thread_display + "|{:^7}│{:^6}│{:^8}│ {}", + example.repetition, + format!("{}%", judge_output.diff.score()), + format!("{}%", judge_output.thread.score()), + relative_run_dir_path.display() ); } Err(err) => { - println!("|{:^7}│{:^6}│{:^8}│{:?}", i + 1, "N/A", "N/A", err); + println!( + "|{:^7}│{:^6}│{:^8}│{:?}│ {}", + example.repetition, + "N/A", + "N/A", + err, + relative_run_dir_path.display() + ); } } } - - println!("└───────┴──────┴────────┘"); - - println!("{}", run_output.tool_metrics); } } - println!( - "{} > {}", - " ".repeat(max_name_width), - example.example_output_directory().display() - ); + + println!("└───────┴──────┴────────┘"); + println!("{}", example_cumulative_tool_metrics); } let diff_score_count = diff_scores.len(); @@ -380,24 +378,16 @@ fn main() { println!("\n{error_count} examples failed to run!"); } - if diff_score_count > 0 { - println!("\nAverage code diff score: {average_diff_score}"); - } + println!("\nAverage code diff score: {average_diff_score}"); let thread_score_count = thread_scores.len(); + let average_thread_score = thread_scores + .into_iter() + .map(|score| score as f32) + .sum::() + / (thread_score_count as f32); - // We might have gotten no thread scores if we weren't asked to judge the thread. - if thread_score_count > 0 { - let average_thread_score = thread_scores - .into_iter() - .map(|score| score as f32) - .sum::() - / (thread_score_count as f32); - - if diff_score_count > 0 { - println!("\nAverage thread score: {average_thread_score}"); - } - } + println!("\nAverage thread score: {average_thread_score}"); print_header("CUMULATIVE TOOL METRICS"); println!("{}", cumulative_tool_metrics); @@ -579,27 +569,26 @@ pub fn git_branch_for_path(repo_path: &Path) -> String { } } -async fn run_judge_repetition( +async fn judge_example( example: Example, model: Arc, zed_commit_sha: &str, zed_branch_name: &str, run_id: &str, run_output: &RunOutput, - round: u32, enable_telemetry: bool, cx: &AsyncApp, ) -> Result { - let judge_output = example.judge(model.clone(), &run_output, round, cx).await; + let judge_output = example.judge(model.clone(), &run_output, cx).await; let diff_evaluation; - let thread_diff_evaluation; + let thread_evaluation; if let Ok(output) = judge_output.as_ref() { diff_evaluation = Some(output.diff.clone()); - thread_diff_evaluation = output.thread.clone(); + thread_evaluation = Some(output.thread.clone()); } else { diff_evaluation = None; - thread_diff_evaluation = None; + thread_evaluation = None; } if enable_telemetry { @@ -609,9 +598,9 @@ async fn run_judge_repetition( zed_branch_name = zed_branch_name, run_id = run_id, example_name = example.name.clone(), - round = round, + example_repetition = example.repetition, diff_evaluation = diff_evaluation, - thread_evaluation = thread_diff_evaluation, + thread_evaluation = thread_evaluation, tool_metrics = run_output.tool_metrics, response_count = run_output.response_count, token_usage = run_output.token_usage, @@ -619,6 +608,8 @@ async fn run_judge_repetition( model_provider = model.provider_id().to_string(), repository_url = example.base.url.clone(), repository_revision = example.base.revision.clone(), + diagnostic_summary_before = run_output.diagnostic_summary_before, + diagnostic_summary_after = run_output.diagnostic_summary_after, diagnostics_before = run_output.diagnostics_before, diagnostics_after = run_output.diagnostics_after, ); diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs index 56ec694f3f2f0b2e3acb8f2f6655b3a529e5b55c..5cd38929a28e5407f52aef39a29f0dc581e60457 100644 --- a/crates/eval/src/example.rs +++ b/crates/eval/src/example.rs @@ -12,7 +12,7 @@ use language_model::{ LanguageModel, LanguageModelCompletionEvent, LanguageModelRequest, LanguageModelRequestMessage, MessageContent, Role, StopReason, TokenUsage, }; -use project::{Project, ProjectPath}; +use project::{DiagnosticSummary, Project, ProjectPath}; use serde::{Deserialize, Serialize}; use std::cell::RefCell; use std::fmt::Write as _; @@ -68,19 +68,25 @@ pub struct Example { /// Content of `diff_criteria.md` pub diff_criteria: String, /// Content of `thread_criteria.md`, if that file exists (it's optional) - pub thread_criteria: Option, - /// Path to the directory containing the requests and responses for the agentic loop - pub run_directory_path: PathBuf, + pub thread_criteria: String, /// Prefix used for logging that identifies this example pub log_prefix: String, - pub worktree_path: PathBuf, + /// The repetition number for this example (0-based) + /// When running multiple repetitions of the same example, each instance is assigned a unique repetition number. + /// This affects the worktree path and log prefix to avoid clobbering results between runs. + pub repetition: usize, pub repo_path: PathBuf, + /// Path to the directory containing the requests and responses for the agentic loop + run_dir_path: PathBuf, + worktrees_dir: PathBuf, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Clone)] pub struct RunOutput { pub repository_diff: String, pub ran_diagnostics_check: bool, + pub diagnostic_summary_before: DiagnosticSummary, + pub diagnostic_summary_after: DiagnosticSummary, pub diagnostics_before: Option, pub diagnostics_after: Option, pub response_count: usize, @@ -92,11 +98,6 @@ pub struct RunOutput { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct JudgeDiffInput { pub repository_diff: String, - pub ran_diagnostics_check: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub diagnostics_before: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub diagnostics_after: Option, pub criteria: String, } @@ -109,12 +110,13 @@ pub struct JudgeThreadInput { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct JudgeResponse { pub analysis: String, - pub score: u32, + pub passing_criteria: u32, + pub total_criteria: u32, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct JudgeOutput { - pub thread: Option, + pub thread: JudgeResponse, pub diff: JudgeResponse, } @@ -126,65 +128,66 @@ impl Example { worktrees_dir: &Path, repos_dir: &Path, ) -> Result { - let name = Self::name_from_path(dir_path); + let name = dir_path.file_name().unwrap().to_string_lossy().to_string(); let base_path = dir_path.join("base.toml"); let prompt_path = dir_path.join("prompt.md"); let diff_criteria_path = dir_path.join("diff_criteria.md"); let thread_criteria_path = dir_path.join("thread_criteria.md"); - let thread_criteria = if thread_criteria_path.exists() { - Some(fs::read_to_string(thread_criteria_path.clone())?) - } else { - None - }; - let base: ExampleBase = toml::from_str(&fs::read_to_string(&base_path)?)?; let repo_path = repo_path_for_url(repos_dir, &base.url); - let worktree_path = worktrees_dir - .canonicalize() - .unwrap() - .join(&name) - .join(&base.repo_name()); - Ok(Example { name: name.clone(), base, prompt: fs::read_to_string(prompt_path.clone())?, - thread_criteria, + thread_criteria: fs::read_to_string(thread_criteria_path.clone())?, diff_criteria: fs::read_to_string(diff_criteria_path.clone())?, - run_directory_path: run_dir.to_path_buf(), - worktree_path, + run_dir_path: run_dir.to_path_buf(), + worktrees_dir: worktrees_dir.to_path_buf(), repo_path, log_prefix: name, + repetition: 0, }) } - pub fn set_repetition_number(&mut self, repetition_number: u32) { - if repetition_number > 0 { - self.name = format!("{}-{}", self.name, repetition_number); - } + pub fn repetition_name(&self) -> String { + format!("{}-{}", self.name, self.repetition) } - pub fn example_output_directory(&self) -> PathBuf { - self.run_directory_path.join(&self.name) + pub fn worktree_path(&self) -> PathBuf { + self.worktrees_dir + .canonicalize() + .unwrap() + .join(self.repetition_name()) + .join(&self.base.repo_name()) + } + + pub fn run_directory_path(&self) -> PathBuf { + self.run_dir_path.join(self.repetition_name()) + } + + /// Create an iterator that returns copies of this example with different repetition numbers + /// Each copy will have a different repetition number and worktree path based on the repetition + pub fn repeat(self, repetitions: usize) -> impl Iterator { + (0..repetitions).map(move |repetition| { + let mut example = self.clone(); + example.repetition = repetition; + example + }) } pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) { self.log_prefix = format!( "{}{: String { - path.file_name().unwrap().to_string_lossy().to_string() - } - /// Set up the example by checking out the specified Git revision - pub async fn setup(&mut self) -> Result<()> { + pub async fn fetch(&mut self) -> Result<()> { let revision_exists = run_git( &self.repo_path, &["rev-parse", &format!("{}^{{commit}}", self.base.revision)], @@ -203,19 +206,24 @@ impl Example { ) .await?; } + Ok(()) + } - if self.worktree_path.is_dir() { + /// Set up the example by checking out the specified Git revision + pub async fn setup(&mut self) -> Result<()> { + let worktree_path = self.worktree_path(); + if worktree_path.is_dir() { println!("{}Resetting existing worktree", self.log_prefix); // TODO: consider including "-x" to remove ignored files. The downside of this is that // it will also remove build artifacts, and so prevent incremental reuse there. - run_git(&self.worktree_path, &["clean", "--force", "-d"]).await?; - run_git(&self.worktree_path, &["reset", "--hard", "HEAD"]).await?; - run_git(&self.worktree_path, &["checkout", &self.base.revision]).await?; + run_git(&worktree_path, &["clean", "--force", "-d"]).await?; + run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?; + run_git(&worktree_path, &["checkout", &self.base.revision]).await?; } else { println!("{}Creating worktree", self.log_prefix); - let worktree_path_string = self.worktree_path.to_string_lossy().to_string(); + let worktree_path_string = worktree_path.to_string_lossy().to_string(); run_git( &self.repo_path, @@ -231,10 +239,10 @@ impl Example { } if self.base.url == ZED_REPO_URL { - std::fs::write(self.worktree_path.join(".rules"), std::fs::read(".rules")?)?; + std::fs::write(worktree_path.join(".rules"), std::fs::read(".rules")?)?; } - std::fs::create_dir_all(self.example_output_directory())?; + std::fs::create_dir_all(self.run_directory_path())?; Ok(()) } @@ -256,7 +264,7 @@ impl Example { ); let worktree = project.update(cx, |project, cx| { - project.create_worktree(&self.worktree_path, true, cx) + project.create_worktree(self.worktree_path(), true, cx) }); let tools = cx.new(|_| ToolWorkingSet::default()); @@ -315,6 +323,9 @@ impl Example { None }; + let diagnostic_summary_before = project.read_with(cx, |project, cx| { + project.diagnostic_summary(false, cx) + })?; let diagnostics_before = query_lsp_diagnostics(project.clone(), cx).await?; if diagnostics_before.is_some() && !this.base.allow_preexisting_diagnostics { return Err(anyhow!("Example has pre-existing diagnostics. If you want to run this example regardless, set `allow_preexisting_diagnostics` to `true` in `base.toml`")); @@ -324,7 +335,7 @@ impl Example { return Err(anyhow!("Setup only mode")); } - let example_output_dir = this.example_output_directory(); + let example_output_dir = this.run_directory_path(); let last_diff_file_path = example_output_dir.join("last.diff"); // Write an empty "last.diff" so that it can be opened in Zed for convenient view of the @@ -491,6 +502,9 @@ impl Example { std::fs::write(last_diff_file_path, &repository_diff)?; println!("{}Getting diagnostics", this.log_prefix); + let diagnostic_summary_after = project.read_with(cx, |project, cx| { + project.diagnostic_summary(false, cx) + })?; let diagnostics_after = cx .update(move |cx| { cx.spawn(async move |cx| query_lsp_diagnostics(project, cx).await) @@ -522,6 +536,8 @@ impl Example { RunOutput { repository_diff, ran_diagnostics_check: this.base.require_lsp, + diagnostic_summary_before, + diagnostic_summary_after, diagnostics_before, diagnostics_after, response_count, @@ -537,7 +553,6 @@ impl Example { &self, model: Arc, run_output: &RunOutput, - judge_number: u32, cx: &AsyncApp, ) -> Result<(String, JudgeResponse)> { let judge_diff_prompt = include_str!("judge_diff_prompt.hbs"); @@ -549,9 +564,6 @@ impl Example { judge_diff_prompt_name, &JudgeDiffInput { repository_diff: run_output.repository_diff.clone(), - ran_diagnostics_check: run_output.ran_diagnostics_check, - diagnostics_before: run_output.diagnostics_before.clone(), - diagnostics_after: run_output.diagnostics_after.clone(), criteria: self.diff_criteria.clone(), }, )?; @@ -573,8 +585,9 @@ impl Example { let diff_output = JudgeResponse::parse(&diff_response)?; println!( - "{}Judge #{judge_number} - Diff score: {}", - self.log_prefix, diff_output.score + "{}Judge - Diff score: {}%", + self.log_prefix, + diff_output.score() ); Ok((diff_response, diff_output)) @@ -584,69 +597,60 @@ impl Example { &self, model: Arc, run_output: &RunOutput, - judge_number: u32, cx: &AsyncApp, - ) -> Result<(String, Option)> { - if let Some(criteria) = self.thread_criteria.clone() { - let judge_thread_prompt = include_str!("judge_thread_prompt.hbs"); - let judge_thread_prompt_name = "judge_thread_prompt"; - let mut hbs = Handlebars::new(); - hbs.register_template_string(judge_thread_prompt_name, judge_thread_prompt)?; - - let request_markdown = RequestMarkdown::new(&run_output.last_request); - let thread_prompt = hbs.render( - judge_thread_prompt_name, - &JudgeThreadInput { - messages: request_markdown.messages, - criteria, - }, - )?; + ) -> Result<(String, JudgeResponse)> { + let judge_thread_prompt = include_str!("judge_thread_prompt.hbs"); + let judge_thread_prompt_name = "judge_thread_prompt"; + let mut hbs = Handlebars::new(); + hbs.register_template_string(judge_thread_prompt_name, judge_thread_prompt)?; + + let request_markdown = RequestMarkdown::new(&run_output.last_request); + let thread_prompt = hbs.render( + judge_thread_prompt_name, + &JudgeThreadInput { + messages: request_markdown.messages, + criteria: self.thread_criteria.clone(), + }, + )?; - let request = LanguageModelRequest { - thread_id: None, - prompt_id: None, - messages: vec![LanguageModelRequestMessage { - role: Role::User, - content: vec![MessageContent::Text(thread_prompt)], - cache: false, - }], - temperature: None, - tools: Vec::new(), - stop: Vec::new(), - }; + let request = LanguageModelRequest { + thread_id: None, + prompt_id: None, + messages: vec![LanguageModelRequestMessage { + role: Role::User, + content: vec![MessageContent::Text(thread_prompt)], + cache: false, + }], + temperature: None, + tools: Vec::new(), + stop: Vec::new(), + }; - let thread_response = send_language_model_request(model, request, cx).await?; - let thread_output = JudgeResponse::parse(&thread_response)?; + let thread_response = send_language_model_request(model, request, cx).await?; + let thread_output = JudgeResponse::parse(&thread_response)?; - println!( - "{}Judge #{judge_number} - Thread score: {}", - self.log_prefix, thread_output.score - ); + println!( + "{}Judge - Thread score: {}%", + self.log_prefix, + thread_output.score() + ); - Ok((thread_response, Some(thread_output))) - } else { - let msg = "There were no criteria specified for this thread, so this example was not judged on its thread.".to_string(); - Ok((msg, None)) - } + Ok((thread_response, thread_output)) } pub async fn judge( &self, model: Arc, run_output: &RunOutput, - judge_number: u32, cx: &AsyncApp, ) -> Result { - let mut output_file = File::create( - self.example_output_directory() - .join(format!("judge_{}.md", judge_number)), - ) - .expect("failed to create judge.md"); + let mut output_file = File::create(self.run_directory_path().join("judge.md")) + .expect("failed to create judge.md"); - println!("{}Running judge #{judge_number}", self.log_prefix); + println!("{}Running judge", self.log_prefix); - let diff_task = self.judge_diff(model.clone(), &run_output, judge_number, cx); - let thread_task = self.judge_thread(model.clone(), &run_output, judge_number, cx); + let diff_task = self.judge_diff(model.clone(), &run_output, cx); + let thread_task = self.judge_thread(model.clone(), &run_output, cx); let (diff_result, thread_result) = futures::join!(diff_task, thread_task); @@ -666,12 +670,14 @@ impl Example { } async fn repository_diff(&self) -> Result { - run_git(&self.worktree_path, &["add", "."]).await?; + let worktree_path = self.worktree_path(); + + run_git(&worktree_path, &["add", "."]).await?; let mut diff_args = vec!["diff", "--staged"]; if self.base.url == ZED_REPO_URL { diff_args.push(":(exclude).rules"); } - run_git(&self.worktree_path, &diff_args).await + run_git(&worktree_path, &diff_args).await } } @@ -805,11 +811,21 @@ async fn query_lsp_diagnostics( impl JudgeResponse { fn parse(response: &str) -> Result { let analysis = get_tag("analysis", response)?.to_string(); - let score = get_tag("score", response)? + let passing_criteria = get_tag("passing_criteria", response)? .parse() .context("error parsing score")?; + let total_criteria = get_tag("total_criteria", response)? + .parse() + .context("error parsing score")?; + Ok(Self { + analysis, + total_criteria, + passing_criteria, + }) + } - Ok(Self { analysis, score }) + pub fn score(&self) -> u32 { + (100.0 * self.passing_criteria as f32 / self.total_criteria as f32).round() as u32 } } @@ -1042,13 +1058,13 @@ fn response_events_to_markdown( #[cfg(test)] mod test { use super::*; - use handlebars::Handlebars; #[test] fn test_parse_judge_output() { let response = r#" The model did a good job but there were still compilations errors. - 3 + 3 + 5 "# .unindent(); @@ -1057,7 +1073,8 @@ mod test { output.analysis, "The model did a good job but there were still compilations errors." ); - assert_eq!(output.score, 3); + assert_eq!(output.passing_criteria, 3); + assert_eq!(output.total_criteria, 5); let response = r#" Text around ignored @@ -1068,162 +1085,15 @@ mod test { - Error 2 - 1 + 1 + + 3 "# .unindent(); let output = JudgeResponse::parse(&response).unwrap(); assert_eq!(output.analysis, "Failed to compile:\n- Error 1\n- Error 2"); - assert_eq!(output.score, 1); - } - - #[test] - fn test_judge_prompt_with_diagnostics() { - // Case 1: Both diagnostics before and after are present - let input = JudgeDiffInput { - repository_diff: "diff content goes here".to_string(), - ran_diagnostics_check: true, - diagnostics_before: Some("Error at line 10: variable not found".to_string()), - diagnostics_after: Some("Error at line 15: missing semicolon".to_string()), - criteria: "Fix all bugs".to_string(), - }; - - let rendered = templates().render(JUDGE_PROMPT_NAME, &input).unwrap(); - - let expected_diagnostics_section = r#" - Take into account the diagnostics before and after applying the change: - - - Error at line 10: variable not found - - - - Error at line 15: missing semicolon - - "# - .unindent(); - - assert!(rendered.contains(&expected_diagnostics_section)); - } - - #[test] - fn test_judge_prompt_with_empty_diagnostics() { - // Case 2: Diagnostics check run but no diagnostics found - let input = JudgeDiffInput { - repository_diff: "diff content goes here".to_string(), - ran_diagnostics_check: true, - diagnostics_before: None, - diagnostics_after: None, - criteria: "Fix all bugs".to_string(), - }; - - let rendered = templates().render(JUDGE_PROMPT_NAME, &input).unwrap(); - - let expected_diagnostics_section = r#" - Take into account the diagnostics before and after applying the change: - - - No diagnostics before applying the edits. - - - - No diagnostics after applying the edits. - - "# - .unindent(); - - assert!(rendered.contains(&expected_diagnostics_section)); - } - - #[test] - fn test_judge_prompt_with_mixed_diagnostics() { - let templates = templates(); - - // Case 3: Before diagnostics present, after diagnostics absent - let input = JudgeDiffInput { - repository_diff: "diff content goes here".to_string(), - ran_diagnostics_check: true, - diagnostics_before: Some("Error at line 10: variable not found".to_string()), - diagnostics_after: None, - criteria: "Fix all bugs".to_string(), - }; - - let rendered = templates.render(JUDGE_PROMPT_NAME, &input).unwrap(); - - let expected_diagnostics_section = r#" - Take into account the diagnostics before and after applying the change: - - - Error at line 10: variable not found - - - - No diagnostics after applying the edits. - - "# - .unindent(); - - assert!(rendered.contains(&expected_diagnostics_section)); - - // Case 4: Before diagnostics absent, after diagnostics present - let input = JudgeDiffInput { - repository_diff: "diff content goes here".to_string(), - ran_diagnostics_check: true, - diagnostics_before: None, - diagnostics_after: Some("Error at line 15: missing semicolon".to_string()), - criteria: "Fix all bugs".to_string(), - }; - - let rendered = templates.render(JUDGE_PROMPT_NAME, &input).unwrap(); - - let expected_diagnostics_section = r#" - Take into account the diagnostics before and after applying the change: - - - No diagnostics before applying the edits. - - - - Error at line 15: missing semicolon - - "# - .unindent(); - - assert!(rendered.contains(&expected_diagnostics_section)); - } - - #[test] - fn test_judge_prompt_without_diagnostics() { - let templates = templates(); - - // Case 5: No diagnostics check run - let input = JudgeDiffInput { - repository_diff: "diff content goes here".to_string(), - ran_diagnostics_check: false, - diagnostics_before: None, - diagnostics_after: None, - criteria: "Fix all bugs".to_string(), - }; - - let rendered = templates.render(JUDGE_PROMPT_NAME, &input).unwrap(); - - // Check for the message when no diagnostics were performed - let diagnostics_message = "No diagnostic checks were performed."; - - assert!(rendered.contains(diagnostics_message)); - assert!(!rendered.contains("")); - assert!(!rendered.contains("")); - } - - const JUDGE_PROMPT_NAME: &str = "judge_prompt"; - - fn templates() -> Handlebars<'static> { - let mut judge_prompt = include_str!("judge_diff_prompt.hbs").to_string(); - language::LineEnding::normalize(&mut judge_prompt); - let mut handlebars = Handlebars::new(); - handlebars - .register_template_string(JUDGE_PROMPT_NAME, judge_prompt) - .unwrap(); - handlebars + assert_eq!(output.passing_criteria, 1); + assert_eq!(output.total_criteria, 3); } } diff --git a/crates/eval/src/judge_diff_prompt.hbs b/crates/eval/src/judge_diff_prompt.hbs index 4e9aaf68c0e5d9c1423716a6e652ee32be31dde9..bf51f933ee1b0a6bd9fe6de485c90c77342bf6e1 100644 --- a/crates/eval/src/judge_diff_prompt.hbs +++ b/crates/eval/src/judge_diff_prompt.hbs @@ -1,48 +1,25 @@ -You are an expert software developer tasked with evaluating the following changes to a codebase: +You are an expert software developer. Your task is to evaluate a diff produced by an AI agent in response to a prompt. +Here is the prompt and the diff: - -{{repository_diff}} - + +{{{prompt}}} + -Use the following criteria to score the above changes. + +{{{repository_diff}}} + + +Evaluate how many of the following criteria were satisfied by the diff: {{criteria}} +- There are no changes unrelated to the prompt -{{#if ran_diagnostics_check}} -Take into account the diagnostics before and after applying the change: - - -{{#if diagnostics_before}} -{{{diagnostics_before}}} -{{else}} -No diagnostics before applying the edits. -{{/if}} - - - -{{#if diagnostics_after}} -{{{diagnostics_after}}} -{{else}} -No diagnostics after applying the edits. -{{/if}} - -{{else}} -No diagnostic checks were performed. -{{/if}} - -Based on these criteria, give the test output a score between 0 and 5. -The output score should ONLY INCLUDE whole numbers. DO NOT return decimals or floats. - -- 5 means: changes meet all criteria -- 0 means: changes don't meet any criteria - -Be suspicious of the changes because they were generated by an LLM. -Sometimes the LLM decides to change random code, so if the changes are not mentioned in the criteria, penalize the score. -Analyze the diff hunk by hunk and describe how each change meets or fails to meet the criteria. +Analyze the diff hunk by hunk, and structure your answer in the following XML format: ``` {YOUR ANALYSIS HERE} -{YOUR SCORE HERE} +{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED} +{THE NUMBER OF CRITERIA THAT ARE MET BY THE DIFF} ``` diff --git a/crates/eval/src/judge_thread_prompt.hbs b/crates/eval/src/judge_thread_prompt.hbs index a84ce8e698c4939a6134a286d234d5195fbc1c6e..aa1b6d6f6e235f4fdab29fafc4a5e85cc7d38ff8 100644 --- a/crates/eval/src/judge_thread_prompt.hbs +++ b/crates/eval/src/judge_thread_prompt.hbs @@ -1,22 +1,19 @@ -You are an expert software developer tasked with evaluating an AI agent's messages and tool calls in this conversation: +You are an expert software developer. Your task is to evaluate an AI agent's messages and tool calls in this conversation: {{{messages}}} -Use the following criteria to score the above messages. +You must count how many of the following criteria were satisfied by the messages: -{{criteria}} +{{{criteria}}} -Based on these criteria, give the messages a score between 0 and 5. -The output score should ONLY INCLUDE whole numbers. DO NOT return decimals or floats. - -- 5 means: messages meet all criteria -- 0 means: messages don't meet any criteria +Analyze the messages one by one, and structure your answer in the following XML format: ``` {YOUR ANALYSIS HERE} -{YOUR SCORE HERE} +{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED} +{THE NUMBER OF CRITERIA THAT ARE MET BY THE MESSAGES} ```