Add support for judge repetitions in eval (#28811)

Michael Sloan and Thomas created

Release Notes:

- N/A

---------

Co-authored-by: Thomas <thomas@zed.dev>

Change summary

crates/eval/src/eval.rs    | 51 +++++++++++++++++++++++++++++----------
crates/eval/src/example.rs | 22 +++++++++++-----
2 files changed, 52 insertions(+), 21 deletions(-)

Detailed changes

crates/eval/src/eval.rs 🔗

@@ -42,6 +42,9 @@ struct Args {
     /// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
     #[arg(long, value_delimiter = ',')]
     languages: Option<Vec<String>>,
+    /// How many times to run the judge on each example run.
+    #[arg(long, default_value = "3")]
+    judge_repetitions: u32,
 }
 
 fn main() {
@@ -203,18 +206,23 @@ fn main() {
                 example.setup().await?;
             }
 
+            let judge_repetitions = args.judge_repetitions;
             let tasks = examples
                 .into_iter()
                 .map(|example| {
                     let app_state = app_state.clone();
                     let model = model.clone();
                     cx.spawn(async move |cx| {
-                        (run_example(&example, model, app_state, cx).await, example)
+                        (
+                            run_example(&example, model, app_state, judge_repetitions, cx).await,
+                            example,
+                        )
                     })
                 })
                 .collect::<Vec<_>>();
 
-            let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await;
+            let results: Vec<(Result<Vec<Result<JudgeOutput>>>, Example)> =
+                future::join_all(tasks).await;
 
             println!("\n\n");
             println!("========================================");
@@ -229,16 +237,25 @@ fn main() {
                     Err(err) => {
                         println!("💥 {}{:?}", example.log_prefix, err);
                     }
-                    Ok(judge_output) => {
-                        const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
-
-                        println!(
-                            "{} {}{}",
-                            SCORES[judge_output.score.min(5) as usize],
-                            example.log_prefix,
-                            judge_output.score,
-                        );
-                        judge_scores.push(judge_output.score);
+                    Ok(judge_results) => {
+                        for judge_result in judge_results {
+                            match judge_result {
+                                Ok(judge_output) => {
+                                    const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
+
+                                    println!(
+                                        "{} {}{}",
+                                        SCORES[judge_output.score.min(5) as usize],
+                                        example.log_prefix,
+                                        judge_output.score,
+                                    );
+                                    judge_scores.push(judge_output.score);
+                                }
+                                Err(err) => {
+                                    println!("💥 {}{:?}", example.log_prefix, err);
+                                }
+                            }
+                        }
                     }
                 }
                 println!(
@@ -266,12 +283,18 @@ async fn run_example(
     example: &Example,
     model: Arc<dyn LanguageModel>,
     app_state: Arc<AgentAppState>,
+    judge_repetitions: u32,
     cx: &mut AsyncApp,
-) -> Result<JudgeOutput> {
+) -> Result<Vec<Result<JudgeOutput>>> {
     cx.update(|cx| example.run(model.clone(), app_state, cx))?
         .await?;
     let diff = example.repository_diff().await?;
-    example.judge(model, diff, cx).await
+
+    let judge_tasks = (0..judge_repetitions)
+        .map(|round| example.judge(model.clone(), diff.clone(), round, cx))
+        .collect::<Vec<_>>();
+
+    Ok(future::join_all(judge_tasks).await)
 }
 
 fn list_all_examples() -> Result<Vec<PathBuf>> {

crates/eval/src/example.rs 🔗

@@ -58,6 +58,8 @@ pub struct Example {
     pub criteria: String,
     /// Markdown output file to append to
     pub output_file: Option<Arc<Mutex<File>>>,
+    /// Path to the output run directory.
+    pub run_dir: PathBuf,
     /// Path to markdown output file
     pub output_file_path: PathBuf,
     /// Prefix used for logging that identifies this example
@@ -103,6 +105,7 @@ impl Example {
             base: toml::from_str(&fs::read_to_string(&base_path)?)?,
             prompt: fs::read_to_string(prompt_path.clone())?,
             criteria: fs::read_to_string(criteria_path.clone())?,
+            run_dir: run_dir.to_path_buf(),
             output_file: None,
             output_file_path,
             log_prefix: name,
@@ -425,6 +428,10 @@ impl Example {
             println!("{}Getting repository diff", this.log_prefix);
             let repository_diff = this.repository_diff().await?;
 
+            let repository_diff_path = this.run_dir.join(format!("{}.diff", this.name));
+            let mut repository_diff_output_file = File::create(&repository_diff_path)?;
+            writeln!(&mut repository_diff_output_file, "{}", &repository_diff).log_err();
+
             println!("{}Getting diagnostics", this.log_prefix);
             let diagnostics = cx
                 .update(move |cx| {
@@ -456,6 +463,7 @@ impl Example {
         &self,
         model: Arc<dyn LanguageModel>,
         repository_diff: String,
+        judge_repetitions: u32,
         cx: &AsyncApp,
     ) -> Result<JudgeOutput> {
         let judge_prompt = include_str!("judge_prompt.hbs");
@@ -483,14 +491,14 @@ impl Example {
 
         let response = send_language_model_request(model, request, cx).await?;
 
-        let output_file_ref = self.output_file();
-        let mut output_file = output_file_ref.lock().unwrap();
+        let judge_file_path = self.run_dir.join(format!(
+            "{}_judge_{}.md",
+            self.name, // This is the eval_name
+            judge_repetitions
+        ));
 
-        writeln!(&mut output_file, "\n\n").log_err();
-        writeln!(&mut output_file, "========================================").log_err();
-        writeln!(&mut output_file, "              JUDGE OUTPUT              ").log_err();
-        writeln!(&mut output_file, "========================================").log_err();
-        writeln!(&mut output_file, "\n{}", &response).log_err();
+        let mut judge_output_file = File::create(&judge_file_path)?;
+        writeln!(&mut judge_output_file, "{}", &response).log_err();
 
         parse_judge_output(&response)
     }