Agent Eval: Initial support for running examples repeatedly (#28844)

Michael Sloan created

Not ideal as it creates a separate worktree for each repetition

Release Notes:

- N/A

Change summary

crates/eval/src/eval.rs    | 22 +++++++++++++++++-----
crates/eval/src/example.rs | 13 ++++++++-----
2 files changed, 25 insertions(+), 10 deletions(-)

Detailed changes

crates/eval/src/eval.rs 🔗

@@ -44,6 +44,10 @@ struct Args {
     model: String,
     #[arg(long, value_delimiter = ',')]
     languages: Option<Vec<String>>,
+    /// How many times to run each example. Note that this is currently not very efficient as N
+    /// worktrees will be created for the examples.
+    #[arg(long, default_value = "1")]
+    repetitions: u32,
     /// How many times to run the judge on each example run.
     #[arg(long, default_value = "3")]
     judge_repetitions: u32,
@@ -146,12 +150,20 @@ fn main() {
                     continue;
                 }
 
-                let name_len = example.name.len();
-                if name_len > max_name_width {
-                    max_name_width = example.name.len();
-                }
+                // TODO: This creates a worktree per repetition. Ideally these examples should
+                // either be run sequentially on the same worktree, or reuse worktrees when there
+                // are more examples to run than the concurrency limit.
+                for repetition_number in 0..args.repetitions {
+                    let mut example = example.clone();
+                    example.set_repetition_number(repetition_number);
+
+                    let name_len = example.name.len();
+                    if name_len > max_name_width {
+                        max_name_width = example.name.len();
+                    }
 
-                examples.push(example);
+                    examples.push(example);
+                }
             }
 
             println!("Skipped examples: {}\n", skipped.join(", "));

crates/eval/src/example.rs 🔗

@@ -94,11 +94,7 @@ impl Example {
         let base_path = dir_path.join("base.toml");
         let prompt_path = dir_path.join("prompt.md");
         let criteria_path = dir_path.join("criteria.md");
-
-        let output_file_path = run_dir.join(format!(
-            "{}.md",
-            dir_path.file_name().unwrap().to_str().unwrap()
-        ));
+        let output_file_path = run_dir.join(format!("{}.md", name));
 
         Ok(Example {
             name: name.clone(),
@@ -112,6 +108,13 @@ impl Example {
         })
     }
 
+    pub fn set_repetition_number(&mut self, repetition_number: u32) {
+        if repetition_number > 0 {
+            self.name = format!("{}-{}", self.name, repetition_number);
+            self.output_file_path = self.run_dir.join(format!("{}.md", self.name));
+        }
+    }
+
     pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) {
         self.log_prefix = format!(
             "{}{:<width$}\x1b[0m | ",