From 9a9f2e71cac70693eae959b7d5d5dde912bd610f Mon Sep 17 00:00:00 2001 From: Michael Sloan Date: Wed, 16 Apr 2025 00:35:55 -0600 Subject: [PATCH] Agent Eval: Initial support for running examples repeatedly (#28844) Not ideal as it creates a separate worktree for each repetition Release Notes: - N/A --- crates/eval/src/eval.rs | 22 +++++++++++++++++----- crates/eval/src/example.rs | 13 ++++++++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index 1e1da15464c04b61add07f1b0c473ffeee48cd45..ca6b11be0a5e5256afdfa4684a0b0ac8cabbc25b 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -44,6 +44,10 @@ struct Args { model: String, #[arg(long, value_delimiter = ',')] languages: Option>, + /// How many times to run each example. Note that this is currently not very efficient as N + /// worktrees will be created for the examples. + #[arg(long, default_value = "1")] + repetitions: u32, /// How many times to run the judge on each example run. #[arg(long, default_value = "3")] judge_repetitions: u32, @@ -146,12 +150,20 @@ fn main() { continue; } - let name_len = example.name.len(); - if name_len > max_name_width { - max_name_width = example.name.len(); - } + // TODO: This creates a worktree per repetition. Ideally these examples should + // either be run sequentially on the same worktree, or reuse worktrees when there + // are more examples to run than the concurrency limit. + for repetition_number in 0..args.repetitions { + let mut example = example.clone(); + example.set_repetition_number(repetition_number); + + let name_len = example.name.len(); + if name_len > max_name_width { + max_name_width = example.name.len(); + } - examples.push(example); + examples.push(example); + } } println!("Skipped examples: {}\n", skipped.join(", ")); diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs index e69b520fb41d5e70b1fc9f9b57b8d4023e97b3de..8e7f6fc00688a9f84df2bc04d3db474e3ad54c08 100644 --- a/crates/eval/src/example.rs +++ b/crates/eval/src/example.rs @@ -94,11 +94,7 @@ impl Example { let base_path = dir_path.join("base.toml"); let prompt_path = dir_path.join("prompt.md"); let criteria_path = dir_path.join("criteria.md"); - - let output_file_path = run_dir.join(format!( - "{}.md", - dir_path.file_name().unwrap().to_str().unwrap() - )); + let output_file_path = run_dir.join(format!("{}.md", name)); Ok(Example { name: name.clone(), @@ -112,6 +108,13 @@ impl Example { }) } + pub fn set_repetition_number(&mut self, repetition_number: u32) { + if repetition_number > 0 { + self.name = format!("{}-{}", self.name, repetition_number); + self.output_file_path = self.run_dir.join(format!("{}.md", self.name)); + } + } + pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) { self.log_prefix = format!( "{}{: