Agent eval: output paths to log files at the end (#28724)

Michael Sloan created

Release Notes:

- N/A

Change summary

crates/eval/src/eval.rs    | 27 +++++++++++++--------------
crates/eval/src/example.rs |  5 ++++-
2 files changed, 17 insertions(+), 15 deletions(-)

Detailed changes

crates/eval/src/eval.rs 🔗

@@ -110,13 +110,15 @@ fn main() {
                     continue;
                 }
 
-                examples.push((example_path, example));
+                println!("{}> Logging to {:?}", example.name, example.log_file_path);
+
+                examples.push(example);
             }
             let mut repo_urls = HashSet::new();
 
             let mut clone_tasks = Vec::new();
 
-            for (_, example) in examples.iter() {
+            for example in examples.iter() {
                 let repo_url = example.base.url.clone();
                 if repo_urls.insert(repo_url.clone()) {
                     let repo_path = repo_path_for_url(&repo_url);
@@ -149,25 +151,22 @@ fn main() {
 
             future::join_all(clone_tasks).await;
 
-            for (_, example) in examples.iter() {
+            for example in examples.iter() {
                 example.setup().await?;
             }
 
             let tasks = examples
                 .into_iter()
-                .map(|(example_path, example)| {
+                .map(|example| {
                     let app_state = app_state.clone();
                     let model = model.clone();
                     cx.spawn(async move |cx| {
-                        (
-                            example_path,
-                            run_example(example, model, app_state, cx).await,
-                        )
+                        (run_example(&example, model, app_state, cx).await, example)
                     })
                 })
                 .collect::<Vec<_>>();
 
-            let results: Vec<(PathBuf, Result<JudgeOutput>)> = future::join_all(tasks).await;
+            let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await;
 
             println!("\n\n");
             println!("========================================");
@@ -177,11 +176,11 @@ fn main() {
 
             let mut judge_scores = Vec::new();
 
-            for (example_path, result) in results {
-                let example_name = example_path.file_name().unwrap().to_string_lossy();
+            for (result, example) in results {
+                println!("📜 {:<30}: {:?}", example.name, example.log_file_path);
                 match result {
                     Err(err) => {
-                        println!("💥 {:<30}: {:?}", example_name, err);
+                        println!("💥 {:<30}: {:?}", example.name, err);
                     }
                     Ok(judge_output) => {
                         const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
@@ -189,7 +188,7 @@ fn main() {
                         println!(
                             "{} {:<30}: {}",
                             SCORES[judge_output.score.min(5) as usize],
-                            example_name,
+                            example.name,
                             judge_output.score,
                         );
                         judge_scores.push(judge_output.score);
@@ -212,7 +211,7 @@ fn main() {
 }
 
 async fn run_example(
-    mut example: Example,
+    example: &Example,
     model: Arc<dyn LanguageModel>,
     app_state: Arc<AgentAppState>,
     cx: &mut AsyncApp,

crates/eval/src/example.rs 🔗

@@ -58,6 +58,8 @@ pub struct Example {
     pub criteria: String,
     /// Markdown log file to append to
     pub log_file: Arc<Mutex<File>>,
+    /// Path to markdown log file
+    pub log_file_path: PathBuf,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -102,6 +104,7 @@ impl Example {
             prompt: fs::read_to_string(prompt_path.clone())?,
             criteria: fs::read_to_string(criteria_path.clone())?,
             log_file,
+            log_file_path,
         })
     }
 
@@ -400,7 +403,7 @@ impl Example {
     }
 
     pub async fn judge(
-        &mut self,
+        &self,
         model: Arc<dyn LanguageModel>,
         repository_diff: String,
         cx: &AsyncApp,