eval: Improve readability with colors and alignment (#28761)

Agus Zubiaga created

![CleanShot 2025-04-15 at 10 35
39@2x](https://github.com/user-attachments/assets/495d96fb-fe2f-478b-a9d6-678c1184db9a)


Release Notes:

- N/A

Change summary

crates/eval/src/eval.rs    |  77 +++++++++++++++++++++++++----
crates/eval/src/example.rs | 102 ++++++++++++++++++++++-----------------
2 files changed, 123 insertions(+), 56 deletions(-)

Detailed changes

crates/eval/src/eval.rs 🔗

@@ -26,6 +26,7 @@ use settings::{Settings, SettingsStore};
 use std::collections::HashSet;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use std::usize;
 use util::ResultExt as _;
 
 pub const RUNS_DIR: &str = "./crates/eval/runs";
@@ -97,8 +98,27 @@ fn main() {
             std::fs::create_dir_all(&run_dir)?;
 
             let mut examples = Vec::new();
-            for example_path in example_paths {
-                let example = Example::load_from_directory(&example_path, &run_dir)?;
+
+            const COLORS: [&str; 12] = [
+                "\x1b[31m", // Red
+                "\x1b[32m", // Green
+                "\x1b[33m", // Yellow
+                "\x1b[34m", // Blue
+                "\x1b[35m", // Magenta
+                "\x1b[36m", // Cyan
+                "\x1b[91m", // Bright Red
+                "\x1b[92m", // Bright Green
+                "\x1b[93m", // Bright Yellow
+                "\x1b[94m", // Bright Blue
+                "\x1b[95m", // Bright Magenta
+                "\x1b[96m", // Bright Cyan
+            ];
+
+            let mut max_name_width = 0;
+            let mut skipped = Vec::new();
+
+            for example_path in &example_paths {
+                let example = Example::load_from_directory(example_path, &run_dir)?;
 
                 if !example
                     .base
@@ -106,25 +126,49 @@ fn main() {
                     .as_ref()
                     .map_or(false, |lang| languages.contains(lang))
                 {
-                    println!("Skipping {}", example.name);
+                    skipped.push(example.name);
                     continue;
                 }
 
-                println!("{}> Logging to {:?}", example.name, example.log_file_path);
+                let name_len = example.name.len();
+                if name_len > max_name_width {
+                    max_name_width = example.name.len();
+                }
 
                 examples.push(example);
             }
-            let mut repo_urls = HashSet::new();
 
+            println!("Skipped examples: {}\n", skipped.join(", "));
+
+            if examples.is_empty() {
+                eprintln!("Filter matched no examples");
+                return cx.update(|cx| cx.quit());
+            }
+
+            let mut repo_urls = HashSet::new();
             let mut clone_tasks = Vec::new();
 
-            for example in examples.iter() {
+            for (i, example) in examples.iter_mut().enumerate() {
+                let color = COLORS[i % COLORS.len()].to_string();
+                example.set_log_prefix_style(&color, max_name_width);
+
+                println!(
+                    "{}Logging to: {}",
+                    example.log_prefix,
+                    example.output_file_path.display()
+                );
+
                 let repo_url = example.base.url.clone();
                 if repo_urls.insert(repo_url.clone()) {
                     let repo_path = repo_path_for_url(&repo_url);
 
                     if !repo_path.join(".git").is_dir() {
-                        println!("Cloning: {}", repo_url);
+                        println!(
+                            "{:<width$}  < {}",
+                            "↓ Cloning",
+                            repo_url,
+                            width = max_name_width
+                        );
 
                         let git_task = cx.spawn(async move |_cx| {
                             std::fs::create_dir_all(&repo_path)?;
@@ -134,7 +178,12 @@ fn main() {
 
                         clone_tasks.push(git_task);
                     } else {
-                        println!("Already cloned: {}", repo_url);
+                        println!(
+                            "{:<width$}  < {}",
+                            "✔︎ Already cloned",
+                            repo_url,
+                            width = max_name_width
+                        );
 
                         let actual_origin =
                             run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
@@ -177,23 +226,27 @@ fn main() {
             let mut judge_scores = Vec::new();
 
             for (result, example) in results {
-                println!("📜 {:<30}: {:?}", example.name, example.log_file_path);
                 match result {
                     Err(err) => {
-                        println!("💥 {:<30}: {:?}", example.name, err);
+                        println!("💥 {}{:?}", example.log_prefix, err);
                     }
                     Ok(judge_output) => {
                         const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
 
                         println!(
-                            "{} {:<30}: {}",
+                            "{} {}{}",
                             SCORES[judge_output.score.min(5) as usize],
-                            example.name,
+                            example.log_prefix,
                             judge_output.score,
                         );
                         judge_scores.push(judge_output.score);
                     }
                 }
+                println!(
+                    "{}    > {}",
+                    " ".repeat(max_name_width),
+                    example.output_file_path.display()
+                );
             }
 
             let score_count = judge_scores.len();

crates/eval/src/example.rs 🔗

@@ -56,10 +56,12 @@ pub struct Example {
     pub prompt: String,
     /// Content of `criteria.md`
     pub criteria: String,
-    /// Markdown log file to append to
-    pub log_file: Arc<Mutex<File>>,
-    /// Path to markdown log file
-    pub log_file_path: PathBuf,
+    /// Markdown output file to append to
+    pub output_file: Arc<Mutex<File>>,
+    /// Path to markdown output file
+    pub output_file_path: PathBuf,
+    /// Prefix used for logging that identifies this example
+    pub log_prefix: String,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -86,28 +88,41 @@ pub struct JudgeOutput {
 impl Example {
     /// Load an example from a directory containing base.toml, prompt.md, and criteria.md
     pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result<Self> {
-        let name = dir_path.file_name().unwrap().to_string_lossy().to_string();
+        let name = Self::name_from_path(dir_path);
         let base_path = dir_path.join("base.toml");
         let prompt_path = dir_path.join("prompt.md");
         let criteria_path = dir_path.join("criteria.md");
 
-        let log_file_path = run_dir.join(format!(
+        let output_file_path = run_dir.join(format!(
             "{}.md",
             dir_path.file_name().unwrap().to_str().unwrap()
         ));
-        let log_file = Arc::new(Mutex::new(File::create(&log_file_path).unwrap()));
-        println!("{}> Logging to {:?}", name, log_file_path);
+        let output_file = Arc::new(Mutex::new(File::create(&output_file_path).unwrap()));
 
         Ok(Example {
-            name,
+            name: name.clone(),
             base: toml::from_str(&fs::read_to_string(&base_path)?)?,
             prompt: fs::read_to_string(prompt_path.clone())?,
             criteria: fs::read_to_string(criteria_path.clone())?,
-            log_file,
-            log_file_path,
+            output_file,
+            output_file_path,
+            log_prefix: name,
         })
     }
 
+    pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) {
+        self.log_prefix = format!(
+            "{}{:<width$}\x1b[0m | ",
+            color,
+            self.name,
+            width = name_width
+        );
+    }
+
+    pub fn name_from_path(path: &Path) -> String {
+        path.file_name().unwrap().to_string_lossy().to_string()
+    }
+
     pub fn worktree_path(&self) -> PathBuf {
         Path::new(WORKTREES_DIR)
             .canonicalize()
@@ -120,7 +135,7 @@ impl Example {
     pub async fn setup(&self) -> Result<()> {
         let repo_path = repo_path_for_url(&self.base.url);
 
-        println!("{}> Fetching", self.name);
+        println!("{}Fetching", self.log_prefix);
 
         run_git(
             &repo_path,
@@ -131,7 +146,7 @@ impl Example {
         let worktree_path = self.worktree_path();
 
         if worktree_path.is_dir() {
-            println!("{}> Resetting existing worktree", self.name);
+            println!("{}Resetting existing worktree", self.log_prefix);
 
             // TODO: consider including "-x" to remove ignored files. The downside of this is that
             // it will also remove build artifacts, and so prevent incremental reuse there.
@@ -139,7 +154,7 @@ impl Example {
             run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
             run_git(&worktree_path, &["checkout", &self.base.revision]).await?;
         } else {
-            println!("{}> Creating worktree", self.name);
+            println!("{}Creating worktree", self.log_prefix);
 
             let worktree_path_string = worktree_path.to_string_lossy().to_string();
 
@@ -235,7 +250,7 @@ impl Example {
 
                 // TODO: remove this once the diagnostics tool waits for new diagnostics
                 cx.background_executor().timer(Duration::new(5, 0)).await;
-                wait_for_lang_server(&lsp_store, this.name.clone(), cx).await?;
+                wait_for_lang_server(&lsp_store, this.log_prefix.clone(), cx).await?;
 
                 lsp_store.update(cx, |lsp_store, cx| {
                     lsp_open_handle.update(cx, |buffer, cx| {
@@ -272,11 +287,11 @@ impl Example {
                 thread_store.update(cx, |thread_store, cx| thread_store.create_thread(cx))?;
 
             {
-                let mut log_file = this.log_file.lock().unwrap();
-                writeln!(&mut log_file, "👤 USER:").log_err();
-                writeln!(&mut log_file, "{}", this.prompt).log_err();
-                writeln!(&mut log_file, "🤖 ASSISTANT:").log_err();
-                log_file.flush().log_err();
+                let mut output_file = this.output_file.lock().unwrap();
+                writeln!(&mut output_file, "👤 USER:").log_err();
+                writeln!(&mut output_file, "{}", this.prompt).log_err();
+                writeln!(&mut output_file, "🤖 ASSISTANT:").log_err();
+                output_file.flush().log_err();
             }
 
             let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
@@ -289,8 +304,8 @@ impl Example {
             });
 
             let event_handler_task = cx.spawn({
-                let log_file = this.log_file.clone();
-                let name = this.name.clone();
+                let output_file = this.output_file.clone();
+                let log_prefix = this.log_prefix.clone();
                 let tool_use_counts = tool_use_counts.clone();
                 let thread = thread.downgrade();
                 async move |cx| {
@@ -305,7 +320,7 @@ impl Example {
                             return Err(anyhow!("ThreadEvent channel ended early"));
                         };
 
-                        let mut log_file = log_file.lock().unwrap();
+                        let mut output_file = output_file.lock().unwrap();
 
                         match event {
                             ThreadEvent::Stopped(reason) => match reason {
@@ -324,15 +339,15 @@ impl Example {
                                 break Err(anyhow!(thread_error.clone()));
                             }
                             ThreadEvent::StreamedAssistantText(_, chunk) => {
-                                write!(&mut log_file, "{}", chunk).log_err();
+                                write!(&mut output_file, "{}", chunk).log_err();
                             }
                             ThreadEvent::StreamedAssistantThinking(_, chunk) => {
-                                write!(&mut log_file, "{}", chunk).log_err();
+                                write!(&mut output_file, "{}", chunk).log_err();
                             }
                             ThreadEvent::UsePendingTools { tool_uses } => {
-                                writeln!(&mut log_file, "\n\nUSING TOOLS:").log_err();
+                                writeln!(&mut output_file, "\n\nUSING TOOLS:").log_err();
                                 for tool_use in tool_uses {
-                                    writeln!(&mut log_file, "{}: {}", tool_use.name, tool_use.input)
+                                    writeln!(&mut output_file, "{}: {}", tool_use.name, tool_use.input)
                                         .log_err();
                                 }
                             }
@@ -343,12 +358,12 @@ impl Example {
                             } => {
                                 if let Some(tool_use) = pending_tool_use {
                                     let message = format!("TOOL FINISHED: {}", tool_use.name);
-                                    println!("{name}> {message}");
-                                    writeln!(&mut log_file, "\n{}", message).log_err();
+                                    println!("{}{message}", log_prefix);
+                                    writeln!(&mut output_file, "\n{}", message).log_err();
                                 }
                                 thread.update(cx, |thread, _cx| {
                                     if let Some(tool_result) = thread.tool_result(&tool_use_id) {
-                                        writeln!(&mut log_file, "\n{}\n", tool_result.content).log_err();
+                                        writeln!(&mut output_file, "\n{}\n", tool_result.content).log_err();
                                         let mut tool_use_counts = tool_use_counts.lock().unwrap();
                                         *tool_use_counts
                                             .entry(tool_result.tool_name.clone())
@@ -359,7 +374,7 @@ impl Example {
                             _ => {}
                         }
 
-                        log_file.flush().log_err();
+                        output_file.flush().log_err();
                     }
                 }
             });
@@ -373,7 +388,7 @@ impl Example {
             event_handler_task.await?;
 
             if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() {
-                wait_for_lang_server(lsp_store, this.name.clone(), cx).await?;
+                wait_for_lang_server(lsp_store, this.log_prefix.clone(), cx).await?;
             }
 
             let repository_diff = this.repository_diff().await?;
@@ -433,13 +448,13 @@ impl Example {
 
         let response = send_language_model_request(model, request, cx).await?;
 
-        let mut log_file = self.log_file.lock().unwrap();
+        let mut output_file = self.output_file.lock().unwrap();
 
-        writeln!(&mut log_file, "\n\n").log_err();
-        writeln!(&mut log_file, "========================================").log_err();
-        writeln!(&mut log_file, "              JUDGE OUTPUT              ").log_err();
-        writeln!(&mut log_file, "========================================").log_err();
-        writeln!(&mut log_file, "\n{}", &response).log_err();
+        writeln!(&mut output_file, "\n\n").log_err();
+        writeln!(&mut output_file, "========================================").log_err();
+        writeln!(&mut output_file, "              JUDGE OUTPUT              ").log_err();
+        writeln!(&mut output_file, "========================================").log_err();
+        writeln!(&mut output_file, "\n{}", &response).log_err();
 
         parse_judge_output(&response)
     }
@@ -453,7 +468,7 @@ impl Example {
 
 fn wait_for_lang_server(
     lsp_store: &Entity<LspStore>,
-    name: String,
+    log_prefix: String,
     cx: &mut AsyncApp,
 ) -> Task<Result<()>> {
     if cx
@@ -464,13 +479,13 @@ fn wait_for_lang_server(
         return Task::ready(anyhow::Ok(()));
     }
 
-    println!("{}> ⏵ Waiting for language server", name);
+    println!("{}⏵ Waiting for language server", log_prefix);
 
     let (mut tx, mut rx) = mpsc::channel(1);
 
     let subscription =
         cx.subscribe(&lsp_store, {
-            let name = name.clone();
+            let log_prefix = log_prefix.clone();
             move |lsp_store, event, cx| {
                 match event {
                     project::LspStoreEvent::LanguageServerUpdate {
@@ -482,7 +497,7 @@ fn wait_for_lang_server(
                                 },
                             ),
                         ..
-                    } => println!("{name}> ⟲ {message}"),
+                    } => println!("{}⟲ {message}", log_prefix),
                     _ => {}
                 }
 
@@ -496,7 +511,7 @@ fn wait_for_lang_server(
         let timeout = cx.background_executor().timer(Duration::new(60 * 5, 0));
         let result = futures::select! {
             _ = rx.next() => {
-                println!("{}> ⚑ Language server idle", name);
+                println!("{}⚑ Language server idle", log_prefix);
                 anyhow::Ok(())
             },
             _ = timeout.fuse() => {
@@ -623,7 +638,6 @@ pub async fn send_language_model_request(
             while let Some(chunk_result) = stream.stream.next().await {
                 match chunk_result {
                     Ok(chunk_str) => {
-                        print!("{}", &chunk_str);
                         full_response.push_str(&chunk_str);
                     }
                     Err(err) => {