Start tracking tool failure rates in eval (#29122)

Antonio Scandurra created 9 months ago

This pull request will print all the used tools and their failure rates.
The objective goal should be to minimize that failure rate.

@tmickleydoyle: this also changes the telemetry event to report
`tool_metrics` as opposed to `tool_use_counts`. Ideally I'd love to be
able to plot failure rates by tool and hopefully see that percentage go
down. Can we do that with the data we're tracking with this pull
request?

Release Notes:

- N/A

Change summary

crates/eval/src/eval.rs         | 124 +++++++++++++++++++---------------
crates/eval/src/example.rs      |  20 ++---
crates/eval/src/tool_metrics.rs | 102 ++++++++++++++++++++++++++++
3 files changed, 178 insertions(+), 68 deletions(-)

Detailed changes

crates/eval/src/eval.rs 🔗

@@ -1,13 +1,15 @@
 mod example;
 mod ids;
+mod tool_metrics;
 
-use client::{Client, ProxySettings, UserStore};
 pub(crate) use example::*;
-use telemetry;
+pub(crate) use tool_metrics::*;
 
 use ::fs::RealFs;
 use anyhow::{Result, anyhow};
 use clap::Parser;
+use client::{Client, ProxySettings, UserStore};
+use collections::HashSet;
 use extension::ExtensionHostProxy;
 use futures::{StreamExt, future};
 use gpui::http_client::{Uri, read_proxy_from_env};
@@ -22,7 +24,6 @@ use prompt_store::PromptBuilder;
 use release_channel::AppVersion;
 use reqwest_client::ReqwestClient;
 use settings::{Settings, SettingsStore};
-use std::collections::HashSet;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::usize;
@@ -92,6 +93,8 @@ fn main() {
             .telemetry()
             .start(system_id, installation_id, session_id, cx);
 
+        let mut cumulative_tool_metrics = ToolMetrics::default();
+
         let model_registry = LanguageModelRegistry::read_global(cx);
         let model = find_model("claude-3-7-sonnet-latest", model_registry, cx).unwrap();
         let model_provider_id = model.provider_id();
@@ -177,7 +180,7 @@ fn main() {
                 return cx.update(|cx| cx.quit());
             }
 
-            let mut repo_urls = HashSet::new();
+            let mut repo_urls = HashSet::default();
             let mut clone_tasks = Vec::new();
 
             for (i, example) in examples.iter_mut().enumerate() {
@@ -244,9 +247,24 @@ fn main() {
                 let model = model.clone();
                 let example = example.clone();
                 cx.spawn(async move |cx| {
-                    let result =
-                        run_example(&example, model, app_state, judge_repetitions, cx).await;
-                    (result, example)
+                    let result = async {
+                        let run_output = cx
+                            .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
+                            .await?;
+                        let judge_tasks = (0..judge_repetitions).map(|round| {
+                            run_judge_repetition(
+                                example.clone(),
+                                model.clone(),
+                                &run_output,
+                                round,
+                                cx,
+                            )
+                        });
+                        let judge_outputs = future::join_all(judge_tasks).await;
+                        anyhow::Ok((run_output, judge_outputs))
+                    }
+                    .await;
+                    (example, result)
                 })
             });
 
@@ -256,52 +274,58 @@ fn main() {
                 .await;
 
             println!("\n\n");
-            println!("========================================");
-            println!("              EVAL RESULTS              ");
-            println!("========================================");
-            println!("");
+            print_header("EVAL RESULTS");
 
             let mut diff_scores = Vec::new();
             let mut thread_scores = Vec::new();
             let mut error_count = 0;
 
-            for (result, example) in results {
+            for (example, result) in results {
+                print_header(&example.name);
+
                 match result {
                     Err(err) => {
                         println!("💥 {}{:?}", example.log_prefix, err);
                         error_count += 1;
                     }
-                    Ok(judge_results) => {
-                        for judge_result in judge_results {
+                    Ok((run_output, judge_results)) => {
+                        cumulative_tool_metrics.merge(&run_output.tool_metrics);
+
+                        println!("┌───────┬──────┬────────┐");
+                        println!("│ Judge │ Diff │ Thread │");
+                        println!("├───────┼──────┼────────┤");
+
+                        for (i, judge_result) in judge_results.iter().enumerate() {
                             match judge_result {
                                 Ok(judge_output) => {
-                                    const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
-                                    let diff_score: u32 = judge_output.diff.score;
-                                    let score_index = (diff_score.min(5)) as usize;
+                                    let diff_score = judge_output.diff.score;
+                                    diff_scores.push(diff_score);
+
+                                    let thread_display = if let Some(thread) = &judge_output.thread
+                                    {
+                                        let thread_score = thread.score;
+                                        thread_scores.push(thread_score);
+                                        format!("{}", thread_score)
+                                    } else {
+                                        "N/A".to_string()
+                                    };
 
                                     println!(
-                                        "{} {}{} (Diff)",
-                                        SCORES[score_index],
-                                        example.log_prefix,
-                                        judge_output.diff.score,
+                                        "|{:^7}│{:^6}│{:^8}│",
+                                        i + 1,
+                                        diff_score,
+                                        thread_display
                                     );
-                                    diff_scores.push(judge_output.diff.score);
-
-                                    if let Some(thread) = judge_output.thread {
-                                        let process_score: u32 = thread.score;
-                                        let score_index = (process_score.min(5)) as usize;
-                                        println!(
-                                            "{} {}{} (Thread)",
-                                            SCORES[score_index], example.log_prefix, thread.score,
-                                        );
-                                        thread_scores.push(thread.score);
-                                    }
                                 }
                                 Err(err) => {
-                                    println!("💥 {}{:?}", example.log_prefix, err);
+                                    println!("|{:^7}│{:^6}│{:^8}│{:?}", i + 1, "N/A", "N/A", err);
                                 }
                             }
                         }
+
+                        println!("└───────┴──────┴────────┘");
+
+                        println!("{}", run_output.tool_metrics);
                     }
                 }
                 println!(
@@ -341,6 +365,9 @@ fn main() {
                 }
             }
 
+            print_header("CUMULATIVE TOOL METRICS");
+            println!("{}", cumulative_tool_metrics);
+
             std::thread::sleep(std::time::Duration::from_secs(2));
 
             app_state.client.telemetry().flush_events();
@@ -351,27 +378,6 @@ fn main() {
     });
 }
 
-async fn run_example(
-    example: &Example,
-    model: Arc<dyn LanguageModel>,
-    app_state: Arc<AgentAppState>,
-    judge_repetitions: u32,
-    cx: &mut AsyncApp,
-) -> Result<Vec<Result<JudgeOutput>>> {
-    let run_output = cx
-        .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
-        .await?;
-
-    let judge_tasks = (0..judge_repetitions)
-        .map(|round| run_judge_repetition(example.clone(), model.clone(), &run_output, round, cx));
-
-    let results = future::join_all(judge_tasks).await;
-
-    app_state.client.telemetry().flush_events();
-
-    Ok(results)
-}
-
 fn list_all_examples() -> Result<Vec<PathBuf>> {
     let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
     let entries = std::fs::read_dir(path).unwrap();
@@ -566,7 +572,7 @@ async fn run_judge_repetition(
                 diff_analysis = judge_output.diff.analysis,
                 thread_score = thread.score,
                 thread_analysis = thread.analysis,
-                tool_use_counts = run_output.tool_use_counts,
+                tool_metrics = run_output.tool_metrics,
                 response_count = run_output.response_count,
                 token_usage = run_output.token_usage,
                 model = model.telemetry_id(),
@@ -585,7 +591,7 @@ async fn run_judge_repetition(
                 round = round,
                 diff_score = judge_output.diff.score,
                 diff_analysis = judge_output.diff.analysis,
-                tool_use_counts = run_output.tool_use_counts,
+                tool_metrics = run_output.tool_metrics,
                 response_count = run_output.response_count,
                 token_usage = run_output.token_usage,
                 model = model.telemetry_id(),
@@ -601,3 +607,9 @@ async fn run_judge_repetition(
 
     judge_result
 }
+
+fn print_header(header: &str) {
+    println!("\n========================================");
+    println!("{:^40}", header);
+    println!("========================================\n");
+}

crates/eval/src/example.rs 🔗

@@ -1,8 +1,8 @@
+use crate::{AgentAppState, ToolMetrics};
 use agent::{ThreadEvent, ThreadStore};
 use anyhow::{Context as _, Result, anyhow};
 use assistant_tool::ToolWorkingSet;
 use client::proto::LspWorkProgress;
-use collections::HashMap;
 use dap::DapRegistry;
 use futures::channel::mpsc;
 use futures::{FutureExt, StreamExt as _, select_biased};
@@ -32,8 +32,6 @@ use util::command::new_smol_command;
 use util::markdown::MarkdownString;
 use util::serde::default_true;
 
-use crate::AgentAppState;
-
 pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
 pub const REPOS_DIR: &str = "./crates/eval/repos";
 pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
@@ -88,7 +86,7 @@ pub struct RunOutput {
     pub diagnostics_after: Option<String>,
     pub response_count: usize,
     pub token_usage: TokenUsage,
-    pub tool_use_counts: HashMap<Arc<str>, u32>,
+    pub tool_metrics: ToolMetrics,
     pub last_request: LanguageModelRequest,
 }
 
@@ -351,8 +349,7 @@ impl Example {
                 });
             })?;
 
-            let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
-                Mutex::new(HashMap::default()).into();
+            let tool_metrics = Arc::new(Mutex::new(ToolMetrics::default()));
 
             let (thread_event_tx, mut thread_event_rx) = mpsc::unbounded();
 
@@ -362,7 +359,7 @@ impl Example {
 
             let event_handler_task = cx.spawn({
                 let log_prefix = this.log_prefix.clone();
-                let tool_use_counts = tool_use_counts.clone();
+                let tool_metrics = tool_metrics.clone();
                 let thread = thread.downgrade();
                 async move |cx| {
                     loop {
@@ -405,6 +402,7 @@ impl Example {
                             } => {
                                 thread.update(cx, |thread, _cx| {
                                     if let Some(tool_use) = pending_tool_use {
+                                        let mut tool_metrics = tool_metrics.lock().unwrap();
                                         if let Some(tool_result) = thread.tool_result(&tool_use_id) {
                                             let message = if tool_result.is_error {
                                                 format!("TOOL FAILED: {}", tool_use.name)
@@ -412,13 +410,11 @@ impl Example {
                                                 format!("TOOL FINISHED: {}", tool_use.name)
                                             };
                                             println!("{log_prefix}{message}");
-                                            let mut tool_use_counts = tool_use_counts.lock().unwrap();
-                                            *tool_use_counts
-                                                .entry(tool_result.tool_name.clone())
-                                                .or_insert(0) += 1;
+                                            tool_metrics.insert(tool_result.tool_name.clone(), !tool_result.is_error);
                                         } else {
                                             let message = format!("TOOL FINISHED WITHOUT RESULT: {}", tool_use.name);
                                             println!("{log_prefix}{message}");
+                                            tool_metrics.insert(tool_use.name.clone(), true);
                                         }
                                     }
                                 })?;
@@ -501,7 +497,7 @@ impl Example {
                     diagnostics_after,
                     response_count,
                     token_usage: thread.cumulative_token_usage(),
-                    tool_use_counts: tool_use_counts.lock().unwrap().clone(),
+                    tool_metrics: tool_metrics.lock().unwrap().clone(),
                     last_request,
                 }
             })

crates/eval/src/tool_metrics.rs 🔗

@@ -0,0 +1,102 @@
+use collections::HashMap;
+use serde::{Deserialize, Serialize};
+use std::{fmt::Display, sync::Arc};
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct ToolMetrics {
+    pub use_counts: HashMap<Arc<str>, u32>,
+    pub failure_counts: HashMap<Arc<str>, u32>,
+}
+
+impl ToolMetrics {
+    pub fn insert(&mut self, tool_name: Arc<str>, succeeded: bool) {
+        *self.use_counts.entry(tool_name.clone()).or_insert(0) += 1;
+        if !succeeded {
+            *self.failure_counts.entry(tool_name).or_insert(0) += 1;
+        }
+    }
+
+    pub fn merge(&mut self, other: &ToolMetrics) {
+        for (tool_name, use_count) in &other.use_counts {
+            *self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count;
+        }
+        for (tool_name, failure_count) in &other.failure_counts {
+            *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
+        }
+    }
+}
+
+impl Display for ToolMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let mut failure_rates: Vec<(Arc<str>, f64)> = Vec::new();
+
+        for (tool_name, use_count) in &self.use_counts {
+            let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0);
+            if *use_count > 0 {
+                let failure_rate = failure_count as f64 / *use_count as f64;
+                failure_rates.push((tool_name.clone(), failure_rate));
+            }
+        }
+
+        // Sort by failure rate descending
+        failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Table dimensions
+        let tool_width = 30;
+        let count_width = 10;
+        let rate_width = 10;
+
+        // Write table top border
+        writeln!(
+            f,
+            "┌{}┬{}┬{}┬{}┐",
+            "─".repeat(tool_width),
+            "─".repeat(count_width),
+            "─".repeat(count_width),
+            "─".repeat(rate_width)
+        )?;
+
+        // Write header row
+        writeln!(
+            f,
+            "│{:^30}│{:^10}│{:^10}│{:^10}│",
+            "Tool", "Uses", "Failures", "Rate"
+        )?;
+
+        // Write header-data separator
+        writeln!(
+            f,
+            "├{}┼{}┼{}┼{}┤",
+            "─".repeat(tool_width),
+            "─".repeat(count_width),
+            "─".repeat(count_width),
+            "─".repeat(rate_width)
+        )?;
+
+        // Write data rows
+        for (tool_name, failure_rate) in failure_rates {
+            let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0);
+            let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
+            writeln!(
+                f,
+                "│{:^30}│{:^10}│{:^10}│{:^10}│",
+                tool_name,
+                use_count,
+                failure_count,
+                format!("{}%", (failure_rate * 100.0).round())
+            )?;
+        }
+
+        // Write table bottom border
+        writeln!(
+            f,
+            "└{}┴{}┴{}┴{}┘",
+            "─".repeat(tool_width),
+            "─".repeat(count_width),
+            "─".repeat(count_width),
+            "─".repeat(rate_width)
+        )?;
+
+        Ok(())
+    }
+}