diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index 90d11f616b612d5b5eb6f7f83f4277b718bbd6f6..fca5f1d6044f14d7559808c26e2d650c7f7b815c 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -1,13 +1,15 @@ mod example; mod ids; +mod tool_metrics; -use client::{Client, ProxySettings, UserStore}; pub(crate) use example::*; -use telemetry; +pub(crate) use tool_metrics::*; use ::fs::RealFs; use anyhow::{Result, anyhow}; use clap::Parser; +use client::{Client, ProxySettings, UserStore}; +use collections::HashSet; use extension::ExtensionHostProxy; use futures::{StreamExt, future}; use gpui::http_client::{Uri, read_proxy_from_env}; @@ -22,7 +24,6 @@ use prompt_store::PromptBuilder; use release_channel::AppVersion; use reqwest_client::ReqwestClient; use settings::{Settings, SettingsStore}; -use std::collections::HashSet; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::usize; @@ -92,6 +93,8 @@ fn main() { .telemetry() .start(system_id, installation_id, session_id, cx); + let mut cumulative_tool_metrics = ToolMetrics::default(); + let model_registry = LanguageModelRegistry::read_global(cx); let model = find_model("claude-3-7-sonnet-latest", model_registry, cx).unwrap(); let model_provider_id = model.provider_id(); @@ -177,7 +180,7 @@ fn main() { return cx.update(|cx| cx.quit()); } - let mut repo_urls = HashSet::new(); + let mut repo_urls = HashSet::default(); let mut clone_tasks = Vec::new(); for (i, example) in examples.iter_mut().enumerate() { @@ -244,9 +247,24 @@ fn main() { let model = model.clone(); let example = example.clone(); cx.spawn(async move |cx| { - let result = - run_example(&example, model, app_state, judge_repetitions, cx).await; - (result, example) + let result = async { + let run_output = cx + .update(|cx| example.run(model.clone(), app_state.clone(), cx))? + .await?; + let judge_tasks = (0..judge_repetitions).map(|round| { + run_judge_repetition( + example.clone(), + model.clone(), + &run_output, + round, + cx, + ) + }); + let judge_outputs = future::join_all(judge_tasks).await; + anyhow::Ok((run_output, judge_outputs)) + } + .await; + (example, result) }) }); @@ -256,52 +274,58 @@ fn main() { .await; println!("\n\n"); - println!("========================================"); - println!(" EVAL RESULTS "); - println!("========================================"); - println!(""); + print_header("EVAL RESULTS"); let mut diff_scores = Vec::new(); let mut thread_scores = Vec::new(); let mut error_count = 0; - for (result, example) in results { + for (example, result) in results { + print_header(&example.name); + match result { Err(err) => { println!("💥 {}{:?}", example.log_prefix, err); error_count += 1; } - Ok(judge_results) => { - for judge_result in judge_results { + Ok((run_output, judge_results)) => { + cumulative_tool_metrics.merge(&run_output.tool_metrics); + + println!("┌───────┬──────┬────────┐"); + println!("│ Judge │ Diff │ Thread │"); + println!("├───────┼──────┼────────┤"); + + for (i, judge_result) in judge_results.iter().enumerate() { match judge_result { Ok(judge_output) => { - const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"]; - let diff_score: u32 = judge_output.diff.score; - let score_index = (diff_score.min(5)) as usize; + let diff_score = judge_output.diff.score; + diff_scores.push(diff_score); + + let thread_display = if let Some(thread) = &judge_output.thread + { + let thread_score = thread.score; + thread_scores.push(thread_score); + format!("{}", thread_score) + } else { + "N/A".to_string() + }; println!( - "{} {}{} (Diff)", - SCORES[score_index], - example.log_prefix, - judge_output.diff.score, + "|{:^7}│{:^6}│{:^8}│", + i + 1, + diff_score, + thread_display ); - diff_scores.push(judge_output.diff.score); - - if let Some(thread) = judge_output.thread { - let process_score: u32 = thread.score; - let score_index = (process_score.min(5)) as usize; - println!( - "{} {}{} (Thread)", - SCORES[score_index], example.log_prefix, thread.score, - ); - thread_scores.push(thread.score); - } } Err(err) => { - println!("💥 {}{:?}", example.log_prefix, err); + println!("|{:^7}│{:^6}│{:^8}│{:?}", i + 1, "N/A", "N/A", err); } } } + + println!("└───────┴──────┴────────┘"); + + println!("{}", run_output.tool_metrics); } } println!( @@ -341,6 +365,9 @@ fn main() { } } + print_header("CUMULATIVE TOOL METRICS"); + println!("{}", cumulative_tool_metrics); + std::thread::sleep(std::time::Duration::from_secs(2)); app_state.client.telemetry().flush_events(); @@ -351,27 +378,6 @@ fn main() { }); } -async fn run_example( - example: &Example, - model: Arc, - app_state: Arc, - judge_repetitions: u32, - cx: &mut AsyncApp, -) -> Result>> { - let run_output = cx - .update(|cx| example.run(model.clone(), app_state.clone(), cx))? - .await?; - - let judge_tasks = (0..judge_repetitions) - .map(|round| run_judge_repetition(example.clone(), model.clone(), &run_output, round, cx)); - - let results = future::join_all(judge_tasks).await; - - app_state.client.telemetry().flush_events(); - - Ok(results) -} - fn list_all_examples() -> Result> { let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap(); let entries = std::fs::read_dir(path).unwrap(); @@ -566,7 +572,7 @@ async fn run_judge_repetition( diff_analysis = judge_output.diff.analysis, thread_score = thread.score, thread_analysis = thread.analysis, - tool_use_counts = run_output.tool_use_counts, + tool_metrics = run_output.tool_metrics, response_count = run_output.response_count, token_usage = run_output.token_usage, model = model.telemetry_id(), @@ -585,7 +591,7 @@ async fn run_judge_repetition( round = round, diff_score = judge_output.diff.score, diff_analysis = judge_output.diff.analysis, - tool_use_counts = run_output.tool_use_counts, + tool_metrics = run_output.tool_metrics, response_count = run_output.response_count, token_usage = run_output.token_usage, model = model.telemetry_id(), @@ -601,3 +607,9 @@ async fn run_judge_repetition( judge_result } + +fn print_header(header: &str) { + println!("\n========================================"); + println!("{:^40}", header); + println!("========================================\n"); +} diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs index 1f50885e8fbc1c04c245687d690dd0b1c071499a..5aff8522defe0fd6809b4f5f2c1e331e0b9f88bc 100644 --- a/crates/eval/src/example.rs +++ b/crates/eval/src/example.rs @@ -1,8 +1,8 @@ +use crate::{AgentAppState, ToolMetrics}; use agent::{ThreadEvent, ThreadStore}; use anyhow::{Context as _, Result, anyhow}; use assistant_tool::ToolWorkingSet; use client::proto::LspWorkProgress; -use collections::HashMap; use dap::DapRegistry; use futures::channel::mpsc; use futures::{FutureExt, StreamExt as _, select_biased}; @@ -32,8 +32,6 @@ use util::command::new_smol_command; use util::markdown::MarkdownString; use util::serde::default_true; -use crate::AgentAppState; - pub const EXAMPLES_DIR: &str = "./crates/eval/examples"; pub const REPOS_DIR: &str = "./crates/eval/repos"; pub const WORKTREES_DIR: &str = "./crates/eval/worktrees"; @@ -88,7 +86,7 @@ pub struct RunOutput { pub diagnostics_after: Option, pub response_count: usize, pub token_usage: TokenUsage, - pub tool_use_counts: HashMap, u32>, + pub tool_metrics: ToolMetrics, pub last_request: LanguageModelRequest, } @@ -351,8 +349,7 @@ impl Example { }); })?; - let tool_use_counts: Arc, u32>>> = - Mutex::new(HashMap::default()).into(); + let tool_metrics = Arc::new(Mutex::new(ToolMetrics::default())); let (thread_event_tx, mut thread_event_rx) = mpsc::unbounded(); @@ -362,7 +359,7 @@ impl Example { let event_handler_task = cx.spawn({ let log_prefix = this.log_prefix.clone(); - let tool_use_counts = tool_use_counts.clone(); + let tool_metrics = tool_metrics.clone(); let thread = thread.downgrade(); async move |cx| { loop { @@ -405,6 +402,7 @@ impl Example { } => { thread.update(cx, |thread, _cx| { if let Some(tool_use) = pending_tool_use { + let mut tool_metrics = tool_metrics.lock().unwrap(); if let Some(tool_result) = thread.tool_result(&tool_use_id) { let message = if tool_result.is_error { format!("TOOL FAILED: {}", tool_use.name) @@ -412,13 +410,11 @@ impl Example { format!("TOOL FINISHED: {}", tool_use.name) }; println!("{log_prefix}{message}"); - let mut tool_use_counts = tool_use_counts.lock().unwrap(); - *tool_use_counts - .entry(tool_result.tool_name.clone()) - .or_insert(0) += 1; + tool_metrics.insert(tool_result.tool_name.clone(), !tool_result.is_error); } else { let message = format!("TOOL FINISHED WITHOUT RESULT: {}", tool_use.name); println!("{log_prefix}{message}"); + tool_metrics.insert(tool_use.name.clone(), true); } } })?; @@ -501,7 +497,7 @@ impl Example { diagnostics_after, response_count, token_usage: thread.cumulative_token_usage(), - tool_use_counts: tool_use_counts.lock().unwrap().clone(), + tool_metrics: tool_metrics.lock().unwrap().clone(), last_request, } }) diff --git a/crates/eval/src/tool_metrics.rs b/crates/eval/src/tool_metrics.rs new file mode 100644 index 0000000000000000000000000000000000000000..e576cca822a2858e3499dfe4a86377d575562528 --- /dev/null +++ b/crates/eval/src/tool_metrics.rs @@ -0,0 +1,102 @@ +use collections::HashMap; +use serde::{Deserialize, Serialize}; +use std::{fmt::Display, sync::Arc}; + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct ToolMetrics { + pub use_counts: HashMap, u32>, + pub failure_counts: HashMap, u32>, +} + +impl ToolMetrics { + pub fn insert(&mut self, tool_name: Arc, succeeded: bool) { + *self.use_counts.entry(tool_name.clone()).or_insert(0) += 1; + if !succeeded { + *self.failure_counts.entry(tool_name).or_insert(0) += 1; + } + } + + pub fn merge(&mut self, other: &ToolMetrics) { + for (tool_name, use_count) in &other.use_counts { + *self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count; + } + for (tool_name, failure_count) in &other.failure_counts { + *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count; + } + } +} + +impl Display for ToolMetrics { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut failure_rates: Vec<(Arc, f64)> = Vec::new(); + + for (tool_name, use_count) in &self.use_counts { + let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0); + if *use_count > 0 { + let failure_rate = failure_count as f64 / *use_count as f64; + failure_rates.push((tool_name.clone(), failure_rate)); + } + } + + // Sort by failure rate descending + failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + + // Table dimensions + let tool_width = 30; + let count_width = 10; + let rate_width = 10; + + // Write table top border + writeln!( + f, + "┌{}┬{}┬{}┬{}┐", + "─".repeat(tool_width), + "─".repeat(count_width), + "─".repeat(count_width), + "─".repeat(rate_width) + )?; + + // Write header row + writeln!( + f, + "│{:^30}│{:^10}│{:^10}│{:^10}│", + "Tool", "Uses", "Failures", "Rate" + )?; + + // Write header-data separator + writeln!( + f, + "├{}┼{}┼{}┼{}┤", + "─".repeat(tool_width), + "─".repeat(count_width), + "─".repeat(count_width), + "─".repeat(rate_width) + )?; + + // Write data rows + for (tool_name, failure_rate) in failure_rates { + let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0); + let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0); + writeln!( + f, + "│{:^30}│{:^10}│{:^10}│{:^10}│", + tool_name, + use_count, + failure_count, + format!("{}%", (failure_rate * 100.0).round()) + )?; + } + + // Write table bottom border + writeln!( + f, + "└{}┴{}┴{}┴{}┘", + "─".repeat(tool_width), + "─".repeat(count_width), + "─".repeat(count_width), + "─".repeat(rate_width) + )?; + + Ok(()) + } +}