@@ -1,13 +1,15 @@
mod example;
mod ids;
+mod tool_metrics;
-use client::{Client, ProxySettings, UserStore};
pub(crate) use example::*;
-use telemetry;
+pub(crate) use tool_metrics::*;
use ::fs::RealFs;
use anyhow::{Result, anyhow};
use clap::Parser;
+use client::{Client, ProxySettings, UserStore};
+use collections::HashSet;
use extension::ExtensionHostProxy;
use futures::{StreamExt, future};
use gpui::http_client::{Uri, read_proxy_from_env};
@@ -22,7 +24,6 @@ use prompt_store::PromptBuilder;
use release_channel::AppVersion;
use reqwest_client::ReqwestClient;
use settings::{Settings, SettingsStore};
-use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::usize;
@@ -92,6 +93,8 @@ fn main() {
.telemetry()
.start(system_id, installation_id, session_id, cx);
+ let mut cumulative_tool_metrics = ToolMetrics::default();
+
let model_registry = LanguageModelRegistry::read_global(cx);
let model = find_model("claude-3-7-sonnet-latest", model_registry, cx).unwrap();
let model_provider_id = model.provider_id();
@@ -177,7 +180,7 @@ fn main() {
return cx.update(|cx| cx.quit());
}
- let mut repo_urls = HashSet::new();
+ let mut repo_urls = HashSet::default();
let mut clone_tasks = Vec::new();
for (i, example) in examples.iter_mut().enumerate() {
@@ -244,9 +247,24 @@ fn main() {
let model = model.clone();
let example = example.clone();
cx.spawn(async move |cx| {
- let result =
- run_example(&example, model, app_state, judge_repetitions, cx).await;
- (result, example)
+ let result = async {
+ let run_output = cx
+ .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
+ .await?;
+ let judge_tasks = (0..judge_repetitions).map(|round| {
+ run_judge_repetition(
+ example.clone(),
+ model.clone(),
+ &run_output,
+ round,
+ cx,
+ )
+ });
+ let judge_outputs = future::join_all(judge_tasks).await;
+ anyhow::Ok((run_output, judge_outputs))
+ }
+ .await;
+ (example, result)
})
});
@@ -256,52 +274,58 @@ fn main() {
.await;
println!("\n\n");
- println!("========================================");
- println!(" EVAL RESULTS ");
- println!("========================================");
- println!("");
+ print_header("EVAL RESULTS");
let mut diff_scores = Vec::new();
let mut thread_scores = Vec::new();
let mut error_count = 0;
- for (result, example) in results {
+ for (example, result) in results {
+ print_header(&example.name);
+
match result {
Err(err) => {
println!("💥 {}{:?}", example.log_prefix, err);
error_count += 1;
}
- Ok(judge_results) => {
- for judge_result in judge_results {
+ Ok((run_output, judge_results)) => {
+ cumulative_tool_metrics.merge(&run_output.tool_metrics);
+
+ println!("┌───────┬──────┬────────┐");
+ println!("│ Judge │ Diff │ Thread │");
+ println!("├───────┼──────┼────────┤");
+
+ for (i, judge_result) in judge_results.iter().enumerate() {
match judge_result {
Ok(judge_output) => {
- const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
- let diff_score: u32 = judge_output.diff.score;
- let score_index = (diff_score.min(5)) as usize;
+ let diff_score = judge_output.diff.score;
+ diff_scores.push(diff_score);
+
+ let thread_display = if let Some(thread) = &judge_output.thread
+ {
+ let thread_score = thread.score;
+ thread_scores.push(thread_score);
+ format!("{}", thread_score)
+ } else {
+ "N/A".to_string()
+ };
println!(
- "{} {}{} (Diff)",
- SCORES[score_index],
- example.log_prefix,
- judge_output.diff.score,
+ "|{:^7}│{:^6}│{:^8}│",
+ i + 1,
+ diff_score,
+ thread_display
);
- diff_scores.push(judge_output.diff.score);
-
- if let Some(thread) = judge_output.thread {
- let process_score: u32 = thread.score;
- let score_index = (process_score.min(5)) as usize;
- println!(
- "{} {}{} (Thread)",
- SCORES[score_index], example.log_prefix, thread.score,
- );
- thread_scores.push(thread.score);
- }
}
Err(err) => {
- println!("💥 {}{:?}", example.log_prefix, err);
+ println!("|{:^7}│{:^6}│{:^8}│{:?}", i + 1, "N/A", "N/A", err);
}
}
}
+
+ println!("└───────┴──────┴────────┘");
+
+ println!("{}", run_output.tool_metrics);
}
}
println!(
@@ -341,6 +365,9 @@ fn main() {
}
}
+ print_header("CUMULATIVE TOOL METRICS");
+ println!("{}", cumulative_tool_metrics);
+
std::thread::sleep(std::time::Duration::from_secs(2));
app_state.client.telemetry().flush_events();
@@ -351,27 +378,6 @@ fn main() {
});
}
-async fn run_example(
- example: &Example,
- model: Arc<dyn LanguageModel>,
- app_state: Arc<AgentAppState>,
- judge_repetitions: u32,
- cx: &mut AsyncApp,
-) -> Result<Vec<Result<JudgeOutput>>> {
- let run_output = cx
- .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
- .await?;
-
- let judge_tasks = (0..judge_repetitions)
- .map(|round| run_judge_repetition(example.clone(), model.clone(), &run_output, round, cx));
-
- let results = future::join_all(judge_tasks).await;
-
- app_state.client.telemetry().flush_events();
-
- Ok(results)
-}
-
fn list_all_examples() -> Result<Vec<PathBuf>> {
let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
let entries = std::fs::read_dir(path).unwrap();
@@ -566,7 +572,7 @@ async fn run_judge_repetition(
diff_analysis = judge_output.diff.analysis,
thread_score = thread.score,
thread_analysis = thread.analysis,
- tool_use_counts = run_output.tool_use_counts,
+ tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
@@ -585,7 +591,7 @@ async fn run_judge_repetition(
round = round,
diff_score = judge_output.diff.score,
diff_analysis = judge_output.diff.analysis,
- tool_use_counts = run_output.tool_use_counts,
+ tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
@@ -601,3 +607,9 @@ async fn run_judge_repetition(
judge_result
}
+
+fn print_header(header: &str) {
+ println!("\n========================================");
+ println!("{:^40}", header);
+ println!("========================================\n");
+}
@@ -1,8 +1,8 @@
+use crate::{AgentAppState, ToolMetrics};
use agent::{ThreadEvent, ThreadStore};
use anyhow::{Context as _, Result, anyhow};
use assistant_tool::ToolWorkingSet;
use client::proto::LspWorkProgress;
-use collections::HashMap;
use dap::DapRegistry;
use futures::channel::mpsc;
use futures::{FutureExt, StreamExt as _, select_biased};
@@ -32,8 +32,6 @@ use util::command::new_smol_command;
use util::markdown::MarkdownString;
use util::serde::default_true;
-use crate::AgentAppState;
-
pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
pub const REPOS_DIR: &str = "./crates/eval/repos";
pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
@@ -88,7 +86,7 @@ pub struct RunOutput {
pub diagnostics_after: Option<String>,
pub response_count: usize,
pub token_usage: TokenUsage,
- pub tool_use_counts: HashMap<Arc<str>, u32>,
+ pub tool_metrics: ToolMetrics,
pub last_request: LanguageModelRequest,
}
@@ -351,8 +349,7 @@ impl Example {
});
})?;
- let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
- Mutex::new(HashMap::default()).into();
+ let tool_metrics = Arc::new(Mutex::new(ToolMetrics::default()));
let (thread_event_tx, mut thread_event_rx) = mpsc::unbounded();
@@ -362,7 +359,7 @@ impl Example {
let event_handler_task = cx.spawn({
let log_prefix = this.log_prefix.clone();
- let tool_use_counts = tool_use_counts.clone();
+ let tool_metrics = tool_metrics.clone();
let thread = thread.downgrade();
async move |cx| {
loop {
@@ -405,6 +402,7 @@ impl Example {
} => {
thread.update(cx, |thread, _cx| {
if let Some(tool_use) = pending_tool_use {
+ let mut tool_metrics = tool_metrics.lock().unwrap();
if let Some(tool_result) = thread.tool_result(&tool_use_id) {
let message = if tool_result.is_error {
format!("TOOL FAILED: {}", tool_use.name)
@@ -412,13 +410,11 @@ impl Example {
format!("TOOL FINISHED: {}", tool_use.name)
};
println!("{log_prefix}{message}");
- let mut tool_use_counts = tool_use_counts.lock().unwrap();
- *tool_use_counts
- .entry(tool_result.tool_name.clone())
- .or_insert(0) += 1;
+ tool_metrics.insert(tool_result.tool_name.clone(), !tool_result.is_error);
} else {
let message = format!("TOOL FINISHED WITHOUT RESULT: {}", tool_use.name);
println!("{log_prefix}{message}");
+ tool_metrics.insert(tool_use.name.clone(), true);
}
}
})?;
@@ -501,7 +497,7 @@ impl Example {
diagnostics_after,
response_count,
token_usage: thread.cumulative_token_usage(),
- tool_use_counts: tool_use_counts.lock().unwrap().clone(),
+ tool_metrics: tool_metrics.lock().unwrap().clone(),
last_request,
}
})
@@ -0,0 +1,102 @@
+use collections::HashMap;
+use serde::{Deserialize, Serialize};
+use std::{fmt::Display, sync::Arc};
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct ToolMetrics {
+ pub use_counts: HashMap<Arc<str>, u32>,
+ pub failure_counts: HashMap<Arc<str>, u32>,
+}
+
+impl ToolMetrics {
+ pub fn insert(&mut self, tool_name: Arc<str>, succeeded: bool) {
+ *self.use_counts.entry(tool_name.clone()).or_insert(0) += 1;
+ if !succeeded {
+ *self.failure_counts.entry(tool_name).or_insert(0) += 1;
+ }
+ }
+
+ pub fn merge(&mut self, other: &ToolMetrics) {
+ for (tool_name, use_count) in &other.use_counts {
+ *self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count;
+ }
+ for (tool_name, failure_count) in &other.failure_counts {
+ *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
+ }
+ }
+}
+
+impl Display for ToolMetrics {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let mut failure_rates: Vec<(Arc<str>, f64)> = Vec::new();
+
+ for (tool_name, use_count) in &self.use_counts {
+ let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0);
+ if *use_count > 0 {
+ let failure_rate = failure_count as f64 / *use_count as f64;
+ failure_rates.push((tool_name.clone(), failure_rate));
+ }
+ }
+
+ // Sort by failure rate descending
+ failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+ // Table dimensions
+ let tool_width = 30;
+ let count_width = 10;
+ let rate_width = 10;
+
+ // Write table top border
+ writeln!(
+ f,
+ "┌{}┬{}┬{}┬{}┐",
+ "─".repeat(tool_width),
+ "─".repeat(count_width),
+ "─".repeat(count_width),
+ "─".repeat(rate_width)
+ )?;
+
+ // Write header row
+ writeln!(
+ f,
+ "│{:^30}│{:^10}│{:^10}│{:^10}│",
+ "Tool", "Uses", "Failures", "Rate"
+ )?;
+
+ // Write header-data separator
+ writeln!(
+ f,
+ "├{}┼{}┼{}┼{}┤",
+ "─".repeat(tool_width),
+ "─".repeat(count_width),
+ "─".repeat(count_width),
+ "─".repeat(rate_width)
+ )?;
+
+ // Write data rows
+ for (tool_name, failure_rate) in failure_rates {
+ let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0);
+ let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
+ writeln!(
+ f,
+ "│{:^30}│{:^10}│{:^10}│{:^10}│",
+ tool_name,
+ use_count,
+ failure_count,
+ format!("{}%", (failure_rate * 100.0).round())
+ )?;
+ }
+
+ // Write table bottom border
+ writeln!(
+ f,
+ "└{}┴{}┴{}┴{}┘",
+ "─".repeat(tool_width),
+ "─".repeat(count_width),
+ "─".repeat(count_width),
+ "─".repeat(rate_width)
+ )?;
+
+ Ok(())
+ }
+}