1mod assertions;
2mod example;
3mod examples;
4mod explorer;
5mod ids;
6mod instance;
7mod tool_metrics;
8
9use assertions::{AssertionsReport, display_error_row};
10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
11pub(crate) use tool_metrics::*;
12
13use ::fs::RealFs;
14use anyhow::anyhow;
15use clap::Parser;
16use client::{Client, ProxySettings, UserStore};
17use collections::{HashMap, HashSet};
18use extension::ExtensionHostProxy;
19use futures::future;
20use gpui::http_client::read_proxy_from_env;
21use gpui::{App, AppContext, Application, AsyncApp, Entity, SemanticVersion, UpdateGlobal};
22use gpui_tokio::Tokio;
23use language::LanguageRegistry;
24use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry};
25use node_runtime::{NodeBinaryOptions, NodeRuntime};
26use project::Project;
27use project::project_settings::ProjectSettings;
28use prompt_store::PromptBuilder;
29use release_channel::AppVersion;
30use reqwest_client::ReqwestClient;
31use settings::{Settings, SettingsStore};
32use std::cell::RefCell;
33use std::collections::VecDeque;
34use std::env;
35use std::path::{Path, PathBuf};
36use std::rc::Rc;
37use std::sync::{Arc, LazyLock};
38use util::ResultExt as _;
39
40static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
41 LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
42
43#[derive(Parser, Debug)]
44#[command(name = "eval", disable_version_flag = true)]
45struct Args {
46 /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
47 #[arg(value_name = "EXAMPLE_SUBSTRING")]
48 filter: Vec<String>,
49 /// ID of model to use.
50 #[arg(long, default_value = "claude-3-7-sonnet-latest")]
51 model: String,
52 /// Model provider to use.
53 #[arg(long, default_value = "anthropic")]
54 provider: String,
55 #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
56 languages: Vec<String>,
57 /// How many times to run each example.
58 #[arg(long, default_value = "8")]
59 repetitions: usize,
60 /// Maximum number of examples to run concurrently.
61 #[arg(long, default_value = "4")]
62 concurrency: usize,
63}
64
65fn main() {
66 dotenv::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
67
68 env_logger::init();
69
70 let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
71 let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
72 let session_id = uuid::Uuid::new_v4().to_string();
73 let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
74 let run_id = match env::var("GITHUB_RUN_ID") {
75 Ok(run_id) => format!("github/{}", run_id),
76 Err(_) => format!("local/{}", run_timestamp),
77 };
78
79 let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
80 .parent()
81 .unwrap()
82 .parent()
83 .unwrap()
84 .canonicalize()
85 .unwrap();
86 let eval_crate_dir = root_dir.join("crates").join("eval");
87 let repos_dir = eval_crate_dir.join("repos");
88 let worktrees_dir = eval_crate_dir.join("worktrees");
89 let examples_dir = eval_crate_dir.join("src").join("examples");
90 let run_dir = eval_crate_dir
91 .join("runs")
92 .join(format!("{}", run_timestamp));
93 std::fs::create_dir_all(&run_dir).unwrap();
94 std::fs::create_dir_all(&repos_dir).unwrap();
95 std::fs::create_dir_all(&worktrees_dir).unwrap();
96 std::fs::create_dir_all(&examples_dir).unwrap();
97 std::fs::create_dir_all(&paths::config_dir()).unwrap();
98
99 let zed_commit_sha = commit_sha_for_path(&root_dir);
100 let zed_branch_name = git_branch_for_path(&root_dir);
101 let args = Args::parse();
102 let languages: HashSet<String> = args.languages.into_iter().collect();
103
104 let http_client = Arc::new(ReqwestClient::new());
105 let app = Application::headless().with_http_client(http_client.clone());
106 let all_threads = examples::all(&examples_dir);
107
108 app.run(move |cx| {
109 let app_state = init(cx);
110
111 let telemetry = app_state.client.telemetry();
112 telemetry.start(system_id, installation_id, session_id, cx);
113
114 let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
115 && telemetry.has_checksum_seed();
116 if enable_telemetry {
117 println!("Telemetry enabled");
118 telemetry::event!(
119 "Agent Eval Started",
120 zed_commit_sha = zed_commit_sha,
121 zed_branch_name = zed_branch_name,
122 run_id = run_id,
123 );
124 }
125
126 let mut cumulative_tool_metrics = ToolMetrics::default();
127
128 let model_registry = LanguageModelRegistry::read_global(cx);
129 let model = find_model(&args.provider, &args.model, model_registry, cx).unwrap();
130 let model_provider_id = model.provider_id();
131 let model_provider = model_registry.provider(&model_provider_id).unwrap();
132
133 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
134 registry.set_default_model(
135 Some(ConfiguredModel {
136 provider: model_provider.clone(),
137 model: model.clone(),
138 }),
139 cx,
140 );
141 });
142
143 let authenticate_task = model_provider.authenticate(cx);
144
145 cx.spawn(async move |cx| {
146 authenticate_task.await.unwrap();
147
148 let mut examples = Vec::new();
149
150 const COLORS: [&str; 12] = [
151 "\x1b[31m", // Red
152 "\x1b[32m", // Green
153 "\x1b[33m", // Yellow
154 "\x1b[34m", // Blue
155 "\x1b[35m", // Magenta
156 "\x1b[36m", // Cyan
157 "\x1b[91m", // Bright Red
158 "\x1b[92m", // Bright Green
159 "\x1b[93m", // Bright Yellow
160 "\x1b[94m", // Bright Blue
161 "\x1b[95m", // Bright Magenta
162 "\x1b[96m", // Bright Cyan
163 ];
164
165 let mut skipped = Vec::new();
166
167 for thread in all_threads {
168 let meta = thread.meta();
169 if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
170 {
171 skipped.push(meta.name);
172 continue;
173 }
174
175 if let Some(language) = meta.language_server {
176 if !languages.contains(&language.file_extension) {
177 panic!(
178 "Eval for {:?} could not be run because no language server was found for extension {:?}",
179 meta.name,
180 language.file_extension
181 );
182 }
183 }
184
185 // TODO: This creates a worktree per repetition. Ideally these examples should
186 // either be run sequentially on the same worktree, or reuse worktrees when there
187 // are more examples to run than the concurrency limit.
188 for repetition_number in 0..args.repetitions {
189 let example_instance = ExampleInstance::new(
190 thread.clone(),
191 &repos_dir,
192 &run_dir,
193 &worktrees_dir,
194 repetition_number,
195 );
196
197 examples.push(example_instance);
198 }
199 }
200
201 if !skipped.is_empty() {
202 println!("Skipped threads: {}", skipped.join(", "));
203 }
204
205 if examples.is_empty() {
206 eprintln!("Filter matched no examples");
207 return cx.update(|cx| cx.quit());
208 }
209
210 let mut repo_urls = HashSet::default();
211 let mut clone_tasks = Vec::new();
212
213 let max_name_width = examples
214 .iter()
215 .map(|e| e.worktree_name().len())
216 .max()
217 .unwrap_or(0);
218
219 for (i, example_instance) in examples.iter_mut().enumerate() {
220 let color = COLORS[i % COLORS.len()].to_string();
221 example_instance.set_log_prefix_style(&color, max_name_width);
222
223 println!(
224 "{}Logging to: {}",
225 example_instance.log_prefix,
226 example_instance.run_directory.display()
227 );
228
229 let repo_url = example_instance.repo_url();
230 if repo_urls.insert(repo_url.clone()) {
231 let repo_path = example_instance.repo_path.clone();
232
233 if !repo_path.join(".git").is_dir() {
234 println!(
235 "{:<width$} < {}",
236 "↓ Cloning",
237 repo_url,
238 width = max_name_width
239 );
240
241 let git_task = cx.spawn(async move |_cx| {
242 std::fs::create_dir_all(&repo_path)?;
243 run_git(&repo_path, &["init"]).await?;
244 run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
245 });
246
247 clone_tasks.push(git_task);
248 } else {
249 println!(
250 "{:<width$} < {}",
251 "✔︎ Already cloned",
252 repo_url,
253 width = max_name_width
254 );
255
256 let actual_origin =
257 run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
258 if actual_origin != repo_url {
259 return Err(anyhow!(
260 "remote origin {} does not match expected origin {}",
261 actual_origin,
262 repo_url,
263 ));
264 }
265 }
266 }
267 }
268
269 future::join_all(clone_tasks).await;
270
271 for example_instance in examples.iter_mut() {
272 example_instance.fetch().await?;
273 }
274
275 let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
276 let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
277
278 future::join_all((0..args.concurrency).map(|_| {
279 let app_state = app_state.clone();
280 let model = model.clone();
281 let zed_commit_sha = zed_commit_sha.clone();
282 let zed_branch_name = zed_branch_name.clone();
283 let run_id = run_id.clone();
284 let examples = examples.clone();
285 let results = results_by_example_name.clone();
286 cx.spawn(async move |cx| {
287 loop {
288 let Some(mut example) = examples.borrow_mut().pop_front() else {
289 break;
290 };
291 let result = async {
292 example.setup().await?;
293 let run_output = cx
294 .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
295 .await?;
296 let judge_output = judge_example(
297 example.clone(),
298 model.clone(),
299 &zed_commit_sha,
300 &zed_branch_name,
301 &run_id,
302 &run_output,
303 enable_telemetry,
304 cx,
305 )
306 .await;
307 anyhow::Ok((run_output, judge_output))
308 }
309 .await;
310 results
311 .borrow_mut()
312 .entry(example.name.clone())
313 .or_insert(Vec::new())
314 .push((example.clone(), result));
315 }
316 })
317 }))
318 .await;
319
320 print_report(
321 &mut results_by_example_name.borrow_mut(),
322 &mut cumulative_tool_metrics,
323 &run_dir,
324 )?;
325
326 app_state.client.telemetry().flush_events().await;
327
328 cx.update(|cx| cx.quit())
329 })
330 .detach_and_log_err(cx);
331 });
332}
333
334/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
335pub struct AgentAppState {
336 pub languages: Arc<LanguageRegistry>,
337 pub client: Arc<Client>,
338 pub user_store: Entity<UserStore>,
339 pub fs: Arc<dyn fs::Fs>,
340 pub node_runtime: NodeRuntime,
341
342 // Additional fields not present in `workspace::AppState`.
343 pub prompt_builder: Arc<PromptBuilder>,
344}
345
346pub fn init(cx: &mut App) -> Arc<AgentAppState> {
347 release_channel::init(SemanticVersion::default(), cx);
348 gpui_tokio::init(cx);
349
350 let mut settings_store = SettingsStore::new(cx);
351 settings_store
352 .set_default_settings(settings::default_settings().as_ref(), cx)
353 .unwrap();
354 cx.set_global(settings_store);
355 client::init_settings(cx);
356
357 // Set User-Agent so we can download language servers from GitHub
358 let user_agent = format!(
359 "Zed/{} ({}; {})",
360 AppVersion::global(cx),
361 std::env::consts::OS,
362 std::env::consts::ARCH
363 );
364 let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
365 let proxy_url = proxy_str
366 .as_ref()
367 .and_then(|input| input.parse().ok())
368 .or_else(read_proxy_from_env);
369 let http = {
370 let _guard = Tokio::handle(cx).enter();
371
372 ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
373 .expect("could not start HTTP client")
374 };
375 cx.set_http_client(Arc::new(http));
376
377 Project::init_settings(cx);
378
379 let client = Client::production(cx);
380 cx.set_http_client(client.http_client());
381
382 let git_binary_path = None;
383 let fs = Arc::new(RealFs::new(
384 git_binary_path,
385 cx.background_executor().clone(),
386 ));
387
388 let mut languages = LanguageRegistry::new(cx.background_executor().clone());
389 languages.set_language_server_download_dir(paths::languages_dir().clone());
390 let languages = Arc::new(languages);
391
392 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
393
394 extension::init(cx);
395
396 let (tx, rx) = async_watch::channel(None);
397 cx.observe_global::<SettingsStore>(move |cx| {
398 let settings = &ProjectSettings::get_global(cx).node;
399 let options = NodeBinaryOptions {
400 allow_path_lookup: !settings.ignore_system_version,
401 allow_binary_download: true,
402 use_paths: settings.path.as_ref().map(|node_path| {
403 let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
404 let npm_path = settings
405 .npm_path
406 .as_ref()
407 .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
408 (
409 node_path.clone(),
410 npm_path.unwrap_or_else(|| {
411 let base_path = PathBuf::new();
412 node_path.parent().unwrap_or(&base_path).join("npm")
413 }),
414 )
415 }),
416 };
417 tx.send(Some(options)).log_err();
418 })
419 .detach();
420 let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
421
422 let extension_host_proxy = ExtensionHostProxy::global(cx);
423
424 language::init(cx);
425 language_extension::init(extension_host_proxy.clone(), languages.clone());
426 language_model::init(client.clone(), cx);
427 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
428 languages::init(languages.clone(), node_runtime.clone(), cx);
429 prompt_store::init(cx);
430 let stdout_is_a_pty = false;
431 let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
432 agent::init(
433 fs.clone(),
434 client.clone(),
435 prompt_builder.clone(),
436 languages.clone(),
437 cx,
438 );
439 assistant_tools::init(client.http_client(), cx);
440
441 SettingsStore::update_global(cx, |store, cx| {
442 store.set_user_settings(include_str!("../runner_settings.json"), cx)
443 })
444 .unwrap();
445
446 Arc::new(AgentAppState {
447 languages,
448 client,
449 user_store,
450 fs,
451 node_runtime,
452 prompt_builder,
453 })
454}
455
456pub fn find_model(
457 provider_id: &str,
458 model_id: &str,
459 model_registry: &LanguageModelRegistry,
460 cx: &App,
461) -> anyhow::Result<Arc<dyn LanguageModel>> {
462 let matching_models = model_registry
463 .available_models(cx)
464 .filter(|model| model.id().0 == model_id && model.provider_id().0 == provider_id)
465 .collect::<Vec<_>>();
466
467 match matching_models.as_slice() {
468 [model] => Ok(model.clone()),
469 [] => Err(anyhow!(
470 "No language model with ID {}/{} was available. Available models: {}",
471 provider_id,
472 model_id,
473 model_registry
474 .available_models(cx)
475 .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
476 .collect::<Vec<_>>()
477 .join(", ")
478 )),
479 _ => Err(anyhow!(
480 "Multiple language models with ID {} available - use `--provider` to choose one of: {:?}",
481 model_id,
482 matching_models
483 .iter()
484 .map(|model| model.provider_id().0)
485 .collect::<Vec<_>>()
486 )),
487 }
488}
489
490pub fn commit_sha_for_path(repo_path: &Path) -> String {
491 futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
492}
493
494pub fn git_branch_for_path(repo_path: &Path) -> String {
495 match std::env::var("GITHUB_REF_NAME") {
496 Ok(branch) => branch,
497 Err(_) => {
498 futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
499 .unwrap_or_else(|_| "unknown".to_string())
500 }
501 }
502}
503
504async fn judge_example(
505 example: ExampleInstance,
506 model: Arc<dyn LanguageModel>,
507 zed_commit_sha: &str,
508 zed_branch_name: &str,
509 run_id: &str,
510 run_output: &RunOutput,
511 enable_telemetry: bool,
512 cx: &AsyncApp,
513) -> JudgeOutput {
514 let judge_output = example.judge(model.clone(), &run_output, cx).await;
515
516 if enable_telemetry {
517 telemetry::event!(
518 "Agent Example Evaluated",
519 zed_commit_sha = zed_commit_sha,
520 zed_branch_name = zed_branch_name,
521 run_id = run_id,
522 example_name = example.name.clone(),
523 example_repetition = example.repetition,
524 diff_evaluation = judge_output.diff.clone(),
525 thread_evaluation = judge_output.thread.clone(),
526 tool_metrics = run_output.tool_metrics,
527 response_count = run_output.response_count,
528 token_usage = run_output.token_usage,
529 model = model.telemetry_id(),
530 model_provider = model.provider_id().to_string(),
531 repository_url = example.repo_url(),
532 repository_revision = example.revision(),
533 diagnostic_summary_before = run_output.diagnostic_summary_before,
534 diagnostic_summary_after = run_output.diagnostic_summary_after,
535 diagnostics_before = run_output.diagnostics_before,
536 diagnostics_after = run_output.diagnostics_after,
537 );
538 }
539
540 judge_output
541}
542
543const HEADER_WIDTH: usize = 65;
544
545fn print_h1(header: &str) {
546 println!("\n\n{:=^HEADER_WIDTH$}", "");
547 println!("{:^HEADER_WIDTH$}", header);
548 println!("{:=^HEADER_WIDTH$}\n", "");
549}
550
551fn print_h2(header: &str) {
552 println!("\n{:-^HEADER_WIDTH$}", "");
553 println!("{:^HEADER_WIDTH$}", header);
554 println!("{:-^HEADER_WIDTH$}\n", "");
555}
556
557fn print_report(
558 results_by_example_name: &mut HashMap<
559 String,
560 Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
561 >,
562 cumulative_tool_metrics: &mut ToolMetrics,
563 run_dir: &Path,
564) -> anyhow::Result<()> {
565 print_h1("EVAL RESULTS");
566
567 let mut diff_scores = Vec::new();
568 let mut thread_scores = Vec::new();
569 let mut programmatic_scores = Vec::new();
570 let mut error_count = 0;
571
572 for (example_name, results) in results_by_example_name.iter_mut() {
573 print_h2(example_name);
574
575 results.sort_unstable_by_key(|(example, _)| example.repetition);
576 let mut example_cumulative_tool_metrics = ToolMetrics::default();
577
578 let mut table_rows = String::new();
579
580 for (example, result) in results.iter() {
581 match result {
582 Err(err) => {
583 display_error_row(&mut table_rows, example.repetition, err.to_string())?;
584 error_count += 1;
585 programmatic_scores.push(0.0);
586 diff_scores.push(0.0);
587 thread_scores.push(0.0);
588 }
589 Ok((run_output, judge_output)) => {
590 cumulative_tool_metrics.merge(&run_output.tool_metrics);
591 example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
592
593 if run_output.programmatic_assertions.total_count() > 0 {
594 for assertion in &run_output.programmatic_assertions.ran {
595 assertions::display_table_row(
596 &mut table_rows,
597 example.repetition,
598 assertion,
599 )?;
600 }
601
602 programmatic_scores
603 .push(run_output.programmatic_assertions.passed_percentage())
604 }
605
606 if !judge_output.diff.is_empty() {
607 diff_scores.push(judge_output.diff.passed_percentage());
608
609 for assertion in &judge_output.diff.ran {
610 assertions::display_table_row(
611 &mut table_rows,
612 example.repetition,
613 assertion,
614 )?;
615 }
616 }
617
618 if !judge_output.thread.is_empty() {
619 thread_scores.push(judge_output.thread.passed_percentage());
620
621 for assertion in &judge_output.thread.ran {
622 assertions::display_table_row(
623 &mut table_rows,
624 example.repetition,
625 assertion,
626 )?;
627 }
628 }
629 }
630 }
631 }
632
633 let mut all_asserts = Vec::new();
634
635 if !table_rows.is_empty() {
636 assertions::print_table_header();
637 print!("{}", table_rows);
638
639 assertions::print_table_divider();
640
641 for (example, result) in results.iter() {
642 if let Ok((run_output, judge_output)) = result {
643 let asserts = [
644 run_output.programmatic_assertions.clone(),
645 judge_output.diff.clone(),
646 judge_output.thread.clone(),
647 ];
648 all_asserts.extend_from_slice(&asserts);
649 assertions::print_table_round_summary(
650 &example.repetition.to_string(),
651 asserts.iter(),
652 )
653 } else if let Err(err) = result {
654 let assert = AssertionsReport::error(err.to_string());
655 all_asserts.push(assert.clone());
656 assertions::print_table_round_summary(
657 &example.repetition.to_string(),
658 [assert].iter(),
659 )
660 }
661 }
662
663 assertions::print_table_divider();
664
665 assertions::print_table_round_summary("avg", all_asserts.iter());
666
667 assertions::print_table_footer();
668 }
669
670 if !example_cumulative_tool_metrics.is_empty() {
671 println!("{}", &example_cumulative_tool_metrics);
672 }
673 }
674
675 if results_by_example_name.len() > 1 {
676 print_h1("AGGREGATE");
677
678 if error_count > 0 {
679 println!("\n{error_count} examples failed to run!");
680 }
681
682 let programmatic_score_count = programmatic_scores.len();
683 if programmatic_score_count > 0 {
684 let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
685 / (programmatic_score_count as f32))
686 .floor();
687 println!("Average programmatic score: {average_programmatic_score}%");
688 }
689
690 let diff_score_count = diff_scores.len();
691 if diff_score_count > 0 {
692 let average_diff_score =
693 (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
694 println!("Average diff score: {average_diff_score}%");
695 }
696
697 let thread_score_count = thread_scores.len();
698
699 if thread_score_count > 0 {
700 let average_thread_score =
701 (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
702 println!("Average thread score: {average_thread_score}%");
703 }
704
705 println!("");
706
707 print_h2("CUMULATIVE TOOL METRICS");
708 println!("{}", cumulative_tool_metrics);
709 }
710
711 let explorer_output_path = run_dir.join("overview.html");
712 let mut json_paths: Vec<PathBuf> = results_by_example_name
713 .values()
714 .flat_map(|results| {
715 results.iter().map(|(example, _)| {
716 let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
717 let cwd = std::env::current_dir().expect("Can't get current dir");
718 pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
719 })
720 })
721 .collect::<Vec<_>>();
722 json_paths.sort();
723 if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
724 eprintln!("Failed to generate explorer HTML: {}", err);
725 }
726
727 Ok(())
728}