1mod assertions;
2mod example;
3mod examples;
4mod explorer;
5mod ids;
6mod instance;
7mod tool_metrics;
8
9use assertions::{AssertionsReport, display_error_row};
10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
11pub(crate) use tool_metrics::*;
12
13use ::fs::RealFs;
14use clap::Parser;
15use client::{Client, ProxySettings, UserStore};
16use collections::{HashMap, HashSet};
17use extension::ExtensionHostProxy;
18use futures::future;
19use gpui::http_client::read_proxy_from_env;
20use gpui::{App, AppContext, Application, AsyncApp, Entity, SemanticVersion, UpdateGlobal};
21use gpui_tokio::Tokio;
22use language::LanguageRegistry;
23use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry};
24use node_runtime::{NodeBinaryOptions, NodeRuntime};
25use project::Project;
26use project::project_settings::ProjectSettings;
27use prompt_store::PromptBuilder;
28use release_channel::AppVersion;
29use reqwest_client::ReqwestClient;
30use settings::{Settings, SettingsStore};
31use std::cell::RefCell;
32use std::collections::VecDeque;
33use std::env;
34use std::path::{Path, PathBuf};
35use std::rc::Rc;
36use std::sync::{Arc, LazyLock};
37use util::ResultExt as _;
38
39static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
40 LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
41
42#[derive(Parser, Debug)]
43#[command(name = "eval", disable_version_flag = true)]
44struct Args {
45 /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
46 #[arg(value_name = "EXAMPLE_SUBSTRING")]
47 filter: Vec<String>,
48 /// ID of model to use.
49 #[arg(long, default_value = "claude-3-7-sonnet-latest")]
50 model: String,
51 /// Model provider to use.
52 #[arg(long, default_value = "anthropic")]
53 provider: String,
54 #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
55 languages: Vec<String>,
56 /// How many times to run each example.
57 #[arg(long, default_value = "8")]
58 repetitions: usize,
59 /// Maximum number of examples to run concurrently.
60 #[arg(long, default_value = "4")]
61 concurrency: usize,
62}
63
64fn main() {
65 dotenv::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
66
67 env_logger::init();
68
69 let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
70 let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
71 let session_id = uuid::Uuid::new_v4().to_string();
72 let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
73 let run_id = match env::var("GITHUB_RUN_ID") {
74 Ok(run_id) => format!("github/{}", run_id),
75 Err(_) => format!("local/{}", run_timestamp),
76 };
77
78 let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
79 .parent()
80 .unwrap()
81 .parent()
82 .unwrap()
83 .canonicalize()
84 .unwrap();
85 let eval_crate_dir = root_dir.join("crates").join("eval");
86 let repos_dir = eval_crate_dir.join("repos");
87 let worktrees_dir = eval_crate_dir.join("worktrees");
88 let examples_dir = eval_crate_dir.join("src").join("examples");
89 let run_dir = eval_crate_dir
90 .join("runs")
91 .join(format!("{}", run_timestamp));
92 std::fs::create_dir_all(&run_dir).unwrap();
93 std::fs::create_dir_all(&repos_dir).unwrap();
94 std::fs::create_dir_all(&worktrees_dir).unwrap();
95 std::fs::create_dir_all(&examples_dir).unwrap();
96 std::fs::create_dir_all(&paths::config_dir()).unwrap();
97
98 let zed_commit_sha = commit_sha_for_path(&root_dir);
99 let zed_branch_name = git_branch_for_path(&root_dir);
100 let args = Args::parse();
101 let languages: HashSet<String> = args.languages.into_iter().collect();
102
103 let http_client = Arc::new(ReqwestClient::new());
104 let app = Application::headless().with_http_client(http_client.clone());
105 let all_threads = examples::all(&examples_dir);
106
107 app.run(move |cx| {
108 let app_state = init(cx);
109
110 let telemetry = app_state.client.telemetry();
111 telemetry.start(system_id, installation_id, session_id, cx);
112
113 let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
114 && telemetry.has_checksum_seed();
115 if enable_telemetry {
116 println!("Telemetry enabled");
117 telemetry::event!(
118 "Agent Eval Started",
119 zed_commit_sha = zed_commit_sha,
120 zed_branch_name = zed_branch_name,
121 run_id = run_id,
122 );
123 }
124
125 let mut cumulative_tool_metrics = ToolMetrics::default();
126
127 let model_registry = LanguageModelRegistry::read_global(cx);
128 let model = find_model(&args.provider, &args.model, model_registry, cx).unwrap();
129 let model_provider_id = model.provider_id();
130 let model_provider = model_registry.provider(&model_provider_id).unwrap();
131
132 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
133 registry.set_default_model(
134 Some(ConfiguredModel {
135 provider: model_provider.clone(),
136 model: model.clone(),
137 }),
138 cx,
139 );
140 });
141
142 let authenticate_task = model_provider.authenticate(cx);
143
144 cx.spawn(async move |cx| {
145 authenticate_task.await.unwrap();
146
147 let mut examples = Vec::new();
148
149 const COLORS: [&str; 12] = [
150 "\x1b[31m", // Red
151 "\x1b[32m", // Green
152 "\x1b[33m", // Yellow
153 "\x1b[34m", // Blue
154 "\x1b[35m", // Magenta
155 "\x1b[36m", // Cyan
156 "\x1b[91m", // Bright Red
157 "\x1b[92m", // Bright Green
158 "\x1b[93m", // Bright Yellow
159 "\x1b[94m", // Bright Blue
160 "\x1b[95m", // Bright Magenta
161 "\x1b[96m", // Bright Cyan
162 ];
163
164 let mut skipped = Vec::new();
165
166 for thread in all_threads {
167 let meta = thread.meta();
168 if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
169 {
170 skipped.push(meta.name);
171 continue;
172 }
173
174 if let Some(language) = meta.language_server {
175 if !languages.contains(&language.file_extension) {
176 panic!(
177 "Eval for {:?} could not be run because no language server was found for extension {:?}",
178 meta.name,
179 language.file_extension
180 );
181 }
182 }
183
184 // TODO: This creates a worktree per repetition. Ideally these examples should
185 // either be run sequentially on the same worktree, or reuse worktrees when there
186 // are more examples to run than the concurrency limit.
187 for repetition_number in 0..args.repetitions {
188 let example_instance = ExampleInstance::new(
189 thread.clone(),
190 &repos_dir,
191 &run_dir,
192 &worktrees_dir,
193 repetition_number,
194 );
195
196 examples.push(example_instance);
197 }
198 }
199
200 if !skipped.is_empty() {
201 println!("Skipped threads: {}", skipped.join(", "));
202 }
203
204 if examples.is_empty() {
205 eprintln!("Filter matched no examples");
206 return cx.update(|cx| cx.quit());
207 }
208
209 let mut repo_urls = HashSet::default();
210 let mut clone_tasks = Vec::new();
211
212 let max_name_width = examples
213 .iter()
214 .map(|e| e.worktree_name().len())
215 .max()
216 .unwrap_or(0);
217
218 for (i, example_instance) in examples.iter_mut().enumerate() {
219 let color = COLORS[i % COLORS.len()].to_string();
220 example_instance.set_log_prefix_style(&color, max_name_width);
221
222 println!(
223 "{}Logging to: {}",
224 example_instance.log_prefix,
225 example_instance.run_directory.display()
226 );
227
228 let repo_url = example_instance.repo_url();
229 if repo_urls.insert(repo_url.clone()) {
230 let repo_path = example_instance.repo_path.clone();
231
232 if !repo_path.join(".git").is_dir() {
233 println!(
234 "{:<width$} < {}",
235 "↓ Cloning",
236 repo_url,
237 width = max_name_width
238 );
239
240 let git_task = cx.spawn(async move |_cx| {
241 std::fs::create_dir_all(&repo_path)?;
242 run_git(&repo_path, &["init"]).await?;
243 run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
244 });
245
246 clone_tasks.push(git_task);
247 } else {
248 println!(
249 "{:<width$} < {}",
250 "✔︎ Already cloned",
251 repo_url,
252 width = max_name_width
253 );
254
255 let actual_origin =
256 run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
257 anyhow::ensure!(
258 actual_origin == repo_url,
259 "remote origin {actual_origin} does not match expected origin {repo_url}"
260 );
261 }
262 }
263 }
264
265 future::join_all(clone_tasks).await;
266
267 for example_instance in examples.iter_mut() {
268 example_instance.fetch().await?;
269 }
270
271 let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
272 let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
273
274 future::join_all((0..args.concurrency).map(|_| {
275 let app_state = app_state.clone();
276 let model = model.clone();
277 let zed_commit_sha = zed_commit_sha.clone();
278 let zed_branch_name = zed_branch_name.clone();
279 let run_id = run_id.clone();
280 let examples = examples.clone();
281 let results = results_by_example_name.clone();
282 cx.spawn(async move |cx| {
283 loop {
284 let Some(mut example) = examples.borrow_mut().pop_front() else {
285 break;
286 };
287 let result = async {
288 example.setup().await?;
289 let run_output = cx
290 .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
291 .await?;
292 let judge_output = judge_example(
293 example.clone(),
294 model.clone(),
295 &zed_commit_sha,
296 &zed_branch_name,
297 &run_id,
298 &run_output,
299 enable_telemetry,
300 cx,
301 )
302 .await;
303 anyhow::Ok((run_output, judge_output))
304 }
305 .await;
306 results
307 .borrow_mut()
308 .entry(example.name.clone())
309 .or_insert(Vec::new())
310 .push((example.clone(), result));
311 }
312 })
313 }))
314 .await;
315
316 print_report(
317 &mut results_by_example_name.borrow_mut(),
318 &mut cumulative_tool_metrics,
319 &run_dir,
320 )?;
321
322 app_state.client.telemetry().flush_events().await;
323
324 cx.update(|cx| cx.quit())
325 })
326 .detach_and_log_err(cx);
327 });
328}
329
330/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
331pub struct AgentAppState {
332 pub languages: Arc<LanguageRegistry>,
333 pub client: Arc<Client>,
334 pub user_store: Entity<UserStore>,
335 pub fs: Arc<dyn fs::Fs>,
336 pub node_runtime: NodeRuntime,
337
338 // Additional fields not present in `workspace::AppState`.
339 pub prompt_builder: Arc<PromptBuilder>,
340}
341
342pub fn init(cx: &mut App) -> Arc<AgentAppState> {
343 release_channel::init(SemanticVersion::default(), cx);
344 gpui_tokio::init(cx);
345
346 let mut settings_store = SettingsStore::new(cx);
347 settings_store
348 .set_default_settings(settings::default_settings().as_ref(), cx)
349 .unwrap();
350 cx.set_global(settings_store);
351 client::init_settings(cx);
352
353 // Set User-Agent so we can download language servers from GitHub
354 let user_agent = format!(
355 "Zed/{} ({}; {})",
356 AppVersion::global(cx),
357 std::env::consts::OS,
358 std::env::consts::ARCH
359 );
360 let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
361 let proxy_url = proxy_str
362 .as_ref()
363 .and_then(|input| input.parse().ok())
364 .or_else(read_proxy_from_env);
365 let http = {
366 let _guard = Tokio::handle(cx).enter();
367
368 ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
369 .expect("could not start HTTP client")
370 };
371 cx.set_http_client(Arc::new(http));
372
373 Project::init_settings(cx);
374
375 let client = Client::production(cx);
376 cx.set_http_client(client.http_client());
377
378 let git_binary_path = None;
379 let fs = Arc::new(RealFs::new(
380 git_binary_path,
381 cx.background_executor().clone(),
382 ));
383
384 let mut languages = LanguageRegistry::new(cx.background_executor().clone());
385 languages.set_language_server_download_dir(paths::languages_dir().clone());
386 let languages = Arc::new(languages);
387
388 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
389
390 extension::init(cx);
391
392 let (tx, rx) = async_watch::channel(None);
393 cx.observe_global::<SettingsStore>(move |cx| {
394 let settings = &ProjectSettings::get_global(cx).node;
395 let options = NodeBinaryOptions {
396 allow_path_lookup: !settings.ignore_system_version,
397 allow_binary_download: true,
398 use_paths: settings.path.as_ref().map(|node_path| {
399 let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
400 let npm_path = settings
401 .npm_path
402 .as_ref()
403 .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
404 (
405 node_path.clone(),
406 npm_path.unwrap_or_else(|| {
407 let base_path = PathBuf::new();
408 node_path.parent().unwrap_or(&base_path).join("npm")
409 }),
410 )
411 }),
412 };
413 tx.send(Some(options)).log_err();
414 })
415 .detach();
416 let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
417
418 let extension_host_proxy = ExtensionHostProxy::global(cx);
419
420 language::init(cx);
421 debug_adapter_extension::init(extension_host_proxy.clone(), cx);
422 language_extension::init(extension_host_proxy.clone(), languages.clone());
423 language_model::init(client.clone(), cx);
424 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
425 languages::init(languages.clone(), node_runtime.clone(), cx);
426 prompt_store::init(cx);
427 terminal_view::init(cx);
428 let stdout_is_a_pty = false;
429 let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
430 agent::init(
431 fs.clone(),
432 client.clone(),
433 prompt_builder.clone(),
434 languages.clone(),
435 cx,
436 );
437 assistant_tools::init(client.http_client(), cx);
438
439 SettingsStore::update_global(cx, |store, cx| {
440 store.set_user_settings(include_str!("../runner_settings.json"), cx)
441 })
442 .unwrap();
443
444 Arc::new(AgentAppState {
445 languages,
446 client,
447 user_store,
448 fs,
449 node_runtime,
450 prompt_builder,
451 })
452}
453
454pub fn find_model(
455 provider_id: &str,
456 model_id: &str,
457 model_registry: &LanguageModelRegistry,
458 cx: &App,
459) -> anyhow::Result<Arc<dyn LanguageModel>> {
460 let matching_models = model_registry
461 .available_models(cx)
462 .filter(|model| model.id().0 == model_id && model.provider_id().0 == provider_id)
463 .collect::<Vec<_>>();
464
465 match matching_models.as_slice() {
466 [model] => Ok(model.clone()),
467 [] => anyhow::bail!(
468 "No language model with ID {}/{} was available. Available models: {}",
469 provider_id,
470 model_id,
471 model_registry
472 .available_models(cx)
473 .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
474 .collect::<Vec<_>>()
475 .join(", ")
476 ),
477 _ => anyhow::bail!(
478 "Multiple language models with ID {} available - use `--provider` to choose one of: {:?}",
479 model_id,
480 matching_models
481 .iter()
482 .map(|model| model.provider_id().0)
483 .collect::<Vec<_>>()
484 ),
485 }
486}
487
488pub fn commit_sha_for_path(repo_path: &Path) -> String {
489 futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
490}
491
492pub fn git_branch_for_path(repo_path: &Path) -> String {
493 match std::env::var("GITHUB_REF_NAME") {
494 Ok(branch) => branch,
495 Err(_) => {
496 futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
497 .unwrap_or_else(|_| "unknown".to_string())
498 }
499 }
500}
501
502async fn judge_example(
503 example: ExampleInstance,
504 model: Arc<dyn LanguageModel>,
505 zed_commit_sha: &str,
506 zed_branch_name: &str,
507 run_id: &str,
508 run_output: &RunOutput,
509 enable_telemetry: bool,
510 cx: &AsyncApp,
511) -> JudgeOutput {
512 let judge_output = example.judge(model.clone(), &run_output, cx).await;
513
514 if enable_telemetry {
515 telemetry::event!(
516 "Agent Example Evaluated",
517 zed_commit_sha = zed_commit_sha,
518 zed_branch_name = zed_branch_name,
519 run_id = run_id,
520 example_name = example.name.clone(),
521 example_repetition = example.repetition,
522 diff_evaluation = judge_output.diff.clone(),
523 thread_evaluation = judge_output.thread.clone(),
524 tool_metrics = run_output.tool_metrics,
525 response_count = run_output.response_count,
526 token_usage = run_output.token_usage,
527 model = model.telemetry_id(),
528 model_provider = model.provider_id().to_string(),
529 repository_url = example.repo_url(),
530 repository_revision = example.revision(),
531 diagnostic_summary_before = run_output.diagnostic_summary_before,
532 diagnostic_summary_after = run_output.diagnostic_summary_after,
533 diagnostics_before = run_output.diagnostics_before,
534 diagnostics_after = run_output.diagnostics_after,
535 );
536 }
537
538 judge_output
539}
540
541const HEADER_WIDTH: usize = 65;
542
543fn print_h1(header: &str) {
544 println!("\n\n{:=^HEADER_WIDTH$}", "");
545 println!("{:^HEADER_WIDTH$}", header);
546 println!("{:=^HEADER_WIDTH$}\n", "");
547}
548
549fn print_h2(header: &str) {
550 println!("\n{:-^HEADER_WIDTH$}", "");
551 println!("{:^HEADER_WIDTH$}", header);
552 println!("{:-^HEADER_WIDTH$}\n", "");
553}
554
555fn print_report(
556 results_by_example_name: &mut HashMap<
557 String,
558 Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
559 >,
560 cumulative_tool_metrics: &mut ToolMetrics,
561 run_dir: &Path,
562) -> anyhow::Result<()> {
563 print_h1("EVAL RESULTS");
564
565 let mut diff_scores = Vec::new();
566 let mut thread_scores = Vec::new();
567 let mut programmatic_scores = Vec::new();
568 let mut error_count = 0;
569
570 for (example_name, results) in results_by_example_name.iter_mut() {
571 print_h2(example_name);
572
573 results.sort_unstable_by_key(|(example, _)| example.repetition);
574 let mut example_cumulative_tool_metrics = ToolMetrics::default();
575
576 let mut table_rows = String::new();
577
578 for (example, result) in results.iter() {
579 match result {
580 Err(err) => {
581 display_error_row(&mut table_rows, example.repetition, err.to_string())?;
582 error_count += 1;
583 programmatic_scores.push(0.0);
584 diff_scores.push(0.0);
585 thread_scores.push(0.0);
586 }
587 Ok((run_output, judge_output)) => {
588 cumulative_tool_metrics.merge(&run_output.tool_metrics);
589 example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
590
591 if run_output.programmatic_assertions.total_count() > 0 {
592 for assertion in &run_output.programmatic_assertions.ran {
593 assertions::display_table_row(
594 &mut table_rows,
595 example.repetition,
596 assertion,
597 )?;
598 }
599
600 programmatic_scores
601 .push(run_output.programmatic_assertions.passed_percentage())
602 }
603
604 if !judge_output.diff.is_empty() {
605 diff_scores.push(judge_output.diff.passed_percentage());
606
607 for assertion in &judge_output.diff.ran {
608 assertions::display_table_row(
609 &mut table_rows,
610 example.repetition,
611 assertion,
612 )?;
613 }
614 }
615
616 if !judge_output.thread.is_empty() {
617 thread_scores.push(judge_output.thread.passed_percentage());
618
619 for assertion in &judge_output.thread.ran {
620 assertions::display_table_row(
621 &mut table_rows,
622 example.repetition,
623 assertion,
624 )?;
625 }
626 }
627 }
628 }
629 }
630
631 let mut all_asserts = Vec::new();
632
633 if !table_rows.is_empty() {
634 assertions::print_table_header();
635 print!("{}", table_rows);
636
637 assertions::print_table_divider();
638
639 for (example, result) in results.iter() {
640 if let Ok((run_output, judge_output)) = result {
641 let asserts = [
642 run_output.programmatic_assertions.clone(),
643 judge_output.diff.clone(),
644 judge_output.thread.clone(),
645 ];
646 all_asserts.extend_from_slice(&asserts);
647 assertions::print_table_round_summary(
648 &example.repetition.to_string(),
649 asserts.iter(),
650 )
651 } else if let Err(err) = result {
652 let assert = AssertionsReport::error(err.to_string());
653 all_asserts.push(assert.clone());
654 assertions::print_table_round_summary(
655 &example.repetition.to_string(),
656 [assert].iter(),
657 )
658 }
659 }
660
661 assertions::print_table_divider();
662
663 assertions::print_table_round_summary("avg", all_asserts.iter());
664
665 assertions::print_table_footer();
666 }
667
668 if !example_cumulative_tool_metrics.is_empty() {
669 println!("{}", &example_cumulative_tool_metrics);
670 }
671 }
672
673 if results_by_example_name.len() > 1 {
674 print_h1("AGGREGATE");
675
676 if error_count > 0 {
677 println!("\n{error_count} examples failed to run!");
678 }
679
680 let programmatic_score_count = programmatic_scores.len();
681 if programmatic_score_count > 0 {
682 let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
683 / (programmatic_score_count as f32))
684 .floor();
685 println!("Average programmatic score: {average_programmatic_score}%");
686 }
687
688 let diff_score_count = diff_scores.len();
689 if diff_score_count > 0 {
690 let average_diff_score =
691 (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
692 println!("Average diff score: {average_diff_score}%");
693 }
694
695 let thread_score_count = thread_scores.len();
696
697 if thread_score_count > 0 {
698 let average_thread_score =
699 (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
700 println!("Average thread score: {average_thread_score}%");
701 }
702
703 println!("");
704
705 print_h2("CUMULATIVE TOOL METRICS");
706 println!("{}", cumulative_tool_metrics);
707 }
708
709 let explorer_output_path = run_dir.join("overview.html");
710 let mut json_paths: Vec<PathBuf> = results_by_example_name
711 .values()
712 .flat_map(|results| {
713 results.iter().map(|(example, _)| {
714 let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
715 let cwd = std::env::current_dir().expect("Can't get current dir");
716 pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
717 })
718 })
719 .collect::<Vec<_>>();
720 json_paths.sort();
721 if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
722 eprintln!("Failed to generate explorer HTML: {}", err);
723 }
724
725 Ok(())
726}