1mod assertions;
2mod example;
3mod examples;
4mod explorer;
5mod ids;
6mod instance;
7mod tool_metrics;
8
9use assertions::{AssertionsReport, display_error_row};
10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
11use language_extension::LspAccess;
12pub(crate) use tool_metrics::*;
13
14use ::fs::RealFs;
15use clap::Parser;
16use client::{Client, ProxySettings, UserStore};
17use collections::{HashMap, HashSet};
18use extension::ExtensionHostProxy;
19use futures::future;
20use gpui::http_client::read_proxy_from_env;
21use gpui::{App, AppContext, Application, AsyncApp, Entity, UpdateGlobal};
22use gpui_tokio::Tokio;
23use language::LanguageRegistry;
24use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel};
25use node_runtime::{NodeBinaryOptions, NodeRuntime};
26use project::project_settings::ProjectSettings;
27use prompt_store::PromptBuilder;
28use release_channel::{AppCommitSha, AppVersion};
29use reqwest_client::ReqwestClient;
30use settings::{Settings, SettingsStore};
31use std::cell::RefCell;
32use std::collections::VecDeque;
33use std::env;
34use std::path::{Path, PathBuf};
35use std::rc::Rc;
36use std::str::FromStr;
37use std::sync::{Arc, LazyLock};
38use util::ResultExt as _;
39
40static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
41 LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
42
43#[derive(Parser, Debug)]
44#[command(name = "eval", disable_version_flag = true)]
45struct Args {
46 /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
47 #[arg(value_name = "EXAMPLE_SUBSTRING")]
48 filter: Vec<String>,
49 /// provider/model to use for agent
50 #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
51 model: String,
52 /// provider/model to use for judges
53 #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
54 judge_model: String,
55 #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
56 languages: Vec<String>,
57 /// How many times to run each example.
58 #[arg(long, default_value = "8")]
59 repetitions: usize,
60 /// Maximum number of examples to run concurrently.
61 #[arg(long, default_value = "4")]
62 concurrency: usize,
63 /// Output current environment variables as JSON to stdout
64 #[arg(long, hide = true)]
65 printenv: bool,
66}
67
68fn main() {
69 let args = Args::parse();
70
71 // This prevents errors showing up in the logs, because
72 // project::environment::load_shell_environment() calls
73 // std::env::current_exe().unwrap() --printenv
74 if args.printenv {
75 util::shell_env::print_env();
76 return;
77 }
78
79 dotenvy::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
80
81 env_logger::init();
82
83 let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
84 let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
85 let session_id = uuid::Uuid::new_v4().to_string();
86 let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
87 let run_id = match env::var("GITHUB_RUN_ID") {
88 Ok(run_id) => format!("github/{}", run_id),
89 Err(_) => format!("local/{}", run_timestamp),
90 };
91
92 let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
93 .parent()
94 .unwrap()
95 .parent()
96 .unwrap()
97 .canonicalize()
98 .unwrap();
99 let eval_crate_dir = root_dir.join("crates").join("eval");
100 let repos_dir = eval_crate_dir.join("repos");
101 let worktrees_dir = eval_crate_dir.join("worktrees");
102 let examples_dir = eval_crate_dir.join("src").join("examples");
103 let run_dir = eval_crate_dir
104 .join("runs")
105 .join(format!("{}", run_timestamp));
106 std::fs::create_dir_all(&run_dir).unwrap();
107 std::fs::create_dir_all(&repos_dir).unwrap();
108 std::fs::create_dir_all(&worktrees_dir).unwrap();
109 std::fs::create_dir_all(&examples_dir).unwrap();
110 std::fs::create_dir_all(&paths::config_dir()).unwrap();
111
112 let zed_commit_sha = commit_sha_for_path(&root_dir);
113 let zed_branch_name = git_branch_for_path(&root_dir);
114 let languages: HashSet<String> = args.languages.into_iter().collect();
115
116 let http_client = Arc::new(ReqwestClient::new());
117 let app = Application::headless().with_http_client(http_client);
118 let all_threads = examples::all(&examples_dir);
119
120 app.run(move |cx| {
121 let app_state = init(cx);
122
123 let telemetry = app_state.client.telemetry();
124 telemetry.start(system_id, installation_id, session_id, cx);
125
126 let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").is_ok_and(|value| value == "1")
127 && telemetry.has_checksum_seed();
128 if enable_telemetry {
129 println!("Telemetry enabled");
130 telemetry::event!(
131 "Agent Eval Started",
132 zed_commit_sha = zed_commit_sha,
133 zed_branch_name = zed_branch_name,
134 run_id = run_id,
135 );
136 }
137
138 let mut cumulative_tool_metrics = ToolMetrics::default();
139
140 let tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
141 registry.providers().iter().map(|p| p.authenticate(cx)).collect::<Vec<_>>()
142 });
143
144 cx.spawn(async move |cx| {
145 future::join_all(tasks).await;
146 let judge_model = cx.update(|cx| {
147 let agent_model = load_model(&args.model, cx).unwrap();
148 let judge_model = load_model(&args.judge_model, cx).unwrap();
149 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
150 registry.set_default_model(Some(agent_model.clone()), cx);
151 });
152 judge_model
153 })?;
154
155 let mut examples = Vec::new();
156
157 const COLORS: [&str; 12] = [
158 "\x1b[31m", // Red
159 "\x1b[32m", // Green
160 "\x1b[33m", // Yellow
161 "\x1b[34m", // Blue
162 "\x1b[35m", // Magenta
163 "\x1b[36m", // Cyan
164 "\x1b[91m", // Bright Red
165 "\x1b[92m", // Bright Green
166 "\x1b[93m", // Bright Yellow
167 "\x1b[94m", // Bright Blue
168 "\x1b[95m", // Bright Magenta
169 "\x1b[96m", // Bright Cyan
170 ];
171
172 let mut skipped = Vec::new();
173
174 for thread in all_threads {
175 let meta = thread.meta();
176 if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
177 {
178 skipped.push(meta.name);
179 continue;
180 }
181
182 if let Some(language) = meta.language_server
183 && !languages.contains(&language.file_extension) {
184 panic!(
185 "Eval for {:?} could not be run because no language server was found for extension {:?}",
186 meta.name,
187 language.file_extension
188 );
189 }
190
191 // TODO: This creates a worktree per repetition. Ideally these examples should
192 // either be run sequentially on the same worktree, or reuse worktrees when there
193 // are more examples to run than the concurrency limit.
194 for repetition_number in 0..args.repetitions {
195 let example_instance = ExampleInstance::new(
196 thread.clone(),
197 &repos_dir,
198 &run_dir,
199 &worktrees_dir,
200 repetition_number,
201 );
202
203 examples.push(example_instance);
204 }
205 }
206
207 if !skipped.is_empty() {
208 println!("Skipped threads: {}", skipped.join(", "));
209 }
210
211 if examples.is_empty() {
212 eprintln!("Filter matched no examples");
213 return cx.update(|cx| cx.quit());
214 }
215
216 let mut repo_urls = HashSet::default();
217 let mut clone_tasks = Vec::new();
218
219 let max_name_width = examples
220 .iter()
221 .map(|e| e.worktree_name().len())
222 .max()
223 .unwrap_or(0);
224
225 for (i, example_instance) in examples.iter_mut().enumerate() {
226 let color = COLORS[i % COLORS.len()].to_string();
227 example_instance.set_log_prefix_style(&color, max_name_width);
228
229 println!(
230 "{}Logging to: {}",
231 example_instance.log_prefix,
232 example_instance.run_directory.display()
233 );
234
235 let repo_url = example_instance.repo_url();
236 if repo_urls.insert(repo_url.clone()) {
237 let repo_path = example_instance.repo_path.clone();
238
239 if !repo_path.join(".git").is_dir() {
240 println!(
241 "{:<width$} < {}",
242 "↓ Cloning",
243 repo_url,
244 width = max_name_width
245 );
246
247 let git_task = cx.spawn(async move |_cx| {
248 std::fs::create_dir_all(&repo_path)?;
249 run_git(&repo_path, &["init"]).await?;
250 run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
251 });
252
253 clone_tasks.push(git_task);
254 } else {
255 println!(
256 "{:<width$} < {}",
257 "✔︎ Already cloned",
258 repo_url,
259 width = max_name_width
260 );
261
262 let actual_origin =
263 run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
264 anyhow::ensure!(
265 actual_origin == repo_url,
266 "remote origin {actual_origin} does not match expected origin {repo_url}"
267 );
268 }
269 }
270 }
271
272 future::join_all(clone_tasks).await;
273
274 for example_instance in examples.iter_mut() {
275 example_instance.fetch().await?;
276 }
277
278 let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
279 let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
280
281 future::join_all((0..args.concurrency).map(|_| {
282 let app_state = app_state.clone();
283 let judge_model = judge_model.model.clone();
284 let zed_commit_sha = zed_commit_sha.clone();
285 let zed_branch_name = zed_branch_name.clone();
286 let run_id = run_id.clone();
287 let examples = examples.clone();
288 let results = results_by_example_name.clone();
289 cx.spawn(async move |cx| {
290 loop {
291 let Some(mut example) = examples.borrow_mut().pop_front() else {
292 break;
293 };
294 let result = async {
295 example.setup().await?;
296 let run_output = cx
297 .update(|cx| example.run(app_state.clone(), cx))?
298 .await?;
299 let judge_output = judge_example(
300 example.clone(),
301 judge_model.clone(),
302 &zed_commit_sha,
303 &zed_branch_name,
304 &run_id,
305 &run_output,
306 enable_telemetry,
307 cx,
308 )
309 .await;
310 anyhow::Ok((run_output, judge_output))
311 }
312 .await;
313 results
314 .borrow_mut()
315 .entry(example.name.clone())
316 .or_insert(Vec::new())
317 .push((example.clone(), result));
318 }
319 })
320 }))
321 .await;
322
323 print_report(
324 &mut results_by_example_name.borrow_mut(),
325 &mut cumulative_tool_metrics,
326 &run_dir,
327 )?;
328
329 app_state.client.telemetry().flush_events().await;
330
331 cx.update(|cx| cx.quit())
332 })
333 .detach_and_log_err(cx);
334 });
335}
336
337/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
338pub struct AgentAppState {
339 pub languages: Arc<LanguageRegistry>,
340 pub client: Arc<Client>,
341 pub user_store: Entity<UserStore>,
342 pub fs: Arc<dyn fs::Fs>,
343 pub node_runtime: NodeRuntime,
344
345 // Additional fields not present in `workspace::AppState`.
346 pub prompt_builder: Arc<PromptBuilder>,
347}
348
349pub fn init(cx: &mut App) -> Arc<AgentAppState> {
350 let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned()));
351
352 let app_version = AppVersion::load(
353 env!("ZED_PKG_VERSION"),
354 option_env!("ZED_BUILD_ID"),
355 app_commit_sha,
356 );
357
358 release_channel::init(app_version.clone(), cx);
359 gpui_tokio::init(cx);
360
361 let settings_store = SettingsStore::new(cx, &settings::default_settings());
362 cx.set_global(settings_store);
363
364 // Set User-Agent so we can download language servers from GitHub
365 let user_agent = format!(
366 "Zed Agent Eval/{} ({}; {})",
367 app_version,
368 std::env::consts::OS,
369 std::env::consts::ARCH
370 );
371 let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
372 let proxy_url = proxy_str
373 .as_ref()
374 .and_then(|input| input.parse().ok())
375 .or_else(read_proxy_from_env);
376 let http = {
377 let _guard = Tokio::handle(cx).enter();
378
379 ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
380 .expect("could not start HTTP client")
381 };
382 cx.set_http_client(Arc::new(http));
383
384 let client = Client::production(cx);
385 cx.set_http_client(client.http_client());
386
387 let git_binary_path = None;
388 let fs = Arc::new(RealFs::new(
389 git_binary_path,
390 cx.background_executor().clone(),
391 ));
392
393 let mut languages = LanguageRegistry::new(cx.background_executor().clone());
394 languages.set_language_server_download_dir(paths::languages_dir().clone());
395 let languages = Arc::new(languages);
396
397 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
398
399 extension::init(cx);
400
401 let (mut tx, rx) = watch::channel(None);
402 cx.observe_global::<SettingsStore>(move |cx| {
403 let settings = &ProjectSettings::get_global(cx).node;
404 let options = NodeBinaryOptions {
405 allow_path_lookup: !settings.ignore_system_version,
406 allow_binary_download: true,
407 use_paths: settings.path.as_ref().map(|node_path| {
408 let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
409 let npm_path = settings
410 .npm_path
411 .as_ref()
412 .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
413 (
414 node_path.clone(),
415 npm_path.unwrap_or_else(|| {
416 let base_path = PathBuf::new();
417 node_path.parent().unwrap_or(&base_path).join("npm")
418 }),
419 )
420 }),
421 };
422 tx.send(Some(options)).log_err();
423 })
424 .detach();
425 let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
426
427 let extension_host_proxy = ExtensionHostProxy::global(cx);
428 debug_adapter_extension::init(extension_host_proxy.clone(), cx);
429 language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone());
430 language_model::init(client.clone(), cx);
431 language_models::init(user_store.clone(), client.clone(), cx);
432 languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx);
433 prompt_store::init(cx);
434 terminal_view::init(cx);
435 let stdout_is_a_pty = false;
436 let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
437 agent_ui::init(
438 fs.clone(),
439 client.clone(),
440 prompt_builder.clone(),
441 languages.clone(),
442 true,
443 cx,
444 );
445
446 SettingsStore::update_global(cx, |store, cx| {
447 store.set_user_settings(include_str!("../runner_settings.json"), cx)
448 })
449 .unwrap();
450
451 Arc::new(AgentAppState {
452 languages,
453 client,
454 user_store,
455 fs,
456 node_runtime,
457 prompt_builder,
458 })
459}
460
461pub fn find_model(
462 model_name: &str,
463 model_registry: &LanguageModelRegistry,
464 cx: &App,
465) -> anyhow::Result<Arc<dyn LanguageModel>> {
466 let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?;
467 model_registry
468 .available_models(cx)
469 .find(|model| model.id() == selected.model && model.provider_id() == selected.provider)
470 .ok_or_else(|| {
471 anyhow::anyhow!(
472 "No language model with ID {}/{} was available. Available models: {}",
473 selected.provider.0,
474 selected.model.0,
475 model_registry
476 .available_models(cx)
477 .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
478 .collect::<Vec<_>>()
479 .join(", ")
480 )
481 })
482}
483
484pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result<ConfiguredModel> {
485 let model = {
486 let model_registry = LanguageModelRegistry::read_global(cx);
487 find_model(model_name, model_registry, cx)?
488 };
489
490 let provider = {
491 let model_registry = LanguageModelRegistry::read_global(cx);
492 model_registry
493 .provider(&model.provider_id())
494 .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))?
495 };
496
497 Ok(ConfiguredModel {
498 provider: provider.clone(),
499 model: model.clone(),
500 })
501}
502
503pub fn commit_sha_for_path(repo_path: &Path) -> String {
504 futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
505}
506
507pub fn git_branch_for_path(repo_path: &Path) -> String {
508 match std::env::var("GITHUB_REF_NAME") {
509 Ok(branch) => branch,
510 Err(_) => {
511 futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
512 .unwrap_or_else(|_| "unknown".to_string())
513 }
514 }
515}
516
517async fn judge_example(
518 example: ExampleInstance,
519 model: Arc<dyn LanguageModel>,
520 zed_commit_sha: &str,
521 zed_branch_name: &str,
522 run_id: &str,
523 run_output: &RunOutput,
524 enable_telemetry: bool,
525 cx: &AsyncApp,
526) -> JudgeOutput {
527 let judge_output = example.judge(model.clone(), run_output, cx).await;
528
529 if enable_telemetry {
530 telemetry::event!(
531 "Agent Example Evaluated",
532 zed_commit_sha = zed_commit_sha,
533 zed_branch_name = zed_branch_name,
534 run_id = run_id,
535 example_name = example.name.clone(),
536 example_repetition = example.repetition,
537 diff_evaluation = judge_output.diff.clone(),
538 thread_evaluation = judge_output.thread,
539 tool_metrics = run_output.tool_metrics,
540 token_usage = run_output.token_usage,
541 model = model.telemetry_id(),
542 model_provider = model.provider_id().to_string(),
543 repository_url = example.repo_url(),
544 repository_revision = example.revision(),
545 diagnostic_summary_before = run_output.diagnostic_summary_before,
546 diagnostic_summary_after = run_output.diagnostic_summary_after,
547 diagnostics_before = run_output.diagnostics_before,
548 diagnostics_after = run_output.diagnostics_after,
549 );
550 }
551
552 judge_output
553}
554
555const HEADER_WIDTH: usize = 65;
556
557fn print_h1(header: &str) {
558 println!("\n\n{:=^HEADER_WIDTH$}", "");
559 println!("{:^HEADER_WIDTH$}", header);
560 println!("{:=^HEADER_WIDTH$}\n", "");
561}
562
563fn print_h2(header: &str) {
564 println!("\n{:-^HEADER_WIDTH$}", "");
565 println!("{:^HEADER_WIDTH$}", header);
566 println!("{:-^HEADER_WIDTH$}\n", "");
567}
568
569fn print_report(
570 results_by_example_name: &mut HashMap<
571 String,
572 Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
573 >,
574 cumulative_tool_metrics: &mut ToolMetrics,
575 run_dir: &Path,
576) -> anyhow::Result<()> {
577 print_h1("EVAL RESULTS");
578
579 let mut diff_scores = Vec::new();
580 let mut thread_scores = Vec::new();
581 let mut programmatic_scores = Vec::new();
582 let mut error_count = 0;
583
584 for (example_name, results) in results_by_example_name.iter_mut() {
585 print_h2(example_name);
586
587 results.sort_unstable_by_key(|(example, _)| example.repetition);
588 let mut example_cumulative_tool_metrics = ToolMetrics::default();
589
590 let mut table_rows = String::new();
591
592 for (example, result) in results.iter() {
593 match result {
594 Err(err) => {
595 display_error_row(&mut table_rows, example.repetition, err.to_string())?;
596 error_count += 1;
597 programmatic_scores.push(0.0);
598 diff_scores.push(0.0);
599 thread_scores.push(0.0);
600 }
601 Ok((run_output, judge_output)) => {
602 cumulative_tool_metrics.merge(&run_output.tool_metrics);
603 example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
604
605 if run_output.programmatic_assertions.total_count() > 0 {
606 for assertion in &run_output.programmatic_assertions.ran {
607 assertions::display_table_row(
608 &mut table_rows,
609 example.repetition,
610 assertion,
611 )?;
612 }
613
614 programmatic_scores
615 .push(run_output.programmatic_assertions.passed_percentage())
616 }
617
618 if !judge_output.diff.is_empty() {
619 diff_scores.push(judge_output.diff.passed_percentage());
620
621 for assertion in &judge_output.diff.ran {
622 assertions::display_table_row(
623 &mut table_rows,
624 example.repetition,
625 assertion,
626 )?;
627 }
628 }
629
630 if !judge_output.thread.is_empty() {
631 thread_scores.push(judge_output.thread.passed_percentage());
632
633 for assertion in &judge_output.thread.ran {
634 assertions::display_table_row(
635 &mut table_rows,
636 example.repetition,
637 assertion,
638 )?;
639 }
640 }
641 }
642 }
643 }
644
645 let mut all_asserts = Vec::new();
646
647 if !table_rows.is_empty() {
648 assertions::print_table_header();
649 print!("{}", table_rows);
650
651 assertions::print_table_divider();
652
653 for (example, result) in results.iter() {
654 if let Ok((run_output, judge_output)) = result {
655 let asserts = [
656 run_output.programmatic_assertions.clone(),
657 judge_output.diff.clone(),
658 judge_output.thread.clone(),
659 ];
660 all_asserts.extend_from_slice(&asserts);
661 assertions::print_table_round_summary(
662 &example.repetition.to_string(),
663 asserts.iter(),
664 )
665 } else if let Err(err) = result {
666 let assert = AssertionsReport::error(err.to_string());
667 all_asserts.push(assert.clone());
668 assertions::print_table_round_summary(
669 &example.repetition.to_string(),
670 [assert].iter(),
671 )
672 }
673 }
674
675 assertions::print_table_divider();
676
677 assertions::print_table_round_summary("avg", all_asserts.iter());
678
679 assertions::print_table_footer();
680 }
681
682 if !example_cumulative_tool_metrics.is_empty() {
683 println!("{}", &example_cumulative_tool_metrics);
684 }
685 }
686
687 if results_by_example_name.len() > 1 {
688 print_h1("AGGREGATE");
689
690 if error_count > 0 {
691 println!("\n{error_count} examples failed to run!");
692 }
693
694 let programmatic_score_count = programmatic_scores.len();
695 if programmatic_score_count > 0 {
696 let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
697 / (programmatic_score_count as f32))
698 .floor();
699 println!("Average programmatic score: {average_programmatic_score}%");
700 }
701
702 let diff_score_count = diff_scores.len();
703 if diff_score_count > 0 {
704 let average_diff_score =
705 (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
706 println!("Average diff score: {average_diff_score}%");
707 }
708
709 let thread_score_count = thread_scores.len();
710
711 if thread_score_count > 0 {
712 let average_thread_score =
713 (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
714 println!("Average thread score: {average_thread_score}%");
715 }
716
717 println!();
718
719 print_h2("CUMULATIVE TOOL METRICS");
720 println!("{}", cumulative_tool_metrics);
721 }
722
723 let explorer_output_path = run_dir.join("overview.html");
724 let mut json_paths: Vec<PathBuf> = results_by_example_name
725 .values()
726 .flat_map(|results| {
727 results.iter().map(|(example, _)| {
728 let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
729 let cwd = std::env::current_dir().expect("Can't get current dir");
730 pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
731 })
732 })
733 .collect::<Vec<_>>();
734 json_paths.sort();
735 if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
736 eprintln!("Failed to generate explorer HTML: {}", err);
737 }
738
739 Ok(())
740}