eval.rs

  1mod assertions;
  2mod example;
  3mod examples;
  4mod explorer;
  5mod ids;
  6mod instance;
  7mod tool_metrics;
  8
  9use assertions::{AssertionsReport, display_error_row};
 10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 11pub(crate) use tool_metrics::*;
 12
 13use ::fs::RealFs;
 14use clap::Parser;
 15use client::{Client, ProxySettings, UserStore};
 16use collections::{HashMap, HashSet};
 17use extension::ExtensionHostProxy;
 18use futures::future;
 19use gpui::http_client::read_proxy_from_env;
 20use gpui::{App, AppContext, Application, AsyncApp, Entity, SemanticVersion, UpdateGlobal};
 21use gpui_tokio::Tokio;
 22use language::LanguageRegistry;
 23use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel};
 24use node_runtime::{NodeBinaryOptions, NodeRuntime};
 25use project::Project;
 26use project::project_settings::ProjectSettings;
 27use prompt_store::PromptBuilder;
 28use release_channel::AppVersion;
 29use reqwest_client::ReqwestClient;
 30use settings::{Settings, SettingsStore};
 31use std::cell::RefCell;
 32use std::collections::VecDeque;
 33use std::env;
 34use std::path::{Path, PathBuf};
 35use std::rc::Rc;
 36use std::str::FromStr;
 37use std::sync::{Arc, LazyLock};
 38use util::ResultExt as _;
 39
 40static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
 41    LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
 42
 43#[derive(Parser, Debug)]
 44#[command(name = "eval", disable_version_flag = true)]
 45struct Args {
 46    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
 47    #[arg(value_name = "EXAMPLE_SUBSTRING")]
 48    filter: Vec<String>,
 49    /// provider/model to use for agent
 50    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 51    model: String,
 52    /// provider/model to use for judges
 53    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 54    judge_model: String,
 55    #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
 56    languages: Vec<String>,
 57    /// How many times to run each example.
 58    #[arg(long, default_value = "8")]
 59    repetitions: usize,
 60    /// Maximum number of examples to run concurrently.
 61    #[arg(long, default_value = "4")]
 62    concurrency: usize,
 63}
 64
 65fn main() {
 66    dotenv::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
 67
 68    env_logger::init();
 69
 70    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
 71    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
 72    let session_id = uuid::Uuid::new_v4().to_string();
 73    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
 74    let run_id = match env::var("GITHUB_RUN_ID") {
 75        Ok(run_id) => format!("github/{}", run_id),
 76        Err(_) => format!("local/{}", run_timestamp),
 77    };
 78
 79    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
 80        .parent()
 81        .unwrap()
 82        .parent()
 83        .unwrap()
 84        .canonicalize()
 85        .unwrap();
 86    let eval_crate_dir = root_dir.join("crates").join("eval");
 87    let repos_dir = eval_crate_dir.join("repos");
 88    let worktrees_dir = eval_crate_dir.join("worktrees");
 89    let examples_dir = eval_crate_dir.join("src").join("examples");
 90    let run_dir = eval_crate_dir
 91        .join("runs")
 92        .join(format!("{}", run_timestamp));
 93    std::fs::create_dir_all(&run_dir).unwrap();
 94    std::fs::create_dir_all(&repos_dir).unwrap();
 95    std::fs::create_dir_all(&worktrees_dir).unwrap();
 96    std::fs::create_dir_all(&examples_dir).unwrap();
 97    std::fs::create_dir_all(&paths::config_dir()).unwrap();
 98
 99    let zed_commit_sha = commit_sha_for_path(&root_dir);
100    let zed_branch_name = git_branch_for_path(&root_dir);
101    let args = Args::parse();
102    let languages: HashSet<String> = args.languages.into_iter().collect();
103
104    let http_client = Arc::new(ReqwestClient::new());
105    let app = Application::headless().with_http_client(http_client.clone());
106    let all_threads = examples::all(&examples_dir);
107
108    app.run(move |cx| {
109        let app_state = init(cx);
110
111        let telemetry = app_state.client.telemetry();
112        telemetry.start(system_id, installation_id, session_id, cx);
113
114        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
115            && telemetry.has_checksum_seed();
116        if enable_telemetry {
117            println!("Telemetry enabled");
118            telemetry::event!(
119                "Agent Eval Started",
120                zed_commit_sha = zed_commit_sha,
121                zed_branch_name = zed_branch_name,
122                run_id = run_id,
123            );
124        }
125
126        let mut cumulative_tool_metrics = ToolMetrics::default();
127
128        let agent_model = load_model(&args.model, cx).unwrap();
129        let judge_model = load_model(&args.judge_model, cx).unwrap();
130
131        LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
132            registry.set_default_model(Some(agent_model.clone()), cx);
133        });
134
135        let auth1 = agent_model.provider.authenticate(cx);
136        let auth2 = judge_model.provider.authenticate(cx);
137
138        cx.spawn(async move |cx| {
139            auth1.await?;
140            auth2.await?;
141
142            let mut examples = Vec::new();
143
144            const COLORS: [&str; 12] = [
145                "\x1b[31m", // Red
146                "\x1b[32m", // Green
147                "\x1b[33m", // Yellow
148                "\x1b[34m", // Blue
149                "\x1b[35m", // Magenta
150                "\x1b[36m", // Cyan
151                "\x1b[91m", // Bright Red
152                "\x1b[92m", // Bright Green
153                "\x1b[93m", // Bright Yellow
154                "\x1b[94m", // Bright Blue
155                "\x1b[95m", // Bright Magenta
156                "\x1b[96m", // Bright Cyan
157            ];
158
159            let mut skipped = Vec::new();
160
161            for thread in all_threads {
162                let meta = thread.meta();
163                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
164                {
165                    skipped.push(meta.name);
166                    continue;
167                }
168
169                if let Some(language) = meta.language_server {
170                    if !languages.contains(&language.file_extension) {
171                        panic!(
172                            "Eval for {:?} could not be run because no language server was found for extension {:?}",
173                            meta.name,
174                            language.file_extension
175                        );
176                    }
177                }
178
179                // TODO: This creates a worktree per repetition. Ideally these examples should
180                // either be run sequentially on the same worktree, or reuse worktrees when there
181                // are more examples to run than the concurrency limit.
182                for repetition_number in 0..args.repetitions {
183                    let example_instance = ExampleInstance::new(
184                        thread.clone(),
185                        &repos_dir,
186                        &run_dir,
187                        &worktrees_dir,
188                        repetition_number,
189                    );
190
191                    examples.push(example_instance);
192                }
193            }
194
195            if !skipped.is_empty() {
196                println!("Skipped threads: {}", skipped.join(", "));
197            }
198
199            if examples.is_empty() {
200                eprintln!("Filter matched no examples");
201                return cx.update(|cx| cx.quit());
202            }
203
204            let mut repo_urls = HashSet::default();
205            let mut clone_tasks = Vec::new();
206
207            let max_name_width = examples
208                .iter()
209                .map(|e| e.worktree_name().len())
210                .max()
211                .unwrap_or(0);
212
213            for (i, example_instance) in examples.iter_mut().enumerate() {
214                let color = COLORS[i % COLORS.len()].to_string();
215                example_instance.set_log_prefix_style(&color, max_name_width);
216
217                println!(
218                    "{}Logging to: {}",
219                    example_instance.log_prefix,
220                    example_instance.run_directory.display()
221                );
222
223                let repo_url = example_instance.repo_url();
224                if repo_urls.insert(repo_url.clone()) {
225                    let repo_path = example_instance.repo_path.clone();
226
227                    if !repo_path.join(".git").is_dir() {
228                        println!(
229                            "{:<width$} < {}",
230                            "↓ Cloning",
231                            repo_url,
232                            width = max_name_width
233                        );
234
235                        let git_task = cx.spawn(async move |_cx| {
236                            std::fs::create_dir_all(&repo_path)?;
237                            run_git(&repo_path, &["init"]).await?;
238                            run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
239                        });
240
241                        clone_tasks.push(git_task);
242                    } else {
243                        println!(
244                            "{:<width$}  < {}",
245                            "✔︎ Already cloned",
246                            repo_url,
247                            width = max_name_width
248                        );
249
250                        let actual_origin =
251                            run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
252                        anyhow::ensure!(
253                            actual_origin == repo_url,
254                            "remote origin {actual_origin} does not match expected origin {repo_url}"
255                        );
256                    }
257                }
258            }
259
260            future::join_all(clone_tasks).await;
261
262            for example_instance in examples.iter_mut() {
263                example_instance.fetch().await?;
264            }
265
266            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
267            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
268
269            future::join_all((0..args.concurrency).map(|_| {
270                let app_state = app_state.clone();
271                let model = agent_model.model.clone();
272                let judge_model = judge_model.model.clone();
273                let zed_commit_sha = zed_commit_sha.clone();
274                let zed_branch_name = zed_branch_name.clone();
275                let run_id = run_id.clone();
276                let examples = examples.clone();
277                let results = results_by_example_name.clone();
278                cx.spawn(async move |cx| {
279                    loop {
280                        let Some(mut example) = examples.borrow_mut().pop_front() else {
281                            break;
282                        };
283                        let result = async {
284                            example.setup().await?;
285                            let run_output = cx
286                                .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
287                                .await?;
288                            let judge_output = judge_example(
289                                example.clone(),
290                                judge_model.clone(),
291                                &zed_commit_sha,
292                                &zed_branch_name,
293                                &run_id,
294                                &run_output,
295                                enable_telemetry,
296                                cx,
297                            )
298                            .await;
299                            anyhow::Ok((run_output, judge_output))
300                        }
301                        .await;
302                        results
303                            .borrow_mut()
304                            .entry(example.name.clone())
305                            .or_insert(Vec::new())
306                            .push((example.clone(), result));
307                    }
308                })
309            }))
310            .await;
311
312            print_report(
313                &mut results_by_example_name.borrow_mut(),
314                &mut cumulative_tool_metrics,
315                &run_dir,
316            )?;
317
318            app_state.client.telemetry().flush_events().await;
319
320            cx.update(|cx| cx.quit())
321        })
322        .detach_and_log_err(cx);
323    });
324}
325
326/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
327pub struct AgentAppState {
328    pub languages: Arc<LanguageRegistry>,
329    pub client: Arc<Client>,
330    pub user_store: Entity<UserStore>,
331    pub fs: Arc<dyn fs::Fs>,
332    pub node_runtime: NodeRuntime,
333
334    // Additional fields not present in `workspace::AppState`.
335    pub prompt_builder: Arc<PromptBuilder>,
336}
337
338pub fn init(cx: &mut App) -> Arc<AgentAppState> {
339    release_channel::init(SemanticVersion::default(), cx);
340    gpui_tokio::init(cx);
341
342    let mut settings_store = SettingsStore::new(cx);
343    settings_store
344        .set_default_settings(settings::default_settings().as_ref(), cx)
345        .unwrap();
346    cx.set_global(settings_store);
347    client::init_settings(cx);
348
349    // Set User-Agent so we can download language servers from GitHub
350    let user_agent = format!(
351        "Zed/{} ({}; {})",
352        AppVersion::global(cx),
353        std::env::consts::OS,
354        std::env::consts::ARCH
355    );
356    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
357    let proxy_url = proxy_str
358        .as_ref()
359        .and_then(|input| input.parse().ok())
360        .or_else(read_proxy_from_env);
361    let http = {
362        let _guard = Tokio::handle(cx).enter();
363
364        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
365            .expect("could not start HTTP client")
366    };
367    cx.set_http_client(Arc::new(http));
368
369    Project::init_settings(cx);
370
371    let client = Client::production(cx);
372    cx.set_http_client(client.http_client());
373
374    let git_binary_path = None;
375    let fs = Arc::new(RealFs::new(
376        git_binary_path,
377        cx.background_executor().clone(),
378    ));
379
380    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
381    languages.set_language_server_download_dir(paths::languages_dir().clone());
382    let languages = Arc::new(languages);
383
384    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
385
386    extension::init(cx);
387
388    let (tx, rx) = async_watch::channel(None);
389    cx.observe_global::<SettingsStore>(move |cx| {
390        let settings = &ProjectSettings::get_global(cx).node;
391        let options = NodeBinaryOptions {
392            allow_path_lookup: !settings.ignore_system_version,
393            allow_binary_download: true,
394            use_paths: settings.path.as_ref().map(|node_path| {
395                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
396                let npm_path = settings
397                    .npm_path
398                    .as_ref()
399                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
400                (
401                    node_path.clone(),
402                    npm_path.unwrap_or_else(|| {
403                        let base_path = PathBuf::new();
404                        node_path.parent().unwrap_or(&base_path).join("npm")
405                    }),
406                )
407            }),
408        };
409        tx.send(Some(options)).log_err();
410    })
411    .detach();
412    let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
413
414    let extension_host_proxy = ExtensionHostProxy::global(cx);
415
416    language::init(cx);
417    debug_adapter_extension::init(extension_host_proxy.clone(), cx);
418    language_extension::init(extension_host_proxy.clone(), languages.clone());
419    language_model::init(client.clone(), cx);
420    language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
421    languages::init(languages.clone(), node_runtime.clone(), cx);
422    prompt_store::init(cx);
423    terminal_view::init(cx);
424    let stdout_is_a_pty = false;
425    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
426    agent::init(
427        fs.clone(),
428        client.clone(),
429        prompt_builder.clone(),
430        languages.clone(),
431        true,
432        cx,
433    );
434    assistant_tools::init(client.http_client(), cx);
435
436    SettingsStore::update_global(cx, |store, cx| {
437        store.set_user_settings(include_str!("../runner_settings.json"), cx)
438    })
439    .unwrap();
440
441    Arc::new(AgentAppState {
442        languages,
443        client,
444        user_store,
445        fs,
446        node_runtime,
447        prompt_builder,
448    })
449}
450
451pub fn find_model(
452    model_name: &str,
453    model_registry: &LanguageModelRegistry,
454    cx: &App,
455) -> anyhow::Result<Arc<dyn LanguageModel>> {
456    let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?;
457    model_registry
458        .available_models(cx)
459        .find(|model| model.id() == selected.model && model.provider_id() == selected.provider)
460        .ok_or_else(|| {
461            anyhow::anyhow!(
462                "No language model with ID {}/{} was available. Available models: {}",
463                selected.model.0,
464                selected.provider.0,
465                model_registry
466                    .available_models(cx)
467                    .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
468                    .collect::<Vec<_>>()
469                    .join(", ")
470            )
471        })
472}
473
474pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result<ConfiguredModel> {
475    let model = {
476        let model_registry = LanguageModelRegistry::read_global(cx);
477        find_model(model_name, model_registry, cx)?
478    };
479
480    let provider = {
481        let model_registry = LanguageModelRegistry::read_global(cx);
482        model_registry
483            .provider(&model.provider_id())
484            .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))?
485    };
486
487    Ok(ConfiguredModel {
488        provider: provider.clone(),
489        model: model.clone(),
490    })
491}
492
493pub fn commit_sha_for_path(repo_path: &Path) -> String {
494    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
495}
496
497pub fn git_branch_for_path(repo_path: &Path) -> String {
498    match std::env::var("GITHUB_REF_NAME") {
499        Ok(branch) => branch,
500        Err(_) => {
501            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
502                .unwrap_or_else(|_| "unknown".to_string())
503        }
504    }
505}
506
507async fn judge_example(
508    example: ExampleInstance,
509    model: Arc<dyn LanguageModel>,
510    zed_commit_sha: &str,
511    zed_branch_name: &str,
512    run_id: &str,
513    run_output: &RunOutput,
514    enable_telemetry: bool,
515    cx: &AsyncApp,
516) -> JudgeOutput {
517    let judge_output = example.judge(model.clone(), &run_output, cx).await;
518
519    if enable_telemetry {
520        telemetry::event!(
521            "Agent Example Evaluated",
522            zed_commit_sha = zed_commit_sha,
523            zed_branch_name = zed_branch_name,
524            run_id = run_id,
525            example_name = example.name.clone(),
526            example_repetition = example.repetition,
527            diff_evaluation = judge_output.diff.clone(),
528            thread_evaluation = judge_output.thread.clone(),
529            tool_metrics = run_output.tool_metrics,
530            response_count = run_output.response_count,
531            token_usage = run_output.token_usage,
532            model = model.telemetry_id(),
533            model_provider = model.provider_id().to_string(),
534            repository_url = example.repo_url(),
535            repository_revision = example.revision(),
536            diagnostic_summary_before = run_output.diagnostic_summary_before,
537            diagnostic_summary_after = run_output.diagnostic_summary_after,
538            diagnostics_before = run_output.diagnostics_before,
539            diagnostics_after = run_output.diagnostics_after,
540        );
541    }
542
543    judge_output
544}
545
546const HEADER_WIDTH: usize = 65;
547
548fn print_h1(header: &str) {
549    println!("\n\n{:=^HEADER_WIDTH$}", "");
550    println!("{:^HEADER_WIDTH$}", header);
551    println!("{:=^HEADER_WIDTH$}\n", "");
552}
553
554fn print_h2(header: &str) {
555    println!("\n{:-^HEADER_WIDTH$}", "");
556    println!("{:^HEADER_WIDTH$}", header);
557    println!("{:-^HEADER_WIDTH$}\n", "");
558}
559
560fn print_report(
561    results_by_example_name: &mut HashMap<
562        String,
563        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
564    >,
565    cumulative_tool_metrics: &mut ToolMetrics,
566    run_dir: &Path,
567) -> anyhow::Result<()> {
568    print_h1("EVAL RESULTS");
569
570    let mut diff_scores = Vec::new();
571    let mut thread_scores = Vec::new();
572    let mut programmatic_scores = Vec::new();
573    let mut error_count = 0;
574
575    for (example_name, results) in results_by_example_name.iter_mut() {
576        print_h2(example_name);
577
578        results.sort_unstable_by_key(|(example, _)| example.repetition);
579        let mut example_cumulative_tool_metrics = ToolMetrics::default();
580
581        let mut table_rows = String::new();
582
583        for (example, result) in results.iter() {
584            match result {
585                Err(err) => {
586                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
587                    error_count += 1;
588                    programmatic_scores.push(0.0);
589                    diff_scores.push(0.0);
590                    thread_scores.push(0.0);
591                }
592                Ok((run_output, judge_output)) => {
593                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
594                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
595
596                    if run_output.programmatic_assertions.total_count() > 0 {
597                        for assertion in &run_output.programmatic_assertions.ran {
598                            assertions::display_table_row(
599                                &mut table_rows,
600                                example.repetition,
601                                assertion,
602                            )?;
603                        }
604
605                        programmatic_scores
606                            .push(run_output.programmatic_assertions.passed_percentage())
607                    }
608
609                    if !judge_output.diff.is_empty() {
610                        diff_scores.push(judge_output.diff.passed_percentage());
611
612                        for assertion in &judge_output.diff.ran {
613                            assertions::display_table_row(
614                                &mut table_rows,
615                                example.repetition,
616                                assertion,
617                            )?;
618                        }
619                    }
620
621                    if !judge_output.thread.is_empty() {
622                        thread_scores.push(judge_output.thread.passed_percentage());
623
624                        for assertion in &judge_output.thread.ran {
625                            assertions::display_table_row(
626                                &mut table_rows,
627                                example.repetition,
628                                assertion,
629                            )?;
630                        }
631                    }
632                }
633            }
634        }
635
636        let mut all_asserts = Vec::new();
637
638        if !table_rows.is_empty() {
639            assertions::print_table_header();
640            print!("{}", table_rows);
641
642            assertions::print_table_divider();
643
644            for (example, result) in results.iter() {
645                if let Ok((run_output, judge_output)) = result {
646                    let asserts = [
647                        run_output.programmatic_assertions.clone(),
648                        judge_output.diff.clone(),
649                        judge_output.thread.clone(),
650                    ];
651                    all_asserts.extend_from_slice(&asserts);
652                    assertions::print_table_round_summary(
653                        &example.repetition.to_string(),
654                        asserts.iter(),
655                    )
656                } else if let Err(err) = result {
657                    let assert = AssertionsReport::error(err.to_string());
658                    all_asserts.push(assert.clone());
659                    assertions::print_table_round_summary(
660                        &example.repetition.to_string(),
661                        [assert].iter(),
662                    )
663                }
664            }
665
666            assertions::print_table_divider();
667
668            assertions::print_table_round_summary("avg", all_asserts.iter());
669
670            assertions::print_table_footer();
671        }
672
673        if !example_cumulative_tool_metrics.is_empty() {
674            println!("{}", &example_cumulative_tool_metrics);
675        }
676    }
677
678    if results_by_example_name.len() > 1 {
679        print_h1("AGGREGATE");
680
681        if error_count > 0 {
682            println!("\n{error_count} examples failed to run!");
683        }
684
685        let programmatic_score_count = programmatic_scores.len();
686        if programmatic_score_count > 0 {
687            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
688                / (programmatic_score_count as f32))
689                .floor();
690            println!("Average programmatic score: {average_programmatic_score}%");
691        }
692
693        let diff_score_count = diff_scores.len();
694        if diff_score_count > 0 {
695            let average_diff_score =
696                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
697            println!("Average diff score: {average_diff_score}%");
698        }
699
700        let thread_score_count = thread_scores.len();
701
702        if thread_score_count > 0 {
703            let average_thread_score =
704                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
705            println!("Average thread score: {average_thread_score}%");
706        }
707
708        println!("");
709
710        print_h2("CUMULATIVE TOOL METRICS");
711        println!("{}", cumulative_tool_metrics);
712    }
713
714    let explorer_output_path = run_dir.join("overview.html");
715    let mut json_paths: Vec<PathBuf> = results_by_example_name
716        .values()
717        .flat_map(|results| {
718            results.iter().map(|(example, _)| {
719                let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
720                let cwd = std::env::current_dir().expect("Can't get current dir");
721                pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
722            })
723        })
724        .collect::<Vec<_>>();
725    json_paths.sort();
726    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
727        eprintln!("Failed to generate explorer HTML: {}", err);
728    }
729
730    Ok(())
731}