eval.rs

  1mod assertions;
  2mod example;
  3mod examples;
  4mod explorer;
  5mod ids;
  6mod instance;
  7mod tool_metrics;
  8
  9use assertions::{AssertionsReport, display_error_row};
 10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 11use language_extension::LspAccess;
 12pub(crate) use tool_metrics::*;
 13
 14use ::fs::RealFs;
 15use clap::Parser;
 16use client::{Client, ProxySettings, UserStore};
 17use collections::{HashMap, HashSet};
 18use extension::ExtensionHostProxy;
 19use futures::future;
 20use gpui::http_client::read_proxy_from_env;
 21use gpui::{App, AppContext, Application, AsyncApp, Entity, UpdateGlobal};
 22use gpui_tokio::Tokio;
 23use language::LanguageRegistry;
 24use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel};
 25use node_runtime::{NodeBinaryOptions, NodeRuntime};
 26use project::Project;
 27use project::project_settings::ProjectSettings;
 28use prompt_store::PromptBuilder;
 29use release_channel::AppVersion;
 30use reqwest_client::ReqwestClient;
 31use settings::{Settings, SettingsStore};
 32use std::cell::RefCell;
 33use std::collections::VecDeque;
 34use std::env;
 35use std::path::{Path, PathBuf};
 36use std::rc::Rc;
 37use std::str::FromStr;
 38use std::sync::{Arc, LazyLock};
 39use util::ResultExt as _;
 40
 41static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
 42    LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
 43
 44#[derive(Parser, Debug)]
 45#[command(name = "eval", disable_version_flag = true)]
 46struct Args {
 47    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
 48    #[arg(value_name = "EXAMPLE_SUBSTRING")]
 49    filter: Vec<String>,
 50    /// provider/model to use for agent
 51    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 52    model: String,
 53    /// provider/model to use for judges
 54    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 55    judge_model: String,
 56    #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
 57    languages: Vec<String>,
 58    /// How many times to run each example.
 59    #[arg(long, default_value = "8")]
 60    repetitions: usize,
 61    /// Maximum number of examples to run concurrently.
 62    #[arg(long, default_value = "4")]
 63    concurrency: usize,
 64}
 65
 66fn main() {
 67    dotenvy::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
 68
 69    env_logger::init();
 70
 71    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
 72    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
 73    let session_id = uuid::Uuid::new_v4().to_string();
 74    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
 75    let run_id = match env::var("GITHUB_RUN_ID") {
 76        Ok(run_id) => format!("github/{}", run_id),
 77        Err(_) => format!("local/{}", run_timestamp),
 78    };
 79
 80    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
 81        .parent()
 82        .unwrap()
 83        .parent()
 84        .unwrap()
 85        .canonicalize()
 86        .unwrap();
 87    let eval_crate_dir = root_dir.join("crates").join("eval");
 88    let repos_dir = eval_crate_dir.join("repos");
 89    let worktrees_dir = eval_crate_dir.join("worktrees");
 90    let examples_dir = eval_crate_dir.join("src").join("examples");
 91    let run_dir = eval_crate_dir
 92        .join("runs")
 93        .join(format!("{}", run_timestamp));
 94    std::fs::create_dir_all(&run_dir).unwrap();
 95    std::fs::create_dir_all(&repos_dir).unwrap();
 96    std::fs::create_dir_all(&worktrees_dir).unwrap();
 97    std::fs::create_dir_all(&examples_dir).unwrap();
 98    std::fs::create_dir_all(&paths::config_dir()).unwrap();
 99
100    let zed_commit_sha = commit_sha_for_path(&root_dir);
101    let zed_branch_name = git_branch_for_path(&root_dir);
102    let args = Args::parse();
103    let languages: HashSet<String> = args.languages.into_iter().collect();
104
105    let http_client = Arc::new(ReqwestClient::new());
106    let app = Application::headless().with_http_client(http_client.clone());
107    let all_threads = examples::all(&examples_dir);
108
109    app.run(move |cx| {
110        let app_state = init(cx);
111
112        let telemetry = app_state.client.telemetry();
113        telemetry.start(system_id, installation_id, session_id, cx);
114
115        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
116            && telemetry.has_checksum_seed();
117        if enable_telemetry {
118            println!("Telemetry enabled");
119            telemetry::event!(
120                "Agent Eval Started",
121                zed_commit_sha = zed_commit_sha,
122                zed_branch_name = zed_branch_name,
123                run_id = run_id,
124            );
125        }
126
127        let mut cumulative_tool_metrics = ToolMetrics::default();
128
129        let agent_model = load_model(&args.model, cx).unwrap();
130        let judge_model = load_model(&args.judge_model, cx).unwrap();
131
132        LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
133            registry.set_default_model(Some(agent_model.clone()), cx);
134        });
135
136        let auth1 = agent_model.provider.authenticate(cx);
137        let auth2 = judge_model.provider.authenticate(cx);
138
139        cx.spawn(async move |cx| {
140            auth1.await?;
141            auth2.await?;
142
143            let mut examples = Vec::new();
144
145            const COLORS: [&str; 12] = [
146                "\x1b[31m", // Red
147                "\x1b[32m", // Green
148                "\x1b[33m", // Yellow
149                "\x1b[34m", // Blue
150                "\x1b[35m", // Magenta
151                "\x1b[36m", // Cyan
152                "\x1b[91m", // Bright Red
153                "\x1b[92m", // Bright Green
154                "\x1b[93m", // Bright Yellow
155                "\x1b[94m", // Bright Blue
156                "\x1b[95m", // Bright Magenta
157                "\x1b[96m", // Bright Cyan
158            ];
159
160            let mut skipped = Vec::new();
161
162            for thread in all_threads {
163                let meta = thread.meta();
164                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
165                {
166                    skipped.push(meta.name);
167                    continue;
168                }
169
170                if let Some(language) = meta.language_server {
171                    if !languages.contains(&language.file_extension) {
172                        panic!(
173                            "Eval for {:?} could not be run because no language server was found for extension {:?}",
174                            meta.name,
175                            language.file_extension
176                        );
177                    }
178                }
179
180                // TODO: This creates a worktree per repetition. Ideally these examples should
181                // either be run sequentially on the same worktree, or reuse worktrees when there
182                // are more examples to run than the concurrency limit.
183                for repetition_number in 0..args.repetitions {
184                    let example_instance = ExampleInstance::new(
185                        thread.clone(),
186                        &repos_dir,
187                        &run_dir,
188                        &worktrees_dir,
189                        repetition_number,
190                    );
191
192                    examples.push(example_instance);
193                }
194            }
195
196            if !skipped.is_empty() {
197                println!("Skipped threads: {}", skipped.join(", "));
198            }
199
200            if examples.is_empty() {
201                eprintln!("Filter matched no examples");
202                return cx.update(|cx| cx.quit());
203            }
204
205            let mut repo_urls = HashSet::default();
206            let mut clone_tasks = Vec::new();
207
208            let max_name_width = examples
209                .iter()
210                .map(|e| e.worktree_name().len())
211                .max()
212                .unwrap_or(0);
213
214            for (i, example_instance) in examples.iter_mut().enumerate() {
215                let color = COLORS[i % COLORS.len()].to_string();
216                example_instance.set_log_prefix_style(&color, max_name_width);
217
218                println!(
219                    "{}Logging to: {}",
220                    example_instance.log_prefix,
221                    example_instance.run_directory.display()
222                );
223
224                let repo_url = example_instance.repo_url();
225                if repo_urls.insert(repo_url.clone()) {
226                    let repo_path = example_instance.repo_path.clone();
227
228                    if !repo_path.join(".git").is_dir() {
229                        println!(
230                            "{:<width$} < {}",
231                            "↓ Cloning",
232                            repo_url,
233                            width = max_name_width
234                        );
235
236                        let git_task = cx.spawn(async move |_cx| {
237                            std::fs::create_dir_all(&repo_path)?;
238                            run_git(&repo_path, &["init"]).await?;
239                            run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
240                        });
241
242                        clone_tasks.push(git_task);
243                    } else {
244                        println!(
245                            "{:<width$}  < {}",
246                            "✔︎ Already cloned",
247                            repo_url,
248                            width = max_name_width
249                        );
250
251                        let actual_origin =
252                            run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
253                        anyhow::ensure!(
254                            actual_origin == repo_url,
255                            "remote origin {actual_origin} does not match expected origin {repo_url}"
256                        );
257                    }
258                }
259            }
260
261            future::join_all(clone_tasks).await;
262
263            for example_instance in examples.iter_mut() {
264                example_instance.fetch().await?;
265            }
266
267            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
268            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
269
270            future::join_all((0..args.concurrency).map(|_| {
271                let app_state = app_state.clone();
272                let model = agent_model.model.clone();
273                let judge_model = judge_model.model.clone();
274                let zed_commit_sha = zed_commit_sha.clone();
275                let zed_branch_name = zed_branch_name.clone();
276                let run_id = run_id.clone();
277                let examples = examples.clone();
278                let results = results_by_example_name.clone();
279                cx.spawn(async move |cx| {
280                    loop {
281                        let Some(mut example) = examples.borrow_mut().pop_front() else {
282                            break;
283                        };
284                        let result = async {
285                            example.setup().await?;
286                            let run_output = cx
287                                .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
288                                .await?;
289                            let judge_output = judge_example(
290                                example.clone(),
291                                judge_model.clone(),
292                                &zed_commit_sha,
293                                &zed_branch_name,
294                                &run_id,
295                                &run_output,
296                                enable_telemetry,
297                                cx,
298                            )
299                            .await;
300                            anyhow::Ok((run_output, judge_output))
301                        }
302                        .await;
303                        results
304                            .borrow_mut()
305                            .entry(example.name.clone())
306                            .or_insert(Vec::new())
307                            .push((example.clone(), result));
308                    }
309                })
310            }))
311            .await;
312
313            print_report(
314                &mut results_by_example_name.borrow_mut(),
315                &mut cumulative_tool_metrics,
316                &run_dir,
317            )?;
318
319            app_state.client.telemetry().flush_events().await;
320
321            cx.update(|cx| cx.quit())
322        })
323        .detach_and_log_err(cx);
324    });
325}
326
327/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
328pub struct AgentAppState {
329    pub languages: Arc<LanguageRegistry>,
330    pub client: Arc<Client>,
331    pub user_store: Entity<UserStore>,
332    pub fs: Arc<dyn fs::Fs>,
333    pub node_runtime: NodeRuntime,
334
335    // Additional fields not present in `workspace::AppState`.
336    pub prompt_builder: Arc<PromptBuilder>,
337}
338
339pub fn init(cx: &mut App) -> Arc<AgentAppState> {
340    let app_version = AppVersion::load(env!("ZED_PKG_VERSION"));
341    release_channel::init(app_version, cx);
342    gpui_tokio::init(cx);
343
344    let mut settings_store = SettingsStore::new(cx);
345    settings_store
346        .set_default_settings(settings::default_settings().as_ref(), cx)
347        .unwrap();
348    cx.set_global(settings_store);
349    client::init_settings(cx);
350
351    // Set User-Agent so we can download language servers from GitHub
352    let user_agent = format!(
353        "Zed Agent Eval/{} ({}; {})",
354        app_version,
355        std::env::consts::OS,
356        std::env::consts::ARCH
357    );
358    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
359    let proxy_url = proxy_str
360        .as_ref()
361        .and_then(|input| input.parse().ok())
362        .or_else(read_proxy_from_env);
363    let http = {
364        let _guard = Tokio::handle(cx).enter();
365
366        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
367            .expect("could not start HTTP client")
368    };
369    cx.set_http_client(Arc::new(http));
370
371    Project::init_settings(cx);
372
373    let client = Client::production(cx);
374    cx.set_http_client(client.http_client());
375
376    let git_binary_path = None;
377    let fs = Arc::new(RealFs::new(
378        git_binary_path,
379        cx.background_executor().clone(),
380    ));
381
382    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
383    languages.set_language_server_download_dir(paths::languages_dir().clone());
384    let languages = Arc::new(languages);
385
386    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
387
388    extension::init(cx);
389
390    let (mut tx, rx) = watch::channel(None);
391    cx.observe_global::<SettingsStore>(move |cx| {
392        let settings = &ProjectSettings::get_global(cx).node;
393        let options = NodeBinaryOptions {
394            allow_path_lookup: !settings.ignore_system_version,
395            allow_binary_download: true,
396            use_paths: settings.path.as_ref().map(|node_path| {
397                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
398                let npm_path = settings
399                    .npm_path
400                    .as_ref()
401                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
402                (
403                    node_path.clone(),
404                    npm_path.unwrap_or_else(|| {
405                        let base_path = PathBuf::new();
406                        node_path.parent().unwrap_or(&base_path).join("npm")
407                    }),
408                )
409            }),
410        };
411        tx.send(Some(options)).log_err();
412    })
413    .detach();
414    let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
415
416    let extension_host_proxy = ExtensionHostProxy::global(cx);
417
418    language::init(cx);
419    debug_adapter_extension::init(extension_host_proxy.clone(), cx);
420    language_extension::init(
421        LspAccess::Noop,
422        extension_host_proxy.clone(),
423        languages.clone(),
424    );
425    language_model::init(client.clone(), cx);
426    language_models::init(user_store.clone(), client.clone(), cx);
427    languages::init(languages.clone(), node_runtime.clone(), cx);
428    prompt_store::init(cx);
429    terminal_view::init(cx);
430    let stdout_is_a_pty = false;
431    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
432    agent_ui::init(
433        fs.clone(),
434        client.clone(),
435        prompt_builder.clone(),
436        languages.clone(),
437        true,
438        cx,
439    );
440    assistant_tools::init(client.http_client(), cx);
441
442    SettingsStore::update_global(cx, |store, cx| {
443        store.set_user_settings(include_str!("../runner_settings.json"), cx)
444    })
445    .unwrap();
446
447    Arc::new(AgentAppState {
448        languages,
449        client,
450        user_store,
451        fs,
452        node_runtime,
453        prompt_builder,
454    })
455}
456
457pub fn find_model(
458    model_name: &str,
459    model_registry: &LanguageModelRegistry,
460    cx: &App,
461) -> anyhow::Result<Arc<dyn LanguageModel>> {
462    let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?;
463    model_registry
464        .available_models(cx)
465        .find(|model| model.id() == selected.model && model.provider_id() == selected.provider)
466        .ok_or_else(|| {
467            anyhow::anyhow!(
468                "No language model with ID {}/{} was available. Available models: {}",
469                selected.model.0,
470                selected.provider.0,
471                model_registry
472                    .available_models(cx)
473                    .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
474                    .collect::<Vec<_>>()
475                    .join(", ")
476            )
477        })
478}
479
480pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result<ConfiguredModel> {
481    let model = {
482        let model_registry = LanguageModelRegistry::read_global(cx);
483        find_model(model_name, model_registry, cx)?
484    };
485
486    let provider = {
487        let model_registry = LanguageModelRegistry::read_global(cx);
488        model_registry
489            .provider(&model.provider_id())
490            .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))?
491    };
492
493    Ok(ConfiguredModel {
494        provider: provider.clone(),
495        model: model.clone(),
496    })
497}
498
499pub fn commit_sha_for_path(repo_path: &Path) -> String {
500    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
501}
502
503pub fn git_branch_for_path(repo_path: &Path) -> String {
504    match std::env::var("GITHUB_REF_NAME") {
505        Ok(branch) => branch,
506        Err(_) => {
507            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
508                .unwrap_or_else(|_| "unknown".to_string())
509        }
510    }
511}
512
513async fn judge_example(
514    example: ExampleInstance,
515    model: Arc<dyn LanguageModel>,
516    zed_commit_sha: &str,
517    zed_branch_name: &str,
518    run_id: &str,
519    run_output: &RunOutput,
520    enable_telemetry: bool,
521    cx: &AsyncApp,
522) -> JudgeOutput {
523    let judge_output = example.judge(model.clone(), run_output, cx).await;
524
525    if enable_telemetry {
526        telemetry::event!(
527            "Agent Example Evaluated",
528            zed_commit_sha = zed_commit_sha,
529            zed_branch_name = zed_branch_name,
530            run_id = run_id,
531            example_name = example.name.clone(),
532            example_repetition = example.repetition,
533            diff_evaluation = judge_output.diff.clone(),
534            thread_evaluation = judge_output.thread.clone(),
535            tool_metrics = run_output.tool_metrics,
536            response_count = run_output.response_count,
537            token_usage = run_output.token_usage,
538            model = model.telemetry_id(),
539            model_provider = model.provider_id().to_string(),
540            repository_url = example.repo_url(),
541            repository_revision = example.revision(),
542            diagnostic_summary_before = run_output.diagnostic_summary_before,
543            diagnostic_summary_after = run_output.diagnostic_summary_after,
544            diagnostics_before = run_output.diagnostics_before,
545            diagnostics_after = run_output.diagnostics_after,
546        );
547    }
548
549    judge_output
550}
551
552const HEADER_WIDTH: usize = 65;
553
554fn print_h1(header: &str) {
555    println!("\n\n{:=^HEADER_WIDTH$}", "");
556    println!("{:^HEADER_WIDTH$}", header);
557    println!("{:=^HEADER_WIDTH$}\n", "");
558}
559
560fn print_h2(header: &str) {
561    println!("\n{:-^HEADER_WIDTH$}", "");
562    println!("{:^HEADER_WIDTH$}", header);
563    println!("{:-^HEADER_WIDTH$}\n", "");
564}
565
566fn print_report(
567    results_by_example_name: &mut HashMap<
568        String,
569        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
570    >,
571    cumulative_tool_metrics: &mut ToolMetrics,
572    run_dir: &Path,
573) -> anyhow::Result<()> {
574    print_h1("EVAL RESULTS");
575
576    let mut diff_scores = Vec::new();
577    let mut thread_scores = Vec::new();
578    let mut programmatic_scores = Vec::new();
579    let mut error_count = 0;
580
581    for (example_name, results) in results_by_example_name.iter_mut() {
582        print_h2(example_name);
583
584        results.sort_unstable_by_key(|(example, _)| example.repetition);
585        let mut example_cumulative_tool_metrics = ToolMetrics::default();
586
587        let mut table_rows = String::new();
588
589        for (example, result) in results.iter() {
590            match result {
591                Err(err) => {
592                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
593                    error_count += 1;
594                    programmatic_scores.push(0.0);
595                    diff_scores.push(0.0);
596                    thread_scores.push(0.0);
597                }
598                Ok((run_output, judge_output)) => {
599                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
600                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
601
602                    if run_output.programmatic_assertions.total_count() > 0 {
603                        for assertion in &run_output.programmatic_assertions.ran {
604                            assertions::display_table_row(
605                                &mut table_rows,
606                                example.repetition,
607                                assertion,
608                            )?;
609                        }
610
611                        programmatic_scores
612                            .push(run_output.programmatic_assertions.passed_percentage())
613                    }
614
615                    if !judge_output.diff.is_empty() {
616                        diff_scores.push(judge_output.diff.passed_percentage());
617
618                        for assertion in &judge_output.diff.ran {
619                            assertions::display_table_row(
620                                &mut table_rows,
621                                example.repetition,
622                                assertion,
623                            )?;
624                        }
625                    }
626
627                    if !judge_output.thread.is_empty() {
628                        thread_scores.push(judge_output.thread.passed_percentage());
629
630                        for assertion in &judge_output.thread.ran {
631                            assertions::display_table_row(
632                                &mut table_rows,
633                                example.repetition,
634                                assertion,
635                            )?;
636                        }
637                    }
638                }
639            }
640        }
641
642        let mut all_asserts = Vec::new();
643
644        if !table_rows.is_empty() {
645            assertions::print_table_header();
646            print!("{}", table_rows);
647
648            assertions::print_table_divider();
649
650            for (example, result) in results.iter() {
651                if let Ok((run_output, judge_output)) = result {
652                    let asserts = [
653                        run_output.programmatic_assertions.clone(),
654                        judge_output.diff.clone(),
655                        judge_output.thread.clone(),
656                    ];
657                    all_asserts.extend_from_slice(&asserts);
658                    assertions::print_table_round_summary(
659                        &example.repetition.to_string(),
660                        asserts.iter(),
661                    )
662                } else if let Err(err) = result {
663                    let assert = AssertionsReport::error(err.to_string());
664                    all_asserts.push(assert.clone());
665                    assertions::print_table_round_summary(
666                        &example.repetition.to_string(),
667                        [assert].iter(),
668                    )
669                }
670            }
671
672            assertions::print_table_divider();
673
674            assertions::print_table_round_summary("avg", all_asserts.iter());
675
676            assertions::print_table_footer();
677        }
678
679        if !example_cumulative_tool_metrics.is_empty() {
680            println!("{}", &example_cumulative_tool_metrics);
681        }
682    }
683
684    if results_by_example_name.len() > 1 {
685        print_h1("AGGREGATE");
686
687        if error_count > 0 {
688            println!("\n{error_count} examples failed to run!");
689        }
690
691        let programmatic_score_count = programmatic_scores.len();
692        if programmatic_score_count > 0 {
693            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
694                / (programmatic_score_count as f32))
695                .floor();
696            println!("Average programmatic score: {average_programmatic_score}%");
697        }
698
699        let diff_score_count = diff_scores.len();
700        if diff_score_count > 0 {
701            let average_diff_score =
702                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
703            println!("Average diff score: {average_diff_score}%");
704        }
705
706        let thread_score_count = thread_scores.len();
707
708        if thread_score_count > 0 {
709            let average_thread_score =
710                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
711            println!("Average thread score: {average_thread_score}%");
712        }
713
714        println!("");
715
716        print_h2("CUMULATIVE TOOL METRICS");
717        println!("{}", cumulative_tool_metrics);
718    }
719
720    let explorer_output_path = run_dir.join("overview.html");
721    let mut json_paths: Vec<PathBuf> = results_by_example_name
722        .values()
723        .flat_map(|results| {
724            results.iter().map(|(example, _)| {
725                let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
726                let cwd = std::env::current_dir().expect("Can't get current dir");
727                pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
728            })
729        })
730        .collect::<Vec<_>>();
731    json_paths.sort();
732    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
733        eprintln!("Failed to generate explorer HTML: {}", err);
734    }
735
736    Ok(())
737}