eval.rs

  1mod assertions;
  2mod example;
  3mod examples;
  4mod explorer;
  5mod ids;
  6mod instance;
  7mod tool_metrics;
  8
  9use assertions::{AssertionsReport, display_error_row};
 10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 11use language_extension::LspAccess;
 12pub(crate) use tool_metrics::*;
 13
 14use ::fs::RealFs;
 15use clap::Parser;
 16use client::{Client, ProxySettings, UserStore};
 17use collections::{HashMap, HashSet};
 18use extension::ExtensionHostProxy;
 19use futures::future;
 20use gpui::http_client::read_proxy_from_env;
 21use gpui::{App, AppContext, Application, AsyncApp, Entity, UpdateGlobal};
 22use gpui_tokio::Tokio;
 23use language::LanguageRegistry;
 24use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel};
 25use node_runtime::{NodeBinaryOptions, NodeRuntime};
 26use project::project_settings::ProjectSettings;
 27use prompt_store::PromptBuilder;
 28use release_channel::{AppCommitSha, AppVersion};
 29use reqwest_client::ReqwestClient;
 30use settings::{Settings, SettingsStore};
 31use std::cell::RefCell;
 32use std::collections::VecDeque;
 33use std::env;
 34use std::path::{Path, PathBuf};
 35use std::rc::Rc;
 36use std::str::FromStr;
 37use std::sync::{Arc, LazyLock};
 38use util::ResultExt as _;
 39
 40static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
 41    LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
 42
 43#[derive(Parser, Debug)]
 44#[command(name = "eval", disable_version_flag = true)]
 45struct Args {
 46    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
 47    #[arg(value_name = "EXAMPLE_SUBSTRING")]
 48    filter: Vec<String>,
 49    /// provider/model to use for agent
 50    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 51    model: String,
 52    /// provider/model to use for judges
 53    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 54    judge_model: String,
 55    #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
 56    languages: Vec<String>,
 57    /// How many times to run each example.
 58    #[arg(long, default_value = "8")]
 59    repetitions: usize,
 60    /// Maximum number of examples to run concurrently.
 61    #[arg(long, default_value = "4")]
 62    concurrency: usize,
 63    /// Output current environment variables as JSON to stdout
 64    #[arg(long, hide = true)]
 65    printenv: bool,
 66}
 67
 68fn main() {
 69    let args = Args::parse();
 70
 71    // This prevents errors showing up in the logs, because
 72    // project::environment::load_shell_environment() calls
 73    // std::env::current_exe().unwrap() --printenv
 74    if args.printenv {
 75        util::shell_env::print_env();
 76        return;
 77    }
 78
 79    dotenvy::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
 80
 81    env_logger::init();
 82
 83    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
 84    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
 85    let session_id = uuid::Uuid::new_v4().to_string();
 86    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
 87    let run_id = match env::var("GITHUB_RUN_ID") {
 88        Ok(run_id) => format!("github/{}", run_id),
 89        Err(_) => format!("local/{}", run_timestamp),
 90    };
 91
 92    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
 93        .parent()
 94        .unwrap()
 95        .parent()
 96        .unwrap()
 97        .canonicalize()
 98        .unwrap();
 99    let eval_crate_dir = root_dir.join("crates").join("eval");
100    let repos_dir = eval_crate_dir.join("repos");
101    let worktrees_dir = eval_crate_dir.join("worktrees");
102    let examples_dir = eval_crate_dir.join("src").join("examples");
103    let run_dir = eval_crate_dir
104        .join("runs")
105        .join(format!("{}", run_timestamp));
106    std::fs::create_dir_all(&run_dir).unwrap();
107    std::fs::create_dir_all(&repos_dir).unwrap();
108    std::fs::create_dir_all(&worktrees_dir).unwrap();
109    std::fs::create_dir_all(&examples_dir).unwrap();
110    std::fs::create_dir_all(&paths::config_dir()).unwrap();
111
112    let zed_commit_sha = commit_sha_for_path(&root_dir);
113    let zed_branch_name = git_branch_for_path(&root_dir);
114    let languages: HashSet<String> = args.languages.into_iter().collect();
115
116    let http_client = Arc::new(ReqwestClient::new());
117    let app = Application::headless().with_http_client(http_client);
118    let all_threads = examples::all(&examples_dir);
119
120    app.run(move |cx| {
121        let app_state = init(cx);
122
123        let telemetry = app_state.client.telemetry();
124        telemetry.start(system_id, installation_id, session_id, cx);
125
126        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").is_ok_and(|value| value == "1")
127            && telemetry.has_checksum_seed();
128        if enable_telemetry {
129            println!("Telemetry enabled");
130            telemetry::event!(
131                "Agent Eval Started",
132                zed_commit_sha = zed_commit_sha,
133                zed_branch_name = zed_branch_name,
134                run_id = run_id,
135            );
136        }
137
138        let mut cumulative_tool_metrics = ToolMetrics::default();
139
140        let tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
141            registry.providers().iter().map(|p| p.authenticate(cx)).collect::<Vec<_>>()
142        });
143
144        cx.spawn(async move |cx| {
145            future::join_all(tasks).await;
146            let judge_model = cx.update(|cx| {
147                let agent_model = load_model(&args.model, cx).unwrap();
148                let judge_model = load_model(&args.judge_model, cx).unwrap();
149                LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
150                    registry.set_default_model(Some(agent_model.clone()), cx);
151                });
152                judge_model
153            })?;
154
155            let mut examples = Vec::new();
156
157            const COLORS: [&str; 12] = [
158                "\x1b[31m", // Red
159                "\x1b[32m", // Green
160                "\x1b[33m", // Yellow
161                "\x1b[34m", // Blue
162                "\x1b[35m", // Magenta
163                "\x1b[36m", // Cyan
164                "\x1b[91m", // Bright Red
165                "\x1b[92m", // Bright Green
166                "\x1b[93m", // Bright Yellow
167                "\x1b[94m", // Bright Blue
168                "\x1b[95m", // Bright Magenta
169                "\x1b[96m", // Bright Cyan
170            ];
171
172            let mut skipped = Vec::new();
173
174            for thread in all_threads {
175                let meta = thread.meta();
176                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
177                {
178                    skipped.push(meta.name);
179                    continue;
180                }
181
182                if let Some(language) = meta.language_server
183                    && !languages.contains(&language.file_extension) {
184                        panic!(
185                            "Eval for {:?} could not be run because no language server was found for extension {:?}",
186                            meta.name,
187                            language.file_extension
188                        );
189                    }
190
191                // TODO: This creates a worktree per repetition. Ideally these examples should
192                // either be run sequentially on the same worktree, or reuse worktrees when there
193                // are more examples to run than the concurrency limit.
194                for repetition_number in 0..args.repetitions {
195                    let example_instance = ExampleInstance::new(
196                        thread.clone(),
197                        &repos_dir,
198                        &run_dir,
199                        &worktrees_dir,
200                        repetition_number,
201                    );
202
203                    examples.push(example_instance);
204                }
205            }
206
207            if !skipped.is_empty() {
208                println!("Skipped threads: {}", skipped.join(", "));
209            }
210
211            if examples.is_empty() {
212                eprintln!("Filter matched no examples");
213                return cx.update(|cx| cx.quit());
214            }
215
216            let mut repo_urls = HashSet::default();
217            let mut clone_tasks = Vec::new();
218
219            let max_name_width = examples
220                .iter()
221                .map(|e| e.worktree_name().len())
222                .max()
223                .unwrap_or(0);
224
225            for (i, example_instance) in examples.iter_mut().enumerate() {
226                let color = COLORS[i % COLORS.len()].to_string();
227                example_instance.set_log_prefix_style(&color, max_name_width);
228
229                println!(
230                    "{}Logging to: {}",
231                    example_instance.log_prefix,
232                    example_instance.run_directory.display()
233                );
234
235                let repo_url = example_instance.repo_url();
236                if repo_urls.insert(repo_url.clone()) {
237                    let repo_path = example_instance.repo_path.clone();
238
239                    if !repo_path.join(".git").is_dir() {
240                        println!(
241                            "{:<width$} < {}",
242                            "↓ Cloning",
243                            repo_url,
244                            width = max_name_width
245                        );
246
247                        let git_task = cx.spawn(async move |_cx| {
248                            std::fs::create_dir_all(&repo_path)?;
249                            run_git(&repo_path, &["init"]).await?;
250                            run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
251                        });
252
253                        clone_tasks.push(git_task);
254                    } else {
255                        println!(
256                            "{:<width$}  < {}",
257                            "✔︎ Already cloned",
258                            repo_url,
259                            width = max_name_width
260                        );
261
262                        let actual_origin =
263                            run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
264                        anyhow::ensure!(
265                            actual_origin == repo_url,
266                            "remote origin {actual_origin} does not match expected origin {repo_url}"
267                        );
268                    }
269                }
270            }
271
272            future::join_all(clone_tasks).await;
273
274            for example_instance in examples.iter_mut() {
275                example_instance.fetch().await?;
276            }
277
278            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
279            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
280
281            future::join_all((0..args.concurrency).map(|_| {
282                let app_state = app_state.clone();
283                let judge_model = judge_model.model.clone();
284                let zed_commit_sha = zed_commit_sha.clone();
285                let zed_branch_name = zed_branch_name.clone();
286                let run_id = run_id.clone();
287                let examples = examples.clone();
288                let results = results_by_example_name.clone();
289                cx.spawn(async move |cx| {
290                    loop {
291                        let Some(mut example) = examples.borrow_mut().pop_front() else {
292                            break;
293                        };
294                        let result = async {
295                            example.setup().await?;
296                            let run_output = cx
297                                .update(|cx| example.run(app_state.clone(), cx))?
298                                .await?;
299                            let judge_output = judge_example(
300                                example.clone(),
301                                judge_model.clone(),
302                                &zed_commit_sha,
303                                &zed_branch_name,
304                                &run_id,
305                                &run_output,
306                                enable_telemetry,
307                                cx,
308                            )
309                            .await;
310                            anyhow::Ok((run_output, judge_output))
311                        }
312                        .await;
313                        results
314                            .borrow_mut()
315                            .entry(example.name.clone())
316                            .or_insert(Vec::new())
317                            .push((example.clone(), result));
318                    }
319                })
320            }))
321            .await;
322
323            print_report(
324                &mut results_by_example_name.borrow_mut(),
325                &mut cumulative_tool_metrics,
326                &run_dir,
327            )?;
328
329            app_state.client.telemetry().flush_events().await;
330
331            cx.update(|cx| cx.quit())
332        })
333        .detach_and_log_err(cx);
334    });
335}
336
337/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
338pub struct AgentAppState {
339    pub languages: Arc<LanguageRegistry>,
340    pub client: Arc<Client>,
341    pub user_store: Entity<UserStore>,
342    pub fs: Arc<dyn fs::Fs>,
343    pub node_runtime: NodeRuntime,
344
345    // Additional fields not present in `workspace::AppState`.
346    pub prompt_builder: Arc<PromptBuilder>,
347}
348
349pub fn init(cx: &mut App) -> Arc<AgentAppState> {
350    let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned()));
351
352    let app_version = AppVersion::load(
353        env!("ZED_PKG_VERSION"),
354        option_env!("ZED_BUILD_ID"),
355        app_commit_sha,
356    );
357
358    release_channel::init(app_version.clone(), cx);
359    gpui_tokio::init(cx);
360
361    let settings_store = SettingsStore::new(cx, &settings::default_settings());
362    cx.set_global(settings_store);
363
364    // Set User-Agent so we can download language servers from GitHub
365    let user_agent = format!(
366        "Zed Agent Eval/{} ({}; {})",
367        app_version,
368        std::env::consts::OS,
369        std::env::consts::ARCH
370    );
371    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
372    let proxy_url = proxy_str
373        .as_ref()
374        .and_then(|input| input.parse().ok())
375        .or_else(read_proxy_from_env);
376    let http = {
377        let _guard = Tokio::handle(cx).enter();
378
379        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
380            .expect("could not start HTTP client")
381    };
382    cx.set_http_client(Arc::new(http));
383
384    let client = Client::production(cx);
385    cx.set_http_client(client.http_client());
386
387    let git_binary_path = None;
388    let fs = Arc::new(RealFs::new(
389        git_binary_path,
390        cx.background_executor().clone(),
391    ));
392
393    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
394    languages.set_language_server_download_dir(paths::languages_dir().clone());
395    let languages = Arc::new(languages);
396
397    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
398
399    extension::init(cx);
400
401    let (mut tx, rx) = watch::channel(None);
402    cx.observe_global::<SettingsStore>(move |cx| {
403        let settings = &ProjectSettings::get_global(cx).node;
404        let options = NodeBinaryOptions {
405            allow_path_lookup: !settings.ignore_system_version,
406            allow_binary_download: true,
407            use_paths: settings.path.as_ref().map(|node_path| {
408                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
409                let npm_path = settings
410                    .npm_path
411                    .as_ref()
412                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
413                (
414                    node_path.clone(),
415                    npm_path.unwrap_or_else(|| {
416                        let base_path = PathBuf::new();
417                        node_path.parent().unwrap_or(&base_path).join("npm")
418                    }),
419                )
420            }),
421        };
422        tx.send(Some(options)).log_err();
423    })
424    .detach();
425    let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
426
427    let extension_host_proxy = ExtensionHostProxy::global(cx);
428    debug_adapter_extension::init(extension_host_proxy.clone(), cx);
429    language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone());
430    language_model::init(client.clone(), cx);
431    language_models::init(user_store.clone(), client.clone(), cx);
432    languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx);
433    prompt_store::init(cx);
434    terminal_view::init(cx);
435    let stdout_is_a_pty = false;
436    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
437    agent_ui::init(
438        fs.clone(),
439        client.clone(),
440        prompt_builder.clone(),
441        languages.clone(),
442        true,
443        cx,
444    );
445
446    SettingsStore::update_global(cx, |store, cx| {
447        store.set_user_settings(include_str!("../runner_settings.json"), cx)
448    })
449    .unwrap();
450
451    Arc::new(AgentAppState {
452        languages,
453        client,
454        user_store,
455        fs,
456        node_runtime,
457        prompt_builder,
458    })
459}
460
461pub fn find_model(
462    model_name: &str,
463    model_registry: &LanguageModelRegistry,
464    cx: &App,
465) -> anyhow::Result<Arc<dyn LanguageModel>> {
466    let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?;
467    model_registry
468        .available_models(cx)
469        .find(|model| model.id() == selected.model && model.provider_id() == selected.provider)
470        .ok_or_else(|| {
471            anyhow::anyhow!(
472                "No language model with ID {}/{} was available. Available models: {}",
473                selected.provider.0,
474                selected.model.0,
475                model_registry
476                    .available_models(cx)
477                    .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
478                    .collect::<Vec<_>>()
479                    .join(", ")
480            )
481        })
482}
483
484pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result<ConfiguredModel> {
485    let model = {
486        let model_registry = LanguageModelRegistry::read_global(cx);
487        find_model(model_name, model_registry, cx)?
488    };
489
490    let provider = {
491        let model_registry = LanguageModelRegistry::read_global(cx);
492        model_registry
493            .provider(&model.provider_id())
494            .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))?
495    };
496
497    Ok(ConfiguredModel {
498        provider: provider.clone(),
499        model: model.clone(),
500    })
501}
502
503pub fn commit_sha_for_path(repo_path: &Path) -> String {
504    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
505}
506
507pub fn git_branch_for_path(repo_path: &Path) -> String {
508    match std::env::var("GITHUB_REF_NAME") {
509        Ok(branch) => branch,
510        Err(_) => {
511            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
512                .unwrap_or_else(|_| "unknown".to_string())
513        }
514    }
515}
516
517async fn judge_example(
518    example: ExampleInstance,
519    model: Arc<dyn LanguageModel>,
520    zed_commit_sha: &str,
521    zed_branch_name: &str,
522    run_id: &str,
523    run_output: &RunOutput,
524    enable_telemetry: bool,
525    cx: &AsyncApp,
526) -> JudgeOutput {
527    let judge_output = example.judge(model.clone(), run_output, cx).await;
528
529    if enable_telemetry {
530        telemetry::event!(
531            "Agent Example Evaluated",
532            zed_commit_sha = zed_commit_sha,
533            zed_branch_name = zed_branch_name,
534            run_id = run_id,
535            example_name = example.name.clone(),
536            example_repetition = example.repetition,
537            diff_evaluation = judge_output.diff.clone(),
538            thread_evaluation = judge_output.thread,
539            tool_metrics = run_output.tool_metrics,
540            token_usage = run_output.token_usage,
541            model = model.telemetry_id(),
542            model_provider = model.provider_id().to_string(),
543            repository_url = example.repo_url(),
544            repository_revision = example.revision(),
545            diagnostic_summary_before = run_output.diagnostic_summary_before,
546            diagnostic_summary_after = run_output.diagnostic_summary_after,
547            diagnostics_before = run_output.diagnostics_before,
548            diagnostics_after = run_output.diagnostics_after,
549        );
550    }
551
552    judge_output
553}
554
555const HEADER_WIDTH: usize = 65;
556
557fn print_h1(header: &str) {
558    println!("\n\n{:=^HEADER_WIDTH$}", "");
559    println!("{:^HEADER_WIDTH$}", header);
560    println!("{:=^HEADER_WIDTH$}\n", "");
561}
562
563fn print_h2(header: &str) {
564    println!("\n{:-^HEADER_WIDTH$}", "");
565    println!("{:^HEADER_WIDTH$}", header);
566    println!("{:-^HEADER_WIDTH$}\n", "");
567}
568
569fn print_report(
570    results_by_example_name: &mut HashMap<
571        String,
572        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
573    >,
574    cumulative_tool_metrics: &mut ToolMetrics,
575    run_dir: &Path,
576) -> anyhow::Result<()> {
577    print_h1("EVAL RESULTS");
578
579    let mut diff_scores = Vec::new();
580    let mut thread_scores = Vec::new();
581    let mut programmatic_scores = Vec::new();
582    let mut error_count = 0;
583
584    for (example_name, results) in results_by_example_name.iter_mut() {
585        print_h2(example_name);
586
587        results.sort_unstable_by_key(|(example, _)| example.repetition);
588        let mut example_cumulative_tool_metrics = ToolMetrics::default();
589
590        let mut table_rows = String::new();
591
592        for (example, result) in results.iter() {
593            match result {
594                Err(err) => {
595                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
596                    error_count += 1;
597                    programmatic_scores.push(0.0);
598                    diff_scores.push(0.0);
599                    thread_scores.push(0.0);
600                }
601                Ok((run_output, judge_output)) => {
602                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
603                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
604
605                    if run_output.programmatic_assertions.total_count() > 0 {
606                        for assertion in &run_output.programmatic_assertions.ran {
607                            assertions::display_table_row(
608                                &mut table_rows,
609                                example.repetition,
610                                assertion,
611                            )?;
612                        }
613
614                        programmatic_scores
615                            .push(run_output.programmatic_assertions.passed_percentage())
616                    }
617
618                    if !judge_output.diff.is_empty() {
619                        diff_scores.push(judge_output.diff.passed_percentage());
620
621                        for assertion in &judge_output.diff.ran {
622                            assertions::display_table_row(
623                                &mut table_rows,
624                                example.repetition,
625                                assertion,
626                            )?;
627                        }
628                    }
629
630                    if !judge_output.thread.is_empty() {
631                        thread_scores.push(judge_output.thread.passed_percentage());
632
633                        for assertion in &judge_output.thread.ran {
634                            assertions::display_table_row(
635                                &mut table_rows,
636                                example.repetition,
637                                assertion,
638                            )?;
639                        }
640                    }
641                }
642            }
643        }
644
645        let mut all_asserts = Vec::new();
646
647        if !table_rows.is_empty() {
648            assertions::print_table_header();
649            print!("{}", table_rows);
650
651            assertions::print_table_divider();
652
653            for (example, result) in results.iter() {
654                if let Ok((run_output, judge_output)) = result {
655                    let asserts = [
656                        run_output.programmatic_assertions.clone(),
657                        judge_output.diff.clone(),
658                        judge_output.thread.clone(),
659                    ];
660                    all_asserts.extend_from_slice(&asserts);
661                    assertions::print_table_round_summary(
662                        &example.repetition.to_string(),
663                        asserts.iter(),
664                    )
665                } else if let Err(err) = result {
666                    let assert = AssertionsReport::error(err.to_string());
667                    all_asserts.push(assert.clone());
668                    assertions::print_table_round_summary(
669                        &example.repetition.to_string(),
670                        [assert].iter(),
671                    )
672                }
673            }
674
675            assertions::print_table_divider();
676
677            assertions::print_table_round_summary("avg", all_asserts.iter());
678
679            assertions::print_table_footer();
680        }
681
682        if !example_cumulative_tool_metrics.is_empty() {
683            println!("{}", &example_cumulative_tool_metrics);
684        }
685    }
686
687    if results_by_example_name.len() > 1 {
688        print_h1("AGGREGATE");
689
690        if error_count > 0 {
691            println!("\n{error_count} examples failed to run!");
692        }
693
694        let programmatic_score_count = programmatic_scores.len();
695        if programmatic_score_count > 0 {
696            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
697                / (programmatic_score_count as f32))
698                .floor();
699            println!("Average programmatic score: {average_programmatic_score}%");
700        }
701
702        let diff_score_count = diff_scores.len();
703        if diff_score_count > 0 {
704            let average_diff_score =
705                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
706            println!("Average diff score: {average_diff_score}%");
707        }
708
709        let thread_score_count = thread_scores.len();
710
711        if thread_score_count > 0 {
712            let average_thread_score =
713                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
714            println!("Average thread score: {average_thread_score}%");
715        }
716
717        println!();
718
719        print_h2("CUMULATIVE TOOL METRICS");
720        println!("{}", cumulative_tool_metrics);
721    }
722
723    let explorer_output_path = run_dir.join("overview.html");
724    let mut json_paths: Vec<PathBuf> = results_by_example_name
725        .values()
726        .flat_map(|results| {
727            results.iter().map(|(example, _)| {
728                let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
729                let cwd = std::env::current_dir().expect("Can't get current dir");
730                pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
731            })
732        })
733        .collect::<Vec<_>>();
734    json_paths.sort();
735    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
736        eprintln!("Failed to generate explorer HTML: {}", err);
737    }
738
739    Ok(())
740}