eval.rs

  1mod assertions;
  2mod example;
  3mod examples;
  4mod explorer;
  5mod ids;
  6mod instance;
  7mod tool_metrics;
  8
  9use assertions::{AssertionsReport, display_error_row};
 10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 11use language_extension::LspAccess;
 12pub(crate) use tool_metrics::*;
 13
 14use ::fs::RealFs;
 15use clap::Parser;
 16use client::{Client, ProxySettings, UserStore};
 17use collections::{HashMap, HashSet};
 18use extension::ExtensionHostProxy;
 19use futures::future;
 20use gpui::http_client::read_proxy_from_env;
 21use gpui::{App, AppContext, Application, AsyncApp, Entity, UpdateGlobal};
 22use gpui_tokio::Tokio;
 23use language::LanguageRegistry;
 24use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel};
 25use node_runtime::{NodeBinaryOptions, NodeRuntime};
 26use project::Project;
 27use project::project_settings::ProjectSettings;
 28use prompt_store::PromptBuilder;
 29use release_channel::AppVersion;
 30use reqwest_client::ReqwestClient;
 31use settings::{Settings, SettingsStore};
 32use std::cell::RefCell;
 33use std::collections::VecDeque;
 34use std::env;
 35use std::path::{Path, PathBuf};
 36use std::rc::Rc;
 37use std::str::FromStr;
 38use std::sync::{Arc, LazyLock};
 39use util::ResultExt as _;
 40
 41static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
 42    LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
 43
 44#[derive(Parser, Debug)]
 45#[command(name = "eval", disable_version_flag = true)]
 46struct Args {
 47    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
 48    #[arg(value_name = "EXAMPLE_SUBSTRING")]
 49    filter: Vec<String>,
 50    /// provider/model to use for agent
 51    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 52    model: String,
 53    /// provider/model to use for judges
 54    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
 55    judge_model: String,
 56    #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
 57    languages: Vec<String>,
 58    /// How many times to run each example.
 59    #[arg(long, default_value = "8")]
 60    repetitions: usize,
 61    /// Maximum number of examples to run concurrently.
 62    #[arg(long, default_value = "4")]
 63    concurrency: usize,
 64    /// Output current environment variables as JSON to stdout
 65    #[arg(long, hide = true)]
 66    printenv: bool,
 67}
 68
 69fn main() {
 70    let args = Args::parse();
 71
 72    // This prevents errors showing up in the logs, because
 73    // project::environment::load_shell_environment() calls
 74    // std::env::current_exe().unwrap() --printenv
 75    if args.printenv {
 76        util::shell_env::print_env();
 77        return;
 78    }
 79
 80    dotenvy::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
 81
 82    env_logger::init();
 83
 84    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
 85    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
 86    let session_id = uuid::Uuid::new_v4().to_string();
 87    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
 88    let run_id = match env::var("GITHUB_RUN_ID") {
 89        Ok(run_id) => format!("github/{}", run_id),
 90        Err(_) => format!("local/{}", run_timestamp),
 91    };
 92
 93    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
 94        .parent()
 95        .unwrap()
 96        .parent()
 97        .unwrap()
 98        .canonicalize()
 99        .unwrap();
100    let eval_crate_dir = root_dir.join("crates").join("eval");
101    let repos_dir = eval_crate_dir.join("repos");
102    let worktrees_dir = eval_crate_dir.join("worktrees");
103    let examples_dir = eval_crate_dir.join("src").join("examples");
104    let run_dir = eval_crate_dir
105        .join("runs")
106        .join(format!("{}", run_timestamp));
107    std::fs::create_dir_all(&run_dir).unwrap();
108    std::fs::create_dir_all(&repos_dir).unwrap();
109    std::fs::create_dir_all(&worktrees_dir).unwrap();
110    std::fs::create_dir_all(&examples_dir).unwrap();
111    std::fs::create_dir_all(&paths::config_dir()).unwrap();
112
113    let zed_commit_sha = commit_sha_for_path(&root_dir);
114    let zed_branch_name = git_branch_for_path(&root_dir);
115    let languages: HashSet<String> = args.languages.into_iter().collect();
116
117    let http_client = Arc::new(ReqwestClient::new());
118    let app = Application::headless().with_http_client(http_client);
119    let all_threads = examples::all(&examples_dir);
120
121    app.run(move |cx| {
122        let app_state = init(cx);
123
124        let telemetry = app_state.client.telemetry();
125        telemetry.start(system_id, installation_id, session_id, cx);
126
127        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").is_ok_and(|value| value == "1")
128            && telemetry.has_checksum_seed();
129        if enable_telemetry {
130            println!("Telemetry enabled");
131            telemetry::event!(
132                "Agent Eval Started",
133                zed_commit_sha = zed_commit_sha,
134                zed_branch_name = zed_branch_name,
135                run_id = run_id,
136            );
137        }
138
139        let mut cumulative_tool_metrics = ToolMetrics::default();
140
141        let tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
142            registry.providers().iter().map(|p| p.authenticate(cx)).collect::<Vec<_>>()
143        });
144
145        cx.spawn(async move |cx| {
146            future::join_all(tasks).await;
147            let judge_model = cx.update(|cx| {
148                let agent_model = load_model(&args.model, cx).unwrap();
149                let judge_model = load_model(&args.judge_model, cx).unwrap();
150                LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
151                    registry.set_default_model(Some(agent_model.clone()), cx);
152                });
153                judge_model
154            })?;
155
156            let mut examples = Vec::new();
157
158            const COLORS: [&str; 12] = [
159                "\x1b[31m", // Red
160                "\x1b[32m", // Green
161                "\x1b[33m", // Yellow
162                "\x1b[34m", // Blue
163                "\x1b[35m", // Magenta
164                "\x1b[36m", // Cyan
165                "\x1b[91m", // Bright Red
166                "\x1b[92m", // Bright Green
167                "\x1b[93m", // Bright Yellow
168                "\x1b[94m", // Bright Blue
169                "\x1b[95m", // Bright Magenta
170                "\x1b[96m", // Bright Cyan
171            ];
172
173            let mut skipped = Vec::new();
174
175            for thread in all_threads {
176                let meta = thread.meta();
177                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
178                {
179                    skipped.push(meta.name);
180                    continue;
181                }
182
183                if let Some(language) = meta.language_server
184                    && !languages.contains(&language.file_extension) {
185                        panic!(
186                            "Eval for {:?} could not be run because no language server was found for extension {:?}",
187                            meta.name,
188                            language.file_extension
189                        );
190                    }
191
192                // TODO: This creates a worktree per repetition. Ideally these examples should
193                // either be run sequentially on the same worktree, or reuse worktrees when there
194                // are more examples to run than the concurrency limit.
195                for repetition_number in 0..args.repetitions {
196                    let example_instance = ExampleInstance::new(
197                        thread.clone(),
198                        &repos_dir,
199                        &run_dir,
200                        &worktrees_dir,
201                        repetition_number,
202                    );
203
204                    examples.push(example_instance);
205                }
206            }
207
208            if !skipped.is_empty() {
209                println!("Skipped threads: {}", skipped.join(", "));
210            }
211
212            if examples.is_empty() {
213                eprintln!("Filter matched no examples");
214                return cx.update(|cx| cx.quit());
215            }
216
217            let mut repo_urls = HashSet::default();
218            let mut clone_tasks = Vec::new();
219
220            let max_name_width = examples
221                .iter()
222                .map(|e| e.worktree_name().len())
223                .max()
224                .unwrap_or(0);
225
226            for (i, example_instance) in examples.iter_mut().enumerate() {
227                let color = COLORS[i % COLORS.len()].to_string();
228                example_instance.set_log_prefix_style(&color, max_name_width);
229
230                println!(
231                    "{}Logging to: {}",
232                    example_instance.log_prefix,
233                    example_instance.run_directory.display()
234                );
235
236                let repo_url = example_instance.repo_url();
237                if repo_urls.insert(repo_url.clone()) {
238                    let repo_path = example_instance.repo_path.clone();
239
240                    if !repo_path.join(".git").is_dir() {
241                        println!(
242                            "{:<width$} < {}",
243                            "↓ Cloning",
244                            repo_url,
245                            width = max_name_width
246                        );
247
248                        let git_task = cx.spawn(async move |_cx| {
249                            std::fs::create_dir_all(&repo_path)?;
250                            run_git(&repo_path, &["init"]).await?;
251                            run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
252                        });
253
254                        clone_tasks.push(git_task);
255                    } else {
256                        println!(
257                            "{:<width$}  < {}",
258                            "✔︎ Already cloned",
259                            repo_url,
260                            width = max_name_width
261                        );
262
263                        let actual_origin =
264                            run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
265                        anyhow::ensure!(
266                            actual_origin == repo_url,
267                            "remote origin {actual_origin} does not match expected origin {repo_url}"
268                        );
269                    }
270                }
271            }
272
273            future::join_all(clone_tasks).await;
274
275            for example_instance in examples.iter_mut() {
276                example_instance.fetch().await?;
277            }
278
279            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
280            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
281
282            future::join_all((0..args.concurrency).map(|_| {
283                let app_state = app_state.clone();
284                let judge_model = judge_model.model.clone();
285                let zed_commit_sha = zed_commit_sha.clone();
286                let zed_branch_name = zed_branch_name.clone();
287                let run_id = run_id.clone();
288                let examples = examples.clone();
289                let results = results_by_example_name.clone();
290                cx.spawn(async move |cx| {
291                    loop {
292                        let Some(mut example) = examples.borrow_mut().pop_front() else {
293                            break;
294                        };
295                        let result = async {
296                            example.setup().await?;
297                            let run_output = cx
298                                .update(|cx| example.run(app_state.clone(), cx))?
299                                .await?;
300                            let judge_output = judge_example(
301                                example.clone(),
302                                judge_model.clone(),
303                                &zed_commit_sha,
304                                &zed_branch_name,
305                                &run_id,
306                                &run_output,
307                                enable_telemetry,
308                                cx,
309                            )
310                            .await;
311                            anyhow::Ok((run_output, judge_output))
312                        }
313                        .await;
314                        results
315                            .borrow_mut()
316                            .entry(example.name.clone())
317                            .or_insert(Vec::new())
318                            .push((example.clone(), result));
319                    }
320                })
321            }))
322            .await;
323
324            print_report(
325                &mut results_by_example_name.borrow_mut(),
326                &mut cumulative_tool_metrics,
327                &run_dir,
328            )?;
329
330            app_state.client.telemetry().flush_events().await;
331
332            cx.update(|cx| cx.quit())
333        })
334        .detach_and_log_err(cx);
335    });
336}
337
338/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
339pub struct AgentAppState {
340    pub languages: Arc<LanguageRegistry>,
341    pub client: Arc<Client>,
342    pub user_store: Entity<UserStore>,
343    pub fs: Arc<dyn fs::Fs>,
344    pub node_runtime: NodeRuntime,
345
346    // Additional fields not present in `workspace::AppState`.
347    pub prompt_builder: Arc<PromptBuilder>,
348}
349
350pub fn init(cx: &mut App) -> Arc<AgentAppState> {
351    let app_version = AppVersion::load(env!("ZED_PKG_VERSION"));
352    release_channel::init(app_version, cx);
353    gpui_tokio::init(cx);
354
355    let settings_store = SettingsStore::new(cx, &settings::default_settings());
356    cx.set_global(settings_store);
357    client::init_settings(cx);
358
359    // Set User-Agent so we can download language servers from GitHub
360    let user_agent = format!(
361        "Zed Agent Eval/{} ({}; {})",
362        app_version,
363        std::env::consts::OS,
364        std::env::consts::ARCH
365    );
366    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
367    let proxy_url = proxy_str
368        .as_ref()
369        .and_then(|input| input.parse().ok())
370        .or_else(read_proxy_from_env);
371    let http = {
372        let _guard = Tokio::handle(cx).enter();
373
374        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
375            .expect("could not start HTTP client")
376    };
377    cx.set_http_client(Arc::new(http));
378
379    Project::init_settings(cx);
380
381    let client = Client::production(cx);
382    cx.set_http_client(client.http_client());
383
384    let git_binary_path = None;
385    let fs = Arc::new(RealFs::new(
386        git_binary_path,
387        cx.background_executor().clone(),
388    ));
389
390    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
391    languages.set_language_server_download_dir(paths::languages_dir().clone());
392    let languages = Arc::new(languages);
393
394    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
395
396    extension::init(cx);
397
398    let (mut tx, rx) = watch::channel(None);
399    cx.observe_global::<SettingsStore>(move |cx| {
400        let settings = &ProjectSettings::get_global(cx).node;
401        let options = NodeBinaryOptions {
402            allow_path_lookup: !settings.ignore_system_version,
403            allow_binary_download: true,
404            use_paths: settings.path.as_ref().map(|node_path| {
405                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
406                let npm_path = settings
407                    .npm_path
408                    .as_ref()
409                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
410                (
411                    node_path.clone(),
412                    npm_path.unwrap_or_else(|| {
413                        let base_path = PathBuf::new();
414                        node_path.parent().unwrap_or(&base_path).join("npm")
415                    }),
416                )
417            }),
418        };
419        tx.send(Some(options)).log_err();
420    })
421    .detach();
422    let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
423
424    let extension_host_proxy = ExtensionHostProxy::global(cx);
425
426    language::init(cx);
427    debug_adapter_extension::init(extension_host_proxy.clone(), cx);
428    language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone());
429    language_model::init(client.clone(), cx);
430    language_models::init(user_store.clone(), client.clone(), cx);
431    languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx);
432    prompt_store::init(cx);
433    terminal_view::init(cx);
434    let stdout_is_a_pty = false;
435    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
436    agent_ui::init(
437        fs.clone(),
438        client.clone(),
439        prompt_builder.clone(),
440        languages.clone(),
441        true,
442        cx,
443    );
444
445    SettingsStore::update_global(cx, |store, cx| {
446        store.set_user_settings(include_str!("../runner_settings.json"), cx)
447    })
448    .unwrap();
449
450    Arc::new(AgentAppState {
451        languages,
452        client,
453        user_store,
454        fs,
455        node_runtime,
456        prompt_builder,
457    })
458}
459
460pub fn find_model(
461    model_name: &str,
462    model_registry: &LanguageModelRegistry,
463    cx: &App,
464) -> anyhow::Result<Arc<dyn LanguageModel>> {
465    let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?;
466    model_registry
467        .available_models(cx)
468        .find(|model| model.id() == selected.model && model.provider_id() == selected.provider)
469        .ok_or_else(|| {
470            anyhow::anyhow!(
471                "No language model with ID {}/{} was available. Available models: {}",
472                selected.model.0,
473                selected.provider.0,
474                model_registry
475                    .available_models(cx)
476                    .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
477                    .collect::<Vec<_>>()
478                    .join(", ")
479            )
480        })
481}
482
483pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result<ConfiguredModel> {
484    let model = {
485        let model_registry = LanguageModelRegistry::read_global(cx);
486        find_model(model_name, model_registry, cx)?
487    };
488
489    let provider = {
490        let model_registry = LanguageModelRegistry::read_global(cx);
491        model_registry
492            .provider(&model.provider_id())
493            .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))?
494    };
495
496    Ok(ConfiguredModel {
497        provider: provider.clone(),
498        model: model.clone(),
499    })
500}
501
502pub fn commit_sha_for_path(repo_path: &Path) -> String {
503    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
504}
505
506pub fn git_branch_for_path(repo_path: &Path) -> String {
507    match std::env::var("GITHUB_REF_NAME") {
508        Ok(branch) => branch,
509        Err(_) => {
510            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
511                .unwrap_or_else(|_| "unknown".to_string())
512        }
513    }
514}
515
516async fn judge_example(
517    example: ExampleInstance,
518    model: Arc<dyn LanguageModel>,
519    zed_commit_sha: &str,
520    zed_branch_name: &str,
521    run_id: &str,
522    run_output: &RunOutput,
523    enable_telemetry: bool,
524    cx: &AsyncApp,
525) -> JudgeOutput {
526    let judge_output = example.judge(model.clone(), run_output, cx).await;
527
528    if enable_telemetry {
529        telemetry::event!(
530            "Agent Example Evaluated",
531            zed_commit_sha = zed_commit_sha,
532            zed_branch_name = zed_branch_name,
533            run_id = run_id,
534            example_name = example.name.clone(),
535            example_repetition = example.repetition,
536            diff_evaluation = judge_output.diff.clone(),
537            thread_evaluation = judge_output.thread,
538            tool_metrics = run_output.tool_metrics,
539            token_usage = run_output.token_usage,
540            model = model.telemetry_id(),
541            model_provider = model.provider_id().to_string(),
542            repository_url = example.repo_url(),
543            repository_revision = example.revision(),
544            diagnostic_summary_before = run_output.diagnostic_summary_before,
545            diagnostic_summary_after = run_output.diagnostic_summary_after,
546            diagnostics_before = run_output.diagnostics_before,
547            diagnostics_after = run_output.diagnostics_after,
548        );
549    }
550
551    judge_output
552}
553
554const HEADER_WIDTH: usize = 65;
555
556fn print_h1(header: &str) {
557    println!("\n\n{:=^HEADER_WIDTH$}", "");
558    println!("{:^HEADER_WIDTH$}", header);
559    println!("{:=^HEADER_WIDTH$}\n", "");
560}
561
562fn print_h2(header: &str) {
563    println!("\n{:-^HEADER_WIDTH$}", "");
564    println!("{:^HEADER_WIDTH$}", header);
565    println!("{:-^HEADER_WIDTH$}\n", "");
566}
567
568fn print_report(
569    results_by_example_name: &mut HashMap<
570        String,
571        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
572    >,
573    cumulative_tool_metrics: &mut ToolMetrics,
574    run_dir: &Path,
575) -> anyhow::Result<()> {
576    print_h1("EVAL RESULTS");
577
578    let mut diff_scores = Vec::new();
579    let mut thread_scores = Vec::new();
580    let mut programmatic_scores = Vec::new();
581    let mut error_count = 0;
582
583    for (example_name, results) in results_by_example_name.iter_mut() {
584        print_h2(example_name);
585
586        results.sort_unstable_by_key(|(example, _)| example.repetition);
587        let mut example_cumulative_tool_metrics = ToolMetrics::default();
588
589        let mut table_rows = String::new();
590
591        for (example, result) in results.iter() {
592            match result {
593                Err(err) => {
594                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
595                    error_count += 1;
596                    programmatic_scores.push(0.0);
597                    diff_scores.push(0.0);
598                    thread_scores.push(0.0);
599                }
600                Ok((run_output, judge_output)) => {
601                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
602                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
603
604                    if run_output.programmatic_assertions.total_count() > 0 {
605                        for assertion in &run_output.programmatic_assertions.ran {
606                            assertions::display_table_row(
607                                &mut table_rows,
608                                example.repetition,
609                                assertion,
610                            )?;
611                        }
612
613                        programmatic_scores
614                            .push(run_output.programmatic_assertions.passed_percentage())
615                    }
616
617                    if !judge_output.diff.is_empty() {
618                        diff_scores.push(judge_output.diff.passed_percentage());
619
620                        for assertion in &judge_output.diff.ran {
621                            assertions::display_table_row(
622                                &mut table_rows,
623                                example.repetition,
624                                assertion,
625                            )?;
626                        }
627                    }
628
629                    if !judge_output.thread.is_empty() {
630                        thread_scores.push(judge_output.thread.passed_percentage());
631
632                        for assertion in &judge_output.thread.ran {
633                            assertions::display_table_row(
634                                &mut table_rows,
635                                example.repetition,
636                                assertion,
637                            )?;
638                        }
639                    }
640                }
641            }
642        }
643
644        let mut all_asserts = Vec::new();
645
646        if !table_rows.is_empty() {
647            assertions::print_table_header();
648            print!("{}", table_rows);
649
650            assertions::print_table_divider();
651
652            for (example, result) in results.iter() {
653                if let Ok((run_output, judge_output)) = result {
654                    let asserts = [
655                        run_output.programmatic_assertions.clone(),
656                        judge_output.diff.clone(),
657                        judge_output.thread.clone(),
658                    ];
659                    all_asserts.extend_from_slice(&asserts);
660                    assertions::print_table_round_summary(
661                        &example.repetition.to_string(),
662                        asserts.iter(),
663                    )
664                } else if let Err(err) = result {
665                    let assert = AssertionsReport::error(err.to_string());
666                    all_asserts.push(assert.clone());
667                    assertions::print_table_round_summary(
668                        &example.repetition.to_string(),
669                        [assert].iter(),
670                    )
671                }
672            }
673
674            assertions::print_table_divider();
675
676            assertions::print_table_round_summary("avg", all_asserts.iter());
677
678            assertions::print_table_footer();
679        }
680
681        if !example_cumulative_tool_metrics.is_empty() {
682            println!("{}", &example_cumulative_tool_metrics);
683        }
684    }
685
686    if results_by_example_name.len() > 1 {
687        print_h1("AGGREGATE");
688
689        if error_count > 0 {
690            println!("\n{error_count} examples failed to run!");
691        }
692
693        let programmatic_score_count = programmatic_scores.len();
694        if programmatic_score_count > 0 {
695            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
696                / (programmatic_score_count as f32))
697                .floor();
698            println!("Average programmatic score: {average_programmatic_score}%");
699        }
700
701        let diff_score_count = diff_scores.len();
702        if diff_score_count > 0 {
703            let average_diff_score =
704                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
705            println!("Average diff score: {average_diff_score}%");
706        }
707
708        let thread_score_count = thread_scores.len();
709
710        if thread_score_count > 0 {
711            let average_thread_score =
712                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
713            println!("Average thread score: {average_thread_score}%");
714        }
715
716        println!();
717
718        print_h2("CUMULATIVE TOOL METRICS");
719        println!("{}", cumulative_tool_metrics);
720    }
721
722    let explorer_output_path = run_dir.join("overview.html");
723    let mut json_paths: Vec<PathBuf> = results_by_example_name
724        .values()
725        .flat_map(|results| {
726            results.iter().map(|(example, _)| {
727                let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
728                let cwd = std::env::current_dir().expect("Can't get current dir");
729                pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
730            })
731        })
732        .collect::<Vec<_>>();
733    json_paths.sort();
734    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
735        eprintln!("Failed to generate explorer HTML: {}", err);
736    }
737
738    Ok(())
739}