eval.rs

  1mod assertions;
  2mod example;
  3mod examples;
  4mod explorer;
  5mod ids;
  6mod instance;
  7mod tool_metrics;
  8
  9use assertions::display_error_row;
 10use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 11pub(crate) use tool_metrics::*;
 12
 13use ::fs::RealFs;
 14use anyhow::anyhow;
 15use clap::Parser;
 16use client::{Client, ProxySettings, UserStore};
 17use collections::{HashMap, HashSet};
 18use extension::ExtensionHostProxy;
 19use futures::future;
 20use gpui::http_client::read_proxy_from_env;
 21use gpui::{App, AppContext, Application, AsyncApp, Entity, SemanticVersion, UpdateGlobal};
 22use gpui_tokio::Tokio;
 23use language::LanguageRegistry;
 24use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry};
 25use node_runtime::{NodeBinaryOptions, NodeRuntime};
 26use project::Project;
 27use project::project_settings::ProjectSettings;
 28use prompt_store::PromptBuilder;
 29use release_channel::AppVersion;
 30use reqwest_client::ReqwestClient;
 31use settings::{Settings, SettingsStore};
 32use std::cell::RefCell;
 33use std::collections::VecDeque;
 34use std::env;
 35use std::path::{Path, PathBuf};
 36use std::rc::Rc;
 37use std::sync::{Arc, LazyLock};
 38use util::ResultExt as _;
 39
 40static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
 41    LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
 42
 43#[derive(Parser, Debug)]
 44#[command(name = "eval", disable_version_flag = true)]
 45struct Args {
 46    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
 47    #[arg(value_name = "EXAMPLE_SUBSTRING")]
 48    filter: Vec<String>,
 49    /// Model to use (default: "claude-3-7-sonnet-latest")
 50    #[arg(long, default_value = "claude-3-7-sonnet-latest")]
 51    model: String,
 52    #[arg(long, value_delimiter = ',', default_value = "rs,ts")]
 53    languages: Vec<String>,
 54    /// How many times to run each example.
 55    #[arg(long, default_value = "8")]
 56    repetitions: usize,
 57    /// Maximum number of examples to run concurrently.
 58    #[arg(long, default_value = "4")]
 59    concurrency: usize,
 60}
 61
 62fn main() {
 63    dotenv::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
 64
 65    env_logger::init();
 66
 67    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
 68    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
 69    let session_id = uuid::Uuid::new_v4().to_string();
 70    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
 71    let run_id = match env::var("GITHUB_RUN_ID") {
 72        Ok(run_id) => format!("github/{}", run_id),
 73        Err(_) => format!("local/{}", run_timestamp),
 74    };
 75
 76    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
 77        .parent()
 78        .unwrap()
 79        .parent()
 80        .unwrap()
 81        .canonicalize()
 82        .unwrap();
 83    let eval_crate_dir = root_dir.join("crates").join("eval");
 84    let repos_dir = eval_crate_dir.join("repos");
 85    let worktrees_dir = eval_crate_dir.join("worktrees");
 86    let examples_dir = eval_crate_dir.join("src").join("examples");
 87    let run_dir = eval_crate_dir
 88        .join("runs")
 89        .join(format!("{}", run_timestamp));
 90    std::fs::create_dir_all(&run_dir).unwrap();
 91    std::fs::create_dir_all(&repos_dir).unwrap();
 92    std::fs::create_dir_all(&worktrees_dir).unwrap();
 93    std::fs::create_dir_all(&examples_dir).unwrap();
 94    std::fs::create_dir_all(&paths::config_dir()).unwrap();
 95
 96    let zed_commit_sha = commit_sha_for_path(&root_dir);
 97    let zed_branch_name = git_branch_for_path(&root_dir);
 98    let args = Args::parse();
 99    let languages: HashSet<String> = args.languages.into_iter().collect();
100
101    let http_client = Arc::new(ReqwestClient::new());
102    let app = Application::headless().with_http_client(http_client.clone());
103    let all_threads = examples::all(&examples_dir);
104
105    app.run(move |cx| {
106        let app_state = init(cx);
107
108        let telemetry = app_state.client.telemetry();
109        telemetry.start(system_id, installation_id, session_id, cx);
110
111        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
112            && telemetry.has_checksum_seed();
113        if enable_telemetry {
114            println!("Telemetry enabled");
115            telemetry::event!(
116                "Agent Eval Started",
117                zed_commit_sha = zed_commit_sha,
118                zed_branch_name = zed_branch_name,
119                run_id = run_id,
120            );
121        }
122
123        let mut cumulative_tool_metrics = ToolMetrics::default();
124
125        let model_registry = LanguageModelRegistry::read_global(cx);
126        let model = find_model("claude-3-7-sonnet-latest", model_registry, cx).unwrap();
127        let model_provider_id = model.provider_id();
128        let model_provider = model_registry.provider(&model_provider_id).unwrap();
129
130        LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
131            registry.set_default_model(
132                Some(ConfiguredModel {
133                    provider: model_provider.clone(),
134                    model: model.clone(),
135                }),
136                cx,
137            );
138        });
139
140        let authenticate_task = model_provider.authenticate(cx);
141
142        cx.spawn(async move |cx| {
143            authenticate_task.await.unwrap();
144
145            let mut examples = Vec::new();
146
147            const COLORS: [&str; 12] = [
148                "\x1b[31m", // Red
149                "\x1b[32m", // Green
150                "\x1b[33m", // Yellow
151                "\x1b[34m", // Blue
152                "\x1b[35m", // Magenta
153                "\x1b[36m", // Cyan
154                "\x1b[91m", // Bright Red
155                "\x1b[92m", // Bright Green
156                "\x1b[93m", // Bright Yellow
157                "\x1b[94m", // Bright Blue
158                "\x1b[95m", // Bright Magenta
159                "\x1b[96m", // Bright Cyan
160            ];
161
162            let mut skipped = Vec::new();
163
164            for thread in all_threads {
165                let meta = thread.meta();
166                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
167                {
168                    skipped.push(meta.name);
169                    continue;
170                }
171
172                if meta.language_server.map_or(false, |language| {
173                    !languages.contains(&language.file_extension)
174                }) {
175                    skipped.push(meta.name);
176                    continue;
177                }
178
179                // TODO: This creates a worktree per repetition. Ideally these examples should
180                // either be run sequentially on the same worktree, or reuse worktrees when there
181                // are more examples to run than the concurrency limit.
182                for repetition_number in 0..args.repetitions {
183                    let example_instance = ExampleInstance::new(
184                        thread.clone(),
185                        &repos_dir,
186                        &run_dir,
187                        &worktrees_dir,
188                        repetition_number,
189                    );
190
191                    examples.push(example_instance);
192                }
193            }
194
195            if !skipped.is_empty() {
196                println!("Skipped threads: {}", skipped.join(", "));
197            }
198
199            if examples.is_empty() {
200                eprintln!("Filter matched no examples");
201                return cx.update(|cx| cx.quit());
202            }
203
204            let mut repo_urls = HashSet::default();
205            let mut clone_tasks = Vec::new();
206
207            let max_name_width = examples
208                .iter()
209                .map(|e| e.worktree_name().len())
210                .max()
211                .unwrap_or(0);
212
213            for (i, example_instance) in examples.iter_mut().enumerate() {
214                let color = COLORS[i % COLORS.len()].to_string();
215                example_instance.set_log_prefix_style(&color, max_name_width);
216
217                println!(
218                    "{}Logging to: {}",
219                    example_instance.log_prefix,
220                    example_instance.run_directory.display()
221                );
222
223                let repo_url = example_instance.repo_url();
224                if repo_urls.insert(repo_url.clone()) {
225                    let repo_path = example_instance.repo_path.clone();
226
227                    if !repo_path.join(".git").is_dir() {
228                        println!(
229                            "{:<width$} < {}",
230                            "↓ Cloning",
231                            repo_url,
232                            width = max_name_width
233                        );
234
235                        let git_task = cx.spawn(async move |_cx| {
236                            std::fs::create_dir_all(&repo_path)?;
237                            run_git(&repo_path, &["init"]).await?;
238                            run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
239                        });
240
241                        clone_tasks.push(git_task);
242                    } else {
243                        println!(
244                            "{:<width$}  < {}",
245                            "✔︎ Already cloned",
246                            repo_url,
247                            width = max_name_width
248                        );
249
250                        let actual_origin =
251                            run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
252                        if actual_origin != repo_url {
253                            return Err(anyhow!(
254                                "remote origin {} does not match expected origin {}",
255                                actual_origin,
256                                repo_url,
257                            ));
258                        }
259                    }
260                }
261            }
262
263            future::join_all(clone_tasks).await;
264
265            for example_instance in examples.iter_mut() {
266                example_instance.fetch().await?;
267            }
268
269            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
270            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
271
272            future::join_all((0..args.concurrency).map(|_| {
273                let app_state = app_state.clone();
274                let model = model.clone();
275                let zed_commit_sha = zed_commit_sha.clone();
276                let zed_branch_name = zed_branch_name.clone();
277                let run_id = run_id.clone();
278                let examples = examples.clone();
279                let results = results_by_example_name.clone();
280                cx.spawn(async move |cx| {
281                    loop {
282                        let Some(mut example) = examples.borrow_mut().pop_front() else {
283                            break;
284                        };
285                        let result = async {
286                            example.setup().await?;
287                            let run_output = cx
288                                .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
289                                .await?;
290                            let judge_output = judge_example(
291                                example.clone(),
292                                model.clone(),
293                                &zed_commit_sha,
294                                &zed_branch_name,
295                                &run_id,
296                                &run_output,
297                                enable_telemetry,
298                                cx,
299                            )
300                            .await;
301                            anyhow::Ok((run_output, judge_output))
302                        }
303                        .await;
304                        results
305                            .borrow_mut()
306                            .entry(example.name.clone())
307                            .or_insert(Vec::new())
308                            .push((example.clone(), result));
309                    }
310                })
311            }))
312            .await;
313
314            print_report(
315                &mut results_by_example_name.borrow_mut(),
316                &mut cumulative_tool_metrics,
317                &run_dir,
318            )?;
319
320            app_state.client.telemetry().flush_events().await;
321
322            cx.update(|cx| cx.quit())
323        })
324        .detach_and_log_err(cx);
325    });
326}
327
328/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
329pub struct AgentAppState {
330    pub languages: Arc<LanguageRegistry>,
331    pub client: Arc<Client>,
332    pub user_store: Entity<UserStore>,
333    pub fs: Arc<dyn fs::Fs>,
334    pub node_runtime: NodeRuntime,
335
336    // Additional fields not present in `workspace::AppState`.
337    pub prompt_builder: Arc<PromptBuilder>,
338}
339
340pub fn init(cx: &mut App) -> Arc<AgentAppState> {
341    release_channel::init(SemanticVersion::default(), cx);
342    gpui_tokio::init(cx);
343
344    let mut settings_store = SettingsStore::new(cx);
345    settings_store
346        .set_default_settings(settings::default_settings().as_ref(), cx)
347        .unwrap();
348    cx.set_global(settings_store);
349    client::init_settings(cx);
350
351    // Set User-Agent so we can download language servers from GitHub
352    let user_agent = format!(
353        "Zed/{} ({}; {})",
354        AppVersion::global(cx),
355        std::env::consts::OS,
356        std::env::consts::ARCH
357    );
358    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
359    let proxy_url = proxy_str
360        .as_ref()
361        .and_then(|input| input.parse().ok())
362        .or_else(read_proxy_from_env);
363    let http = {
364        let _guard = Tokio::handle(cx).enter();
365
366        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
367            .expect("could not start HTTP client")
368    };
369    cx.set_http_client(Arc::new(http));
370
371    Project::init_settings(cx);
372
373    let client = Client::production(cx);
374    cx.set_http_client(client.http_client());
375
376    let git_binary_path = None;
377    let fs = Arc::new(RealFs::new(
378        git_binary_path,
379        cx.background_executor().clone(),
380    ));
381
382    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
383    languages.set_language_server_download_dir(paths::languages_dir().clone());
384    let languages = Arc::new(languages);
385
386    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
387
388    extension::init(cx);
389
390    let (tx, rx) = async_watch::channel(None);
391    cx.observe_global::<SettingsStore>(move |cx| {
392        let settings = &ProjectSettings::get_global(cx).node;
393        let options = NodeBinaryOptions {
394            allow_path_lookup: !settings.ignore_system_version.unwrap_or_default(),
395            allow_binary_download: true,
396            use_paths: settings.path.as_ref().map(|node_path| {
397                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
398                let npm_path = settings
399                    .npm_path
400                    .as_ref()
401                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
402                (
403                    node_path.clone(),
404                    npm_path.unwrap_or_else(|| {
405                        let base_path = PathBuf::new();
406                        node_path.parent().unwrap_or(&base_path).join("npm")
407                    }),
408                )
409            }),
410        };
411        tx.send(Some(options)).log_err();
412    })
413    .detach();
414    let node_runtime = NodeRuntime::new(client.http_client(), rx);
415
416    let extension_host_proxy = ExtensionHostProxy::global(cx);
417
418    language::init(cx);
419    language_extension::init(extension_host_proxy.clone(), languages.clone());
420    language_model::init(client.clone(), cx);
421    language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
422    languages::init(languages.clone(), node_runtime.clone(), cx);
423    context_server::init(cx);
424    prompt_store::init(cx);
425    let stdout_is_a_pty = false;
426    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
427    agent::init(
428        fs.clone(),
429        client.clone(),
430        prompt_builder.clone(),
431        languages.clone(),
432        cx,
433    );
434    assistant_tools::init(client.http_client(), cx);
435
436    SettingsStore::update_global(cx, |store, cx| {
437        store.set_user_settings(include_str!("../runner_settings.json"), cx)
438    })
439    .unwrap();
440
441    Arc::new(AgentAppState {
442        languages,
443        client,
444        user_store,
445        fs,
446        node_runtime,
447        prompt_builder,
448    })
449}
450
451pub fn find_model(
452    model_name: &str,
453    model_registry: &LanguageModelRegistry,
454    cx: &App,
455) -> anyhow::Result<Arc<dyn LanguageModel>> {
456    let model = model_registry
457        .available_models(cx)
458        .find(|model| model.id().0 == model_name);
459
460    let Some(model) = model else {
461        return Err(anyhow!(
462            "No language model named {} was available. Available models: {}",
463            model_name,
464            model_registry
465                .available_models(cx)
466                .map(|model| model.id().0.clone())
467                .collect::<Vec<_>>()
468                .join(", ")
469        ));
470    };
471
472    Ok(model)
473}
474
475pub fn commit_sha_for_path(repo_path: &Path) -> String {
476    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
477}
478
479pub fn git_branch_for_path(repo_path: &Path) -> String {
480    match std::env::var("GITHUB_REF_NAME") {
481        Ok(branch) => branch,
482        Err(_) => {
483            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
484                .unwrap_or_else(|_| "unknown".to_string())
485        }
486    }
487}
488
489async fn judge_example(
490    example: ExampleInstance,
491    model: Arc<dyn LanguageModel>,
492    zed_commit_sha: &str,
493    zed_branch_name: &str,
494    run_id: &str,
495    run_output: &RunOutput,
496    enable_telemetry: bool,
497    cx: &AsyncApp,
498) -> JudgeOutput {
499    let judge_output = example.judge(model.clone(), &run_output, cx).await;
500
501    if enable_telemetry {
502        telemetry::event!(
503            "Agent Example Evaluated",
504            zed_commit_sha = zed_commit_sha,
505            zed_branch_name = zed_branch_name,
506            run_id = run_id,
507            example_name = example.name.clone(),
508            example_repetition = example.repetition,
509            diff_evaluation = judge_output.diff.clone(),
510            thread_evaluation = judge_output.thread.clone(),
511            tool_metrics = run_output.tool_metrics,
512            response_count = run_output.response_count,
513            token_usage = run_output.token_usage,
514            model = model.telemetry_id(),
515            model_provider = model.provider_id().to_string(),
516            repository_url = example.repo_url(),
517            repository_revision = example.revision(),
518            diagnostic_summary_before = run_output.diagnostic_summary_before,
519            diagnostic_summary_after = run_output.diagnostic_summary_after,
520            diagnostics_before = run_output.diagnostics_before,
521            diagnostics_after = run_output.diagnostics_after,
522        );
523    }
524
525    judge_output
526}
527
528const HEADER_WIDTH: usize = 65;
529
530fn print_h1(header: &str) {
531    println!("\n\n{:=^HEADER_WIDTH$}", "");
532    println!("{:^HEADER_WIDTH$}", header);
533    println!("{:=^HEADER_WIDTH$}\n", "");
534}
535
536fn print_h2(header: &str) {
537    println!("\n{:-^HEADER_WIDTH$}", "");
538    println!("{:^HEADER_WIDTH$}", header);
539    println!("{:-^HEADER_WIDTH$}\n", "");
540}
541
542fn print_report(
543    results_by_example_name: &mut HashMap<
544        String,
545        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
546    >,
547    cumulative_tool_metrics: &mut ToolMetrics,
548    run_dir: &Path,
549) -> anyhow::Result<()> {
550    print_h1("EVAL RESULTS");
551
552    let mut diff_scores = Vec::new();
553    let mut thread_scores = Vec::new();
554    let mut programmatic_scores = Vec::new();
555    let mut error_count = 0;
556
557    for (example_name, results) in results_by_example_name.iter_mut() {
558        print_h2(example_name);
559
560        results.sort_unstable_by_key(|(example, _)| example.repetition);
561        let mut example_cumulative_tool_metrics = ToolMetrics::default();
562
563        let mut table_rows = String::new();
564
565        for (example, result) in results.iter() {
566            match result {
567                Err(err) => {
568                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
569                    error_count += 1;
570                }
571                Ok((run_output, judge_output)) => {
572                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
573                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
574
575                    if !run_output.programmatic_assertions.total_count() > 0 {
576                        for assertion in &run_output.programmatic_assertions.ran {
577                            assertions::display_table_row(
578                                &mut table_rows,
579                                example.repetition,
580                                assertion,
581                            )?;
582                        }
583
584                        programmatic_scores
585                            .push(run_output.programmatic_assertions.passed_percentage())
586                    }
587
588                    if !judge_output.diff.is_empty() {
589                        diff_scores.push(judge_output.diff.passed_percentage());
590
591                        for assertion in &judge_output.diff.ran {
592                            assertions::display_table_row(
593                                &mut table_rows,
594                                example.repetition,
595                                assertion,
596                            )?;
597                        }
598                    }
599
600                    if !judge_output.thread.is_empty() {
601                        thread_scores.push(judge_output.thread.passed_percentage());
602
603                        for assertion in &judge_output.thread.ran {
604                            assertions::display_table_row(
605                                &mut table_rows,
606                                example.repetition,
607                                assertion,
608                            )?;
609                        }
610                    }
611                }
612            }
613        }
614
615        if !table_rows.is_empty() {
616            assertions::print_table_header();
617            print!("{}", table_rows);
618
619            assertions::print_table_divider();
620
621            for (example, result) in results.iter() {
622                if let Ok((run_output, judge_output)) = result {
623                    assertions::print_table_round_summary(
624                        &example.repetition.to_string(),
625                        [
626                            &run_output.programmatic_assertions,
627                            &judge_output.diff,
628                            &judge_output.thread,
629                        ]
630                        .into_iter(),
631                    )
632                }
633            }
634
635            assertions::print_table_divider();
636
637            assertions::print_table_round_summary(
638                "avg",
639                results.iter().flat_map(|(_, result)| {
640                    result.iter().flat_map(|(run_output, judge_output)| {
641                        [
642                            &run_output.programmatic_assertions,
643                            &judge_output.diff,
644                            &judge_output.thread,
645                        ]
646                        .into_iter()
647                    })
648                }),
649            );
650
651            assertions::print_table_footer();
652        }
653
654        if !example_cumulative_tool_metrics.is_empty() {
655            println!("{}", &example_cumulative_tool_metrics);
656        }
657    }
658
659    if results_by_example_name.len() > 1 {
660        print_h1("AGGREGATE");
661
662        if error_count > 0 {
663            println!("\n{error_count} examples failed to run!");
664        }
665
666        let programmatic_score_count = programmatic_scores.len();
667        if programmatic_score_count > 0 {
668            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
669                / (programmatic_score_count as f32))
670                .floor();
671            println!("Average programmatic score: {average_programmatic_score}%");
672        }
673
674        let diff_score_count = diff_scores.len();
675        if diff_score_count > 0 {
676            let average_diff_score =
677                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
678            println!("Average diff score: {average_diff_score}%");
679        }
680
681        let thread_score_count = thread_scores.len();
682
683        if thread_score_count > 0 {
684            let average_thread_score =
685                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
686            println!("Average thread score: {average_thread_score}%");
687        }
688
689        println!("");
690
691        print_h2("CUMULATIVE TOOL METRICS");
692        println!("{}", cumulative_tool_metrics);
693    }
694
695    let explorer_output_path = run_dir.join("overview.html");
696    let mut json_paths: Vec<PathBuf> = results_by_example_name
697        .values()
698        .flat_map(|results| {
699            results.iter().map(|(example, _)| {
700                let absolute_path = example.run_directory.join("last.messages.json");
701                pathdiff::diff_paths(&absolute_path, run_dir)
702                    .unwrap_or_else(|| absolute_path.clone())
703            })
704        })
705        .collect::<Vec<_>>();
706    json_paths.sort();
707    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
708        eprintln!("Failed to generate explorer HTML: {}", err);
709    }
710
711    Ok(())
712}