eval: Remove deprecated eval crate and workflow (#52733)

Ben Brandt created

This is replaced by eval_cli

Release Notes:

- N/A

Change summary

.github/CODEOWNERS.hold                                       |    1 
.github/workflows/run_agent_evals.yml                         |   71 
.zed/settings.json                                            |    2 
Cargo.lock                                                    |   56 
Cargo.toml                                                    |    1 
crates/agent/Cargo.toml                                       |    1 
crates/agent/src/thread.rs                                    |    8 
crates/eval/.gitignore                                        |    3 
crates/eval/Cargo.toml                                        |   70 
crates/eval/LICENSE-GPL                                       |    1 
crates/eval/README.md                                         |   27 
crates/eval/build.rs                                          |   14 
crates/eval/docs/explorer.md                                  |   27 
crates/eval/runner_settings.json                              |    7 
crates/eval/src/assertions.rs                                 |  170 
crates/eval/src/eval.rs                                       |  742 --
crates/eval/src/example.rs                                    |  561 -
crates/eval/src/examples/add_arg_to_trait_method.rs           |  115 
crates/eval/src/examples/code_block_citations.rs              |  218 
crates/eval/src/examples/comment_translation.rs               |   60 
crates/eval/src/examples/file_change_notification.rs          |   74 
crates/eval/src/examples/file_search.rs                       |   55 
crates/eval/src/examples/find_and_replace_diff_card.toml      |   43 
crates/eval/src/examples/grep_params_escapement.rs            |   59 
crates/eval/src/examples/hallucinated_tool_calls.toml         |   13 
crates/eval/src/examples/mod.rs                               |  173 
crates/eval/src/examples/no_tools_enabled.toml                |   19 
crates/eval/src/examples/overwrite_file.rs                    |   51 
crates/eval/src/examples/planets.rs                           |   75 
crates/eval/src/examples/threads/overwrite-file.json          |   27 
crates/eval/src/examples/tree_sitter_drop_emscripten_dep.toml |   53 
crates/eval/src/explorer.html                                 |  949 ---
crates/eval/src/explorer.rs                                   |  182 
crates/eval/src/ids.rs                                        |   29 
crates/eval/src/instance.rs                                   | 1446 -----
crates/eval/src/judge_diff_prompt.hbs                         |   25 
crates/eval/src/judge_thread_prompt.hbs                       |   21 
crates/eval/src/tool_metrics.rs                               |  106 
tooling/xtask/src/tasks/workflows.rs                          |    1 
tooling/xtask/src/tasks/workflows/run_agent_evals.rs          |   47 
typos.toml                                                    |    2 
41 files changed, 1 insertion(+), 5,604 deletions(-)

Detailed changes

.github/CODEOWNERS.hold πŸ”—

@@ -48,7 +48,6 @@
 /crates/edit_prediction_context/ @zed-industries/ai-team
 /crates/edit_prediction_types/ @zed-industries/ai-team
 /crates/edit_prediction_ui/ @zed-industries/ai-team
-/crates/eval/ @zed-industries/ai-team
 /crates/eval_utils/ @zed-industries/ai-team
 /crates/google_ai/ @zed-industries/ai-team
 /crates/language_model/ @zed-industries/ai-team

.github/workflows/run_agent_evals.yml πŸ”—

@@ -1,71 +0,0 @@
-# Generated from xtask::workflows::run_agent_evals
-# Rebuild with `cargo xtask workflows`.
-name: run_agent_evals
-env:
-  CARGO_TERM_COLOR: always
-  CARGO_INCREMENTAL: '0'
-  RUST_BACKTRACE: '1'
-  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-  GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
-  GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
-  ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
-  ZED_EVAL_TELEMETRY: '1'
-  MODEL_NAME: ${{ inputs.model_name }}
-on:
-  workflow_dispatch:
-    inputs:
-      model_name:
-        description: model_name
-        required: true
-        type: string
-jobs:
-  agent_evals:
-    runs-on: namespace-profile-16x32-ubuntu-2204
-    steps:
-    - name: steps::checkout_repo
-      uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd
-      with:
-        clean: false
-    - name: steps::cache_rust_dependencies_namespace
-      uses: namespacelabs/nscloud-cache-action@a90bb5d4b27522ce881c6e98eebd7d7e6d1653f9
-      with:
-        cache: rust
-        path: ~/.rustup
-    - name: steps::setup_linux
-      run: ./script/linux
-    - name: steps::download_wasi_sdk
-      run: ./script/download-wasi-sdk
-    - name: steps::setup_cargo_config
-      run: |
-        mkdir -p ./../.cargo
-        cp ./.cargo/ci-config.toml ./../.cargo/config.toml
-    - name: steps::setup_sccache
-      run: ./script/setup-sccache
-      env:
-        R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }}
-        R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
-        R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
-        SCCACHE_BUCKET: sccache-zed
-    - name: cargo build --package=eval
-      run: cargo build --package=eval
-    - name: run_agent_evals::agent_evals::run_eval
-      run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}"
-      env:
-        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }}
-        GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }}
-    - name: steps::show_sccache_stats
-      run: sccache --show-stats || true
-    - name: steps::cleanup_cargo_config
-      if: always()
-      run: |
-        rm -rf ./../.cargo
-    timeout-minutes: 600
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}

.zed/settings.json πŸ”—

@@ -59,8 +59,6 @@
   "file_scan_exclusions": [
     "crates/agent/src/edit_agent/evals/fixtures",
     "crates/agent/src/tools/evals/fixtures",
-    "crates/eval/worktrees/",
-    "crates/eval/repos/",
     "**/.git",
     "**/.svn",
     "**/.hg",

Cargo.lock πŸ”—

@@ -5825,62 +5825,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "eval"
-version = "0.1.0"
-dependencies = [
- "acp_thread",
- "agent",
- "agent-client-protocol",
- "agent_settings",
- "agent_ui",
- "anyhow",
- "async-trait",
- "buffer_diff",
- "chrono",
- "clap",
- "client",
- "collections",
- "debug_adapter_extension",
- "dirs 4.0.0",
- "dotenvy",
- "env_logger 0.11.8",
- "extension",
- "fs",
- "futures 0.3.31",
- "gpui",
- "gpui_platform",
- "gpui_tokio",
- "handlebars 4.5.0",
- "language",
- "language_extension",
- "language_model",
- "language_models",
- "languages",
- "markdown",
- "node_runtime",
- "pathdiff",
- "paths",
- "pretty_assertions",
- "project",
- "prompt_store",
- "rand 0.9.2",
- "regex",
- "release_channel",
- "reqwest_client",
- "serde",
- "serde_json",
- "settings",
- "shellexpand 2.1.2",
- "telemetry",
- "terminal_view",
- "toml 0.8.23",
- "unindent",
- "util",
- "uuid",
- "watch",
-]
-
 [[package]]
 name = "eval_cli"
 version = "0.1.0"

Cargo.toml πŸ”—

@@ -65,7 +65,6 @@ members = [
     "crates/editor",
     "crates/encoding_selector",
     "crates/etw_tracing",
-    "crates/eval",
     "crates/eval_cli",
     "crates/eval_utils",
     "crates/explorer_command_injector",

crates/agent/Cargo.toml πŸ”—

@@ -10,7 +10,6 @@ path = "src/agent.rs"
 
 [features]
 test-support = ["db/test-support"]
-eval = []
 unit-eval = []
 e2e = []
 

crates/agent/src/thread.rs πŸ”—

@@ -1804,14 +1804,6 @@ impl Thread {
         cx.notify();
     }
 
-    #[cfg(feature = "eval")]
-    pub fn proceed(
-        &mut self,
-        cx: &mut Context<Self>,
-    ) -> Result<mpsc::UnboundedReceiver<Result<ThreadEvent>>> {
-        self.run_turn(cx)
-    }
-
     fn run_turn(
         &mut self,
         cx: &mut Context<Self>,

crates/eval/Cargo.toml πŸ”—

@@ -1,70 +0,0 @@
-[package]
-name = "eval"
-version = "0.1.0"
-publish.workspace = true
-edition.workspace = true
-license = "GPL-3.0-or-later"
-default-run = "eval"
-
-[lints]
-workspace = true
-
-[[bin]]
-name = "eval"
-path = "src/eval.rs"
-
-[[bin]]
-name = "explorer"
-path = "src/explorer.rs"
-
-[dependencies]
-acp_thread.workspace = true
-agent = { workspace = true, features = ["eval"] }
-agent-client-protocol.workspace = true
-agent_settings.workspace = true
-agent_ui.workspace = true
-anyhow.workspace = true
-async-trait.workspace = true
-buffer_diff.workspace = true
-chrono.workspace = true
-clap.workspace = true
-client.workspace = true
-collections.workspace = true
-debug_adapter_extension.workspace = true
-dirs.workspace = true
-dotenvy.workspace = true
-env_logger.workspace = true
-extension.workspace = true
-fs.workspace = true
-futures.workspace = true
-gpui.workspace = true
-gpui_platform.workspace = true
-gpui_tokio.workspace = true
-handlebars.workspace = true
-language.workspace = true
-language_extension.workspace = true
-language_model.workspace = true
-language_models.workspace = true
-languages = { workspace = true, features = ["load-grammars"] }
-markdown.workspace = true
-node_runtime.workspace = true
-pathdiff.workspace = true
-paths.workspace = true
-pretty_assertions.workspace = true
-project.workspace = true
-prompt_store.workspace = true
-regex.workspace = true
-rand.workspace = true
-release_channel.workspace = true
-reqwest_client.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-settings.workspace = true
-shellexpand.workspace = true
-telemetry.workspace = true
-terminal_view.workspace = true
-toml.workspace = true
-unindent.workspace = true
-util.workspace = true
-uuid.workspace = true
-watch.workspace = true

crates/eval/README.md πŸ”—

@@ -1,27 +0,0 @@
-# Eval
-
-This eval assumes the working directory is the root of the repository. Run it with:
-
-```sh
-cargo run -p eval
-```
-
-The eval will optionally read a `.env` file in `crates/eval` if you need it to set environment variables, such as API keys.
-
-## Explorer Tool
-
-The explorer tool generates a self-contained HTML view from one or more thread
-JSON file. It provides a visual interface to explore the agent thread, including
-tool calls and results. See [./docs/explorer.md](./docs/explorer.md) for more details.
-
-### Usage
-
-```sh
-cargo run -p eval --bin explorer -- --input <path-to-json-files> --output <output-html-path>
-```
-
-Example:
-
-```sh
-cargo run -p eval --bin explorer -- --input ./runs/2025-04-23_15-53-30/fastmcp_bugifx/*/last.messages.json --output /tmp/explorer.html
-```

crates/eval/build.rs πŸ”—

@@ -1,14 +0,0 @@
-fn main() {
-    let cargo_toml =
-        std::fs::read_to_string("../zed/Cargo.toml").expect("Failed to read crates/zed/Cargo.toml");
-    let version = cargo_toml
-        .lines()
-        .find(|line| line.starts_with("version = "))
-        .expect("Version not found in crates/zed/Cargo.toml")
-        .split('=')
-        .nth(1)
-        .expect("Invalid version format")
-        .trim()
-        .trim_matches('"');
-    println!("cargo:rustc-env=ZED_PKG_VERSION={}", version);
-}

crates/eval/docs/explorer.md πŸ”—

@@ -1,27 +0,0 @@
-# Explorer
-
-Threads Explorer is a single self-contained HTML file that gives an overview of
-evaluation runs, while allowing for some interactivity.
-
-When you open a file, it gives you a _thread overview_, which looks like this:
-
-| Turn | Text                                 | Tool                                         | Result                                        |
-| ---- | ------------------------------------ | -------------------------------------------- | --------------------------------------------- |
-| 1    | [User]:                              |                                              |                                               |
-|      | Fix the bug: kwargs not passed...    |                                              |                                               |
-| 2    | I'll help you fix that bug.          | **list_directory**(path="fastmcp")           | `fastmcp/src [...]`                           |
-|      |                                      |                                              |                                               |
-| 3    | Let's examine the code.              | **read_file**(path="fastmcp/main.py", [...]) | `def run_application(app, \*\*kwargs): [...]` |
-| 4    | I found the issue.                   | **edit_file**(path="fastmcp/core.py", [...]) | `Made edit to fastmcp/core.py`                |
-| 5    | Let's check if there are any errors. | **diagnostics**()                            | `No errors found`                             |
-
-### Implementation details
-
-`src/explorer.html` contains the template. You can open this template in a
-browser as is, and it will show some dummy values. But the main use is to set
-the `threadsData` variable with real data, which then will be used instead of
-the dummy values.
-
-`src/explorer.rs` takes one or more JSON files as generated by `cargo run -p
-eval`, and outputs an HTML file for rendering these threads. Refer dummy data
-in `explorer.html` for a sample format.

crates/eval/src/assertions.rs πŸ”—

@@ -1,170 +0,0 @@
-use serde::{Deserialize, Serialize};
-use std::fmt::Write;
-use std::fmt::{self};
-
-#[derive(Default, Debug, Serialize, Deserialize, Clone)]
-pub struct AssertionsReport {
-    pub ran: Vec<RanAssertion>,
-    pub max: Option<usize>,
-}
-
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RanAssertion {
-    pub id: String,
-    pub result: Result<RanAssertionResult, String>,
-}
-
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RanAssertionResult {
-    pub analysis: Option<String>,
-    pub passed: bool,
-}
-
-impl AssertionsReport {
-    pub fn new(max: Option<usize>) -> Self {
-        AssertionsReport {
-            ran: Vec::new(),
-            max,
-        }
-    }
-
-    pub fn error(msg: String) -> Self {
-        let assert = RanAssertion {
-            id: "no-unhandled-errors".into(),
-            result: Err(msg),
-        };
-        AssertionsReport {
-            ran: vec![assert],
-            max: Some(1),
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.ran.is_empty()
-    }
-
-    pub fn total_count(&self) -> usize {
-        self.run_count().max(self.max.unwrap_or(0))
-    }
-
-    pub fn run_count(&self) -> usize {
-        self.ran.len()
-    }
-
-    pub fn passed_count(&self) -> usize {
-        self.ran
-            .iter()
-            .filter(|a| a.result.as_ref().is_ok_and(|result| result.passed))
-            .count()
-    }
-
-    pub fn passed_percentage(&self) -> f32 {
-        if self.total_count() == 0 {
-            0.0
-        } else {
-            (self.passed_count() as f32 / self.total_count() as f32) * 100.0
-        }
-    }
-}
-
-const ROUND_WIDTH: usize = "Round".len();
-const ASSERTIONS_WIDTH: usize = 42;
-const RESULTS_WIDTH: usize = 8;
-
-pub fn print_table_header() {
-    println!(
-        "β”Œβ”€{}─┬─{}─┬─{}─┐",
-        "─".repeat(ROUND_WIDTH),
-        "─".repeat(ASSERTIONS_WIDTH),
-        "─".repeat(RESULTS_WIDTH)
-    );
-
-    println!(
-        "β”‚ {:^ROUND_WIDTH$} β”‚ {:^ASSERTIONS_WIDTH$} β”‚ {:^RESULTS_WIDTH$} β”‚",
-        "Round", "Assertion", "Result"
-    );
-
-    println!(
-        "β”œβ”€{}─┼─{}─┼─{}──",
-        "─".repeat(ROUND_WIDTH),
-        "─".repeat(ASSERTIONS_WIDTH),
-        "─".repeat(RESULTS_WIDTH)
-    )
-}
-
-pub fn display_error_row(f: &mut String, round: usize, error: String) -> fmt::Result {
-    let last_two_columns = ASSERTIONS_WIDTH + RESULTS_WIDTH;
-    writeln!(
-        f,
-        "β”‚ {:^ROUND_WIDTH$} β”‚ {:<last_two_columns$} |",
-        round,
-        truncate(&error, last_two_columns)
-    )
-}
-
-pub fn display_table_row(f: &mut String, round: usize, assertion: &RanAssertion) -> fmt::Result {
-    let result = match &assertion.result {
-        Ok(result) if result.passed => "\x1b[32mβœ”οΈŽ Passed\x1b[0m",
-        Ok(_) => "\x1b[31mβœ— Failed\x1b[0m",
-        Err(_) => "\x1b[31mπŸ’₯ Judge Error\x1b[0m",
-    };
-
-    writeln!(
-        f,
-        "β”‚ {:^ROUND_WIDTH$} β”‚ {:<ASSERTIONS_WIDTH$} β”‚ {:>RESULTS_WIDTH$} β”‚",
-        round,
-        truncate(&assertion.id, ASSERTIONS_WIDTH),
-        result
-    )
-}
-
-pub fn print_table_round_summary<'a>(
-    round: &str,
-    reports: impl Iterator<Item = &'a AssertionsReport>,
-) {
-    let mut passed = 0;
-    let mut total = 0;
-    for report in reports {
-        passed += report.passed_count();
-        total += report.total_count();
-    }
-
-    println!(
-        "β”‚ {:^ROUND_WIDTH$} β”‚ {:<ASSERTIONS_WIDTH$} β”‚ {:>RESULTS_WIDTH$} β”‚",
-        round,
-        "total",
-        format!("{}%", (passed as f32 / total as f32 * 100.0).floor())
-    )
-}
-
-pub fn print_table_footer() {
-    println!(
-        "└─{}─┴─{}─┴─{}β”€β”˜",
-        "─".repeat(ROUND_WIDTH),
-        "─".repeat(ASSERTIONS_WIDTH),
-        "─".repeat(RESULTS_WIDTH)
-    )
-}
-
-pub fn print_table_divider() {
-    println!(
-        "β”œβ”€{}─┼─{}─┼─{}──",
-        "─".repeat(ROUND_WIDTH),
-        "─".repeat(ASSERTIONS_WIDTH),
-        "─".repeat(RESULTS_WIDTH)
-    )
-}
-
-fn truncate(assertion: &str, max_width: usize) -> String {
-    let is_verbose = std::env::var("VERBOSE").is_ok_and(|v| !v.is_empty());
-
-    if assertion.len() <= max_width || is_verbose {
-        assertion.to_string()
-    } else {
-        let mut end_ix = max_width - 1;
-        while !assertion.is_char_boundary(end_ix) {
-            end_ix -= 1;
-        }
-        format!("{}…", &assertion[..end_ix])
-    }
-}

crates/eval/src/eval.rs πŸ”—

@@ -1,742 +0,0 @@
-mod assertions;
-mod example;
-mod examples;
-mod explorer;
-mod ids;
-mod instance;
-mod tool_metrics;
-
-use assertions::{AssertionsReport, display_error_row};
-use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
-use language_extension::LspAccess;
-pub(crate) use tool_metrics::*;
-
-use ::fs::RealFs;
-use clap::Parser;
-use client::{Client, ProxySettings, UserStore};
-use collections::{HashMap, HashSet};
-use extension::ExtensionHostProxy;
-use futures::future;
-use gpui::http_client::read_proxy_from_env;
-use gpui::{App, AppContext, AsyncApp, Entity, UpdateGlobal};
-use gpui_tokio::Tokio;
-use language::LanguageRegistry;
-use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel};
-use node_runtime::{NodeBinaryOptions, NodeRuntime};
-use project::project_settings::ProjectSettings;
-use prompt_store::PromptBuilder;
-use release_channel::{AppCommitSha, AppVersion};
-use reqwest_client::ReqwestClient;
-use settings::{Settings, SettingsStore};
-use std::cell::RefCell;
-use std::collections::VecDeque;
-use std::env;
-use std::path::{Path, PathBuf};
-use std::rc::Rc;
-use std::str::FromStr;
-use std::sync::{Arc, LazyLock};
-use util::ResultExt as _;
-
-static CARGO_MANIFEST_DIR: LazyLock<PathBuf> =
-    LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR")));
-
-#[derive(Parser, Debug)]
-#[command(name = "eval", disable_version_flag = true)]
-struct Args {
-    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
-    #[arg(value_name = "EXAMPLE_SUBSTRING")]
-    filter: Vec<String>,
-    /// provider/model to use for agent
-    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
-    model: String,
-    /// provider/model to use for judges
-    #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")]
-    judge_model: String,
-    #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")]
-    languages: Vec<String>,
-    /// How many times to run each example.
-    #[arg(long, default_value = "8")]
-    repetitions: usize,
-    /// Maximum number of examples to run concurrently.
-    #[arg(long, default_value = "4")]
-    concurrency: usize,
-    /// Output current environment variables as JSON to stdout
-    #[arg(long, hide = true)]
-    printenv: bool,
-}
-
-fn main() {
-    let args = Args::parse();
-
-    // This prevents errors showing up in the logs, because
-    // project::environment::load_shell_environment() calls
-    // std::env::current_exe().unwrap() --printenv
-    if args.printenv {
-        util::shell_env::print_env();
-        return;
-    }
-
-    dotenvy::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok();
-
-    env_logger::init();
-
-    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
-    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
-    let session_id = uuid::Uuid::new_v4().to_string();
-    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
-    let run_id = match env::var("GITHUB_RUN_ID") {
-        Ok(run_id) => format!("github/{}", run_id),
-        Err(_) => format!("local/{}", run_timestamp),
-    };
-
-    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
-        .parent()
-        .unwrap()
-        .parent()
-        .unwrap()
-        .canonicalize()
-        .unwrap();
-    let eval_crate_dir = root_dir.join("crates").join("eval");
-    let repos_dir = eval_crate_dir.join("repos");
-    let worktrees_dir = eval_crate_dir.join("worktrees");
-    let examples_dir = eval_crate_dir.join("src").join("examples");
-    let run_dir = eval_crate_dir
-        .join("runs")
-        .join(format!("{}", run_timestamp));
-    std::fs::create_dir_all(&run_dir).unwrap();
-    std::fs::create_dir_all(&repos_dir).unwrap();
-    std::fs::create_dir_all(&worktrees_dir).unwrap();
-    std::fs::create_dir_all(&examples_dir).unwrap();
-    std::fs::create_dir_all(&paths::config_dir()).unwrap();
-
-    let zed_commit_sha = commit_sha_for_path(&root_dir);
-    let zed_branch_name = git_branch_for_path(&root_dir);
-    let languages: HashSet<String> = args.languages.into_iter().collect();
-
-    let http_client = Arc::new(ReqwestClient::new());
-    let app = gpui_platform::headless().with_http_client(http_client);
-    let all_threads = examples::all(&examples_dir);
-
-    app.run(move |cx| {
-        let app_state = init(cx);
-
-        let telemetry = app_state.client.telemetry();
-        telemetry.start(system_id, installation_id, session_id, cx);
-
-        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").is_ok_and(|value| value == "1")
-            && telemetry.has_checksum_seed();
-        if enable_telemetry {
-            println!("Telemetry enabled");
-            telemetry::event!(
-                "Agent Eval Started",
-                zed_commit_sha = zed_commit_sha,
-                zed_branch_name = zed_branch_name,
-                run_id = run_id,
-            );
-        }
-
-        let mut cumulative_tool_metrics = ToolMetrics::default();
-
-        let tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
-            registry.providers().iter().map(|p| p.authenticate(cx)).collect::<Vec<_>>()
-        });
-
-        cx.spawn(async move |cx| {
-            future::join_all(tasks).await;
-            let judge_model = cx.update(|cx| {
-                let agent_model = load_model(&args.model, cx).unwrap();
-                let judge_model = load_model(&args.judge_model, cx).unwrap();
-                LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
-                    registry.set_default_model(Some(agent_model.clone()), cx);
-                });
-                judge_model
-            });
-
-            let mut examples = Vec::new();
-
-            const COLORS: [&str; 12] = [
-                "\x1b[31m", // Red
-                "\x1b[32m", // Green
-                "\x1b[33m", // Yellow
-                "\x1b[34m", // Blue
-                "\x1b[35m", // Magenta
-                "\x1b[36m", // Cyan
-                "\x1b[91m", // Bright Red
-                "\x1b[92m", // Bright Green
-                "\x1b[93m", // Bright Yellow
-                "\x1b[94m", // Bright Blue
-                "\x1b[95m", // Bright Magenta
-                "\x1b[96m", // Bright Cyan
-            ];
-
-            let mut skipped = Vec::new();
-
-            for thread in all_threads {
-                let meta = thread.meta();
-                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
-                {
-                    skipped.push(meta.name);
-                    continue;
-                }
-
-                if let Some(language) = meta.language_server
-                    && !languages.contains(&language.file_extension) {
-                        panic!(
-                            "Eval for {:?} could not be run because no language server was found for extension {:?}",
-                            meta.name,
-                            language.file_extension
-                        );
-                    }
-
-                // TODO: This creates a worktree per repetition. Ideally these examples should
-                // either be run sequentially on the same worktree, or reuse worktrees when there
-                // are more examples to run than the concurrency limit.
-                for repetition_number in 0..args.repetitions {
-                    let example_instance = ExampleInstance::new(
-                        thread.clone(),
-                        &repos_dir,
-                        &run_dir,
-                        &worktrees_dir,
-                        repetition_number,
-                    );
-
-                    examples.push(example_instance);
-                }
-            }
-
-            if !skipped.is_empty() {
-                println!("Skipped threads: {}", skipped.join(", "));
-            }
-
-            if examples.is_empty() {
-                eprintln!("Filter matched no examples");
-                cx.update(|cx| cx.quit());
-                return anyhow::Ok(());
-            }
-
-            let mut repo_urls = HashSet::default();
-            let mut clone_tasks = Vec::new();
-
-            let max_name_width = examples
-                .iter()
-                .map(|e| e.worktree_name().len())
-                .max()
-                .unwrap_or(0);
-
-            for (i, example_instance) in examples.iter_mut().enumerate() {
-                let color = COLORS[i % COLORS.len()].to_string();
-                example_instance.set_log_prefix_style(&color, max_name_width);
-
-                println!(
-                    "{}Logging to: {}",
-                    example_instance.log_prefix,
-                    example_instance.run_directory.display()
-                );
-
-                let repo_url = example_instance.repo_url();
-                if repo_urls.insert(repo_url.clone()) {
-                    let repo_path = example_instance.repo_path.clone();
-
-                    if !repo_path.join(".git").is_dir() {
-                        println!(
-                            "{:<width$} < {}",
-                            "↓ Cloning",
-                            repo_url,
-                            width = max_name_width
-                        );
-
-                        let git_task = cx.spawn(async move |_cx| {
-                            std::fs::create_dir_all(&repo_path)?;
-                            run_git(&repo_path, &["init"]).await?;
-                            run_git(&repo_path, &["remote", "add", "origin", &repo_url]).await
-                        });
-
-                        clone_tasks.push(git_task);
-                    } else {
-                        println!(
-                            "{:<width$}  < {}",
-                            "βœ”οΈŽ Already cloned",
-                            repo_url,
-                            width = max_name_width
-                        );
-
-                        let actual_origin =
-                            run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
-                        anyhow::ensure!(
-                            actual_origin == repo_url,
-                            "remote origin {actual_origin} does not match expected origin {repo_url}"
-                        );
-                    }
-                }
-            }
-
-            future::join_all(clone_tasks).await;
-
-            for example_instance in examples.iter_mut() {
-                example_instance.fetch().await?;
-            }
-
-            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
-            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
-
-            future::join_all((0..args.concurrency).map(|_| {
-                let app_state = app_state.clone();
-                let judge_model = judge_model.model.clone();
-                let zed_commit_sha = zed_commit_sha.clone();
-                let zed_branch_name = zed_branch_name.clone();
-                let run_id = run_id.clone();
-                let examples = examples.clone();
-                let results = results_by_example_name.clone();
-                cx.spawn(async move |cx| {
-                    loop {
-                        let Some(mut example) = examples.borrow_mut().pop_front() else {
-                            break;
-                        };
-                        let result = async {
-                            example.setup().await?;
-                            let run_output = cx
-                                .update(|cx| example.run(app_state.clone(), cx))
-                                .await?;
-                            let judge_output = judge_example(
-                                example.clone(),
-                                judge_model.clone(),
-                                &zed_commit_sha,
-                                &zed_branch_name,
-                                &run_id,
-                                &run_output,
-                                enable_telemetry,
-                                cx,
-                            )
-                            .await;
-                            anyhow::Ok((run_output, judge_output))
-                        }
-                        .await;
-                        results
-                            .borrow_mut()
-                            .entry(example.name.clone())
-                            .or_insert(Vec::new())
-                            .push((example.clone(), result));
-                    }
-                })
-            }))
-            .await;
-
-            print_report(
-                &mut results_by_example_name.borrow_mut(),
-                &mut cumulative_tool_metrics,
-                &run_dir,
-            )?;
-
-            app_state.client.telemetry().flush_events().await;
-
-            cx.update(|cx| cx.quit());
-            anyhow::Ok(())
-        })
-        .detach_and_log_err(cx);
-    });
-}
-
-/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
-pub struct AgentAppState {
-    pub languages: Arc<LanguageRegistry>,
-    pub client: Arc<Client>,
-    pub user_store: Entity<UserStore>,
-    pub fs: Arc<dyn fs::Fs>,
-    pub node_runtime: NodeRuntime,
-
-    // Additional fields not present in `workspace::AppState`.
-    pub prompt_builder: Arc<PromptBuilder>,
-}
-
-pub fn init(cx: &mut App) -> Arc<AgentAppState> {
-    let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned()));
-
-    let app_version = AppVersion::load(
-        env!("ZED_PKG_VERSION"),
-        option_env!("ZED_BUILD_ID"),
-        app_commit_sha,
-    );
-
-    release_channel::init(app_version.clone(), cx);
-    gpui_tokio::init(cx);
-
-    let settings_store = SettingsStore::new(cx, &settings::default_settings());
-    cx.set_global(settings_store);
-
-    // Set User-Agent so we can download language servers from GitHub
-    let user_agent = format!(
-        "Zed Agent Eval/{} ({}; {})",
-        app_version,
-        std::env::consts::OS,
-        std::env::consts::ARCH
-    );
-    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
-    let proxy_url = proxy_str
-        .as_ref()
-        .and_then(|input| input.parse().ok())
-        .or_else(read_proxy_from_env);
-    let http = {
-        let _guard = Tokio::handle(cx).enter();
-
-        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
-            .expect("could not start HTTP client")
-    };
-    cx.set_http_client(Arc::new(http));
-
-    let client = Client::production(cx);
-    cx.set_http_client(client.http_client());
-
-    let git_binary_path = None;
-    let fs = Arc::new(RealFs::new(
-        git_binary_path,
-        cx.background_executor().clone(),
-    ));
-
-    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
-    languages.set_language_server_download_dir(paths::languages_dir().clone());
-    let languages = Arc::new(languages);
-
-    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
-
-    extension::init(cx);
-
-    let (mut tx, rx) = watch::channel(None);
-    cx.observe_global::<SettingsStore>(move |cx| {
-        let settings = &ProjectSettings::get_global(cx).node;
-        let options = NodeBinaryOptions {
-            allow_path_lookup: !settings.ignore_system_version,
-            allow_binary_download: true,
-            use_paths: settings.path.as_ref().map(|node_path| {
-                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
-                let npm_path = settings
-                    .npm_path
-                    .as_ref()
-                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
-                (
-                    node_path.clone(),
-                    npm_path.unwrap_or_else(|| {
-                        let base_path = PathBuf::new();
-                        node_path.parent().unwrap_or(&base_path).join("npm")
-                    }),
-                )
-            }),
-        };
-        tx.send(Some(options)).log_err();
-    })
-    .detach();
-    let node_runtime = NodeRuntime::new(client.http_client(), None, rx);
-
-    let extension_host_proxy = ExtensionHostProxy::global(cx);
-    debug_adapter_extension::init(extension_host_proxy.clone(), cx);
-    language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone());
-    language_model::init(user_store.clone(), client.clone(), cx);
-    language_models::init(user_store.clone(), client.clone(), cx);
-    languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx);
-    prompt_store::init(cx);
-    terminal_view::init(cx);
-    let stdout_is_a_pty = false;
-    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
-    agent_ui::init(
-        fs.clone(),
-        client.clone(),
-        prompt_builder.clone(),
-        languages.clone(),
-        true,
-        cx,
-    );
-
-    SettingsStore::update_global(cx, |store, cx| {
-        store.set_user_settings(include_str!("../runner_settings.json"), cx)
-    })
-    .unwrap();
-
-    Arc::new(AgentAppState {
-        languages,
-        client,
-        user_store,
-        fs,
-        node_runtime,
-        prompt_builder,
-    })
-}
-
-pub fn find_model(
-    model_name: &str,
-    model_registry: &LanguageModelRegistry,
-    cx: &App,
-) -> anyhow::Result<Arc<dyn LanguageModel>> {
-    let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?;
-    model_registry
-        .available_models(cx)
-        .find(|model| model.id() == selected.model && model.provider_id() == selected.provider)
-        .ok_or_else(|| {
-            anyhow::anyhow!(
-                "No language model with ID {}/{} was available. Available models: {}",
-                selected.provider.0,
-                selected.model.0,
-                model_registry
-                    .available_models(cx)
-                    .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
-                    .collect::<Vec<_>>()
-                    .join(", ")
-            )
-        })
-}
-
-pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result<ConfiguredModel> {
-    let model = {
-        let model_registry = LanguageModelRegistry::read_global(cx);
-        find_model(model_name, model_registry, cx)?
-    };
-
-    let provider = {
-        let model_registry = LanguageModelRegistry::read_global(cx);
-        model_registry
-            .provider(&model.provider_id())
-            .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))?
-    };
-
-    Ok(ConfiguredModel {
-        provider: provider.clone(),
-        model: model.clone(),
-    })
-}
-
-pub fn commit_sha_for_path(repo_path: &Path) -> String {
-    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
-}
-
-pub fn git_branch_for_path(repo_path: &Path) -> String {
-    match std::env::var("GITHUB_REF_NAME") {
-        Ok(branch) => branch,
-        Err(_) => {
-            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
-                .unwrap_or_else(|_| "unknown".to_string())
-        }
-    }
-}
-
-async fn judge_example(
-    example: ExampleInstance,
-    model: Arc<dyn LanguageModel>,
-    zed_commit_sha: &str,
-    zed_branch_name: &str,
-    run_id: &str,
-    run_output: &RunOutput,
-    enable_telemetry: bool,
-    cx: &AsyncApp,
-) -> JudgeOutput {
-    let judge_output = example.judge(model.clone(), run_output, cx).await;
-
-    if enable_telemetry {
-        telemetry::event!(
-            "Agent Example Evaluated",
-            zed_commit_sha = zed_commit_sha,
-            zed_branch_name = zed_branch_name,
-            run_id = run_id,
-            example_name = example.name.clone(),
-            example_repetition = example.repetition,
-            diff_evaluation = judge_output.diff.clone(),
-            thread_evaluation = judge_output.thread,
-            tool_metrics = run_output.tool_metrics,
-            token_usage = run_output.token_usage,
-            model = model.telemetry_id(),
-            model_provider = model.provider_id().to_string(),
-            repository_url = example.repo_url(),
-            repository_revision = example.revision(),
-            diagnostic_summary_before = run_output.diagnostic_summary_before,
-            diagnostic_summary_after = run_output.diagnostic_summary_after,
-            diagnostics_before = run_output.diagnostics_before,
-            diagnostics_after = run_output.diagnostics_after,
-        );
-    }
-
-    judge_output
-}
-
-const HEADER_WIDTH: usize = 65;
-
-fn print_h1(header: &str) {
-    println!("\n\n{:=^HEADER_WIDTH$}", "");
-    println!("{:^HEADER_WIDTH$}", header);
-    println!("{:=^HEADER_WIDTH$}\n", "");
-}
-
-fn print_h2(header: &str) {
-    println!("\n{:-^HEADER_WIDTH$}", "");
-    println!("{:^HEADER_WIDTH$}", header);
-    println!("{:-^HEADER_WIDTH$}\n", "");
-}
-
-fn print_report(
-    results_by_example_name: &mut HashMap<
-        String,
-        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
-    >,
-    cumulative_tool_metrics: &mut ToolMetrics,
-    run_dir: &Path,
-) -> anyhow::Result<()> {
-    print_h1("EVAL RESULTS");
-
-    let mut diff_scores = Vec::new();
-    let mut thread_scores = Vec::new();
-    let mut programmatic_scores = Vec::new();
-    let mut error_count = 0;
-
-    for (example_name, results) in results_by_example_name.iter_mut() {
-        print_h2(example_name);
-
-        results.sort_unstable_by_key(|(example, _)| example.repetition);
-        let mut example_cumulative_tool_metrics = ToolMetrics::default();
-
-        let mut table_rows = String::new();
-
-        for (example, result) in results.iter() {
-            match result {
-                Err(err) => {
-                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
-                    error_count += 1;
-                    programmatic_scores.push(0.0);
-                    diff_scores.push(0.0);
-                    thread_scores.push(0.0);
-                }
-                Ok((run_output, judge_output)) => {
-                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
-                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
-
-                    if run_output.programmatic_assertions.total_count() > 0 {
-                        for assertion in &run_output.programmatic_assertions.ran {
-                            assertions::display_table_row(
-                                &mut table_rows,
-                                example.repetition,
-                                assertion,
-                            )?;
-                        }
-
-                        programmatic_scores
-                            .push(run_output.programmatic_assertions.passed_percentage())
-                    }
-
-                    if !judge_output.diff.is_empty() {
-                        diff_scores.push(judge_output.diff.passed_percentage());
-
-                        for assertion in &judge_output.diff.ran {
-                            assertions::display_table_row(
-                                &mut table_rows,
-                                example.repetition,
-                                assertion,
-                            )?;
-                        }
-                    }
-
-                    if !judge_output.thread.is_empty() {
-                        thread_scores.push(judge_output.thread.passed_percentage());
-
-                        for assertion in &judge_output.thread.ran {
-                            assertions::display_table_row(
-                                &mut table_rows,
-                                example.repetition,
-                                assertion,
-                            )?;
-                        }
-                    }
-                }
-            }
-        }
-
-        let mut all_asserts = Vec::new();
-
-        if !table_rows.is_empty() {
-            assertions::print_table_header();
-            print!("{}", table_rows);
-
-            assertions::print_table_divider();
-
-            for (example, result) in results.iter() {
-                if let Ok((run_output, judge_output)) = result {
-                    let asserts = [
-                        run_output.programmatic_assertions.clone(),
-                        judge_output.diff.clone(),
-                        judge_output.thread.clone(),
-                    ];
-                    all_asserts.extend_from_slice(&asserts);
-                    assertions::print_table_round_summary(
-                        &example.repetition.to_string(),
-                        asserts.iter(),
-                    )
-                } else if let Err(err) = result {
-                    let assert = AssertionsReport::error(err.to_string());
-                    all_asserts.push(assert.clone());
-                    assertions::print_table_round_summary(
-                        &example.repetition.to_string(),
-                        [assert].iter(),
-                    )
-                }
-            }
-
-            assertions::print_table_divider();
-
-            assertions::print_table_round_summary("avg", all_asserts.iter());
-
-            assertions::print_table_footer();
-        }
-
-        if !example_cumulative_tool_metrics.is_empty() {
-            println!("{}", &example_cumulative_tool_metrics);
-        }
-    }
-
-    if results_by_example_name.len() > 1 {
-        print_h1("AGGREGATE");
-
-        if error_count > 0 {
-            println!("\n{error_count} examples failed to run!");
-        }
-
-        let programmatic_score_count = programmatic_scores.len();
-        if programmatic_score_count > 0 {
-            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
-                / (programmatic_score_count as f32))
-                .floor();
-            println!("Average programmatic score: {average_programmatic_score}%");
-        }
-
-        let diff_score_count = diff_scores.len();
-        if diff_score_count > 0 {
-            let average_diff_score =
-                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
-            println!("Average diff score: {average_diff_score}%");
-        }
-
-        let thread_score_count = thread_scores.len();
-
-        if thread_score_count > 0 {
-            let average_thread_score =
-                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
-            println!("Average thread score: {average_thread_score}%");
-        }
-
-        println!();
-
-        print_h2("CUMULATIVE TOOL METRICS");
-        println!("{}", cumulative_tool_metrics);
-    }
-
-    let explorer_output_path = run_dir.join("overview.html");
-    let mut json_paths: Vec<PathBuf> = results_by_example_name
-        .values()
-        .flat_map(|results| {
-            results.iter().map(|(example, _)| {
-                let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
-                let cwd = std::env::current_dir().expect("Can't get current dir");
-                pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
-            })
-        })
-        .collect::<Vec<_>>();
-    json_paths.sort();
-    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
-        eprintln!("Failed to generate explorer HTML: {}", err);
-    }
-
-    Ok(())
-}

crates/eval/src/example.rs πŸ”—

@@ -1,561 +0,0 @@
-use std::{
-    error::Error,
-    fmt::{self, Debug},
-    sync::{Arc, Mutex},
-    time::Duration,
-    u32,
-};
-
-use crate::{
-    ToolMetrics,
-    assertions::{AssertionsReport, RanAssertion, RanAssertionResult},
-};
-use acp_thread::UserMessageId;
-use agent::{Thread, ThreadEvent, UserMessageContent};
-use agent_client_protocol as acp;
-use agent_settings::AgentProfileId;
-use anyhow::{Result, anyhow};
-use async_trait::async_trait;
-use buffer_diff::DiffHunkStatus;
-use collections::HashMap;
-use futures::{FutureExt as _, StreamExt, select_biased};
-use gpui::{App, AppContext, AsyncApp, Entity};
-use language_model::Role;
-use util::rel_path::RelPath;
-
-pub const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
-
-#[async_trait(?Send)]
-pub trait Example {
-    fn meta(&self) -> ExampleMetadata;
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()>;
-    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
-        Vec::new()
-    }
-    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
-        Vec::new()
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct JudgeAssertion {
-    pub id: String,
-    pub description: String,
-}
-
-#[derive(Clone, Debug)]
-pub struct ExampleMetadata {
-    pub name: String,
-    pub url: String,
-    pub revision: String,
-    pub language_server: Option<LanguageServer>,
-    pub max_assertions: Option<usize>,
-    pub profile_id: AgentProfileId,
-    pub existing_thread_json: Option<String>,
-    pub max_turns: Option<u32>,
-}
-
-#[derive(Clone, Debug)]
-pub struct LanguageServer {
-    pub file_extension: String,
-    pub allow_preexisting_diagnostics: bool,
-}
-
-impl ExampleMetadata {
-    pub fn repo_name(&self) -> String {
-        self.url
-            .split('/')
-            .next_back()
-            .unwrap_or("")
-            .trim_end_matches(".git")
-            .into()
-    }
-}
-
-pub struct FailedAssertion(pub String);
-
-impl fmt::Debug for FailedAssertion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "Assertion failure: {}", self.0)
-    }
-}
-
-impl fmt::Display for FailedAssertion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-
-impl Error for FailedAssertion {}
-
-pub struct ExampleContext {
-    meta: ExampleMetadata,
-    log_prefix: String,
-    agent_thread: Entity<agent::Thread>,
-    app: AsyncApp,
-    pub assertions: AssertionsReport,
-    pub tool_metrics: Arc<Mutex<ToolMetrics>>,
-}
-
-impl ExampleContext {
-    pub fn new(
-        meta: ExampleMetadata,
-        log_prefix: String,
-        agent_thread: Entity<Thread>,
-        app: AsyncApp,
-    ) -> Self {
-        let assertions = AssertionsReport::new(meta.max_assertions);
-
-        Self {
-            meta,
-            log_prefix,
-            agent_thread,
-            assertions,
-            app,
-            tool_metrics: Arc::new(Mutex::new(ToolMetrics::default())),
-        }
-    }
-
-    pub fn assert(&mut self, expected: bool, message: impl ToString) -> Result<()> {
-        let message = message.to_string();
-        self.log_assertion(
-            if expected {
-                Ok(())
-            } else {
-                Err(anyhow::Error::from(FailedAssertion(message.clone())))
-            },
-            message,
-        )
-    }
-
-    pub fn assert_some<T>(&mut self, option: Option<T>, message: impl ToString) -> Result<T> {
-        let message = message.to_string();
-        self.log_assertion(
-            match option {
-                Some(value) => Ok(value),
-                None => Err(anyhow::Error::from(FailedAssertion(message.clone()))),
-            },
-            message,
-        )
-    }
-
-    #[allow(dead_code)]
-    pub fn assert_eq<T: PartialEq + Debug>(
-        &mut self,
-        left: T,
-        right: T,
-        message: impl ToString,
-    ) -> Result<()> {
-        let message = message.to_string();
-        self.log_assertion(
-            if left == right {
-                Ok(())
-            } else {
-                println!(
-                    "{}{}",
-                    self.log_prefix,
-                    pretty_assertions::Comparison::new(&left, &right)
-                );
-                Err(anyhow::Error::from(FailedAssertion(message.clone())))
-            },
-            message,
-        )
-    }
-
-    fn log_assertion<T>(&mut self, result: Result<T>, message: String) -> Result<T> {
-        if let Some(max) = self.meta.max_assertions {
-            anyhow::ensure!(
-                self.assertions.run_count() <= max,
-                "More assertions were run than the stated max_assertions of {max}"
-            );
-        }
-
-        self.assertions.ran.push(RanAssertion {
-            id: message.clone(),
-            result: Ok(RanAssertionResult {
-                analysis: None,
-                passed: result.is_ok(),
-            }),
-        });
-
-        if result.is_ok() {
-            println!("{}βœ… {}", self.log_prefix, message);
-        } else {
-            println!("{}❌ {}", self.log_prefix, message);
-        }
-
-        result
-    }
-
-    pub async fn prompt(&mut self, prompt: impl Into<String>) -> Result<Response> {
-        self.prompt_with_max_turns(prompt, u32::MAX).await
-    }
-
-    pub async fn prompt_with_max_turns(
-        &mut self,
-        prompt: impl Into<String>,
-        max_turns: u32,
-    ) -> Result<Response> {
-        let content = vec![UserMessageContent::Text(prompt.into())];
-        self.run_turns(Some(content), max_turns).await
-    }
-
-    pub async fn proceed_with_max_turns(&mut self, max_turns: u32) -> Result<Response> {
-        self.run_turns(None, max_turns).await
-    }
-
-    async fn run_turns(
-        &mut self,
-        prompt: Option<Vec<UserMessageContent>>,
-        max_turns: u32,
-    ) -> Result<Response> {
-        let tool_metrics = self.tool_metrics.clone();
-        let log_prefix = self.log_prefix.clone();
-
-        let mut remaining_turns = max_turns;
-
-        let mut event_stream = self.agent_thread.update(&mut self.app, |thread, cx| {
-            if let Some(prompt) = prompt {
-                let id = UserMessageId::new();
-                thread.send(id, prompt, cx)
-            } else {
-                thread.proceed(cx)
-            }
-        })?;
-
-        let task = self.app.background_spawn(async move {
-            let mut messages = Vec::new();
-            let mut tool_uses_by_id = HashMap::default();
-            while let Some(event) = event_stream.next().await {
-                match event? {
-                    ThreadEvent::UserMessage(user_message) => {
-                        messages.push(Message {
-                            role: Role::User,
-                            text: user_message.to_markdown(),
-                            tool_use: Vec::new(),
-                        });
-                    }
-                    ThreadEvent::AgentThinking(text) | ThreadEvent::AgentText(text) => {
-                        if matches!(
-                            messages.last(),
-                            Some(Message {
-                                role: Role::Assistant,
-                                ..
-                            })
-                        ) {
-                            messages.last_mut().unwrap().text.push_str(&text);
-                        } else {
-                            messages.push(Message {
-                                role: Role::Assistant,
-                                text,
-                                tool_use: Vec::new(),
-                            });
-                        }
-                    }
-                    ThreadEvent::ToolCall(tool_call) => {
-                        let meta = tool_call.meta.expect("Missing meta field in tool_call");
-                        let tool_name = meta
-                            .get(acp_thread::TOOL_NAME_META_KEY)
-                            .expect("Missing tool_name field in meta")
-                            .as_str()
-                            .expect("Unknown tool_name content in meta");
-
-                        tool_uses_by_id.insert(
-                            tool_call.tool_call_id,
-                            ToolUse {
-                                name: tool_name.to_string(),
-                                value: tool_call.raw_input.unwrap_or_default(),
-                            },
-                        );
-                        if matches!(
-                            tool_call.status,
-                            acp::ToolCallStatus::Completed | acp::ToolCallStatus::Failed
-                        ) {
-                            panic!("Tool call completed without update");
-                        }
-                    }
-                    ThreadEvent::ToolCallUpdate(tool_call_update) => {
-                        if let acp_thread::ToolCallUpdate::UpdateFields(update) = tool_call_update {
-                            if let Some(raw_input) = update.fields.raw_input {
-                                if let Some(tool_use) =
-                                    tool_uses_by_id.get_mut(&update.tool_call_id)
-                                {
-                                    tool_use.value = raw_input;
-                                }
-                            }
-
-                            if matches!(
-                                update.fields.status,
-                                Some(acp::ToolCallStatus::Completed | acp::ToolCallStatus::Failed)
-                            ) {
-                                let succeeded =
-                                    update.fields.status == Some(acp::ToolCallStatus::Completed);
-
-                                let tool_use = tool_uses_by_id
-                                    .remove(&update.tool_call_id)
-                                    .expect("Unrecognized tool call completed");
-
-                                let log_message = if succeeded {
-                                    format!("βœ”οΈŽ {}", tool_use.name)
-                                } else {
-                                    format!("βœ–οΈŽ {}", tool_use.name)
-                                };
-                                println!("{log_prefix}{log_message}");
-
-                                tool_metrics
-                                    .lock()
-                                    .unwrap()
-                                    .insert(tool_use.name.clone().into(), succeeded);
-
-                                if let Some(message) = messages.last_mut() {
-                                    message.tool_use.push(tool_use);
-                                } else {
-                                    messages.push(Message {
-                                        role: Role::Assistant,
-                                        text: "".to_string(),
-                                        tool_use: vec![tool_use],
-                                    });
-                                }
-
-                                remaining_turns -= 1;
-                                if remaining_turns == 0 {
-                                    return Ok(messages);
-                                }
-                            }
-                        }
-                    }
-                    ThreadEvent::ToolCallAuthorization(_) => panic!(
-                        "{}Bug: Tool confirmation should not be required in eval",
-                        log_prefix
-                    ),
-                    ThreadEvent::Plan(plan) => {
-                        println!("{log_prefix} Got plan: {plan:?}");
-                    }
-                    ThreadEvent::SubagentSpawned(session) => {
-                        println!("{log_prefix} Got subagent spawn: {session:?}");
-                    }
-                    ThreadEvent::Retry(status) => {
-                        println!("{log_prefix} Got retry: {status:?}");
-                    }
-                    ThreadEvent::Stop(stop_reason) => match stop_reason {
-                        acp::StopReason::EndTurn => {}
-                        acp::StopReason::MaxTokens => {
-                            return Err(anyhow!("Exceeded maximum tokens"));
-                        }
-                        acp::StopReason::MaxTurnRequests => {
-                            return Err(anyhow!("Exceeded maximum turn requests"));
-                        }
-                        stop_reason => return Err(anyhow!("{stop_reason:?}")),
-                    },
-                }
-            }
-            Ok(messages)
-        });
-
-        select_biased! {
-            result = task.fuse() => {
-                Ok(Response::new(result?))
-            }
-            _ = self.app.background_executor().timer(THREAD_EVENT_TIMEOUT).fuse() => {
-                anyhow::bail!("Agentic loop stalled - waited {THREAD_EVENT_TIMEOUT:?} without any events");
-            }
-        }
-    }
-
-    pub fn edits(&self) -> HashMap<Arc<RelPath>, FileEdits> {
-        self.agent_thread.read_with(&self.app, |thread, cx| {
-            let action_log = thread.action_log().read(cx);
-            HashMap::from_iter(
-                action_log
-                    .changed_buffers(cx)
-                    .into_iter()
-                    .map(|(buffer, diff)| {
-                        let snapshot = buffer.read(cx).snapshot();
-
-                        let file = snapshot.file().unwrap();
-                        let base_text = diff.read(cx).base_text(cx).text();
-
-                        let hunks = diff
-                            .read(cx)
-                            .snapshot(cx)
-                            .hunks(&snapshot)
-                            .map(|hunk| FileEditHunk {
-                                base_text: base_text[hunk.diff_base_byte_range.clone()].to_string(),
-                                text: snapshot
-                                    .text_for_range(hunk.range.clone())
-                                    .collect::<String>(),
-                                status: hunk.status(),
-                            })
-                            .collect();
-
-                        (file.path().clone(), FileEdits { hunks })
-                    }),
-            )
-        })
-    }
-
-    pub fn agent_thread(&self) -> Entity<Thread> {
-        self.agent_thread.clone()
-    }
-}
-
-impl AppContext for ExampleContext {
-    fn new<T: 'static>(
-        &mut self,
-        build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
-    ) -> Entity<T> {
-        self.app.new(build_entity)
-    }
-
-    fn reserve_entity<T: 'static>(&mut self) -> gpui::Reservation<T> {
-        self.app.reserve_entity()
-    }
-
-    fn insert_entity<T: 'static>(
-        &mut self,
-        reservation: gpui::Reservation<T>,
-        build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
-    ) -> Entity<T> {
-        self.app.insert_entity(reservation, build_entity)
-    }
-
-    fn update_entity<T, R>(
-        &mut self,
-        handle: &Entity<T>,
-        update: impl FnOnce(&mut T, &mut gpui::Context<T>) -> R,
-    ) -> R
-    where
-        T: 'static,
-    {
-        self.app.update_entity(handle, update)
-    }
-
-    fn as_mut<'a, T>(&'a mut self, handle: &Entity<T>) -> gpui::GpuiBorrow<'a, T>
-    where
-        T: 'static,
-    {
-        self.app.as_mut(handle)
-    }
-
-    fn read_entity<T, R>(&self, handle: &Entity<T>, read: impl FnOnce(&T, &App) -> R) -> R
-    where
-        T: 'static,
-    {
-        self.app.read_entity(handle, read)
-    }
-
-    fn update_window<T, F>(&mut self, window: gpui::AnyWindowHandle, f: F) -> Result<T>
-    where
-        F: FnOnce(gpui::AnyView, &mut gpui::Window, &mut App) -> T,
-    {
-        self.app.update_window(window, f)
-    }
-
-    fn read_window<T, R>(
-        &self,
-        window: &gpui::WindowHandle<T>,
-        read: impl FnOnce(Entity<T>, &App) -> R,
-    ) -> Result<R>
-    where
-        T: 'static,
-    {
-        self.app.read_window(window, read)
-    }
-
-    fn background_spawn<R>(
-        &self,
-        future: impl std::future::Future<Output = R> + Send + 'static,
-    ) -> gpui::Task<R>
-    where
-        R: Send + 'static,
-    {
-        self.app.background_spawn(future)
-    }
-
-    fn read_global<G, R>(&self, callback: impl FnOnce(&G, &App) -> R) -> R
-    where
-        G: gpui::Global,
-    {
-        self.app.read_global(callback)
-    }
-}
-
-#[derive(Debug)]
-pub struct Response {
-    messages: Vec<Message>,
-}
-
-impl Response {
-    pub fn new(messages: Vec<Message>) -> Self {
-        Self { messages }
-    }
-
-    pub fn expect_tool_call(
-        &self,
-        tool_name: &'static str,
-        cx: &mut ExampleContext,
-    ) -> Result<&ToolUse> {
-        let result = self.find_tool_call(tool_name);
-        cx.assert_some(result, format!("called `{}`", tool_name))
-    }
-
-    pub fn find_tool_call(&self, tool_name: &str) -> Option<&ToolUse> {
-        self.messages.iter().rev().find_map(|msg| {
-            msg.tool_use
-                .iter()
-                .find(|tool_use| tool_use.name == tool_name)
-        })
-    }
-
-    pub fn tool_calls(&self) -> impl Iterator<Item = &ToolUse> {
-        self.messages.iter().flat_map(|msg| &msg.tool_use)
-    }
-
-    pub fn texts(&self) -> impl Iterator<Item = String> {
-        self.messages.iter().map(|message| message.text.clone())
-    }
-}
-
-#[derive(Debug)]
-pub struct Message {
-    role: Role,
-    text: String,
-    tool_use: Vec<ToolUse>,
-}
-
-#[derive(Debug)]
-pub struct ToolUse {
-    pub name: String,
-    value: serde_json::Value,
-}
-
-impl ToolUse {
-    pub fn parse_input<Input>(&self) -> Result<Input>
-    where
-        Input: for<'de> serde::Deserialize<'de>,
-    {
-        serde_json::from_value::<Input>(self.value.clone()).map_err(|err| anyhow!(err))
-    }
-}
-
-#[derive(Debug, Eq, PartialEq)]
-pub struct FileEdits {
-    pub hunks: Vec<FileEditHunk>,
-}
-
-#[derive(Debug, Eq, PartialEq)]
-pub struct FileEditHunk {
-    pub base_text: String,
-    pub text: String,
-    pub status: DiffHunkStatus,
-}
-
-impl FileEdits {
-    pub fn has_added_line(&self, line: &str) -> bool {
-        self.hunks.iter().any(|hunk| {
-            hunk.status == DiffHunkStatus::added_none()
-                && hunk.base_text.is_empty()
-                && hunk.text.contains(line)
-        })
-    }
-}

crates/eval/src/examples/add_arg_to_trait_method.rs πŸ”—

@@ -1,115 +0,0 @@
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-use util::rel_path::RelPath;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
-
-pub struct AddArgToTraitMethod;
-
-#[async_trait(?Send)]
-impl Example for AddArgToTraitMethod {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "add_arg_to_trait_method".to_string(),
-            url: "https://github.com/zed-industries/zed.git".to_string(),
-            revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
-            language_server: Some(LanguageServer {
-                file_extension: "rs".to_string(),
-                allow_preexisting_diagnostics: false,
-            }),
-            max_assertions: None,
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: None,
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        const FILENAME: &str = "assistant_tool.rs";
-        let _ = cx.prompt(format!(
-            r#"
-            Add a `window: Option<gpui::AnyWindowHandle>` argument to the `Tool::run` trait method in {FILENAME},
-            and update all the implementations of the trait and call sites accordingly.
-            "#
-        )).await?;
-
-        // Adds ignored argument to all but `batch_tool`
-
-        let add_ignored_window_paths = &[
-            "code_action_tool",
-            "code_symbols_tool",
-            "contents_tool",
-            "copy_path_tool",
-            "create_directory_tool",
-            "create_file_tool",
-            "delete_path_tool",
-            "diagnostics_tool",
-            "edit_file_tool",
-            "fetch_tool",
-            "grep_tool",
-            "list_directory_tool",
-            "move_path_tool",
-            "now_tool",
-            "open_tool",
-            "path_search_tool",
-            "read_file_tool",
-            "rename_tool",
-            "symbol_info_tool",
-            "terminal_tool",
-            "web_search_tool",
-        ];
-
-        let edits = cx.edits();
-
-        for tool_name in add_ignored_window_paths {
-            let path_str = format!("crates/assistant_tools/src/{}.rs", tool_name);
-            let edits = edits.get(RelPath::unix(&path_str).unwrap());
-
-            let ignored = edits.is_some_and(|edits| {
-                edits.has_added_line("        _window: Option<gpui::AnyWindowHandle>,\n")
-            });
-            let uningored = edits.is_some_and(|edits| {
-                edits.has_added_line("        window: Option<gpui::AnyWindowHandle>,\n")
-            });
-
-            cx.assert(ignored || uningored, format!("Argument:   {}", tool_name))
-                .ok();
-
-            cx.assert(ignored, format!("`_` prefix: {}", tool_name))
-                .ok();
-        }
-
-        // Adds unignored argument to `batch_tool`
-
-        let batch_tool_edits =
-            edits.get(RelPath::unix("crates/assistant_tools/src/batch_tool.rs").unwrap());
-
-        cx.assert(
-            batch_tool_edits.is_some_and(|edits| {
-                edits.has_added_line("        window: Option<gpui::AnyWindowHandle>,\n")
-            }),
-            "Argument:   batch_tool",
-        )
-        .ok();
-
-        Ok(())
-    }
-
-    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
-        vec![
-            JudgeAssertion {
-                id: "batch tool passes window to each".to_string(),
-                description:
-                    "batch_tool is modified to pass a clone of the window to each tool it calls."
-                        .to_string(),
-            },
-            JudgeAssertion {
-                id: "tool tests updated".to_string(),
-                description:
-                    "tool tests are updated to pass the new `window` argument (`None` is ok)."
-                        .to_string(),
-            },
-        ]
-    }
-}

crates/eval/src/examples/code_block_citations.rs πŸ”—

@@ -1,218 +0,0 @@
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-use markdown::PathWithRange;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
-
-pub struct CodeBlockCitations;
-
-const FENCE: &str = "```";
-
-#[async_trait(?Send)]
-impl Example for CodeBlockCitations {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "code_block_citations".to_string(),
-            url: "https://github.com/zed-industries/zed.git".to_string(),
-            revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
-            language_server: Some(LanguageServer {
-                file_extension: "rs".to_string(),
-                allow_preexisting_diagnostics: false,
-            }),
-            max_assertions: None,
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: None,
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        const FILENAME: &str = "assistant_tool.rs";
-
-        // Verify that the messages all have the correct formatting.
-        let texts: Vec<String> = cx
-            .prompt(format!(
-                r#"
-                Show me the method bodies of all the methods of the `Tool` trait in {FILENAME}.
-
-                Please show each method in a separate code snippet.
-                "#
-            ))
-            .await?
-            .texts()
-            .collect();
-        let closing_fence = format!("\n{FENCE}");
-
-        for text in texts.iter() {
-            let mut text = text.as_str();
-
-            while let Some(index) = text.find(FENCE) {
-                // Advance text past the opening backticks.
-                text = &text[index + FENCE.len()..];
-
-                // Find the closing backticks.
-                let content_len = text.find(&closing_fence);
-
-                // Verify the citation format - e.g. ```path/to/foo.txt#L123-456
-                if let Some(citation_len) = text.find('\n') {
-                    let citation = &text[..citation_len];
-
-                    if let Ok(()) =
-                        cx.assert(citation.contains("/"), format!("Slash in {citation:?}",))
-                    {
-                        let path_range = PathWithRange::new(citation);
-                        let path = cx.agent_thread().update(cx, |thread, cx| {
-                            thread
-                                .project()
-                                .read(cx)
-                                .find_project_path(path_range.path.as_ref(), cx)
-                        });
-
-                        if let Ok(path) = cx.assert_some(path, format!("Valid path: {citation:?}"))
-                        {
-                            let buffer_text = {
-                                let buffer = cx
-                                    .agent_thread()
-                                    .update(cx, |thread, cx| {
-                                        thread
-                                            .project()
-                                            .update(cx, |project, cx| project.open_buffer(path, cx))
-                                    })
-                                    .await
-                                    .ok();
-
-                                let Ok(buffer_text) = cx.assert_some(
-                                    buffer.map(|buffer| {
-                                        buffer.read_with(cx, |buffer, _| buffer.text())
-                                    }),
-                                    "Reading buffer text succeeded",
-                                ) else {
-                                    continue;
-                                };
-                                buffer_text
-                            };
-
-                            if let Some(content_len) = content_len {
-                                // + 1 because there's a newline character after the citation.
-                                let start_index = citation.len() + 1;
-                                let end_index = content_len.saturating_sub(start_index);
-
-                                if cx
-                                    .assert(
-                                        start_index <= end_index,
-                                        "Code block had a valid citation",
-                                    )
-                                    .is_ok()
-                                {
-                                    let content = &text[start_index..end_index];
-
-                                    // deindent (trim the start of each line) because sometimes the model
-                                    // chooses to deindent its code snippets for the sake of readability,
-                                    // which in markdown is not only reasonable but usually desirable.
-                                    cx.assert(
-                                        deindent(&buffer_text)
-                                            .trim()
-                                            .contains(deindent(&content).trim()),
-                                        "Code block content was found in file",
-                                    )
-                                    .ok();
-
-                                    if let Some(range) = path_range.range {
-                                        let start_line_index = range.start.line.saturating_sub(1);
-                                        let line_count =
-                                            range.end.line.saturating_sub(start_line_index);
-                                        let mut snippet = buffer_text
-                                            .lines()
-                                            .skip(start_line_index as usize)
-                                            .take(line_count as usize)
-                                            .collect::<Vec<&str>>()
-                                            .join("\n");
-
-                                        if let Some(start_col) = range.start.col {
-                                            snippet = snippet[start_col as usize..].to_string();
-                                        }
-
-                                        if let Some(end_col) = range.end.col {
-                                            let last_line = snippet.lines().last().unwrap();
-                                            snippet = snippet[..snippet.len() - last_line.len()
-                                                + end_col as usize]
-                                                .to_string();
-                                        }
-
-                                        // deindent (trim the start of each line) because sometimes the model
-                                        // chooses to deindent its code snippets for the sake of readability,
-                                        // which in markdown is not only reasonable but usually desirable.
-                                        cx.assert_eq(
-                                            deindent(snippet.as_str()).trim(),
-                                            deindent(content).trim(),
-                                            format!(
-                                                "Code block was at {:?}-{:?}",
-                                                range.start, range.end
-                                            ),
-                                        )
-                                        .ok();
-                                    }
-                                }
-                            }
-                        }
-                    }
-                } else {
-                    cx.assert(
-                        false,
-                        format!("Opening {FENCE} did not have a newline anywhere after it."),
-                    )
-                    .ok();
-                }
-
-                if let Some(content_len) = content_len {
-                    // Advance past the closing backticks
-                    text = &text[content_len + FENCE.len()..];
-                } else {
-                    // There were no closing backticks associated with these opening backticks.
-                    cx.assert(
-                        false,
-                        "Code block opening had matching closing backticks.".to_string(),
-                    )
-                    .ok();
-
-                    // There are no more code blocks to parse, so we're done.
-                    break;
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
-        vec![
-            JudgeAssertion {
-                id: "trait method bodies are shown".to_string(),
-                description:
-                    "All method bodies of the Tool trait are shown."
-                        .to_string(),
-            },
-            JudgeAssertion {
-                id: "code blocks used".to_string(),
-                description:
-                   "All code snippets are rendered inside markdown code blocks (as opposed to any other formatting besides code blocks)."
-                        .to_string(),
-            },
-            JudgeAssertion {
-              id: "code blocks use backticks".to_string(),
-              description:
-                  format!("All markdown code blocks use backtick fences ({FENCE}) rather than indentation.")
-            }
-        ]
-    }
-}
-
-fn deindent(as_str: impl AsRef<str>) -> String {
-    as_str
-        .as_ref()
-        .lines()
-        .map(|line| line.trim_start())
-        .collect::<Vec<&str>>()
-        .join("\n")
-}

crates/eval/src/examples/comment_translation.rs πŸ”—

@@ -1,60 +0,0 @@
-use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
-use agent::{EditFileMode, EditFileToolInput};
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-
-pub struct CommentTranslation;
-
-#[async_trait(?Send)]
-impl Example for CommentTranslation {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "comment_translation".to_string(),
-            url: "https://github.com/servo/font-kit.git".to_string(),
-            revision: "504d084e29bce4f60614bc702e91af7f7d9e60ad".to_string(),
-            language_server: None,
-            max_assertions: Some(1),
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: None,
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        let response = cx.prompt(
-            r#"
-                Edit the following files and translate all their comments to italian, in this exact order:
-
-                - font-kit/src/family.rs
-                - font-kit/src/canvas.rs
-                - font-kit/src/error.rs
-            "#
-        ).await?;
-
-        let mut create_or_overwrite_count = 0;
-        for tool_call in response.tool_calls() {
-            if tool_call.name == "edit_file" {
-                let input = tool_call.parse_input::<EditFileToolInput>()?;
-                if !matches!(input.mode, EditFileMode::Edit) {
-                    create_or_overwrite_count += 1;
-                }
-            }
-        }
-
-        cx.assert_eq(create_or_overwrite_count, 0, "no_creation_or_overwrite")?;
-
-        Ok(())
-    }
-
-    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
-        vec![JudgeAssertion {
-            id: "comments_translated".to_string(),
-            description: concat!(
-                "- Only `family.rs`, `canvas.rs` and `error.rs` should have changed.\n",
-                "- Their doc comments should have been all translated to Italian."
-            )
-            .into(),
-        }]
-    }
-}

crates/eval/src/examples/file_change_notification.rs πŸ”—

@@ -1,74 +0,0 @@
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
-
-pub struct FileChangeNotificationExample;
-
-#[async_trait(?Send)]
-impl Example for FileChangeNotificationExample {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "file_change_notification".to_string(),
-            url: "https://github.com/octocat/hello-world".to_string(),
-            revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
-            language_server: None,
-            max_assertions: None,
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: Some(3),
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        // Track README so that the model gets notified of its changes
-        let project_path = cx.agent_thread().read_with(cx, |thread, cx| {
-            thread
-                .project()
-                .read(cx)
-                .find_project_path("README", cx)
-                .expect("README file should exist in this repo")
-        });
-
-        let buffer = {
-            cx.agent_thread()
-                .update(cx, |thread, cx| {
-                    thread
-                        .project()
-                        .update(cx, |project, cx| project.open_buffer(project_path, cx))
-                })
-                .await?
-        };
-
-        cx.agent_thread().update(cx, |thread, cx| {
-            thread.action_log().update(cx, |action_log, cx| {
-                action_log.buffer_read(buffer.clone(), cx);
-            });
-        });
-
-        // Start conversation (specific message is not important)
-        cx.prompt_with_max_turns("Find all files in this repo", 1)
-            .await?;
-
-        // Edit the README buffer - the model should get a notification on next turn
-        buffer.update(cx, |buffer, cx| {
-            buffer.edit([(0..buffer.len(), "Surprise!")], None, cx);
-        });
-
-        // Run for some more turns.
-        // The model shouldn't thank us for letting it know about the file change.
-        cx.proceed_with_max_turns(3).await?;
-
-        Ok(())
-    }
-
-    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
-        vec![JudgeAssertion {
-            id: "change-file-notification".into(),
-            description:
-                "Agent should not acknowledge or mention anything about files that have been changed"
-                    .into(),
-        }]
-    }
-}

crates/eval/src/examples/file_search.rs πŸ”—

@@ -1,55 +0,0 @@
-use agent::FindPathToolInput;
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-use regex::Regex;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata};
-
-pub struct FileSearchExample;
-
-#[async_trait(?Send)]
-impl Example for FileSearchExample {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "file_search".to_string(),
-            url: "https://github.com/zed-industries/zed.git".to_string(),
-            revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
-            language_server: None,
-            max_assertions: Some(3),
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: None,
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        const FILENAME: &str = "find_replace_file_tool.rs";
-
-        let prompt = format!(
-            r#"
-        Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
-
-        The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
-        markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
-        background for lines that were added. We should have a div per diff line.
-        "#
-        );
-
-        let response = cx.prompt_with_max_turns(prompt, 1).await?;
-        let tool_use = response.expect_tool_call("find_path", cx)?;
-        let input = tool_use.parse_input::<FindPathToolInput>()?;
-
-        let glob = input.glob;
-        cx.assert(glob.ends_with(FILENAME), "glob ends with file name")?;
-
-        let without_filename = glob.replace(FILENAME, "");
-        let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
-            .unwrap()
-            .is_match(&without_filename);
-
-        cx.assert(matches, "glob starts with `**` or project")?;
-
-        Ok(())
-    }
-}

crates/eval/src/examples/find_and_replace_diff_card.toml πŸ”—

@@ -1,43 +0,0 @@
-url = "https://github.com/zed-industries/zed.git"
-revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
-language_extension = "rs"
-
-prompt = """
-Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
-The card should implement the `Render` trait.
-
-The card should show a diff. It should be a beautifully presented diff.
-The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
-I want to see a red background for lines that were deleted and a green background for lines
-that were added. We should have a div per diff line.
-"""
-
-[diff_assertions]
-
-modify_find_and_replace_tool = """
-The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
-The struct should contain an `output` field that is the same as the task we were returning before,
-and a new `card` field that contains a view for the card.
-"""
-
-card_implementation = """
-The card should be a view that displays a diff.
-Each line in the diff should be colored according to whether it was added, removed or unchanged.
-"""
-
-[thread_assertions]
-
-path_search = """
-The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
-(*Not* grep, for example, or reading the file based on a guess at the path.)
-This is because we gave the model a filename and it needs to turn that into a real path.
-"""
-
-read_file_from_path_search = """
-After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
-"""
-
-symbol_search = """
-When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
-on what path the Render trait might be in.
-"""

crates/eval/src/examples/grep_params_escapement.rs πŸ”—

@@ -1,59 +0,0 @@
-use agent::GrepToolInput;
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata};
-
-pub struct GrepParamsEscapementExample;
-
-/*
-
-This eval checks that the model doesn't use HTML escapement for characters like `<` and
-`>` in tool parameters.
-
-                      original     +system_prompt change    +tool description
-  claude-opus-4        89%          92%                     97%+
-  claude-sonnet-4      100%
-  gpt-5-mini           100%
-  gemini-2.5-pro                    98%
-
-*/
-
-#[async_trait(?Send)]
-impl Example for GrepParamsEscapementExample {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "grep_params_escapement".to_string(),
-            url: "https://github.com/octocat/hello-world".to_string(),
-            revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
-            language_server: None,
-            max_assertions: Some(1),
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: Some(2),
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        let response = cx
-            .prompt_with_max_turns("Search for files containing the characters `>` or `<`", 2)
-            .await?;
-        let grep_input = response
-            .find_tool_call("grep")
-            .and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
-
-        cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
-
-        cx.assert(
-            !contains_html_entities(&grep_input.unwrap().regex),
-            "Tool parameters should not be escaped",
-        )
-    }
-}
-
-fn contains_html_entities(pattern: &str) -> bool {
-    regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
-        .unwrap()
-        .is_match(pattern)
-}

crates/eval/src/examples/hallucinated_tool_calls.toml πŸ”—

@@ -1,13 +0,0 @@
-url = "https://github.com/jlowin/fastmcp"
-revision = "a2c1e14e5d83af1c32b76280ab368df199c4e860"
-language_extension = "py"
-
-prompt = "Write a LICENSE file just saying 'Apache 2.0' and nothing else"
-
-profile_name = "ask"
-
-[thread_assertions]
-
-no_edit_attempts = """The agent should not claim that it edited or created the file. It should not pretend making any changes."""
-
-mention_insufficient_tools = """Agent should mention that it doesn't have relevant tools needed to make the change."""

crates/eval/src/examples/mod.rs πŸ”—

@@ -1,173 +0,0 @@
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-use serde::Deserialize;
-use std::collections::BTreeMap;
-use std::fs;
-use std::{
-    path::{Path, PathBuf},
-    rc::Rc,
-};
-use util::serde::default_true;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
-
-mod add_arg_to_trait_method;
-mod code_block_citations;
-mod comment_translation;
-mod file_change_notification;
-mod file_search;
-mod grep_params_escapement;
-mod overwrite_file;
-mod planets;
-
-pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
-    let mut threads: Vec<Rc<dyn Example>> = vec![
-        Rc::new(file_search::FileSearchExample),
-        Rc::new(add_arg_to_trait_method::AddArgToTraitMethod),
-        Rc::new(code_block_citations::CodeBlockCitations),
-        Rc::new(planets::Planets),
-        Rc::new(comment_translation::CommentTranslation),
-        Rc::new(overwrite_file::FileOverwriteExample),
-        Rc::new(file_change_notification::FileChangeNotificationExample),
-        Rc::new(grep_params_escapement::GrepParamsEscapementExample),
-    ];
-
-    for example_path in list_declarative_examples(examples_dir).unwrap() {
-        threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
-    }
-
-    threads
-}
-
-struct DeclarativeExample {
-    metadata: ExampleMetadata,
-    prompt: String,
-    diff_assertions: Vec<JudgeAssertion>,
-    thread_assertions: Vec<JudgeAssertion>,
-}
-
-impl DeclarativeExample {
-    pub fn load(example_path: &Path) -> Result<Self> {
-        let name = Self::name_from_path(example_path);
-        let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
-        let example_dir = example_path.parent().unwrap();
-
-        let language_server = if base.require_lsp {
-            Some(crate::example::LanguageServer {
-                file_extension: base
-                    .language_extension
-                    .expect("Language extension is required when require_lsp = true"),
-                allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
-            })
-        } else {
-            None
-        };
-
-        let profile_id = if let Some(profile_name) = base.profile_name {
-            AgentProfileId(profile_name.into())
-        } else {
-            AgentProfileId::default()
-        };
-
-        let existing_thread_json = if let Some(path) = base.existing_thread_path {
-            let content = fs::read_to_string(example_dir.join(&path))
-                .unwrap_or_else(|_| panic!("Failed to read existing thread file: {}", path));
-            Some(content)
-        } else {
-            None
-        };
-
-        let metadata = ExampleMetadata {
-            name,
-            url: base.url,
-            revision: base.revision,
-            language_server,
-            max_assertions: None,
-            profile_id,
-            existing_thread_json,
-            max_turns: base.max_turns,
-        };
-
-        Ok(DeclarativeExample {
-            metadata,
-            prompt: base.prompt,
-            thread_assertions: base
-                .thread_assertions
-                .into_iter()
-                .map(|(id, description)| JudgeAssertion { id, description })
-                .collect(),
-            diff_assertions: base
-                .diff_assertions
-                .into_iter()
-                .map(|(id, description)| JudgeAssertion { id, description })
-                .collect(),
-        })
-    }
-
-    pub fn name_from_path(path: &Path) -> String {
-        path.file_stem().unwrap().to_string_lossy().into_owned()
-    }
-}
-
-#[derive(Clone, Debug, Deserialize)]
-pub struct ExampleToml {
-    pub url: String,
-    pub revision: String,
-    pub language_extension: Option<String>,
-    #[expect(
-        unused,
-        reason = "This field was found to be unused with serde library bump; it's left as is due to insufficient context on PO's side, but it *may* be fine to remove"
-    )]
-    pub insert_id: Option<String>,
-    #[serde(default = "default_true")]
-    pub require_lsp: bool,
-    #[serde(default)]
-    pub allow_preexisting_diagnostics: bool,
-    pub prompt: String,
-    #[serde(default)]
-    pub profile_name: Option<String>,
-    #[serde(default)]
-    pub diff_assertions: BTreeMap<String, String>,
-    #[serde(default)]
-    pub thread_assertions: BTreeMap<String, String>,
-    #[serde(default)]
-    pub existing_thread_path: Option<String>,
-    #[serde(default)]
-    pub max_turns: Option<u32>,
-}
-
-#[async_trait(?Send)]
-impl Example for DeclarativeExample {
-    fn meta(&self) -> ExampleMetadata {
-        self.metadata.clone()
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        let max_turns = self.metadata.max_turns.unwrap_or(1000);
-        let _ = cx.prompt_with_max_turns(&self.prompt, max_turns).await;
-        Ok(())
-    }
-
-    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
-        self.diff_assertions.clone()
-    }
-
-    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
-        self.thread_assertions.clone()
-    }
-}
-
-fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
-    let path = std::fs::canonicalize(examples_dir).unwrap();
-    let entries = std::fs::read_dir(path).unwrap();
-    let mut result_paths = Vec::new();
-    for entry in entries {
-        let entry = entry?;
-        let path = entry.path();
-        if path.extension() == Some("toml".as_ref()) {
-            result_paths.push(path);
-        }
-    }
-    Ok(result_paths)
-}

crates/eval/src/examples/no_tools_enabled.toml πŸ”—

@@ -1,19 +0,0 @@
-url = "https://github.com/zed-industries/zed"
-revision = "main"
-require_lsp = false
-prompt = """
-I need to explore the codebase to understand what files are available in the project. What can you tell me about the structure of the codebase?
-
-Please find all uses of the 'find_path' function in the src directory.
-
-Also, can you tell me what the capital of France is? And how does garbage collection work in programming languages?
-"""
-
-profile_name = "minimal"
-
-[thread_assertions]
-no_hallucinated_tool_calls = """The agent should not hallucinate tool calls - for example, by writing markdown code blocks that simulate commands like `find`, `grep`, `ls`, etc. - since no tools are available. However, it is totally fine if the agent describes to the user what should be done, e.g. telling the user \"You can run `find` to...\" etc."""
-
-doesnt_hallucinate_file_paths = """The agent should not make up file paths or pretend to know the structure of the project when tools are not available."""
-
-correctly_answers_general_questions = """The agent should correctly answer general knowledge questions about the capital of France and garbage collection without asking for more context, demonstrating it can still be helpful with areas it knows about."""

crates/eval/src/examples/overwrite_file.rs πŸ”—

@@ -1,51 +0,0 @@
-use agent::{EditFileMode, EditFileToolInput};
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata};
-
-pub struct FileOverwriteExample;
-
-/*
-This eval tests a fix for a destructive behavior of the `edit_file` tool.
-Previously, it would rewrite existing files too aggressively, which often
-resulted in content loss.
-
-Model           | Pass rate
-----------------|----------
-Sonnet 3.7      | 100%
-Gemini 2.5 Pro  |  80%
-*/
-
-#[async_trait(?Send)]
-impl Example for FileOverwriteExample {
-    fn meta(&self) -> ExampleMetadata {
-        let thread_json = include_str!("threads/overwrite-file.json");
-
-        ExampleMetadata {
-            name: "file_overwrite".to_string(),
-            url: "https://github.com/zed-industries/zed.git".to_string(),
-            revision: "023a60806a8cc82e73bd8d88e63b4b07fc7a0040".to_string(),
-            language_server: None,
-            max_assertions: Some(1),
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: Some(thread_json.to_string()),
-            max_turns: None,
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        let response = cx.proceed_with_max_turns(1).await?;
-        let tool_use = response.expect_tool_call("edit_file", cx)?;
-        let input = tool_use.parse_input::<EditFileToolInput>()?;
-        let file_overwritten = match input.mode {
-            EditFileMode::Edit => false,
-            EditFileMode::Create | EditFileMode::Overwrite => {
-                input.path.ends_with("src/language_model_selector.rs")
-            }
-        };
-
-        cx.assert(!file_overwritten, "File should be edited, not overwritten")
-    }
-}

crates/eval/src/examples/planets.rs πŸ”—

@@ -1,75 +0,0 @@
-use agent::{AgentTool, OpenTool, TerminalTool};
-use agent_settings::AgentProfileId;
-use anyhow::Result;
-use async_trait::async_trait;
-
-use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
-
-pub struct Planets;
-
-#[async_trait(?Send)]
-impl Example for Planets {
-    fn meta(&self) -> ExampleMetadata {
-        ExampleMetadata {
-            name: "planets".to_string(),
-            url: "https://github.com/roc-lang/roc".to_string(), // This commit in this repo is just the Apache2 license,
-            revision: "59e49c75214f60b4dc4a45092292061c8c26ce27".to_string(), // so effectively a blank project.
-            language_server: None,
-            max_assertions: None,
-            profile_id: AgentProfileId::default(),
-            existing_thread_json: None,
-            max_turns: None,
-        }
-    }
-
-    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
-        let response = cx
-            .prompt(
-                r#"
-            Make a plain JavaScript web page which renders an animated 3D solar system.
-            Let me drag to rotate the camera around.
-            Do not use npm.
-            "#,
-            )
-            .await?;
-        let mut open_tool_uses = 0;
-        let mut terminal_tool_uses = 0;
-
-        for tool_use in response.tool_calls() {
-            if tool_use.name == OpenTool::NAME {
-                open_tool_uses += 1;
-            } else if tool_use.name == TerminalTool::NAME {
-                terminal_tool_uses += 1;
-            }
-        }
-
-        // The open tool should only be used when requested, which it was not.
-        cx.assert_eq(open_tool_uses, 0, "`open` tool was not used")
-            .ok();
-        // No reason to use the terminal if not using npm.
-        cx.assert_eq(terminal_tool_uses, 0, "`terminal` tool was not used")
-            .ok();
-
-        Ok(())
-    }
-
-    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
-        vec![
-            JudgeAssertion {
-                id: "animated solar system".to_string(),
-                description: "This page should render a solar system, and it should be animated."
-                    .to_string(),
-            },
-            JudgeAssertion {
-                id: "drag to rotate camera".to_string(),
-                description: "The user can drag to rotate the camera around.".to_string(),
-            },
-            JudgeAssertion {
-                id: "plain JavaScript".to_string(),
-                description:
-                    "The code base uses plain JavaScript and no npm, along with HTML and CSS."
-                        .to_string(),
-            },
-        ]
-    }
-}

crates/eval/src/examples/threads/overwrite-file.json πŸ”—

@@ -1,262 +0,0 @@
-{
-  "completion_mode": "normal",
-  "cumulative_token_usage": {
-    "cache_creation_input_tokens": 18383,
-    "cache_read_input_tokens": 97250,
-    "input_tokens": 45,
-    "output_tokens": 776
-  },
-  "detailed_summary_state": "NotGenerated",
-  "exceeded_window_error": null,
-  "initial_project_snapshot": {
-    "timestamp": "2025-05-08T14:31:16.701157512Z",
-    "unsaved_buffer_paths": [],
-    "worktree_snapshots": [
-      {
-        "git_state": {
-          "current_branch": null,
-          "diff": "diff --git a/crates/language_model_selector/src/language_model_selector.rs b/crates/language_model_selector/src/language_model_selector.rs\nindex 6775bee98a..e25c9e1415 100644\n--- a/crates/language_model_selector/src/language_model_selector.rs\n+++ b/crates/language_model_selector/src/language_model_selector.rs\n@@ -410,7 +410,8 @@ impl ModelMatcher {\n     }\n \n     pub fn is_match(self: &Self, info: &ModelInfo) -> bool {\n-        self.matched_ids.contains(&info.model.id().0)\n+        let q = (info.model.provider_id(), info.model.id());\n+        self.matched_models.contains(&q)\n     }\n }\n \n",
-          "head_sha": "9245656485e58a5d6d717d82209bc8c57cb9c539",
-          "remote_url": "git@github.com:zed-industries/zed.git"
-        },
-        "worktree_path": "/home/silver/develop/zed"
-      }
-    ]
-  },
-  "messages": [
-    {

crates/eval/src/examples/tree_sitter_drop_emscripten_dep.toml πŸ”—

@@ -1,53 +0,0 @@
-url = "https://github.com/tree-sitter/tree-sitter.git"
-revision = "635c49909ce4aa7f58a9375374f91b1b434f6f9c"
-language_extension = "rs"
-
-prompt = """
-Change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
-Use `ureq` to download the SDK for the current platform and architecture.
-Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
-Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
-that's inside of the archive.
-Don't re-download the SDK if that executable already exists.
-
-Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{language_name}
-
-Here are the available wasi-sdk assets:
-- wasi-sdk-25.0-x86_64-macos.tar.gz
-- wasi-sdk-25.0-arm64-macos.tar.gz
-- wasi-sdk-25.0-x86_64-linux.tar.gz
-- wasi-sdk-25.0-arm64-linux.tar.gz
-- wasi-sdk-25.0-x86_64-linux.tar.gz
-- wasi-sdk-25.0-arm64-linux.tar.gz
-- wasi-sdk-25.0-x86_64-windows.tar.gz
-"""
-
-[diff_assertions]
-
-modify_function = """
-The patch modifies the `compile_parser_to_wasm` function, removing logic for running `emscripten`,
-and adding logic to download `wasi-sdk`.
-"""
-
-use_listed_assets = """
-The patch implements logic for selecting from the assets listed in the prompt by detecting the
-current platform and architecture.
-"""
-
-add_deps = """
-The patch adds a dependency for `ureq` to the Cargo.toml, and adds an import to the top of `loader/lib.rs`
-If the patch uses any other dependencies (such as `tar` or `flate2`), it also correctly adds them
-to the Cargo.toml and imports them.
-"""
-
-[thread_assertions]
-
-find_specified_function = """
-The agent finds the specified function `compile_parser_to_wasm` in a logical way.
-It does not begin by guessing any paths to files in the codebase, but rather searches for the function by name.
-"""
-
-no_syntax_errors = """
-As it edits the file, the agent never introduces syntax errors. It's ok if there are other compile errors,
-but it should not introduce glaring issues like mismatched curly braces.
-"""

crates/eval/src/explorer.html πŸ”—

@@ -1,949 +0,0 @@
-<!doctype html>
-<html lang="en">
-    <head>
-        <meta charset="UTF-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-        <title>Eval Explorer</title>
-        <style>
-            :root {
-                /* Light theme (default) */
-                --bg-color: #ffffff;
-                --text-color: #333333;
-                --header-bg: #f8f8f8;
-                --border-color: #eaeaea;
-                --code-bg: #f5f5f5;
-                --link-color: #0066cc;
-                --button-bg: #f5f5f5;
-                --button-border: #ddd;
-                --button-active-bg: #0066cc;
-                --button-active-color: white;
-                --button-active-border: #0055aa;
-                --preview-bg: #f5f5f5;
-                --table-line: #f0f0f0;
-            }
-
-            /* Dark theme */
-            [data-theme="dark"] {
-                --bg-color: #1e1e1e;
-                --text-color: #e0e0e0;
-                --header-bg: #2d2d2d;
-                --border-color: #444444;
-                --code-bg: #2a2a2a;
-                --link-color: #4da6ff;
-                --button-bg: #333333;
-                --button-border: #555555;
-                --button-active-bg: #0066cc;
-                --button-active-color: white;
-                --button-active-border: #0055aa;
-                --preview-bg: #2a2a2a;
-                --table-line: #333333;
-            }
-
-            /* Apply theme variables */
-            body {
-                font-family: monospace;
-                line-height: 1.6;
-                margin: 0;
-                padding: 20px;
-                color: var(--text-color);
-                max-width: 1200px;
-                margin: 0 auto;
-                background-color: var(--bg-color);
-            }
-            h1 {
-                margin-bottom: 20px;
-                border-bottom: 1px solid var(--border-color);
-                padding-bottom: 10px;
-                font-family: monospace;
-            }
-            table {
-                width: 100%;
-                border-collapse: collapse;
-                margin-bottom: 20px;
-                table-layout: fixed; /* Ensure fixed width columns */
-            }
-            th,
-            td {
-                padding: 10px;
-                text-align: left;
-                border-bottom: 1px dotted var(--border-color);
-                vertical-align: top;
-                word-wrap: break-word; /* Ensure long content wraps */
-                overflow-wrap: break-word;
-            }
-            th {
-                background-color: var(--header-bg);
-                font-weight: 600;
-            }
-            .collapsible {
-                cursor: pointer;
-                color: var(--link-color);
-                text-decoration: underline;
-            }
-            .hidden {
-                display: none;
-            }
-            .tool-name {
-                font-weight: bold;
-            }
-            .tool-params {
-                padding-left: 20px;
-                color: #666;
-            }
-            pre {
-                background-color: var(--code-bg);
-                padding: 10px;
-                border-radius: 5px;
-                overflow-x: auto;
-                max-height: 200px;
-                margin: 10px 0;
-                font-family: monospace;
-                width: 100%;
-                box-sizing: border-box;
-                white-space: pre-wrap; /* Ensure text wraps */
-                color: var(--text-color);
-            }
-            code {
-                font-family: monospace;
-            }
-
-            /* Column sizing */
-            .turn-column {
-                width: 3%;
-                max-width: 3%;
-            }
-            .text-column {
-                width: 22%;
-                max-width: 22%;
-            }
-            .tool-column {
-                width: 38%;
-                max-width: 38%;
-            }
-            .result-column {
-                width: 37%;
-                max-width: 37%;
-                overflow-x: auto;
-            }
-
-            /* Content formatting */
-            .text-content {
-                font-family:
-                    system-ui,
-                    -apple-system,
-                    BlinkMacSystemFont,
-                    "Segoe UI",
-                    Roboto,
-                    Oxygen,
-                    Ubuntu,
-                    Cantarell,
-                    "Open Sans",
-                    "Helvetica Neue",
-                    sans-serif;
-                font-size: 0.7rem;
-            }
-            .action-container .action-preview,
-            .action-container .action-full {
-                margin-bottom: 5px;
-            }
-            .preview-content {
-                white-space: pre-wrap;
-                margin-bottom: 5px;
-                background-color: var(--preview-bg);
-                padding: 10px;
-                border-radius: 5px;
-                font-family: monospace;
-                width: 100%;
-                box-sizing: border-box;
-                overflow-wrap: break-word;
-                color: var(--text-color);
-            }
-            .show-more {
-                color: var(--link-color);
-                cursor: pointer;
-                text-decoration: none;
-                display: block;
-                margin-top: 5px;
-            }
-            .more-inline {
-                color: var(--link-color);
-                cursor: pointer;
-                text-decoration: none;
-                display: inline;
-                margin-left: 5px;
-            }
-
-            /* Compact mode styles */
-            .compact-mode td {
-                padding: 5px; /* Reduced padding in compact mode */
-            }
-
-            .compact-mode .preview-content {
-                padding: 2px;
-                margin-bottom: 2px;
-            }
-
-            .compact-mode pre {
-                padding: 5px;
-                margin: 5px 0;
-                white-space: pre; /* Don't wrap code in compact mode */
-                overflow-x: auto; /* Add horizontal scrollbar */
-            }
-
-            .compact-mode .result-column pre,
-            .compact-mode .result-column .preview-content {
-                max-width: 100%;
-                overflow-x: auto;
-                white-space: pre;
-            }
-
-            /* Make action containers more compact */
-            .compact-mode .action-container {
-                margin-bottom: 2px;
-            }
-
-            /* Reduce space between turns */
-            .compact-mode tr {
-                border-bottom: 1px solid var(--table-line);
-            }
-
-            /* Tool params more compact */
-            .compact-mode .tool-params {
-                padding-left: 10px;
-                margin-top: 2px;
-            }
-
-            hr {
-                margin: 10px 0;
-                border: 0;
-                height: 1px;
-                background-color: var(--border-color);
-            }
-
-            /* View switcher */
-            .view-switcher {
-                display: flex;
-                gap: 10px;
-                margin-bottom: 20px;
-                align-items: center;
-            }
-
-            .view-button {
-                background-color: var(--button-bg);
-                border: 1px solid var(--button-border);
-                border-radius: 4px;
-                padding: 5px 15px;
-                cursor: pointer;
-                font-family: monospace;
-                font-size: 0.9rem;
-                transition: all 0.2s ease;
-                color: var(--text-color);
-            }
-
-            .view-button:hover {
-                background-color: var(--button-border);
-            }
-
-            .view-button.active {
-                background-color: var(--button-active-bg);
-                color: var(--button-active-color);
-                border-color: var(--button-active-border);
-            }
-
-            /* Navigation bar styles */
-            .thread-navigation {
-                display: flex;
-                align-items: center;
-                margin-bottom: 20px;
-                padding: 10px 0;
-                border-bottom: 1px solid var(--border-color);
-            }
-
-            .nav-button {
-                background-color: var(--button-bg);
-                border: 1px solid var(--button-border);
-                border-radius: 4px;
-                padding: 5px 15px;
-                cursor: pointer;
-                font-family: monospace;
-                font-size: 0.9rem;
-                transition: all 0.2s ease;
-                color: var(--text-color);
-            }
-
-            .nav-button:hover:not(:disabled) {
-                background-color: var(--button-border);
-            }
-
-            .nav-button:disabled {
-                opacity: 0.5;
-                cursor: not-allowed;
-            }
-
-            .thread-indicator {
-                margin: 0 15px;
-                font-size: 1rem;
-                flex-grow: 1;
-                text-align: center;
-            }
-
-            #thread-id {
-                font-weight: bold;
-            }
-
-            /* Theme switcher */
-            .theme-switcher {
-                margin-left: auto;
-                display: flex;
-                align-items: center;
-            }
-
-            .theme-button {
-                background-color: var(--button-bg);
-                border: 1px solid var(--button-border);
-                border-radius: 4px;
-                padding: 5px 10px;
-                cursor: pointer;
-                font-size: 0.9rem;
-                transition: all 0.2s ease;
-                color: var(--text-color);
-                display: flex;
-                align-items: center;
-            }
-
-            .theme-button:hover {
-                background-color: var(--button-border);
-            }
-
-            .theme-icon {
-                margin-right: 5px;
-                font-size: 1rem;
-            }
-        </style>
-    </head>
-    <body>
-        <h1 id="current-filename">Thread Explorer</h1>
-        <div class="view-switcher">
-            <button id="full-view" class="view-button active" onclick="switchView('full')">Full View</button>
-            <button id="compact-view" class="view-button" onclick="switchView('compact')">Compact View</button>
-            <button
-                id="export-button"
-                class="view-button"
-                onclick="exportThreadAsJson()"
-                title="Export current thread as JSON"
-            >
-                Export
-            </button>
-            <div class="theme-switcher">
-                <button id="theme-toggle" class="theme-button" onclick="toggleTheme()">
-                    <span id="theme-icon" class="theme-icon">β˜€οΈ</span>
-                    <span id="theme-text">Light</span>
-                </button>
-            </div>
-        </div>
-        <div class="thread-navigation">
-            <button
-                id="prev-thread"
-                class="nav-button"
-                onclick="previousThread()"
-                title="Previous thread (Ctrl+←, k, or h)"
-                disabled
-            >
-                &larr; Previous
-            </button>
-            <div class="thread-indicator">
-                Thread <span id="current-thread-index">1</span> of <span id="total-threads">1</span>:
-                <span id="thread-id">Default Thread</span>
-            </div>
-            <button
-                id="next-thread"
-                class="nav-button"
-                onclick="nextThread()"
-                title="Next thread (Ctrl+β†’, j, or l)"
-                disabled
-            >
-                Next &rarr;
-            </button>
-        </div>
-        <table id="thread-table">
-            <thead>
-                <tr>
-                    <th class="turn-column">Turn</th>
-                    <th class="text-column">Text</th>
-                    <th class="tool-column">Tool</th>
-                    <th class="result-column">Result</th>
-                </tr>
-            </thead>
-            <tbody id="thread-body">
-                <!-- Content will be filled dynamically -->
-            </tbody>
-        </table>
-
-        <script>
-            // View mode - 'full' or 'compact'
-            let viewMode = "full";
-
-            // Theme mode - 'light', 'dark', or 'system'
-            let themeMode = localStorage.getItem("theme") || "system";
-
-            // Function to apply theme
-            function applyTheme(theme) {
-                const themeIcon = document.getElementById("theme-icon");
-                const themeText = document.getElementById("theme-text");
-
-                if (theme === "dark") {
-                    document.documentElement.setAttribute("data-theme", "dark");
-                    themeIcon.textContent = "πŸŒ™";
-                    themeText.textContent = "Dark";
-                } else {
-                    document.documentElement.removeAttribute("data-theme");
-                    themeIcon.textContent = "β˜€οΈ";
-                    themeText.textContent = "Light";
-                }
-            }
-
-            // Function to toggle between light and dark themes
-            function toggleTheme() {
-                // If currently system or light, switch to dark
-                if (themeMode === "system") {
-                    const systemDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
-                    themeMode = systemDark ? "light" : "dark";
-                } else {
-                    themeMode = themeMode === "light" ? "dark" : "light";
-                }
-
-                // Save preference
-                localStorage.setItem("theme", themeMode);
-
-                // Apply theme
-                applyTheme(themeMode);
-            }
-
-            // Initialize theme based on system or saved preference
-            function initTheme() {
-                if (themeMode === "system") {
-                    // Use system preference
-                    const systemDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
-                    applyTheme(systemDark ? "dark" : "light");
-
-                    // Listen for system theme changes
-                    window.matchMedia("(prefers-color-scheme: dark)").addEventListener("change", (e) => {
-                        if (themeMode === "system") {
-                            applyTheme(e.matches ? "dark" : "light");
-                        }
-                    });
-                } else {
-                    // Use saved preference
-                    applyTheme(themeMode);
-                }
-            }
-
-            // Function to switch between view modes
-            function switchView(mode) {
-                viewMode = mode;
-
-                // Update button states
-                document.getElementById("full-view").classList.toggle("active", mode === "full");
-                document.getElementById("compact-view").classList.toggle("active", mode === "compact");
-
-                // Add or remove compact-mode class on the body
-                document.body.classList.toggle("compact-mode", mode === "compact");
-
-                // Re-render the thread with the new view mode
-                renderThread();
-            }
-
-            // Function to export the current thread as a JSON file
-            function exportThreadAsJson() {
-                // Clone the thread to avoid modifying the original
-                const threadToExport = JSON.parse(JSON.stringify(thread));
-
-                // Create a Blob with the JSON data
-                const blob = new Blob([JSON.stringify(threadToExport, null, 2)], { type: "application/json" });
-
-                // Create a download link
-                const url = URL.createObjectURL(blob);
-                const a = document.createElement("a");
-                a.href = url;
-
-                // Generate filename based on thread ID or index
-                const filename =
-                    threadToExport.thread_id || threadToExport.filename || `thread-${currentThreadIndex + 1}.json`;
-                a.download = filename.endsWith(".json") ? filename : `${filename}.json`;
-
-                // Trigger the download
-                document.body.appendChild(a);
-                a.click();
-
-                // Clean up
-                setTimeout(() => {
-                    document.body.removeChild(a);
-                    URL.revokeObjectURL(url);
-                }, 0);
-            }
-            // Default dummy thread data for preview purposes
-            let dummyThread = {
-                messages: [
-                    {
-                        role: "system",
-                        content: [{ Text: "System prompt..." }],
-                    },
-                    {
-                        role: "user",
-                        content: [{ Text: "Fix the bug: kwargs not passed..." }],
-                    },
-                    {
-                        role: "assistant",
-                        content: [
-                            { Text: "I'll help you fix that bug." },
-                            {
-                                ToolUse: {
-                                    name: "list_directory",
-                                    input: { path: "fastmcp" },
-                                    is_input_complete: true,
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "user",
-                        content: [
-                            {
-                                ToolResult: {
-                                    tool_name: "list_directory",
-                                    is_error: false,
-                                    content:
-                                        "fastmcp/src\nfastmcp/tests\nfastmcp/README.md\nfastmcp/pyproject.toml\nfastmcp/.gitignore\nfastmcp/setup.py\nfastmcp/examples\nfastmcp/LICENSE",
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "assistant",
-                        content: [
-                            { Text: "Let's examine the code." },
-                            {
-                                ToolUse: {
-                                    name: "read_file",
-                                    input: {
-                                        path: "fastmcp/main.py",
-                                        start_line: 253,
-                                        end_line: 360,
-                                    },
-                                    is_input_complete: true,
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "user",
-                        content: [
-                            {
-                                ToolResult: {
-                                    tool_name: "read_file",
-                                    is_error: false,
-                                    content:
-                                        "def run_application(app, **kwargs):\n    return anyio.run(app, **kwargs)\n\nasync def start_server():\n    # More code...\n    # Multiple lines of code that would be displayed\n    # when clicking on the show more link\n    app = create_app()\n    await run_app(app)\n\ndef main():\n    # Initialize everything\n    anyio.run(start_server)\n    # Even more code here\n    # that would be shown when the user\n    # expands the content",
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "assistant",
-                        content: [
-                            { Text: "I found the issue." },
-                            {
-                                ToolUse: {
-                                    name: "edit_file",
-                                    input: {
-                                        path: "fastmcp/core.py",
-                                        old_string: "def start_server(app):\n    anyio.run(app)",
-                                        new_string: "def start_server(app, **kwargs):\n    anyio.run(app, **kwargs)",
-                                        display_description: "Fix kwargs passing to anyio.run",
-                                    },
-                                    is_input_complete: true,
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "user",
-                        content: [
-                            {
-                                ToolResult: {
-                                    tool_name: "edit_file",
-                                    is_error: false,
-                                    content: "Made edit to fastmcp/core.py",
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "assistant",
-                        content: [
-                            { Text: "Let's check if there are any errors." },
-                            {
-                                ToolUse: {
-                                    name: "diagnostics",
-                                    input: {},
-                                    is_input_complete: true,
-                                },
-                            },
-                        ],
-                    },
-                    {
-                        role: "user",
-                        content: [
-                            {
-                                ToolResult: {
-                                    tool_name: "diagnostics",
-                                    is_error: false,
-                                    content: "No errors found",
-                                },
-                            },
-                        ],
-                    },
-                ],
-            };
-
-            // The actual thread data will be injected here when opened by eval
-            let threadsData = window.threadsData || { threads: [dummyThread] };
-
-            // Initialize thread variables
-            let threads = threadsData.threads;
-            let currentThreadIndex = 0;
-            let thread = threads[currentThreadIndex];
-
-            // Function to navigate to the previous thread
-            function previousThread() {
-                if (currentThreadIndex > 0) {
-                    currentThreadIndex--;
-                    switchToThread(currentThreadIndex);
-                }
-            }
-
-            // Function to navigate to the next thread
-            function nextThread() {
-                if (currentThreadIndex < threads.length - 1) {
-                    currentThreadIndex++;
-                    switchToThread(currentThreadIndex);
-                }
-            }
-
-            // Function to switch to a specific thread by index
-            function switchToThread(index) {
-                if (index >= 0 && index < threads.length) {
-                    currentThreadIndex = index;
-                    thread = threads[currentThreadIndex];
-                    updateNavigationButtons();
-                    renderThread();
-                }
-            }
-
-            // Function to update the navigation buttons state
-            function updateNavigationButtons() {
-                document.getElementById("prev-thread").disabled = currentThreadIndex <= 0;
-                document.getElementById("next-thread").disabled = currentThreadIndex >= threads.length - 1;
-                document.getElementById("current-thread-index").textContent = currentThreadIndex + 1;
-                document.getElementById("total-threads").textContent = threads.length;
-            }
-
-            function renderThread() {
-                const tbody = document.getElementById("thread-body");
-                tbody.innerHTML = ""; // Clear existing content
-
-                // Set thread name if available
-                const threadId = thread.thread_id || `Thread ${currentThreadIndex + 1}`;
-                document.getElementById("thread-id").textContent = threadId;
-
-                // Set filename in the header if available
-                const filename = thread.filename || `Thread ${currentThreadIndex + 1}`;
-                document.getElementById("current-filename").textContent = filename;
-
-                // Skip system message
-                const nonSystemMessages = thread.messages.filter((msg) => msg.role !== "system");
-
-                let turnNumber = 0;
-                processMessages(nonSystemMessages, tbody, turnNumber);
-            }
-
-            function processMessages(messages, tbody) {
-                let turnNumber = 0;
-
-                for (let i = 0; i < messages.length; i++) {
-                    const msg = messages[i];
-
-                    if (isUserQuery(msg)) {
-                        // User message starts a new turn
-                        turnNumber++;
-                        renderUserMessage(msg, turnNumber, tbody);
-                    } else if (msg.role === "assistant") {
-                        // Each assistant message is one turn
-                        turnNumber++;
-
-                        // Collect all text content and tool uses for this turn
-                        let assistantText = "";
-                        let toolUses = [];
-
-                        // First, collect all text content
-                        for (const content of msg.content) {
-                            if (content.hasOwnProperty("Text")) {
-                                if (assistantText) {
-                                    assistantText += "<br><br>" + formatContent(content.Text);
-                                } else {
-                                    assistantText = formatContent(content.Text);
-                                }
-                            } else if (content.hasOwnProperty("ToolUse")) {
-                                toolUses.push(content.ToolUse);
-                            }
-                        }
-
-                        // Create a single row for this turn with text and tools
-                        const row = document.createElement("tr");
-                        row.id = `assistant-turn-${turnNumber}`;
-
-                        // Start with the turn number and assistant text
-                        row.innerHTML = `
-                            <td class="text-content">${turnNumber}</td>
-                            <td class="text-content"><!--Assistant: <br/ -->${assistantText}</td>
-                            <td id="tools-${turnNumber}"></td>
-                            <td id="results-${turnNumber}"></td>
-                        `;
-
-                        tbody.appendChild(row);
-
-                        // Add all tool calls to the tools cell
-                        const toolsCell = document.getElementById(`tools-${turnNumber}`);
-                        const resultsCell = document.getElementById(`results-${turnNumber}`);
-
-                        // Process all tools and their results
-                        for (let j = 0; j < toolUses.length; j++) {
-                            const toolUse = toolUses[j];
-                            const toolCall = formatToolCall(toolUse.name, toolUse.input);
-
-                            // Add the tool call to the tools cell
-                            if (j > 0) toolsCell.innerHTML += "<hr>";
-                            toolsCell.innerHTML += toolCall;
-
-                            // Look for corresponding tool result
-                            if (hasMatchingToolResult(messages, i, toolUse.name)) {
-                                const resultMsg = messages[i + 1];
-                                const toolResult = findToolResult(resultMsg, toolUse.name);
-
-                                if (toolResult) {
-                                    // Add the result to the results cell
-                                    if (j > 0) resultsCell.innerHTML += "<hr>";
-
-                                    // Create a container for the result
-                                    const resultDiv = document.createElement("div");
-                                    resultDiv.className = "tool-result";
-
-                                    // Format and display the tool result
-                                    formatToolResultInline(toolResult.content.Text, resultDiv);
-                                    resultsCell.appendChild(resultDiv);
-
-                                    // Skip the result message in the next iteration
-                                    if (j === toolUses.length - 1) {
-                                        i++;
-                                    }
-                                }
-                            }
-                        }
-                    } else if (msg.role === "user" && msg.content.some((c) => c.hasOwnProperty("ToolResult"))) {
-                        // Skip tool result messages as they are handled with their corresponding tool use
-                        continue;
-                    }
-                }
-            }
-
-            function isUserQuery(message) {
-                return message.role === "user" && !message.content.some((c) => c.hasOwnProperty("ToolResult"));
-            }
-
-            function renderUserMessage(message, turnNumber, tbody) {
-                const row = document.createElement("tr");
-                row.innerHTML = `
-                    <td>${turnNumber}</td>
-                    <td class="text-content"><b>[User]:</b><br/> ${formatContent(message.content[0].Text)}</td>
-                    <td></td>
-                    <td></td>
-                `;
-                tbody.appendChild(row);
-            }
-
-            function hasMatchingToolResult(messages, currentIndex, toolName) {
-                return (
-                    currentIndex + 1 < messages.length &&
-                    messages[currentIndex + 1].role === "user" &&
-                    messages[currentIndex + 1].content.some(
-                        (c) => c.hasOwnProperty("ToolResult") && c.ToolResult.tool_name === toolName,
-                    )
-                );
-            }
-
-            function findToolResult(resultMessage, toolName) {
-                const toolResultContent = resultMessage.content.find(
-                    (c) => c.hasOwnProperty("ToolResult") && c.ToolResult.tool_name === toolName,
-                );
-
-                return toolResultContent ? toolResultContent.ToolResult : null;
-            }
-            function formatToolCall(name, input) {
-                // In compact mode, format tool calls on a single line
-                if (viewMode === "compact") {
-                    const params = [];
-                    const fullParams = [];
-
-                    // Process all parameters
-                    for (const [key, value] of Object.entries(input)) {
-                        if (value !== null && value !== undefined) {
-                            // Store full parameter for expanded view
-                            let fullValue = typeof value === "string" ? `"${value}"` : value;
-                            fullParams.push([key, fullValue]);
-
-                            // Abbreviated value for compact view
-                            let displayValue = fullValue;
-                            if (typeof value === "string" && value.length > 30) {
-                                displayValue = `"${value.substring(0, 30)}..."`;
-                            }
-                            params.push(`${key}=${displayValue}`);
-                        }
-                    }
-
-                    const paramString = params.join(", ");
-                    const fullLine = `<span class="tool-name">${name}</span>(${paramString})`;
-
-                    // If the line is too long, add a [more] link
-                    if (fullLine.length > 80 || params.length > 1) {
-                        // Create a container with the compact and full views
-                        const compactView = `<span class="tool-name">${name}</span>(${params[0]}, <span class="more-inline" onclick="toggleActionVisibility(this)">[...]</span>)`;
-
-                        // For the full view, use the original untruncated values
-                        let result = `<span class="tool-name">${name}</span>(`;
-                        const formattedParams = fullParams
-                            .map((p) => `&nbsp;&nbsp;&nbsp;&nbsp;${p[0]}=${p[1]}`)
-                            .join(",<br/>");
-                        const fullView = `${result}<br/>${formattedParams}<br/>)`;
-
-                        return `<div class="action-container">
-                            <div class="action-preview">${compactView}</div>
-                            <div class="action-full hidden">${fullView}</div>
-                        </div>`;
-                    }
-
-                    return fullLine;
-                }
-
-                // Regular (full) view formatting with multiple lines
-                let result = `<span class="tool-name">${name}</span>(`;
-                const params = [];
-                for (const [key, value] of Object.entries(input)) {
-                    if (value !== null && value !== undefined) {
-                        // Format different types of values
-                        let formattedValue = typeof value === "string" ? `"${value}"` : value;
-                        params.push([key, formattedValue]);
-                    }
-                }
-
-                if (params.length === 0) {
-                    return `${result})`;
-                } else if (params.length === 1) {
-                    // For single parameter, just show the value without the parameter name
-                    return `${result}${params[0][1]})`;
-                } else {
-                    // Format parameters
-                    const formattedParams = params.map((p) => `&nbsp;&nbsp;&nbsp;&nbsp;${p[0]}=${p[1]}`).join(",<br/>");
-                    return `${result}<br/>${formattedParams}<br/>)`;
-                }
-            }
-
-            function toggleActionVisibility(element, remainingLines) {
-                const container = element.closest(".action-container");
-                const preview = container.querySelector(".action-preview");
-                const full = container.querySelector(".action-full");
-
-                // Once expanded, keep it expanded
-                full.classList.remove("hidden");
-                preview.classList.add("hidden");
-            }
-
-            function formatToolResultInline(content, targetElement) {
-                // Count lines
-                const lines = content.split("\n");
-
-                // In compact mode, show only 1 line with [more] link
-                if (viewMode === "compact" && lines.length > 1) {
-                    // Create container
-                    const container = document.createElement("div");
-
-                    // Preview content
-                    const previewDiv = document.createElement("div");
-                    previewDiv.className = "preview-content";
-
-                    // Add the first line of content plus [more] link
-                    const previewContent = lines[0];
-                    previewDiv.innerHTML =
-                        escapeHtml(previewContent) +
-                        ` <span class="more-inline" onclick="toggleResultVisibility(this)">[...]</span>`;
-
-                    // Full content (initially hidden)
-                    const contentDiv = document.createElement("pre");
-                    contentDiv.className = "hidden";
-                    contentDiv.innerHTML = escapeHtml(content);
-
-                    container.appendChild(previewDiv);
-                    container.appendChild(contentDiv);
-                    targetElement.appendChild(container);
-                } else {
-                    // For full view or short results, display everything
-                    const preElement = document.createElement("pre");
-                    preElement.textContent = content;
-                    targetElement.appendChild(preElement);
-                }
-            }
-
-            function toggleResultVisibility(element, remainingLines) {
-                const container = element.parentElement.parentElement;
-                const preview = container.querySelector(".preview-content");
-                const full = container.querySelector("pre");
-
-                // Once expanded, keep it expanded
-                full.classList.remove("hidden");
-                preview.classList.add("hidden");
-            }
-
-            function formatContent(text) {
-                return escapeHtml(text);
-            }
-
-            function escapeHtml(text) {
-                const div = document.createElement("div");
-                div.textContent = text;
-                return div.innerHTML;
-            }
-
-            // Keyboard navigation handler
-            document.addEventListener("keydown", function (event) {
-                // previous thread
-                if ((event.ctrlKey && event.key === "ArrowLeft") || event.key === "h" || event.key === "k") {
-                    if (!document.getElementById("prev-thread").disabled) {
-                        previousThread();
-                    }
-                }
-                // next thread
-                else if ((event.ctrlKey && event.key === "ArrowRight") || event.key === "j" || event.key === "l") {
-                    if (!document.getElementById("next-thread").disabled) {
-                        nextThread();
-                    }
-                }
-            });
-
-            // Initialize the page
-            document.addEventListener("DOMContentLoaded", function () {
-                initTheme();
-                updateNavigationButtons();
-                renderThread();
-            });
-        </script>
-    </body>
-</html>

crates/eval/src/explorer.rs πŸ”—

@@ -1,182 +0,0 @@
-use anyhow::{Context as _, Result};
-use clap::Parser;
-use serde_json::{Value, json};
-use std::fs;
-use std::path::{Path, PathBuf};
-
-#[derive(Parser, Debug)]
-#[clap(about = "Generate HTML explorer from JSON thread files")]
-struct Args {
-    /// Paths to JSON files or directories. If a directory is provided,
-    /// it will be searched for 'last.messages.json' files up to 2 levels deep.
-    #[clap(long, required = true, num_args = 1..)]
-    input: Vec<PathBuf>,
-
-    /// Path where the output HTML file will be written
-    #[clap(long)]
-    output: PathBuf,
-}
-
-/// Recursively finds files with `target_filename` in `dir_path` up to `max_depth`.
-#[allow(dead_code)]
-fn find_target_files_recursive(
-    dir_path: &Path,
-    target_filename: &str,
-    current_depth: u8,
-    max_depth: u8,
-    found_files: &mut Vec<PathBuf>,
-) -> Result<()> {
-    if current_depth > max_depth {
-        return Ok(());
-    }
-
-    for entry_result in fs::read_dir(dir_path)
-        .with_context(|| format!("Failed to read directory: {}", dir_path.display()))?
-    {
-        let entry = entry_result.with_context(|| {
-            format!("Failed to read directory entry in: {}", dir_path.display())
-        })?;
-        let path = entry.path();
-
-        if path.is_dir() {
-            find_target_files_recursive(
-                &path,
-                target_filename,
-                current_depth + 1,
-                max_depth,
-                found_files,
-            )?;
-        } else if path.is_file()
-            && let Some(filename_osstr) = path.file_name()
-            && let Some(filename_str) = filename_osstr.to_str()
-            && filename_str == target_filename
-        {
-            found_files.push(path);
-        }
-    }
-    Ok(())
-}
-
-pub fn generate_explorer_html(input_paths: &[PathBuf], output_path: &PathBuf) -> Result<String> {
-    if let Some(parent) = output_path.parent()
-        && !parent.exists()
-    {
-        fs::create_dir_all(parent).context(format!(
-            "Failed to create output directory: {}",
-            parent.display()
-        ))?;
-    }
-
-    let template_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/explorer.html");
-    let template_content = fs::read_to_string(&template_path).context(format!(
-        "Template file not found or couldn't be read: {}",
-        template_path.display()
-    ))?;
-
-    if input_paths.is_empty() {
-        println!(
-            "No input JSON files found to process. Explorer will be generated with template defaults or empty data."
-        );
-    }
-
-    let threads = input_paths
-        .iter()
-        .map(|input_path| {
-            let file_content = fs::read_to_string(input_path)
-                .context(format!("Failed to read file: {}", input_path.display()))?;
-            let mut thread_data: Value = file_content
-                .parse::<Value>()
-                .context(format!("Failed to parse JSON from file: {}", input_path.display()))?;
-
-            if let Some(obj) = thread_data.as_object_mut() {
-                obj.insert("filename".to_string(), json!(input_path.display().to_string()));
-            } else {
-                eprintln!("Warning: JSON data in {} is not a root object. Wrapping it to include filename.", input_path.display());
-                thread_data = json!({
-                    "original_data": thread_data,
-                    "filename": input_path.display().to_string()
-                });
-            }
-            Ok(thread_data)
-        })
-        .collect::<Result<Vec<_>>>()?;
-
-    let all_threads_data = json!({ "threads": threads });
-    let html_content = inject_thread_data(template_content, all_threads_data)?;
-    fs::write(&output_path, &html_content)
-        .context(format!("Failed to write output: {}", output_path.display()))?;
-
-    println!(
-        "Saved data from {} resolved file(s) ({} threads) to {}",
-        input_paths.len(),
-        threads.len(),
-        output_path.display()
-    );
-    Ok(html_content)
-}
-
-fn inject_thread_data(template: String, threads_data: Value) -> Result<String> {
-    let injection_marker = "let threadsData = window.threadsData || { threads: [dummyThread] };";
-    if !template.contains(injection_marker) {
-        anyhow::bail!(
-            "Could not find the thread injection point in the template. Expected: '{}'",
-            injection_marker
-        );
-    }
-
-    let threads_json_string = serde_json::to_string_pretty(&threads_data)
-        .context("Failed to serialize threads data to JSON")?
-        .replace("</script>", r"<\/script>");
-
-    let script_injection = format!("let threadsData = {};", threads_json_string);
-    let final_html = template.replacen(injection_marker, &script_injection, 1);
-
-    Ok(final_html)
-}
-
-#[cfg(not(any(test, doctest)))]
-#[allow(dead_code)]
-fn main() -> Result<()> {
-    let args = Args::parse();
-
-    const DEFAULT_FILENAME: &str = "last.messages.json";
-    const MAX_SEARCH_DEPTH: u8 = 2;
-
-    let mut resolved_input_files: Vec<PathBuf> = Vec::new();
-
-    for input_path_arg in &args.input {
-        if !input_path_arg.exists() {
-            eprintln!(
-                "Warning: Input path {} does not exist. Skipping.",
-                input_path_arg.display()
-            );
-            continue;
-        }
-
-        if input_path_arg.is_dir() {
-            find_target_files_recursive(
-                input_path_arg,
-                DEFAULT_FILENAME,
-                0, // starting depth
-                MAX_SEARCH_DEPTH,
-                &mut resolved_input_files,
-            )
-            .with_context(|| {
-                format!(
-                    "Error searching for '{}' files in directory: {}",
-                    DEFAULT_FILENAME,
-                    input_path_arg.display()
-                )
-            })?;
-        } else if input_path_arg.is_file() {
-            resolved_input_files.push(input_path_arg.clone());
-        }
-    }
-
-    resolved_input_files.sort_unstable();
-    resolved_input_files.dedup();
-
-    println!("No input paths provided/found.");
-
-    generate_explorer_html(&resolved_input_files, &args.output).map(|_| ())
-}

crates/eval/src/ids.rs πŸ”—

@@ -1,29 +0,0 @@
-use anyhow::{Context as _, Result};
-use std::fs;
-use std::path::{Path, PathBuf};
-use uuid::Uuid;
-
-pub fn get_or_create_id(path: &Path) -> Result<String> {
-    if let Ok(id) = fs::read_to_string(path) {
-        let trimmed = id.trim();
-        if !trimmed.is_empty() {
-            return Ok(trimmed.to_string());
-        }
-    }
-    let new_id = Uuid::new_v4().to_string();
-    fs::create_dir_all(path.parent().context("invalid id path")?)?;
-    fs::write(path, &new_id)?;
-    Ok(new_id)
-}
-
-pub fn eval_system_id_path() -> PathBuf {
-    dirs::data_local_dir()
-        .unwrap_or_else(|| PathBuf::from("."))
-        .join("zed-eval-system-id")
-}
-
-pub fn eval_installation_id_path() -> PathBuf {
-    dirs::data_local_dir()
-        .unwrap_or_else(|| PathBuf::from("."))
-        .join("zed-eval-installation-id")
-}

crates/eval/src/instance.rs πŸ”—

@@ -1,1446 +0,0 @@
-use agent::ContextServerRegistry;
-use agent_client_protocol as acp;
-use anyhow::{Context as _, Result, anyhow, bail};
-use client::proto::LspWorkProgress;
-use futures::channel::mpsc;
-use futures::future::Shared;
-use futures::{FutureExt as _, StreamExt as _, future};
-use gpui::{App, AppContext as _, AsyncApp, Entity, Task};
-use handlebars::Handlebars;
-use language::{Buffer, DiagnosticSeverity, OffsetRangeExt as _};
-use language_model::{
-    LanguageModel, LanguageModelCompletionEvent, LanguageModelRegistry, LanguageModelRequest,
-    LanguageModelRequestMessage, LanguageModelToolResultContent, MessageContent, Role, TokenUsage,
-};
-use project::{DiagnosticSummary, Project, ProjectPath, lsp_store::OpenLspBufferHandle};
-use prompt_store::{ProjectContext, WorktreeContext};
-use rand::{distr, prelude::*};
-use serde::{Deserialize, Serialize};
-use std::{
-    fmt::Write as _,
-    fs::{self, File},
-    io::Write as _,
-    path::{Path, PathBuf},
-    rc::Rc,
-    sync::{Arc, Mutex},
-    time::Duration,
-};
-use unindent::Unindent as _;
-use util::{ResultExt as _, command::new_command, markdown::MarkdownCodeBlock};
-
-use crate::{
-    AgentAppState, ToolMetrics,
-    assertions::{AssertionsReport, RanAssertion, RanAssertionResult},
-    example::{Example, ExampleContext, FailedAssertion, JudgeAssertion},
-};
-
-pub const ZED_REPO_URL: &str = "https://github.com/zed-industries/zed.git";
-
-#[derive(Clone)]
-pub struct ExampleInstance {
-    pub thread: Rc<dyn Example>,
-    pub name: String,
-    pub run_directory: PathBuf,
-    pub log_prefix: String,
-    /// The repetition number for this example (0-based)
-    /// When running multiple repetitions of the same example, each instance is assigned a unique repetition number.
-    /// This affects the worktree path and log prefix to avoid clobbering results between runs.
-    pub repetition: usize,
-    pub repo_path: PathBuf,
-    /// Path to the directory containing the requests and responses for the agentic loop
-    worktrees_dir: PathBuf,
-}
-
-#[derive(Debug, Serialize, Clone)]
-pub struct RunOutput {
-    pub repository_diff: String,
-    pub diagnostic_summary_before: DiagnosticSummary,
-    pub diagnostic_summary_after: DiagnosticSummary,
-    pub diagnostics_before: Option<String>,
-    pub diagnostics_after: Option<String>,
-    pub token_usage: TokenUsage,
-    pub tool_metrics: ToolMetrics,
-    pub thread_markdown: String,
-    pub programmatic_assertions: AssertionsReport,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct JudgeDiffInput {
-    pub repository_diff: String,
-    pub assertion: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct JudgeThreadInput {
-    pub messages: String,
-    pub assertion: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct JudgeOutput {
-    pub thread: AssertionsReport,
-    pub diff: AssertionsReport,
-}
-
-impl ExampleInstance {
-    pub fn new(
-        thread: Rc<dyn Example>,
-        repos_dir: &Path,
-        run_dir: &Path,
-        worktrees_dir: &Path,
-        repetition: usize,
-    ) -> Self {
-        let name = thread.meta().name;
-        let run_directory = run_dir.join(&name).join(repetition.to_string());
-
-        let repo_path = repo_path_for_url(repos_dir, &thread.meta().url);
-
-        Self {
-            name,
-            thread,
-            log_prefix: String::new(),
-            run_directory,
-            repetition,
-            repo_path,
-            worktrees_dir: worktrees_dir.to_path_buf(),
-        }
-    }
-
-    pub fn repo_url(&self) -> String {
-        self.thread.meta().url
-    }
-
-    pub fn revision(&self) -> String {
-        self.thread.meta().revision
-    }
-
-    pub fn worktree_name(&self) -> String {
-        format!("{}-{}", self.name, self.repetition)
-    }
-
-    pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) {
-        self.log_prefix = format!(
-            "{}{:<width$}\x1b[0m | ",
-            color,
-            self.worktree_name(),
-            width = name_width
-        );
-    }
-
-    /// Set up the example by checking out the specified Git revision
-    pub async fn fetch(&mut self) -> Result<()> {
-        let meta = self.thread.meta();
-
-        let revision_exists = run_git(
-            &self.repo_path,
-            &["rev-parse", &format!("{}^{{commit}}", &meta.revision)],
-        )
-        .await
-        .is_ok();
-
-        if !revision_exists {
-            println!("{}Fetching revision {}", self.log_prefix, &meta.revision);
-            run_git(
-                &self.repo_path,
-                &["fetch", "--depth", "1", "origin", &meta.revision],
-            )
-            .await?;
-        }
-        Ok(())
-    }
-
-    /// Set up the example by checking out the specified Git revision
-    pub async fn setup(&mut self) -> Result<()> {
-        let worktree_path = self.worktree_path();
-        let meta = self.thread.meta();
-        if worktree_path.is_dir() {
-            println!("{}Resetting existing worktree", self.log_prefix);
-
-            // TODO: consider including "-x" to remove ignored files. The downside of this is that
-            // it will also remove build artifacts, and so prevent incremental reuse there.
-            run_git(&worktree_path, &["clean", "--force", "-d"]).await?;
-            run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
-            run_git(&worktree_path, &["checkout", &meta.revision]).await?;
-        } else {
-            println!("{}Creating worktree", self.log_prefix);
-
-            let worktree_path_string = worktree_path.to_string_lossy().into_owned();
-
-            run_git(
-                &self.repo_path,
-                &[
-                    "worktree",
-                    "add",
-                    "-f",
-                    &worktree_path_string,
-                    &meta.revision,
-                ],
-            )
-            .await?;
-        }
-
-        if meta.url == ZED_REPO_URL {
-            std::fs::write(worktree_path.join(".rules"), std::fs::read(".rules")?)?;
-        }
-
-        std::fs::create_dir_all(&self.run_directory)?;
-
-        Ok(())
-    }
-
-    pub fn worktree_path(&self) -> PathBuf {
-        self.worktrees_dir
-            .join(self.worktree_name())
-            .join(self.thread.meta().repo_name())
-    }
-
-    pub fn run(&self, app_state: Arc<AgentAppState>, cx: &mut App) -> Task<Result<RunOutput>> {
-        let project = Project::local(
-            app_state.client.clone(),
-            app_state.node_runtime.clone(),
-            app_state.user_store.clone(),
-            app_state.languages.clone(),
-            app_state.fs.clone(),
-            None,
-            project::LocalProjectFlags {
-                init_worktree_trust: false,
-                ..Default::default()
-            },
-            cx,
-        );
-
-        let worktree = project.update(cx, |project, cx| {
-            project.create_worktree(self.worktree_path(), true, cx)
-        });
-
-        let meta = self.thread.meta();
-        let this = self.clone();
-
-        cx.spawn(async move |cx| {
-            let worktree = worktree.await?;
-
-            // Wait for worktree scan to finish before choosing a file to open.
-            worktree
-                .update(cx, |worktree, _cx| {
-                    worktree.as_local().unwrap().scan_complete()
-                })
-                .await;
-
-            struct LanguageServerState {
-                _lsp_open_handle: OpenLspBufferHandle,
-                language_file_buffer: Entity<Buffer>,
-            }
-
-            let mut diagnostics_before = None;
-            let mut diagnostic_summary_before = DiagnosticSummary::default();
-
-            let lsp = if let Some(language_server) = &meta.language_server {
-                // Open a file that matches the language to cause LSP to start.
-                let language_file = worktree
-                    .read_with(cx, |worktree, _cx| {
-                        worktree
-                            .files(false, 0)
-                            .find_map(|e| {
-                                if e.path.clone().extension()
-                                    == Some(&language_server.file_extension)
-                                {
-                                    Some(ProjectPath {
-                                        worktree_id: worktree.id(),
-                                        path: e.path.clone(),
-                                    })
-                                } else {
-                                    None
-                                }
-                            })
-                            .context("Failed to find a file for example language")
-                    })?;
-
-                let open_language_file_buffer_task = project.update(cx, |project, cx| {
-                    project.open_buffer(language_file.clone(), cx)
-                });
-
-                let language_file_buffer = open_language_file_buffer_task.await?;
-
-                let lsp_open_handle = project.update(cx, |project, cx| {
-                    project.register_buffer_with_language_servers(&language_file_buffer, cx)
-                });
-
-                wait_for_lang_server(&project, &language_file_buffer, this.log_prefix.clone(), cx).await?;
-
-                diagnostic_summary_before = project.read_with(cx, |project, cx| {
-                    project.diagnostic_summary(false, cx)
-                });
-
-                diagnostics_before = query_lsp_diagnostics(project.clone(), cx).await?;
-                if diagnostics_before.is_some() && language_server.allow_preexisting_diagnostics {
-                    anyhow::bail!("Example has pre-existing diagnostics. If you want to run this example regardless, set `allow_preexisting_diagnostics` to `true` in `base.toml`");
-                }
-
-                Some(LanguageServerState {
-                    _lsp_open_handle: lsp_open_handle,
-                    language_file_buffer,
-                })
-            } else {
-                None
-            };
-
-            anyhow::ensure!(std::env::var("ZED_EVAL_SETUP_ONLY").is_err(), "Setup only mode");
-
-            let last_diff_file_path = this.run_directory.join("last.diff");
-
-            // Write an empty "last.diff" so that it can be opened in Zed for convenient view of the
-            // history using undo/redo.
-            std::fs::write(&last_diff_file_path, "")?;
-
-            let thread = cx.update(|cx| {
-                //todo: Do we want to load rules files here?
-                let worktrees = project.read(cx).visible_worktrees(cx).map(|worktree| {
-                    let root_name = worktree.read(cx).root_name_str().into();
-                    let abs_path = worktree.read(cx).abs_path();
-
-                    WorktreeContext {
-                        root_name,
-                        abs_path,
-                        rules_file: None,
-                    }
-                }).collect::<Vec<_>>();
-                let project_context = cx.new(|_cx| ProjectContext::new(worktrees, vec![]));
-                let context_server_registry = cx.new(|cx| ContextServerRegistry::new(project.read(cx).context_server_store(), cx));
-
-                let thread = if let Some(json) = &meta.existing_thread_json {
-                    let session_id = acp::SessionId::new(
-                        rand::rng()
-                            .sample_iter(&distr::Alphanumeric)
-                            .take(7)
-                            .map(char::from)
-                            .collect::<String>(),
-                    );
-
-                    let db_thread = agent::DbThread::from_json(json.as_bytes()).expect("Can't read serialized thread");
-                    cx.new(|cx| agent::Thread::from_db(session_id, db_thread, project.clone(), project_context, context_server_registry, agent::Templates::new(), cx))
-                } else {
-                    cx.new(|cx| agent::Thread::new(project.clone(), project_context, context_server_registry, agent::Templates::new(), None, cx))
-                };
-
-                thread.update(cx, |thread, cx| {
-                    thread.add_default_tools(Rc::new(EvalThreadEnvironment {
-                        project: project.clone(),
-                    }), cx);
-                    thread.set_profile(meta.profile_id.clone(), cx);
-                    thread.set_model(
-                        LanguageModelInterceptor::new(
-                            LanguageModelRegistry::read_global(cx).default_model().expect("Missing model").model.clone(),
-                            this.run_directory.clone(),
-                            last_diff_file_path.clone(),
-                            this.run_directory.join("last.messages.json"),
-                            this.worktree_path(),
-                            this.repo_url(),
-                        ),
-                        cx,
-                    );
-                });
-
-                thread
-            });
-
-            let mut example_cx = ExampleContext::new(
-                meta.clone(),
-                this.log_prefix.clone(),
-                thread.clone(),
-                cx.clone(),
-            );
-            let result = this.thread.conversation(&mut example_cx).await;
-
-            if let Err(err) = result
-                && !err.is::<FailedAssertion>() {
-                    return Err(err);
-                }
-
-            println!("{}Stopped", this.log_prefix);
-
-            println!("{}Getting repository diff", this.log_prefix);
-            let repository_diff = Self::repository_diff(this.worktree_path(), &this.repo_url()).await?;
-
-            std::fs::write(last_diff_file_path, &repository_diff)?;
-
-
-            let mut diagnostics_after = None;
-            let mut diagnostic_summary_after = Default::default();
-
-            if let Some(language_server_state) = lsp {
-                wait_for_lang_server(&project, &language_server_state.language_file_buffer, this.log_prefix.clone(), cx).await?;
-
-                println!("{}Getting diagnostics", this.log_prefix);
-                diagnostics_after = cx
-                    .update(|cx| {
-                        let project = project.clone();
-                        cx.spawn(async move |cx| query_lsp_diagnostics(project, cx).await)
-                    })
-                    .await?;
-                println!("{}Got diagnostics", this.log_prefix);
-
-                diagnostic_summary_after = project.read_with(cx, |project, cx| {
-                    project.diagnostic_summary(false, cx)
-                });
-
-            }
-
-            if let Some(diagnostics_before) = &diagnostics_before {
-                fs::write(this.run_directory.join("diagnostics_before.txt"), diagnostics_before)?;
-            }
-
-            if let Some(diagnostics_after) = &diagnostics_after {
-                fs::write(this.run_directory.join("diagnostics_after.txt"), diagnostics_after)?;
-            }
-
-            Ok(thread.update(cx, |thread, _cx| {
-                RunOutput {
-                    repository_diff,
-                    diagnostic_summary_before,
-                    diagnostic_summary_after,
-                    diagnostics_before,
-                    diagnostics_after,
-                    token_usage: thread.latest_request_token_usage().unwrap(),
-                    tool_metrics: example_cx.tool_metrics.lock().unwrap().clone(),
-                    thread_markdown: thread.to_markdown(),
-                    programmatic_assertions: example_cx.assertions,
-                }
-            }))
-        })
-    }
-
-    async fn repository_diff(repository_path: PathBuf, repository_url: &str) -> Result<String> {
-        run_git(&repository_path, &["add", "."]).await?;
-        let mut diff_args = vec!["diff", "--staged"];
-        if repository_url == ZED_REPO_URL {
-            diff_args.push(":(exclude).rules");
-        }
-        run_git(&repository_path, &diff_args).await
-    }
-
-    pub async fn judge(
-        &self,
-        model: Arc<dyn LanguageModel>,
-        run_output: &RunOutput,
-        cx: &AsyncApp,
-    ) -> JudgeOutput {
-        let mut output_file =
-            File::create(self.run_directory.join("judge.md")).expect("failed to create judge.md");
-
-        let diff_task = self.judge_diff(model.clone(), run_output, cx);
-        let thread_task = self.judge_thread(model.clone(), run_output, cx);
-
-        let (diff_result, thread_result) = futures::join!(diff_task, thread_task);
-
-        let (diff_response, diff_output) = diff_result;
-        let (thread_response, thread_output) = thread_result;
-
-        writeln!(
-            &mut output_file,
-            "# Judgment\n\n## Thread\n\n{thread_response}\n\n## Diff\n\n{diff_response}",
-        )
-        .log_err();
-
-        JudgeOutput {
-            thread: thread_output,
-            diff: diff_output,
-        }
-    }
-
-    async fn judge_diff(
-        &self,
-        model: Arc<dyn LanguageModel>,
-        run_output: &RunOutput,
-        cx: &AsyncApp,
-    ) -> (String, AssertionsReport) {
-        let diff_assertions = self.thread.diff_assertions();
-
-        if diff_assertions.is_empty() {
-            return (
-                "No diff assertions".to_string(),
-                AssertionsReport::default(),
-            );
-        }
-
-        println!("{}Running diff judge", self.log_prefix);
-
-        let judge_diff_prompt = include_str!("judge_diff_prompt.hbs");
-        let judge_diff_prompt_name = "judge_diff_prompt";
-        let mut hbs = Handlebars::new();
-        hbs.register_template_string(judge_diff_prompt_name, judge_diff_prompt)
-            .unwrap();
-
-        let to_prompt = |assertion: String| {
-            hbs.render(
-                judge_diff_prompt_name,
-                &JudgeDiffInput {
-                    repository_diff: run_output.repository_diff.clone(),
-                    assertion,
-                },
-            )
-            .unwrap()
-        };
-
-        let (responses, report) = self
-            .judge_assertions(model, diff_assertions, to_prompt, cx)
-            .await;
-
-        println!(
-            "{}Judge - Diff score: {}%",
-            self.log_prefix,
-            report.passed_percentage()
-        );
-
-        (responses, report)
-    }
-
-    async fn judge_thread(
-        &self,
-        model: Arc<dyn LanguageModel>,
-        run_output: &RunOutput,
-        cx: &AsyncApp,
-    ) -> (String, AssertionsReport) {
-        let thread_assertions = self.thread.thread_assertions();
-
-        if thread_assertions.is_empty() {
-            return (
-                "No thread assertions".to_string(),
-                AssertionsReport::default(),
-            );
-        }
-
-        let judge_thread_prompt = include_str!("judge_thread_prompt.hbs");
-        let judge_thread_prompt_name = "judge_thread_prompt";
-        let mut hbs = Handlebars::new();
-        hbs.register_template_string(judge_thread_prompt_name, judge_thread_prompt)
-            .unwrap();
-
-        let complete_messages = &run_output.thread_markdown;
-        let to_prompt = |assertion: String| {
-            hbs.render(
-                judge_thread_prompt_name,
-                &JudgeThreadInput {
-                    messages: complete_messages.clone(),
-                    assertion,
-                },
-            )
-            .unwrap()
-        };
-
-        let (responses, report) = self
-            .judge_assertions(model, thread_assertions, to_prompt, cx)
-            .await;
-
-        println!(
-            "{}Judge - Thread score: {}%",
-            self.log_prefix,
-            report.passed_percentage()
-        );
-
-        (responses, report)
-    }
-
-    async fn judge_assertions(
-        &self,
-        model: Arc<dyn LanguageModel>,
-        assertions: Vec<JudgeAssertion>,
-        to_prompt: impl Fn(String) -> String,
-        cx: &AsyncApp,
-    ) -> (String, AssertionsReport) {
-        let assertions = assertions.into_iter().map(|assertion| {
-            let request = LanguageModelRequest {
-                thread_id: None,
-                prompt_id: None,
-                intent: None,
-                messages: vec![LanguageModelRequestMessage {
-                    role: Role::User,
-                    content: vec![MessageContent::Text(to_prompt(assertion.description))],
-                    cache: false,
-                    reasoning_details: None,
-                }],
-                temperature: None,
-                tools: Vec::new(),
-                tool_choice: None,
-                stop: Vec::new(),
-                thinking_allowed: true,
-                thinking_effort: None,
-                speed: None,
-            };
-
-            let model = model.clone();
-            let log_prefix = self.log_prefix.clone();
-            async move {
-                let response = send_language_model_request(model, request, cx).await;
-
-                let (response, result) = match response {
-                    Ok(response) => (
-                        response.clone(),
-                        parse_assertion_result(&response).map_err(|err| err.to_string()),
-                    ),
-                    Err(err) => (err.to_string(), Err(err.to_string())),
-                };
-
-                if result.is_ok() {
-                    println!("{}βœ… {}", log_prefix, assertion.id);
-                } else {
-                    println!("{}❌ {}", log_prefix, assertion.id);
-                }
-
-                (
-                    response,
-                    RanAssertion {
-                        id: assertion.id,
-                        result,
-                    },
-                )
-            }
-        });
-
-        let mut responses = String::new();
-        let mut report = AssertionsReport::default();
-
-        for (response, assertion) in future::join_all(assertions).await {
-            writeln!(&mut responses, "# {}", assertion.id).unwrap();
-            writeln!(&mut responses, "{}\n\n", response).unwrap();
-            report.ran.push(assertion);
-        }
-
-        (responses, report)
-    }
-}
-
-struct EvalThreadEnvironment {
-    project: Entity<Project>,
-}
-
-struct EvalTerminalHandle {
-    terminal: Entity<acp_thread::Terminal>,
-}
-
-impl agent::TerminalHandle for EvalTerminalHandle {
-    fn id(&self, cx: &AsyncApp) -> Result<acp::TerminalId> {
-        Ok(self.terminal.read_with(cx, |term, _cx| term.id().clone()))
-    }
-
-    fn wait_for_exit(&self, cx: &AsyncApp) -> Result<Shared<Task<acp::TerminalExitStatus>>> {
-        Ok(self
-            .terminal
-            .read_with(cx, |term, _cx| term.wait_for_exit()))
-    }
-
-    fn current_output(&self, cx: &AsyncApp) -> Result<acp::TerminalOutputResponse> {
-        Ok(self
-            .terminal
-            .read_with(cx, |term, cx| term.current_output(cx)))
-    }
-
-    fn kill(&self, cx: &AsyncApp) -> Result<()> {
-        cx.update(|cx| {
-            self.terminal.update(cx, |terminal, cx| {
-                terminal.kill(cx);
-            });
-        });
-        Ok(())
-    }
-
-    fn was_stopped_by_user(&self, cx: &AsyncApp) -> Result<bool> {
-        Ok(self
-            .terminal
-            .read_with(cx, |term, _cx| term.was_stopped_by_user()))
-    }
-}
-
-impl agent::ThreadEnvironment for EvalThreadEnvironment {
-    fn create_terminal(
-        &self,
-        command: String,
-        cwd: Option<PathBuf>,
-        output_byte_limit: Option<u64>,
-        cx: &mut AsyncApp,
-    ) -> Task<Result<Rc<dyn agent::TerminalHandle>>> {
-        let project = self.project.clone();
-        cx.spawn(async move |cx| {
-            let language_registry =
-                project.read_with(cx, |project, _cx| project.languages().clone());
-            let id = acp::TerminalId::new(uuid::Uuid::new_v4().to_string());
-            let terminal =
-                acp_thread::create_terminal_entity(command, &[], vec![], cwd.clone(), &project, cx)
-                    .await?;
-            let terminal = cx.new(|cx| {
-                acp_thread::Terminal::new(
-                    id,
-                    "",
-                    cwd,
-                    output_byte_limit.map(|limit| limit as usize),
-                    terminal,
-                    language_registry,
-                    cx,
-                )
-            });
-            Ok(Rc::new(EvalTerminalHandle { terminal }) as Rc<dyn agent::TerminalHandle>)
-        })
-    }
-
-    fn create_subagent(
-        &self,
-        _label: String,
-        _cx: &mut App,
-    ) -> Result<Rc<dyn agent::SubagentHandle>> {
-        unimplemented!()
-    }
-}
-
-struct LanguageModelInterceptor {
-    model: Arc<dyn LanguageModel>,
-    request_count: Arc<Mutex<usize>>,
-    previous_diff: Arc<Mutex<String>>,
-    example_output_dir: PathBuf,
-    last_diff_file_path: PathBuf,
-    messages_json_file_path: PathBuf,
-    repository_path: PathBuf,
-    repository_url: String,
-}
-
-impl LanguageModelInterceptor {
-    fn new(
-        model: Arc<dyn LanguageModel>,
-        example_output_dir: PathBuf,
-        last_diff_file_path: PathBuf,
-        messages_json_file_path: PathBuf,
-        repository_path: PathBuf,
-        repository_url: String,
-    ) -> Arc<Self> {
-        Arc::new(Self {
-            model,
-            request_count: Arc::new(Mutex::new(0)),
-            previous_diff: Arc::new(Mutex::new("".to_string())),
-            example_output_dir,
-            last_diff_file_path,
-            messages_json_file_path,
-            repository_path,
-            repository_url,
-        })
-    }
-}
-
-impl language_model::LanguageModel for LanguageModelInterceptor {
-    fn id(&self) -> language_model::LanguageModelId {
-        self.model.id()
-    }
-
-    fn name(&self) -> language_model::LanguageModelName {
-        self.model.name()
-    }
-
-    fn provider_id(&self) -> language_model::LanguageModelProviderId {
-        self.model.provider_id()
-    }
-
-    fn provider_name(&self) -> language_model::LanguageModelProviderName {
-        self.model.provider_name()
-    }
-
-    fn telemetry_id(&self) -> String {
-        self.model.telemetry_id()
-    }
-
-    fn supports_images(&self) -> bool {
-        self.model.supports_images()
-    }
-
-    fn supports_tools(&self) -> bool {
-        self.model.supports_tools()
-    }
-
-    fn supports_tool_choice(&self, choice: language_model::LanguageModelToolChoice) -> bool {
-        self.model.supports_tool_choice(choice)
-    }
-
-    fn max_token_count(&self) -> u64 {
-        self.model.max_token_count()
-    }
-
-    fn count_tokens(
-        &self,
-        request: LanguageModelRequest,
-        cx: &App,
-    ) -> future::BoxFuture<'static, Result<u64>> {
-        self.model.count_tokens(request, cx)
-    }
-
-    fn stream_completion(
-        &self,
-        request: LanguageModelRequest,
-        cx: &AsyncApp,
-    ) -> future::BoxFuture<
-        'static,
-        Result<
-            futures::stream::BoxStream<
-                'static,
-                Result<LanguageModelCompletionEvent, language_model::LanguageModelCompletionError>,
-            >,
-            language_model::LanguageModelCompletionError,
-        >,
-    > {
-        let stream = self.model.stream_completion(request.clone(), cx);
-        let request_count = self.request_count.clone();
-        let previous_diff = self.previous_diff.clone();
-        let example_output_dir = self.example_output_dir.clone();
-        let last_diff_file_path = self.last_diff_file_path.clone();
-        let messages_json_file_path = self.messages_json_file_path.clone();
-        let repository_path = self.repository_path.clone();
-        let repository_url = self.repository_url.clone();
-
-        Box::pin(async move {
-            let stream = stream.await?;
-
-            let response_events = Arc::new(Mutex::new(Vec::new()));
-            let request_clone = request.clone();
-
-            let wrapped_stream = stream.then(move |event| {
-                let response_events = response_events.clone();
-                let request = request_clone.clone();
-                let request_count = request_count.clone();
-                let previous_diff = previous_diff.clone();
-                let example_output_dir = example_output_dir.clone();
-                let last_diff_file_path = last_diff_file_path.clone();
-                let messages_json_file_path = messages_json_file_path.clone();
-                let repository_path = repository_path.clone();
-                let repository_url = repository_url.clone();
-
-                async move {
-                    let event_result = match &event {
-                        Ok(ev) => Ok(ev.clone()),
-                        Err(err) => Err(err.to_string()),
-                    };
-                    response_events.lock().unwrap().push(event_result);
-
-                    let should_execute = matches!(
-                        &event,
-                        Ok(LanguageModelCompletionEvent::Stop { .. }) | Err(_)
-                    );
-
-                    if should_execute {
-                        let current_request_count = {
-                            let mut count = request_count.lock().unwrap();
-                            *count += 1;
-                            *count
-                        };
-
-                        let messages_file_path =
-                            example_output_dir.join(format!("{current_request_count}.messages.md"));
-                        let diff_file_path =
-                            example_output_dir.join(format!("{current_request_count}.diff"));
-                        let last_messages_file_path = example_output_dir.join("last.messages.md");
-
-                        let collected_events = response_events.lock().unwrap().clone();
-                        let request_markdown = RequestMarkdown::new(&request);
-                        let response_events_markdown =
-                            response_events_to_markdown(&collected_events);
-                        let dialog = ThreadDialog::new(&request, &collected_events);
-                        let dialog_json =
-                            serde_json::to_string_pretty(&dialog.to_combined_request())
-                                .unwrap_or_default();
-
-                        let messages = format!(
-                            "{}\n\n{}",
-                            request_markdown.messages, response_events_markdown
-                        );
-                        fs::write(&messages_file_path, messages.clone())
-                            .expect("failed to write messages file");
-                        fs::write(&last_messages_file_path, messages)
-                            .expect("failed to write last messages file");
-                        fs::write(&messages_json_file_path, dialog_json)
-                            .expect("failed to write last.messages.json");
-
-                        // Get repository diff
-                        let diff_result =
-                            ExampleInstance::repository_diff(repository_path, &repository_url)
-                                .await;
-
-                        match diff_result {
-                            Ok(diff) => {
-                                let prev_diff = previous_diff.lock().unwrap().clone();
-                                if diff != prev_diff {
-                                    fs::write(&diff_file_path, &diff)
-                                        .expect("failed to write diff file");
-                                    fs::write(&last_diff_file_path, &diff)
-                                        .expect("failed to write last diff file");
-                                    *previous_diff.lock().unwrap() = diff;
-                                }
-                            }
-                            Err(err) => {
-                                let error_message = format!("{err:?}");
-                                fs::write(&diff_file_path, &error_message)
-                                    .expect("failed to write diff error to file");
-                                fs::write(&last_diff_file_path, &error_message)
-                                    .expect("failed to write last diff file");
-                            }
-                        }
-
-                        if current_request_count == 1 {
-                            let tools_file_path = example_output_dir.join("tools.md");
-                            fs::write(tools_file_path, request_markdown.tools)
-                                .expect("failed to write tools file");
-                        }
-                    }
-
-                    event
-                }
-            });
-
-            Ok(Box::pin(wrapped_stream)
-                as futures::stream::BoxStream<
-                    'static,
-                    Result<
-                        LanguageModelCompletionEvent,
-                        language_model::LanguageModelCompletionError,
-                    >,
-                >)
-        })
-    }
-}
-
-pub fn wait_for_lang_server(
-    project: &Entity<Project>,
-    buffer: &Entity<Buffer>,
-    log_prefix: String,
-    cx: &mut AsyncApp,
-) -> Task<Result<()>> {
-    if std::env::var("ZED_EVAL_SKIP_LS").is_ok() {
-        return Task::ready(Ok(()));
-    }
-
-    println!("{}⏡ Waiting for language server", log_prefix);
-
-    let (mut tx, mut rx) = mpsc::channel(1);
-
-    let lsp_store = project.read_with(cx, |project, _| project.lsp_store());
-
-    let has_lang_server = buffer.update(cx, |buffer, cx| {
-        lsp_store.update(cx, |lsp_store, cx| {
-            lsp_store
-                .running_language_servers_for_local_buffer(buffer, cx)
-                .next()
-                .is_some()
-        })
-    });
-
-    if has_lang_server {
-        project
-            .update(cx, |project, cx| project.save_buffer(buffer.clone(), cx))
-            .detach();
-    }
-
-    let subscriptions =
-        [
-            cx.subscribe(&lsp_store, {
-                let log_prefix = log_prefix.clone();
-                move |_, event, _| {
-                    if let project::LspStoreEvent::LanguageServerUpdate {
-                        message:
-                            client::proto::update_language_server::Variant::WorkProgress(
-                                LspWorkProgress {
-                                    message: Some(message),
-                                    ..
-                                },
-                            ),
-                        ..
-                    } = event
-                    {
-                        println!("{}⟲ {message}", log_prefix)
-                    }
-                }
-            }),
-            cx.subscribe(project, {
-                let buffer = buffer.clone();
-                move |project, event, cx| match event {
-                    project::Event::LanguageServerAdded(_, _, _) => {
-                        let buffer = buffer.clone();
-                        project
-                            .update(cx, |project, cx| project.save_buffer(buffer, cx))
-                            .detach();
-                    }
-                    project::Event::DiskBasedDiagnosticsFinished { .. } => {
-                        tx.try_send(()).ok();
-                    }
-                    _ => {}
-                }
-            }),
-        ];
-
-    cx.spawn(async move |cx| {
-        let timeout = cx.background_executor().timer(Duration::new(60 * 5, 0));
-        let result = futures::select! {
-            _ = rx.next() => {
-                println!("{}βš‘ Language server idle", log_prefix);
-                anyhow::Ok(())
-            },
-            _ = timeout.fuse() => {
-                anyhow::bail!("LSP wait timed out after 5 minutes");
-            }
-        };
-        drop(subscriptions);
-        result
-    })
-}
-
-pub async fn query_lsp_diagnostics(
-    project: Entity<Project>,
-    cx: &mut AsyncApp,
-) -> Result<Option<String>> {
-    let paths_with_diagnostics = project.update(cx, |project, cx| {
-        project
-            .diagnostic_summaries(true, cx)
-            .filter(|(_, _, summary)| summary.error_count > 0 || summary.warning_count > 0)
-            .map(|(project_path, _, _)| project_path)
-            .collect::<Vec<_>>()
-    });
-
-    if paths_with_diagnostics.is_empty() {
-        return Ok(None);
-    }
-
-    let mut output = String::new();
-    for project_path in paths_with_diagnostics {
-        let buffer = project
-            .update(cx, |project, cx| project.open_buffer(project_path, cx))
-            .await?;
-        let snapshot = buffer.read_with(cx, |buffer, _cx| buffer.snapshot());
-
-        for (_, group) in snapshot.diagnostic_groups(None) {
-            let entry = &group.entries[group.primary_ix];
-            let range = entry.range.to_point(&snapshot);
-            let severity = match entry.diagnostic.severity {
-                DiagnosticSeverity::ERROR => "error",
-                DiagnosticSeverity::WARNING => "warning",
-                _ => continue,
-            };
-
-            writeln!(
-                output,
-                "{} at line {}: {}",
-                severity,
-                range.start.row + 1,
-                entry.diagnostic.message
-            )?;
-        }
-    }
-    anyhow::Ok(Some(output))
-}
-
-fn parse_assertion_result(response: &str) -> Result<RanAssertionResult> {
-    let analysis = get_tag("analysis", response)?;
-    let passed = match get_tag("passed", response)?.to_lowercase().as_str() {
-        "true" => true,
-        "false" => false,
-        value @ _ => bail!("invalid judge `passed` tag: {value}"),
-    };
-    Ok(RanAssertionResult {
-        analysis: Some(analysis),
-        passed,
-    })
-}
-
-fn get_tag(name: &'static str, response: &str) -> Result<String> {
-    let start_tag = format!("<{}>", name);
-    let end_tag = format!("</{}>", name);
-
-    let start_ix = response
-        .find(&start_tag)
-        .context(format!("{} start tag not found", name))?;
-    let content_start_ix = start_ix + start_tag.len();
-
-    let end_ix = content_start_ix
-        + response[content_start_ix..]
-            .find(&end_tag)
-            .context(format!("{} end tag not found", name))?;
-
-    let content = response[content_start_ix..end_ix].trim().unindent();
-
-    anyhow::Ok(content)
-}
-
-pub fn repo_path_for_url(repos_dir: &Path, repo_url: &str) -> PathBuf {
-    let repo_name = repo_url
-        .trim_start_matches("https://")
-        .replace(|c: char| !c.is_alphanumeric(), "-");
-    Path::new(repos_dir).join(repo_name)
-}
-
-pub async fn run_git(repo_path: &Path, args: &[&str]) -> Result<String> {
-    let output = new_command("git")
-        .current_dir(repo_path)
-        .args(args)
-        .output()
-        .await?;
-
-    anyhow::ensure!(
-        output.status.success(),
-        "`git {}` within `{}` failed with status: {}\nstderr:\n{}\nstdout:\n{}",
-        args.join(" "),
-        repo_path.display(),
-        output.status,
-        String::from_utf8_lossy(&output.stderr),
-        String::from_utf8_lossy(&output.stdout),
-    );
-    Ok(String::from_utf8(output.stdout)?.trim().to_string())
-}
-
-fn push_role(role: &Role, buf: &mut String, assistant_message_number: &mut u32) {
-    match role {
-        Role::System => buf.push_str("# βš™οΈ SYSTEM\n\n"),
-        Role::User => buf.push_str("# πŸ‘€ USER\n\n"),
-        Role::Assistant => {
-            buf.push_str(&format!("# πŸ€– ASSISTANT {assistant_message_number}\n\n"));
-            *assistant_message_number = *assistant_message_number + 1;
-        }
-    }
-}
-
-pub async fn send_language_model_request(
-    model: Arc<dyn LanguageModel>,
-    request: LanguageModelRequest,
-    cx: &AsyncApp,
-) -> anyhow::Result<String> {
-    match model.stream_completion_text(request, cx).await {
-        Ok(mut stream) => {
-            let mut full_response = String::new();
-            while let Some(chunk_result) = stream.stream.next().await {
-                match chunk_result {
-                    Ok(chunk_str) => {
-                        full_response.push_str(&chunk_str);
-                    }
-                    Err(err) => {
-                        anyhow::bail!("Error receiving response from language model: {err}");
-                    }
-                }
-            }
-            Ok(full_response)
-        }
-        Err(err) => Err(anyhow!(
-            "Failed to get response from language model. Error was: {err}"
-        )),
-    }
-}
-
-pub struct RequestMarkdown {
-    pub tools: String,
-    pub messages: String,
-}
-
-impl RequestMarkdown {
-    pub fn new(request: &LanguageModelRequest) -> Self {
-        let mut tools = String::new();
-        let mut messages = String::new();
-        let mut assistant_message_number: u32 = 1;
-
-        // Print the tools
-        if !request.tools.is_empty() {
-            for tool in &request.tools {
-                write!(&mut tools, "# {}\n\n", tool.name).unwrap();
-                write!(&mut tools, "{}\n\n", tool.description).unwrap();
-                writeln!(
-                    &mut tools,
-                    "{}",
-                    MarkdownCodeBlock {
-                        tag: "json",
-                        text: &format!("{:#}", tool.input_schema)
-                    }
-                )
-                .unwrap();
-            }
-        }
-
-        // Print the messages
-        for message in &request.messages {
-            push_role(&message.role, &mut messages, &mut assistant_message_number);
-
-            for content in &message.content {
-                match content {
-                    MessageContent::Text(text) => {
-                        messages.push_str(text);
-                        messages.push_str("\n\n");
-                    }
-                    MessageContent::Image(_) => {
-                        messages.push_str("[IMAGE DATA]\n\n");
-                    }
-                    MessageContent::Thinking { text, signature } => {
-                        messages.push_str("**Thinking**:\n\n");
-                        if let Some(sig) = signature {
-                            messages.push_str(&format!("Signature: {}\n\n", sig));
-                        }
-                        messages.push_str(text);
-                        messages.push_str("\n");
-                    }
-                    MessageContent::RedactedThinking(items) => {
-                        messages.push_str(&format!(
-                            "**Redacted Thinking**: {} item(s)\n\n",
-                            items.len()
-                        ));
-                    }
-                    MessageContent::ToolUse(tool_use) => {
-                        messages.push_str(&format!(
-                            "**Tool Use**: {} (ID: {})\n",
-                            tool_use.name, tool_use.id
-                        ));
-                        messages.push_str(&format!(
-                            "{}\n",
-                            MarkdownCodeBlock {
-                                tag: "json",
-                                text: &format!("{:#}", tool_use.input)
-                            }
-                        ));
-                    }
-                    MessageContent::ToolResult(tool_result) => {
-                        messages.push_str(&format!(
-                            "**Tool Result**: {} (ID: {})\n\n",
-                            tool_result.tool_name, tool_result.tool_use_id
-                        ));
-                        if tool_result.is_error {
-                            messages.push_str("**ERROR:**\n");
-                        }
-
-                        match &tool_result.content {
-                            LanguageModelToolResultContent::Text(text) => {
-                                writeln!(messages, "{text}\n").ok();
-                            }
-                            LanguageModelToolResultContent::Image(image) => {
-                                writeln!(messages, "![Image](data:base64,{})\n", image.source).ok();
-                            }
-                        }
-
-                        if let Some(output) = tool_result.output.as_ref() {
-                            writeln!(
-                                messages,
-                                "**Debug Output**:\n\n```json\n{}\n```\n",
-                                serde_json::to_string_pretty(output).unwrap()
-                            )
-                            .unwrap();
-                        }
-                    }
-                }
-            }
-        }
-
-        Self { tools, messages }
-    }
-}
-
-pub fn response_events_to_markdown(
-    response_events: &[std::result::Result<LanguageModelCompletionEvent, String>],
-) -> String {
-    let mut response = String::new();
-    // Print the response events if any
-    response.push_str("# Response\n\n");
-    let mut text_buffer = String::new();
-    let mut thinking_buffer = String::new();
-
-    let flush_buffers =
-        |output: &mut String, text_buffer: &mut String, thinking_buffer: &mut String| {
-            if !text_buffer.is_empty() {
-                output.push_str(&format!("**Text**:\n{}\n\n", text_buffer));
-                text_buffer.clear();
-            }
-            if !thinking_buffer.is_empty() {
-                output.push_str(&format!("**Thinking**:\n{}\n\n", thinking_buffer));
-                thinking_buffer.clear();
-            }
-        };
-
-    for event in response_events {
-        match event {
-            Ok(LanguageModelCompletionEvent::Text(text)) => {
-                text_buffer.push_str(text);
-            }
-            Ok(LanguageModelCompletionEvent::Thinking { text, .. }) => {
-                thinking_buffer.push_str(text);
-            }
-            Ok(LanguageModelCompletionEvent::RedactedThinking { .. }) => {}
-            Ok(LanguageModelCompletionEvent::Stop(reason)) => {
-                flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer);
-                response.push_str(&format!("**Stop**: {:?}\n\n", reason));
-            }
-            Ok(LanguageModelCompletionEvent::ToolUse(tool_use)) => {
-                flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer);
-                response.push_str(&format!(
-                    "**Tool Use**: {} (ID: {})\n",
-                    tool_use.name, tool_use.id
-                ));
-                response.push_str(&format!(
-                    "{}\n",
-                    MarkdownCodeBlock {
-                        tag: "json",
-                        text: &format!("{:#}", tool_use.input)
-                    }
-                ));
-            }
-            Ok(
-                LanguageModelCompletionEvent::UsageUpdate(_)
-                | LanguageModelCompletionEvent::StartMessage { .. }
-                | LanguageModelCompletionEvent::Queued { .. }
-                | LanguageModelCompletionEvent::Started
-                | LanguageModelCompletionEvent::ReasoningDetails(_),
-            ) => {}
-            Ok(LanguageModelCompletionEvent::ToolUseJsonParseError {
-                json_parse_error, ..
-            }) => {
-                flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer);
-                response.push_str(&format!(
-                    "**Error**: parse error in tool use JSON: {}\n\n",
-                    json_parse_error
-                ));
-            }
-            Err(error) => {
-                flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer);
-                response.push_str(&format!("**Error**: {}\n\n", error));
-            }
-        }
-    }
-
-    flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer);
-
-    response
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
-pub struct ThreadDialog {
-    pub request: LanguageModelRequest,
-    pub response_events: Vec<std::result::Result<LanguageModelCompletionEvent, String>>,
-}
-
-impl ThreadDialog {
-    pub fn new(
-        request: &LanguageModelRequest,
-        response_events: &[std::result::Result<LanguageModelCompletionEvent, String>],
-    ) -> Self {
-        Self {
-            request: request.clone(),
-            response_events: response_events.to_vec(),
-        }
-    }
-
-    /// Represents all request and response messages in a unified format.
-    ///
-    /// Specifically, it appends the assistant's response (derived from response events)
-    /// as a new message to existing messages in the request.
-    pub fn to_combined_request(&self) -> LanguageModelRequest {
-        let mut request = self.request.clone();
-        if let Some(assistant_message) = self.response_events_to_message() {
-            request.messages.push(assistant_message);
-        }
-        request
-    }
-    fn response_events_to_message(&self) -> Option<LanguageModelRequestMessage> {
-        let response_events = &self.response_events;
-        let mut content: Vec<MessageContent> = Vec::new();
-        let mut current_text = String::new();
-
-        let flush_text = |text: &mut String, content: &mut Vec<MessageContent>| {
-            if !text.is_empty() {
-                content.push(MessageContent::Text(std::mem::take(text)));
-            }
-        };
-
-        for event in response_events {
-            match event {
-                Ok(LanguageModelCompletionEvent::Text(text)) => {
-                    current_text.push_str(text);
-                }
-
-                Ok(LanguageModelCompletionEvent::ToolUse(tool_use)) => {
-                    flush_text(&mut current_text, &mut content);
-                    if tool_use.is_input_complete {
-                        content.push(MessageContent::ToolUse(tool_use.clone()));
-                    }
-                }
-                Ok(LanguageModelCompletionEvent::Thinking { text, signature }) => {
-                    flush_text(&mut current_text, &mut content);
-                    content.push(MessageContent::Thinking {
-                        text: text.clone(),
-                        signature: signature.clone(),
-                    });
-                }
-
-                // Skip these
-                Ok(LanguageModelCompletionEvent::UsageUpdate(_))
-                | Ok(LanguageModelCompletionEvent::RedactedThinking { .. })
-                | Ok(LanguageModelCompletionEvent::StartMessage { .. })
-                | Ok(LanguageModelCompletionEvent::ReasoningDetails(_))
-                | Ok(LanguageModelCompletionEvent::Stop(_))
-                | Ok(LanguageModelCompletionEvent::Queued { .. })
-                | Ok(LanguageModelCompletionEvent::Started) => {}
-
-                Ok(LanguageModelCompletionEvent::ToolUseJsonParseError {
-                    json_parse_error,
-                    ..
-                }) => {
-                    flush_text(&mut current_text, &mut content);
-                    content.push(MessageContent::Text(format!(
-                        "ERROR: parse error in tool use JSON: {}",
-                        json_parse_error
-                    )));
-                }
-
-                Err(error) => {
-                    flush_text(&mut current_text, &mut content);
-                    content.push(MessageContent::Text(format!("ERROR: {}", error)));
-                }
-            }
-        }
-
-        flush_text(&mut current_text, &mut content);
-
-        if !content.is_empty() {
-            Some(LanguageModelRequestMessage {
-                role: Role::Assistant,
-                content,
-                cache: false,
-                reasoning_details: None,
-            })
-        } else {
-            None
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn test_parse_judge_output() {
-        let response = r#"
-            <analysis>The model did a good job but there were still compilations errors.</analysis>
-            <passed>true</passed>
-        "#
-        .unindent();
-
-        let output = parse_assertion_result(&response).unwrap();
-        assert_eq!(
-            output.analysis,
-            Some("The model did a good job but there were still compilations errors.".into())
-        );
-        assert!(output.passed);
-
-        let response = r#"
-            Text around ignored
-
-            <analysis>
-                Failed to compile:
-                - Error 1
-                - Error 2
-            </analysis>
-
-            <passed>false</passed>
-        "#
-        .unindent();
-
-        let output = parse_assertion_result(&response).unwrap();
-        assert_eq!(
-            output.analysis,
-            Some("Failed to compile:\n- Error 1\n- Error 2".into())
-        );
-        assert!(!output.passed);
-    }
-}

crates/eval/src/judge_diff_prompt.hbs πŸ”—

@@ -1,25 +0,0 @@
-You are an expert software developer. Your task is to evaluate a diff produced by an AI agent
-in response to a prompt. Here is the prompt and the diff:
-
-<prompt>
-{{{prompt}}}
-</prompt>
-
-<diff>
-{{{repository_diff}}}
-</diff>
-
-Evaluate whether or not the diff passes the following assertion:
-
-<assertion>
-{{assertion}}
-</assertion>
-
-Analyze the diff hunk by hunk, and structure your answer in the following XML format:
-
-```
-<analysis>{YOUR ANALYSIS HERE}</analysis>
-<passed>{PASSED_ASSERTION}</passed>
-```
-
-Where `PASSED_ASSERTION` is either `true` or `false`.

crates/eval/src/judge_thread_prompt.hbs πŸ”—

@@ -1,21 +0,0 @@
-You are an expert software developer.
-Your task is to evaluate an AI agent's messages and tool calls in this conversation:
-
-<messages>
-{{{messages}}}
-</messages>
-
-Evaluate whether or not the sequence of messages passes the following assertion:
-
-<assertion>
-{{{assertion}}}
-</assertion>
-
-Analyze the messages one by one, and structure your answer in the following XML format:
-
-```
-<analysis>{YOUR ANALYSIS HERE}</analysis>
-<passed>{PASSED_ASSERTION}</passed>
-```
-
-Where `PASSED_ASSERTION` is either `true` or `false`.

crates/eval/src/tool_metrics.rs πŸ”—

@@ -1,106 +0,0 @@
-use collections::HashMap;
-use serde::{Deserialize, Serialize};
-use std::{fmt::Display, sync::Arc};
-
-#[derive(Debug, Default, Clone, Serialize, Deserialize)]
-pub struct ToolMetrics {
-    pub use_counts: HashMap<Arc<str>, u32>,
-    pub failure_counts: HashMap<Arc<str>, u32>,
-}
-
-impl ToolMetrics {
-    pub fn insert(&mut self, tool_name: Arc<str>, succeeded: bool) {
-        *self.use_counts.entry(tool_name.clone()).or_insert(0) += 1;
-        if !succeeded {
-            *self.failure_counts.entry(tool_name).or_insert(0) += 1;
-        }
-    }
-
-    pub fn merge(&mut self, other: &ToolMetrics) {
-        for (tool_name, use_count) in &other.use_counts {
-            *self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count;
-        }
-        for (tool_name, failure_count) in &other.failure_counts {
-            *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.use_counts.is_empty() && self.failure_counts.is_empty()
-    }
-}
-
-impl Display for ToolMetrics {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let mut failure_rates: Vec<(Arc<str>, f64)> = Vec::new();
-
-        for (tool_name, use_count) in &self.use_counts {
-            let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0);
-            if *use_count > 0 {
-                let failure_rate = failure_count as f64 / *use_count as f64;
-                failure_rates.push((tool_name.clone(), failure_rate));
-            }
-        }
-
-        // Sort by failure rate descending
-        failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
-
-        // Table dimensions
-        let tool_width = 30;
-        let count_width = 10;
-        let rate_width = 10;
-
-        // Write table top border
-        writeln!(
-            f,
-            "β”Œ{}┬{}┬{}┬{}┐",
-            "─".repeat(tool_width),
-            "─".repeat(count_width),
-            "─".repeat(count_width),
-            "─".repeat(rate_width)
-        )?;
-
-        // Write header row
-        writeln!(
-            f,
-            "β”‚{:^30}β”‚{:^10}β”‚{:^10}β”‚{:^10}β”‚",
-            "Tool", "Uses", "Failures", "Rate"
-        )?;
-
-        // Write header-data separator
-        writeln!(
-            f,
-            "β”œ{}β”Ό{}β”Ό{}β”Ό{}─",
-            "─".repeat(tool_width),
-            "─".repeat(count_width),
-            "─".repeat(count_width),
-            "─".repeat(rate_width)
-        )?;
-
-        // Write data rows
-        for (tool_name, failure_rate) in failure_rates {
-            let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0);
-            let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
-            writeln!(
-                f,
-                "β”‚{:<30}β”‚{:^10}β”‚{:^10}β”‚{:^10}β”‚",
-                tool_name,
-                use_count,
-                failure_count,
-                format!("{}%", (failure_rate * 100.0).round())
-            )?;
-        }
-
-        // Write table bottom border
-        writeln!(
-            f,
-            "β””{}β”΄{}β”΄{}β”΄{}β”˜",
-            "─".repeat(tool_width),
-            "─".repeat(count_width),
-            "─".repeat(count_width),
-            "─".repeat(rate_width)
-        )?;
-
-        Ok(())
-    }
-}

tooling/xtask/src/tasks/workflows.rs πŸ”—

@@ -206,7 +206,6 @@ pub fn run_workflows(args: GenerateWorkflowArgs) -> Result<()> {
         WorkflowFile::zed(publish_extension_cli::publish_extension_cli),
         WorkflowFile::zed(release::release),
         WorkflowFile::zed(release_nightly::release_nightly),
-        WorkflowFile::zed(run_agent_evals::run_agent_evals),
         WorkflowFile::zed(run_agent_evals::run_cron_unit_evals),
         WorkflowFile::zed(run_agent_evals::run_unit_evals),
         WorkflowFile::zed(run_bundling::run_bundling),

tooling/xtask/src/tasks/workflows/run_agent_evals.rs πŸ”—

@@ -3,32 +3,10 @@ use serde_json::json;
 
 use crate::tasks::workflows::{
     runners::{self, Platform},
-    steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config},
+    steps::{self, FluentBuilder as _, NamedJob, named},
     vars::{self, WorkflowInput},
 };
 
-pub(crate) fn run_agent_evals() -> Workflow {
-    let agent_evals = agent_evals();
-    let model_name = WorkflowInput::string("model_name", None);
-
-    named::workflow()
-        .on(Event::default().workflow_dispatch(
-            WorkflowDispatch::default().add_input(model_name.name, model_name.input()),
-        ))
-        .concurrency(vars::one_workflow_per_non_main_branch())
-        .add_env(("CARGO_TERM_COLOR", "always"))
-        .add_env(("CARGO_INCREMENTAL", 0))
-        .add_env(("RUST_BACKTRACE", 1))
-        .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY))
-        .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY))
-        .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY))
-        .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
-        .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED))
-        .add_env(("ZED_EVAL_TELEMETRY", 1))
-        .add_env(("MODEL_NAME", model_name.to_string()))
-        .add_job(agent_evals.name, agent_evals.job)
-}
-
 pub(crate) fn run_unit_evals() -> Workflow {
     let model_name = WorkflowInput::string("model_name", None);
     let commit_sha = WorkflowInput::string("commit_sha", None);
@@ -59,29 +37,6 @@ fn add_api_keys(step: Step<Run>) -> Step<Run> {
         .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT))
 }
 
-fn agent_evals() -> NamedJob {
-    fn run_eval() -> Step<Run> {
-        named::bash(
-            "cargo run --package=eval -- --repetitions=8 --concurrency=1 --model \"${MODEL_NAME}\"",
-        )
-    }
-
-    named::job(
-        Job::default()
-            .runs_on(runners::LINUX_DEFAULT)
-            .timeout_minutes(60_u32 * 10)
-            .add_step(steps::checkout_repo())
-            .add_step(steps::cache_rust_dependencies_namespace())
-            .map(steps::install_linux_dependencies)
-            .add_step(setup_cargo_config(Platform::Linux))
-            .add_step(steps::setup_sccache(Platform::Linux))
-            .add_step(steps::script("cargo build --package=eval"))
-            .add_step(add_api_keys(run_eval()))
-            .add_step(steps::show_sccache_stats(Platform::Linux))
-            .add_step(steps::cleanup_cargo_config(Platform::Linux)),
-    )
-}
-
 pub(crate) fn run_cron_unit_evals() -> Workflow {
     let unit_evals = cron_unit_evals();
 

typos.toml πŸ”—

@@ -49,8 +49,6 @@ extend-exclude = [
     "docs/theme/c15t@*.js",
     # Spellcheck triggers on `|Fixe[sd]|` regex part.
     "script/danger/dangerfile.ts",
-    # Eval examples for prompts and criteria
-    "crates/eval/src/examples/",
     # File type extensions are not typos
     "crates/zed/resources/windows/zed.iss",
     # typos-cli doesn't understand our `vˇariable` markup