From 8e19c7474e38cceab8b6adb331a82af0e2c0e01e Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Mon, 30 Mar 2026 15:33:29 +0200 Subject: [PATCH] eval: Remove deprecated eval crate and workflow (#52733) This is replaced by eval_cli Release Notes: - N/A --- .github/CODEOWNERS.hold | 1 - .github/workflows/run_agent_evals.yml | 71 - .zed/settings.json | 2 - Cargo.lock | 56 - Cargo.toml | 1 - crates/agent/Cargo.toml | 1 - crates/agent/src/thread.rs | 8 - crates/eval/.gitignore | 3 - crates/eval/Cargo.toml | 70 - crates/eval/LICENSE-GPL | 1 - crates/eval/README.md | 27 - crates/eval/build.rs | 14 - crates/eval/docs/explorer.md | 27 - crates/eval/runner_settings.json | 7 - crates/eval/src/assertions.rs | 170 -- crates/eval/src/eval.rs | 742 --------- crates/eval/src/example.rs | 561 ------- .../src/examples/add_arg_to_trait_method.rs | 115 -- .../eval/src/examples/code_block_citations.rs | 218 --- .../eval/src/examples/comment_translation.rs | 60 - .../src/examples/file_change_notification.rs | 74 - crates/eval/src/examples/file_search.rs | 55 - .../examples/find_and_replace_diff_card.toml | 43 - .../src/examples/grep_params_escapement.rs | 59 - .../src/examples/hallucinated_tool_calls.toml | 13 - crates/eval/src/examples/mod.rs | 173 -- .../eval/src/examples/no_tools_enabled.toml | 19 - crates/eval/src/examples/overwrite_file.rs | 51 - crates/eval/src/examples/planets.rs | 75 - .../src/examples/threads/overwrite-file.json | 262 --- .../tree_sitter_drop_emscripten_dep.toml | 53 - crates/eval/src/explorer.html | 949 ----------- crates/eval/src/explorer.rs | 182 --- crates/eval/src/ids.rs | 29 - crates/eval/src/instance.rs | 1446 ----------------- crates/eval/src/judge_diff_prompt.hbs | 25 - crates/eval/src/judge_thread_prompt.hbs | 21 - crates/eval/src/tool_metrics.rs | 106 -- tooling/xtask/src/tasks/workflows.rs | 1 - .../src/tasks/workflows/run_agent_evals.rs | 47 +- typos.toml | 2 - 41 files changed, 1 insertion(+), 5839 deletions(-) delete mode 100644 .github/workflows/run_agent_evals.yml delete mode 100644 crates/eval/.gitignore delete mode 100644 crates/eval/Cargo.toml delete mode 120000 crates/eval/LICENSE-GPL delete mode 100644 crates/eval/README.md delete mode 100644 crates/eval/build.rs delete mode 100644 crates/eval/docs/explorer.md delete mode 100644 crates/eval/runner_settings.json delete mode 100644 crates/eval/src/assertions.rs delete mode 100644 crates/eval/src/eval.rs delete mode 100644 crates/eval/src/example.rs delete mode 100644 crates/eval/src/examples/add_arg_to_trait_method.rs delete mode 100644 crates/eval/src/examples/code_block_citations.rs delete mode 100644 crates/eval/src/examples/comment_translation.rs delete mode 100644 crates/eval/src/examples/file_change_notification.rs delete mode 100644 crates/eval/src/examples/file_search.rs delete mode 100644 crates/eval/src/examples/find_and_replace_diff_card.toml delete mode 100644 crates/eval/src/examples/grep_params_escapement.rs delete mode 100644 crates/eval/src/examples/hallucinated_tool_calls.toml delete mode 100644 crates/eval/src/examples/mod.rs delete mode 100644 crates/eval/src/examples/no_tools_enabled.toml delete mode 100644 crates/eval/src/examples/overwrite_file.rs delete mode 100644 crates/eval/src/examples/planets.rs delete mode 100644 crates/eval/src/examples/threads/overwrite-file.json delete mode 100644 crates/eval/src/examples/tree_sitter_drop_emscripten_dep.toml delete mode 100644 crates/eval/src/explorer.html delete mode 100644 crates/eval/src/explorer.rs delete mode 100644 crates/eval/src/ids.rs delete mode 100644 crates/eval/src/instance.rs delete mode 100644 crates/eval/src/judge_diff_prompt.hbs delete mode 100644 crates/eval/src/judge_thread_prompt.hbs delete mode 100644 crates/eval/src/tool_metrics.rs diff --git a/.github/CODEOWNERS.hold b/.github/CODEOWNERS.hold index 3d315b36401b2e27e29a2377aeabab8c09c75d39..3b7cbc644768f82646591619e49c4b6a0d6de200 100644 --- a/.github/CODEOWNERS.hold +++ b/.github/CODEOWNERS.hold @@ -48,7 +48,6 @@ /crates/edit_prediction_context/ @zed-industries/ai-team /crates/edit_prediction_types/ @zed-industries/ai-team /crates/edit_prediction_ui/ @zed-industries/ai-team -/crates/eval/ @zed-industries/ai-team /crates/eval_utils/ @zed-industries/ai-team /crates/google_ai/ @zed-industries/ai-team /crates/language_model/ @zed-industries/ai-team diff --git a/.github/workflows/run_agent_evals.yml b/.github/workflows/run_agent_evals.yml deleted file mode 100644 index 83fd91b037fd982a25845b10aaff561b42af5fc5..0000000000000000000000000000000000000000 --- a/.github/workflows/run_agent_evals.yml +++ /dev/null @@ -1,71 +0,0 @@ -# Generated from xtask::workflows::run_agent_evals -# Rebuild with `cargo xtask workflows`. -name: run_agent_evals -env: - CARGO_TERM_COLOR: always - CARGO_INCREMENTAL: '0' - RUST_BACKTRACE: '1' - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }} - GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }} - ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }} - ZED_EVAL_TELEMETRY: '1' - MODEL_NAME: ${{ inputs.model_name }} -on: - workflow_dispatch: - inputs: - model_name: - description: model_name - required: true - type: string -jobs: - agent_evals: - runs-on: namespace-profile-16x32-ubuntu-2204 - steps: - - name: steps::checkout_repo - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd - with: - clean: false - - name: steps::cache_rust_dependencies_namespace - uses: namespacelabs/nscloud-cache-action@a90bb5d4b27522ce881c6e98eebd7d7e6d1653f9 - with: - cache: rust - path: ~/.rustup - - name: steps::setup_linux - run: ./script/linux - - name: steps::download_wasi_sdk - run: ./script/download-wasi-sdk - - name: steps::setup_cargo_config - run: | - mkdir -p ./../.cargo - cp ./.cargo/ci-config.toml ./../.cargo/config.toml - - name: steps::setup_sccache - run: ./script/setup-sccache - env: - R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }} - R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} - R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} - SCCACHE_BUCKET: sccache-zed - - name: cargo build --package=eval - run: cargo build --package=eval - - name: run_agent_evals::agent_evals::run_eval - run: cargo run --package=eval -- --repetitions=8 --concurrency=1 --model "${MODEL_NAME}" - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GOOGLE_AI_API_KEY: ${{ secrets.GOOGLE_AI_API_KEY }} - GOOGLE_CLOUD_PROJECT: ${{ secrets.GOOGLE_CLOUD_PROJECT }} - - name: steps::show_sccache_stats - run: sccache --show-stats || true - - name: steps::cleanup_cargo_config - if: always() - run: | - rm -rf ./../.cargo - timeout-minutes: 600 -concurrency: - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true -defaults: - run: - shell: bash -euxo pipefail {0} diff --git a/.zed/settings.json b/.zed/settings.json index 9ac85ec50d651fb10a5ac73e2cf31fe0c575558d..2ecbd5623d26bd32d40443f8553bf4062248ec45 100644 --- a/.zed/settings.json +++ b/.zed/settings.json @@ -59,8 +59,6 @@ "file_scan_exclusions": [ "crates/agent/src/edit_agent/evals/fixtures", "crates/agent/src/tools/evals/fixtures", - "crates/eval/worktrees/", - "crates/eval/repos/", "**/.git", "**/.svn", "**/.hg", diff --git a/Cargo.lock b/Cargo.lock index cc201bd8842d54cb51e7a4eedbf0b9c7ffc444a0..2c2b9b3f58705aad05d719ea58c4164b29b89891 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5825,62 +5825,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "eval" -version = "0.1.0" -dependencies = [ - "acp_thread", - "agent", - "agent-client-protocol", - "agent_settings", - "agent_ui", - "anyhow", - "async-trait", - "buffer_diff", - "chrono", - "clap", - "client", - "collections", - "debug_adapter_extension", - "dirs 4.0.0", - "dotenvy", - "env_logger 0.11.8", - "extension", - "fs", - "futures 0.3.31", - "gpui", - "gpui_platform", - "gpui_tokio", - "handlebars 4.5.0", - "language", - "language_extension", - "language_model", - "language_models", - "languages", - "markdown", - "node_runtime", - "pathdiff", - "paths", - "pretty_assertions", - "project", - "prompt_store", - "rand 0.9.2", - "regex", - "release_channel", - "reqwest_client", - "serde", - "serde_json", - "settings", - "shellexpand 2.1.2", - "telemetry", - "terminal_view", - "toml 0.8.23", - "unindent", - "util", - "uuid", - "watch", -] - [[package]] name = "eval_cli" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 29b4494503a8e05017b2badee31416849a89c634..7c6fdb14defc7c060ee162a78f4319b2dff4deef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,6 @@ members = [ "crates/editor", "crates/encoding_selector", "crates/etw_tracing", - "crates/eval", "crates/eval_cli", "crates/eval_utils", "crates/explorer_command_injector", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index fe2089d94dc2e3fc812f6cbe39c16c5cadc1a1f5..a5a4c2742a444bf2e8b0a12b0bb233c6e51684f2 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -10,7 +10,6 @@ path = "src/agent.rs" [features] test-support = ["db/test-support"] -eval = [] unit-eval = [] e2e = [] diff --git a/crates/agent/src/thread.rs b/crates/agent/src/thread.rs index f7c55d4e49618a9642203513413de1c447ae28cf..627fb37b4d2559e5cda573d849fd0df306c1cc7d 100644 --- a/crates/agent/src/thread.rs +++ b/crates/agent/src/thread.rs @@ -1804,14 +1804,6 @@ impl Thread { cx.notify(); } - #[cfg(feature = "eval")] - pub fn proceed( - &mut self, - cx: &mut Context, - ) -> Result>> { - self.run_turn(cx) - } - fn run_turn( &mut self, cx: &mut Context, diff --git a/crates/eval/.gitignore b/crates/eval/.gitignore deleted file mode 100644 index 89fb02c12207ce4e077c5eccd67f9dcad2fe548a..0000000000000000000000000000000000000000 --- a/crates/eval/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -repos/ -worktrees/ -runs/ diff --git a/crates/eval/Cargo.toml b/crates/eval/Cargo.toml deleted file mode 100644 index a8917181a1253dea614a02bfaa799ace0ee6ba64..0000000000000000000000000000000000000000 --- a/crates/eval/Cargo.toml +++ /dev/null @@ -1,70 +0,0 @@ -[package] -name = "eval" -version = "0.1.0" -publish.workspace = true -edition.workspace = true -license = "GPL-3.0-or-later" -default-run = "eval" - -[lints] -workspace = true - -[[bin]] -name = "eval" -path = "src/eval.rs" - -[[bin]] -name = "explorer" -path = "src/explorer.rs" - -[dependencies] -acp_thread.workspace = true -agent = { workspace = true, features = ["eval"] } -agent-client-protocol.workspace = true -agent_settings.workspace = true -agent_ui.workspace = true -anyhow.workspace = true -async-trait.workspace = true -buffer_diff.workspace = true -chrono.workspace = true -clap.workspace = true -client.workspace = true -collections.workspace = true -debug_adapter_extension.workspace = true -dirs.workspace = true -dotenvy.workspace = true -env_logger.workspace = true -extension.workspace = true -fs.workspace = true -futures.workspace = true -gpui.workspace = true -gpui_platform.workspace = true -gpui_tokio.workspace = true -handlebars.workspace = true -language.workspace = true -language_extension.workspace = true -language_model.workspace = true -language_models.workspace = true -languages = { workspace = true, features = ["load-grammars"] } -markdown.workspace = true -node_runtime.workspace = true -pathdiff.workspace = true -paths.workspace = true -pretty_assertions.workspace = true -project.workspace = true -prompt_store.workspace = true -regex.workspace = true -rand.workspace = true -release_channel.workspace = true -reqwest_client.workspace = true -serde.workspace = true -serde_json.workspace = true -settings.workspace = true -shellexpand.workspace = true -telemetry.workspace = true -terminal_view.workspace = true -toml.workspace = true -unindent.workspace = true -util.workspace = true -uuid.workspace = true -watch.workspace = true diff --git a/crates/eval/LICENSE-GPL b/crates/eval/LICENSE-GPL deleted file mode 120000 index 89e542f750cd3860a0598eff0dc34b56d7336dc4..0000000000000000000000000000000000000000 --- a/crates/eval/LICENSE-GPL +++ /dev/null @@ -1 +0,0 @@ -../../LICENSE-GPL \ No newline at end of file diff --git a/crates/eval/README.md b/crates/eval/README.md deleted file mode 100644 index c1543734b00f334063b76d6c8fe22b5aac0f9a84..0000000000000000000000000000000000000000 --- a/crates/eval/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Eval - -This eval assumes the working directory is the root of the repository. Run it with: - -```sh -cargo run -p eval -``` - -The eval will optionally read a `.env` file in `crates/eval` if you need it to set environment variables, such as API keys. - -## Explorer Tool - -The explorer tool generates a self-contained HTML view from one or more thread -JSON file. It provides a visual interface to explore the agent thread, including -tool calls and results. See [./docs/explorer.md](./docs/explorer.md) for more details. - -### Usage - -```sh -cargo run -p eval --bin explorer -- --input --output -``` - -Example: - -```sh -cargo run -p eval --bin explorer -- --input ./runs/2025-04-23_15-53-30/fastmcp_bugifx/*/last.messages.json --output /tmp/explorer.html -``` diff --git a/crates/eval/build.rs b/crates/eval/build.rs deleted file mode 100644 index 9ab40da0fb0ca880cecc3a87d5a9e95172dcb6ec..0000000000000000000000000000000000000000 --- a/crates/eval/build.rs +++ /dev/null @@ -1,14 +0,0 @@ -fn main() { - let cargo_toml = - std::fs::read_to_string("../zed/Cargo.toml").expect("Failed to read crates/zed/Cargo.toml"); - let version = cargo_toml - .lines() - .find(|line| line.starts_with("version = ")) - .expect("Version not found in crates/zed/Cargo.toml") - .split('=') - .nth(1) - .expect("Invalid version format") - .trim() - .trim_matches('"'); - println!("cargo:rustc-env=ZED_PKG_VERSION={}", version); -} diff --git a/crates/eval/docs/explorer.md b/crates/eval/docs/explorer.md deleted file mode 100644 index 2ca3336a23442dace8c6c73f5eec2295cae3a2d6..0000000000000000000000000000000000000000 --- a/crates/eval/docs/explorer.md +++ /dev/null @@ -1,27 +0,0 @@ -# Explorer - -Threads Explorer is a single self-contained HTML file that gives an overview of -evaluation runs, while allowing for some interactivity. - -When you open a file, it gives you a _thread overview_, which looks like this: - -| Turn | Text | Tool | Result | -| ---- | ------------------------------------ | -------------------------------------------- | --------------------------------------------- | -| 1 | [User]: | | | -| | Fix the bug: kwargs not passed... | | | -| 2 | I'll help you fix that bug. | **list_directory**(path="fastmcp") | `fastmcp/src [...]` | -| | | | | -| 3 | Let's examine the code. | **read_file**(path="fastmcp/main.py", [...]) | `def run_application(app, \*\*kwargs): [...]` | -| 4 | I found the issue. | **edit_file**(path="fastmcp/core.py", [...]) | `Made edit to fastmcp/core.py` | -| 5 | Let's check if there are any errors. | **diagnostics**() | `No errors found` | - -### Implementation details - -`src/explorer.html` contains the template. You can open this template in a -browser as is, and it will show some dummy values. But the main use is to set -the `threadsData` variable with real data, which then will be used instead of -the dummy values. - -`src/explorer.rs` takes one or more JSON files as generated by `cargo run -p -eval`, and outputs an HTML file for rendering these threads. Refer dummy data -in `explorer.html` for a sample format. diff --git a/crates/eval/runner_settings.json b/crates/eval/runner_settings.json deleted file mode 100644 index 44a9eb6fc60d5c7e44d945114ab4b71fbb0208c3..0000000000000000000000000000000000000000 --- a/crates/eval/runner_settings.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "agent": { - "tool_permissions": { - "default": "allow" - } - } -} diff --git a/crates/eval/src/assertions.rs b/crates/eval/src/assertions.rs deleted file mode 100644 index 01fac186d33a8b5b156121acf924d37c90c64679..0000000000000000000000000000000000000000 --- a/crates/eval/src/assertions.rs +++ /dev/null @@ -1,170 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::fmt::Write; -use std::fmt::{self}; - -#[derive(Default, Debug, Serialize, Deserialize, Clone)] -pub struct AssertionsReport { - pub ran: Vec, - pub max: Option, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct RanAssertion { - pub id: String, - pub result: Result, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct RanAssertionResult { - pub analysis: Option, - pub passed: bool, -} - -impl AssertionsReport { - pub fn new(max: Option) -> Self { - AssertionsReport { - ran: Vec::new(), - max, - } - } - - pub fn error(msg: String) -> Self { - let assert = RanAssertion { - id: "no-unhandled-errors".into(), - result: Err(msg), - }; - AssertionsReport { - ran: vec![assert], - max: Some(1), - } - } - - pub fn is_empty(&self) -> bool { - self.ran.is_empty() - } - - pub fn total_count(&self) -> usize { - self.run_count().max(self.max.unwrap_or(0)) - } - - pub fn run_count(&self) -> usize { - self.ran.len() - } - - pub fn passed_count(&self) -> usize { - self.ran - .iter() - .filter(|a| a.result.as_ref().is_ok_and(|result| result.passed)) - .count() - } - - pub fn passed_percentage(&self) -> f32 { - if self.total_count() == 0 { - 0.0 - } else { - (self.passed_count() as f32 / self.total_count() as f32) * 100.0 - } - } -} - -const ROUND_WIDTH: usize = "Round".len(); -const ASSERTIONS_WIDTH: usize = 42; -const RESULTS_WIDTH: usize = 8; - -pub fn print_table_header() { - println!( - "┌─{}─┬─{}─┬─{}─┐", - "─".repeat(ROUND_WIDTH), - "─".repeat(ASSERTIONS_WIDTH), - "─".repeat(RESULTS_WIDTH) - ); - - println!( - "│ {:^ROUND_WIDTH$} │ {:^ASSERTIONS_WIDTH$} │ {:^RESULTS_WIDTH$} │", - "Round", "Assertion", "Result" - ); - - println!( - "├─{}─┼─{}─┼─{}─┤", - "─".repeat(ROUND_WIDTH), - "─".repeat(ASSERTIONS_WIDTH), - "─".repeat(RESULTS_WIDTH) - ) -} - -pub fn display_error_row(f: &mut String, round: usize, error: String) -> fmt::Result { - let last_two_columns = ASSERTIONS_WIDTH + RESULTS_WIDTH; - writeln!( - f, - "│ {:^ROUND_WIDTH$} │ {: fmt::Result { - let result = match &assertion.result { - Ok(result) if result.passed => "\x1b[32m✔︎ Passed\x1b[0m", - Ok(_) => "\x1b[31m✗ Failed\x1b[0m", - Err(_) => "\x1b[31m💥 Judge Error\x1b[0m", - }; - - writeln!( - f, - "│ {:^ROUND_WIDTH$} │ {:RESULTS_WIDTH$} │", - round, - truncate(&assertion.id, ASSERTIONS_WIDTH), - result - ) -} - -pub fn print_table_round_summary<'a>( - round: &str, - reports: impl Iterator, -) { - let mut passed = 0; - let mut total = 0; - for report in reports { - passed += report.passed_count(); - total += report.total_count(); - } - - println!( - "│ {:^ROUND_WIDTH$} │ {:RESULTS_WIDTH$} │", - round, - "total", - format!("{}%", (passed as f32 / total as f32 * 100.0).floor()) - ) -} - -pub fn print_table_footer() { - println!( - "└─{}─┴─{}─┴─{}─┘", - "─".repeat(ROUND_WIDTH), - "─".repeat(ASSERTIONS_WIDTH), - "─".repeat(RESULTS_WIDTH) - ) -} - -pub fn print_table_divider() { - println!( - "├─{}─┼─{}─┼─{}─┤", - "─".repeat(ROUND_WIDTH), - "─".repeat(ASSERTIONS_WIDTH), - "─".repeat(RESULTS_WIDTH) - ) -} - -fn truncate(assertion: &str, max_width: usize) -> String { - let is_verbose = std::env::var("VERBOSE").is_ok_and(|v| !v.is_empty()); - - if assertion.len() <= max_width || is_verbose { - assertion.to_string() - } else { - let mut end_ix = max_width - 1; - while !assertion.is_char_boundary(end_ix) { - end_ix -= 1; - } - format!("{}…", &assertion[..end_ix]) - } -} diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs deleted file mode 100644 index a621cb0dedb3f7cea512329829f7c99bc8803d41..0000000000000000000000000000000000000000 --- a/crates/eval/src/eval.rs +++ /dev/null @@ -1,742 +0,0 @@ -mod assertions; -mod example; -mod examples; -mod explorer; -mod ids; -mod instance; -mod tool_metrics; - -use assertions::{AssertionsReport, display_error_row}; -use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git}; -use language_extension::LspAccess; -pub(crate) use tool_metrics::*; - -use ::fs::RealFs; -use clap::Parser; -use client::{Client, ProxySettings, UserStore}; -use collections::{HashMap, HashSet}; -use extension::ExtensionHostProxy; -use futures::future; -use gpui::http_client::read_proxy_from_env; -use gpui::{App, AppContext, AsyncApp, Entity, UpdateGlobal}; -use gpui_tokio::Tokio; -use language::LanguageRegistry; -use language_model::{ConfiguredModel, LanguageModel, LanguageModelRegistry, SelectedModel}; -use node_runtime::{NodeBinaryOptions, NodeRuntime}; -use project::project_settings::ProjectSettings; -use prompt_store::PromptBuilder; -use release_channel::{AppCommitSha, AppVersion}; -use reqwest_client::ReqwestClient; -use settings::{Settings, SettingsStore}; -use std::cell::RefCell; -use std::collections::VecDeque; -use std::env; -use std::path::{Path, PathBuf}; -use std::rc::Rc; -use std::str::FromStr; -use std::sync::{Arc, LazyLock}; -use util::ResultExt as _; - -static CARGO_MANIFEST_DIR: LazyLock = - LazyLock::new(|| PathBuf::from(env!("CARGO_MANIFEST_DIR"))); - -#[derive(Parser, Debug)] -#[command(name = "eval", disable_version_flag = true)] -struct Args { - /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run. - #[arg(value_name = "EXAMPLE_SUBSTRING")] - filter: Vec, - /// provider/model to use for agent - #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")] - model: String, - /// provider/model to use for judges - #[arg(long, default_value = "anthropic/claude-3-7-sonnet-latest")] - judge_model: String, - #[arg(long, value_delimiter = ',', default_value = "rs,ts,py")] - languages: Vec, - /// How many times to run each example. - #[arg(long, default_value = "8")] - repetitions: usize, - /// Maximum number of examples to run concurrently. - #[arg(long, default_value = "4")] - concurrency: usize, - /// Output current environment variables as JSON to stdout - #[arg(long, hide = true)] - printenv: bool, -} - -fn main() { - let args = Args::parse(); - - // This prevents errors showing up in the logs, because - // project::environment::load_shell_environment() calls - // std::env::current_exe().unwrap() --printenv - if args.printenv { - util::shell_env::print_env(); - return; - } - - dotenvy::from_filename(CARGO_MANIFEST_DIR.join(".env")).ok(); - - env_logger::init(); - - let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok(); - let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok(); - let session_id = uuid::Uuid::new_v4().to_string(); - let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S"); - let run_id = match env::var("GITHUB_RUN_ID") { - Ok(run_id) => format!("github/{}", run_id), - Err(_) => format!("local/{}", run_timestamp), - }; - - let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR")) - .parent() - .unwrap() - .parent() - .unwrap() - .canonicalize() - .unwrap(); - let eval_crate_dir = root_dir.join("crates").join("eval"); - let repos_dir = eval_crate_dir.join("repos"); - let worktrees_dir = eval_crate_dir.join("worktrees"); - let examples_dir = eval_crate_dir.join("src").join("examples"); - let run_dir = eval_crate_dir - .join("runs") - .join(format!("{}", run_timestamp)); - std::fs::create_dir_all(&run_dir).unwrap(); - std::fs::create_dir_all(&repos_dir).unwrap(); - std::fs::create_dir_all(&worktrees_dir).unwrap(); - std::fs::create_dir_all(&examples_dir).unwrap(); - std::fs::create_dir_all(&paths::config_dir()).unwrap(); - - let zed_commit_sha = commit_sha_for_path(&root_dir); - let zed_branch_name = git_branch_for_path(&root_dir); - let languages: HashSet = args.languages.into_iter().collect(); - - let http_client = Arc::new(ReqwestClient::new()); - let app = gpui_platform::headless().with_http_client(http_client); - let all_threads = examples::all(&examples_dir); - - app.run(move |cx| { - let app_state = init(cx); - - let telemetry = app_state.client.telemetry(); - telemetry.start(system_id, installation_id, session_id, cx); - - let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").is_ok_and(|value| value == "1") - && telemetry.has_checksum_seed(); - if enable_telemetry { - println!("Telemetry enabled"); - telemetry::event!( - "Agent Eval Started", - zed_commit_sha = zed_commit_sha, - zed_branch_name = zed_branch_name, - run_id = run_id, - ); - } - - let mut cumulative_tool_metrics = ToolMetrics::default(); - - let tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| { - registry.providers().iter().map(|p| p.authenticate(cx)).collect::>() - }); - - cx.spawn(async move |cx| { - future::join_all(tasks).await; - let judge_model = cx.update(|cx| { - let agent_model = load_model(&args.model, cx).unwrap(); - let judge_model = load_model(&args.judge_model, cx).unwrap(); - LanguageModelRegistry::global(cx).update(cx, |registry, cx| { - registry.set_default_model(Some(agent_model.clone()), cx); - }); - judge_model - }); - - let mut examples = Vec::new(); - - const COLORS: [&str; 12] = [ - "\x1b[31m", // Red - "\x1b[32m", // Green - "\x1b[33m", // Yellow - "\x1b[34m", // Blue - "\x1b[35m", // Magenta - "\x1b[36m", // Cyan - "\x1b[91m", // Bright Red - "\x1b[92m", // Bright Green - "\x1b[93m", // Bright Yellow - "\x1b[94m", // Bright Blue - "\x1b[95m", // Bright Magenta - "\x1b[96m", // Bright Cyan - ]; - - let mut skipped = Vec::new(); - - for thread in all_threads { - let meta = thread.meta(); - if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub)) - { - skipped.push(meta.name); - continue; - } - - if let Some(language) = meta.language_server - && !languages.contains(&language.file_extension) { - panic!( - "Eval for {:?} could not be run because no language server was found for extension {:?}", - meta.name, - language.file_extension - ); - } - - // TODO: This creates a worktree per repetition. Ideally these examples should - // either be run sequentially on the same worktree, or reuse worktrees when there - // are more examples to run than the concurrency limit. - for repetition_number in 0..args.repetitions { - let example_instance = ExampleInstance::new( - thread.clone(), - &repos_dir, - &run_dir, - &worktrees_dir, - repetition_number, - ); - - examples.push(example_instance); - } - } - - if !skipped.is_empty() { - println!("Skipped threads: {}", skipped.join(", ")); - } - - if examples.is_empty() { - eprintln!("Filter matched no examples"); - cx.update(|cx| cx.quit()); - return anyhow::Ok(()); - } - - let mut repo_urls = HashSet::default(); - let mut clone_tasks = Vec::new(); - - let max_name_width = examples - .iter() - .map(|e| e.worktree_name().len()) - .max() - .unwrap_or(0); - - for (i, example_instance) in examples.iter_mut().enumerate() { - let color = COLORS[i % COLORS.len()].to_string(); - example_instance.set_log_prefix_style(&color, max_name_width); - - println!( - "{}Logging to: {}", - example_instance.log_prefix, - example_instance.run_directory.display() - ); - - let repo_url = example_instance.repo_url(); - if repo_urls.insert(repo_url.clone()) { - let repo_path = example_instance.repo_path.clone(); - - if !repo_path.join(".git").is_dir() { - println!( - "{:, - pub client: Arc, - pub user_store: Entity, - pub fs: Arc, - pub node_runtime: NodeRuntime, - - // Additional fields not present in `workspace::AppState`. - pub prompt_builder: Arc, -} - -pub fn init(cx: &mut App) -> Arc { - let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned())); - - let app_version = AppVersion::load( - env!("ZED_PKG_VERSION"), - option_env!("ZED_BUILD_ID"), - app_commit_sha, - ); - - release_channel::init(app_version.clone(), cx); - gpui_tokio::init(cx); - - let settings_store = SettingsStore::new(cx, &settings::default_settings()); - cx.set_global(settings_store); - - // Set User-Agent so we can download language servers from GitHub - let user_agent = format!( - "Zed Agent Eval/{} ({}; {})", - app_version, - std::env::consts::OS, - std::env::consts::ARCH - ); - let proxy_str = ProxySettings::get_global(cx).proxy.to_owned(); - let proxy_url = proxy_str - .as_ref() - .and_then(|input| input.parse().ok()) - .or_else(read_proxy_from_env); - let http = { - let _guard = Tokio::handle(cx).enter(); - - ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent) - .expect("could not start HTTP client") - }; - cx.set_http_client(Arc::new(http)); - - let client = Client::production(cx); - cx.set_http_client(client.http_client()); - - let git_binary_path = None; - let fs = Arc::new(RealFs::new( - git_binary_path, - cx.background_executor().clone(), - )); - - let mut languages = LanguageRegistry::new(cx.background_executor().clone()); - languages.set_language_server_download_dir(paths::languages_dir().clone()); - let languages = Arc::new(languages); - - let user_store = cx.new(|cx| UserStore::new(client.clone(), cx)); - - extension::init(cx); - - let (mut tx, rx) = watch::channel(None); - cx.observe_global::(move |cx| { - let settings = &ProjectSettings::get_global(cx).node; - let options = NodeBinaryOptions { - allow_path_lookup: !settings.ignore_system_version, - allow_binary_download: true, - use_paths: settings.path.as_ref().map(|node_path| { - let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref()); - let npm_path = settings - .npm_path - .as_ref() - .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref())); - ( - node_path.clone(), - npm_path.unwrap_or_else(|| { - let base_path = PathBuf::new(); - node_path.parent().unwrap_or(&base_path).join("npm") - }), - ) - }), - }; - tx.send(Some(options)).log_err(); - }) - .detach(); - let node_runtime = NodeRuntime::new(client.http_client(), None, rx); - - let extension_host_proxy = ExtensionHostProxy::global(cx); - debug_adapter_extension::init(extension_host_proxy.clone(), cx); - language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone()); - language_model::init(user_store.clone(), client.clone(), cx); - language_models::init(user_store.clone(), client.clone(), cx); - languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx); - prompt_store::init(cx); - terminal_view::init(cx); - let stdout_is_a_pty = false; - let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx); - agent_ui::init( - fs.clone(), - client.clone(), - prompt_builder.clone(), - languages.clone(), - true, - cx, - ); - - SettingsStore::update_global(cx, |store, cx| { - store.set_user_settings(include_str!("../runner_settings.json"), cx) - }) - .unwrap(); - - Arc::new(AgentAppState { - languages, - client, - user_store, - fs, - node_runtime, - prompt_builder, - }) -} - -pub fn find_model( - model_name: &str, - model_registry: &LanguageModelRegistry, - cx: &App, -) -> anyhow::Result> { - let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!(e))?; - model_registry - .available_models(cx) - .find(|model| model.id() == selected.model && model.provider_id() == selected.provider) - .ok_or_else(|| { - anyhow::anyhow!( - "No language model with ID {}/{} was available. Available models: {}", - selected.provider.0, - selected.model.0, - model_registry - .available_models(cx) - .map(|model| format!("{}/{}", model.provider_id().0, model.id().0)) - .collect::>() - .join(", ") - ) - }) -} - -pub fn load_model(model_name: &str, cx: &mut App) -> anyhow::Result { - let model = { - let model_registry = LanguageModelRegistry::read_global(cx); - find_model(model_name, model_registry, cx)? - }; - - let provider = { - let model_registry = LanguageModelRegistry::read_global(cx); - model_registry - .provider(&model.provider_id()) - .ok_or_else(|| anyhow::anyhow!("Provider not found: {}", model.provider_id()))? - }; - - Ok(ConfiguredModel { - provider: provider.clone(), - model: model.clone(), - }) -} - -pub fn commit_sha_for_path(repo_path: &Path) -> String { - futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap() -} - -pub fn git_branch_for_path(repo_path: &Path) -> String { - match std::env::var("GITHUB_REF_NAME") { - Ok(branch) => branch, - Err(_) => { - futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"])) - .unwrap_or_else(|_| "unknown".to_string()) - } - } -} - -async fn judge_example( - example: ExampleInstance, - model: Arc, - zed_commit_sha: &str, - zed_branch_name: &str, - run_id: &str, - run_output: &RunOutput, - enable_telemetry: bool, - cx: &AsyncApp, -) -> JudgeOutput { - let judge_output = example.judge(model.clone(), run_output, cx).await; - - if enable_telemetry { - telemetry::event!( - "Agent Example Evaluated", - zed_commit_sha = zed_commit_sha, - zed_branch_name = zed_branch_name, - run_id = run_id, - example_name = example.name.clone(), - example_repetition = example.repetition, - diff_evaluation = judge_output.diff.clone(), - thread_evaluation = judge_output.thread, - tool_metrics = run_output.tool_metrics, - token_usage = run_output.token_usage, - model = model.telemetry_id(), - model_provider = model.provider_id().to_string(), - repository_url = example.repo_url(), - repository_revision = example.revision(), - diagnostic_summary_before = run_output.diagnostic_summary_before, - diagnostic_summary_after = run_output.diagnostic_summary_after, - diagnostics_before = run_output.diagnostics_before, - diagnostics_after = run_output.diagnostics_after, - ); - } - - judge_output -} - -const HEADER_WIDTH: usize = 65; - -fn print_h1(header: &str) { - println!("\n\n{:=^HEADER_WIDTH$}", ""); - println!("{:^HEADER_WIDTH$}", header); - println!("{:=^HEADER_WIDTH$}\n", ""); -} - -fn print_h2(header: &str) { - println!("\n{:-^HEADER_WIDTH$}", ""); - println!("{:^HEADER_WIDTH$}", header); - println!("{:-^HEADER_WIDTH$}\n", ""); -} - -fn print_report( - results_by_example_name: &mut HashMap< - String, - Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>, - >, - cumulative_tool_metrics: &mut ToolMetrics, - run_dir: &Path, -) -> anyhow::Result<()> { - print_h1("EVAL RESULTS"); - - let mut diff_scores = Vec::new(); - let mut thread_scores = Vec::new(); - let mut programmatic_scores = Vec::new(); - let mut error_count = 0; - - for (example_name, results) in results_by_example_name.iter_mut() { - print_h2(example_name); - - results.sort_unstable_by_key(|(example, _)| example.repetition); - let mut example_cumulative_tool_metrics = ToolMetrics::default(); - - let mut table_rows = String::new(); - - for (example, result) in results.iter() { - match result { - Err(err) => { - display_error_row(&mut table_rows, example.repetition, err.to_string())?; - error_count += 1; - programmatic_scores.push(0.0); - diff_scores.push(0.0); - thread_scores.push(0.0); - } - Ok((run_output, judge_output)) => { - cumulative_tool_metrics.merge(&run_output.tool_metrics); - example_cumulative_tool_metrics.merge(&run_output.tool_metrics); - - if run_output.programmatic_assertions.total_count() > 0 { - for assertion in &run_output.programmatic_assertions.ran { - assertions::display_table_row( - &mut table_rows, - example.repetition, - assertion, - )?; - } - - programmatic_scores - .push(run_output.programmatic_assertions.passed_percentage()) - } - - if !judge_output.diff.is_empty() { - diff_scores.push(judge_output.diff.passed_percentage()); - - for assertion in &judge_output.diff.ran { - assertions::display_table_row( - &mut table_rows, - example.repetition, - assertion, - )?; - } - } - - if !judge_output.thread.is_empty() { - thread_scores.push(judge_output.thread.passed_percentage()); - - for assertion in &judge_output.thread.ran { - assertions::display_table_row( - &mut table_rows, - example.repetition, - assertion, - )?; - } - } - } - } - } - - let mut all_asserts = Vec::new(); - - if !table_rows.is_empty() { - assertions::print_table_header(); - print!("{}", table_rows); - - assertions::print_table_divider(); - - for (example, result) in results.iter() { - if let Ok((run_output, judge_output)) = result { - let asserts = [ - run_output.programmatic_assertions.clone(), - judge_output.diff.clone(), - judge_output.thread.clone(), - ]; - all_asserts.extend_from_slice(&asserts); - assertions::print_table_round_summary( - &example.repetition.to_string(), - asserts.iter(), - ) - } else if let Err(err) = result { - let assert = AssertionsReport::error(err.to_string()); - all_asserts.push(assert.clone()); - assertions::print_table_round_summary( - &example.repetition.to_string(), - [assert].iter(), - ) - } - } - - assertions::print_table_divider(); - - assertions::print_table_round_summary("avg", all_asserts.iter()); - - assertions::print_table_footer(); - } - - if !example_cumulative_tool_metrics.is_empty() { - println!("{}", &example_cumulative_tool_metrics); - } - } - - if results_by_example_name.len() > 1 { - print_h1("AGGREGATE"); - - if error_count > 0 { - println!("\n{error_count} examples failed to run!"); - } - - let programmatic_score_count = programmatic_scores.len(); - if programmatic_score_count > 0 { - let average_programmatic_score = (programmatic_scores.into_iter().sum::() - / (programmatic_score_count as f32)) - .floor(); - println!("Average programmatic score: {average_programmatic_score}%"); - } - - let diff_score_count = diff_scores.len(); - if diff_score_count > 0 { - let average_diff_score = - (diff_scores.into_iter().sum::() / (diff_score_count as f32)).floor(); - println!("Average diff score: {average_diff_score}%"); - } - - let thread_score_count = thread_scores.len(); - - if thread_score_count > 0 { - let average_thread_score = - (thread_scores.into_iter().sum::() / (thread_score_count as f32)).floor(); - println!("Average thread score: {average_thread_score}%"); - } - - println!(); - - print_h2("CUMULATIVE TOOL METRICS"); - println!("{}", cumulative_tool_metrics); - } - - let explorer_output_path = run_dir.join("overview.html"); - let mut json_paths: Vec = results_by_example_name - .values() - .flat_map(|results| { - results.iter().map(|(example, _)| { - let absolute_path = run_dir.join(example.run_directory.join("last.messages.json")); - let cwd = std::env::current_dir().expect("Can't get current dir"); - pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone()) - }) - }) - .collect::>(); - json_paths.sort(); - if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) { - eprintln!("Failed to generate explorer HTML: {}", err); - } - - Ok(()) -} diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs deleted file mode 100644 index d74df7c7f12696a94f6204fae4586c8cec36517d..0000000000000000000000000000000000000000 --- a/crates/eval/src/example.rs +++ /dev/null @@ -1,561 +0,0 @@ -use std::{ - error::Error, - fmt::{self, Debug}, - sync::{Arc, Mutex}, - time::Duration, - u32, -}; - -use crate::{ - ToolMetrics, - assertions::{AssertionsReport, RanAssertion, RanAssertionResult}, -}; -use acp_thread::UserMessageId; -use agent::{Thread, ThreadEvent, UserMessageContent}; -use agent_client_protocol as acp; -use agent_settings::AgentProfileId; -use anyhow::{Result, anyhow}; -use async_trait::async_trait; -use buffer_diff::DiffHunkStatus; -use collections::HashMap; -use futures::{FutureExt as _, StreamExt, select_biased}; -use gpui::{App, AppContext, AsyncApp, Entity}; -use language_model::Role; -use util::rel_path::RelPath; - -pub const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2); - -#[async_trait(?Send)] -pub trait Example { - fn meta(&self) -> ExampleMetadata; - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()>; - fn diff_assertions(&self) -> Vec { - Vec::new() - } - fn thread_assertions(&self) -> Vec { - Vec::new() - } -} - -#[derive(Clone, Debug)] -pub struct JudgeAssertion { - pub id: String, - pub description: String, -} - -#[derive(Clone, Debug)] -pub struct ExampleMetadata { - pub name: String, - pub url: String, - pub revision: String, - pub language_server: Option, - pub max_assertions: Option, - pub profile_id: AgentProfileId, - pub existing_thread_json: Option, - pub max_turns: Option, -} - -#[derive(Clone, Debug)] -pub struct LanguageServer { - pub file_extension: String, - pub allow_preexisting_diagnostics: bool, -} - -impl ExampleMetadata { - pub fn repo_name(&self) -> String { - self.url - .split('/') - .next_back() - .unwrap_or("") - .trim_end_matches(".git") - .into() - } -} - -pub struct FailedAssertion(pub String); - -impl fmt::Debug for FailedAssertion { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "Assertion failure: {}", self.0) - } -} - -impl fmt::Display for FailedAssertion { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.0) - } -} - -impl Error for FailedAssertion {} - -pub struct ExampleContext { - meta: ExampleMetadata, - log_prefix: String, - agent_thread: Entity, - app: AsyncApp, - pub assertions: AssertionsReport, - pub tool_metrics: Arc>, -} - -impl ExampleContext { - pub fn new( - meta: ExampleMetadata, - log_prefix: String, - agent_thread: Entity, - app: AsyncApp, - ) -> Self { - let assertions = AssertionsReport::new(meta.max_assertions); - - Self { - meta, - log_prefix, - agent_thread, - assertions, - app, - tool_metrics: Arc::new(Mutex::new(ToolMetrics::default())), - } - } - - pub fn assert(&mut self, expected: bool, message: impl ToString) -> Result<()> { - let message = message.to_string(); - self.log_assertion( - if expected { - Ok(()) - } else { - Err(anyhow::Error::from(FailedAssertion(message.clone()))) - }, - message, - ) - } - - pub fn assert_some(&mut self, option: Option, message: impl ToString) -> Result { - let message = message.to_string(); - self.log_assertion( - match option { - Some(value) => Ok(value), - None => Err(anyhow::Error::from(FailedAssertion(message.clone()))), - }, - message, - ) - } - - #[allow(dead_code)] - pub fn assert_eq( - &mut self, - left: T, - right: T, - message: impl ToString, - ) -> Result<()> { - let message = message.to_string(); - self.log_assertion( - if left == right { - Ok(()) - } else { - println!( - "{}{}", - self.log_prefix, - pretty_assertions::Comparison::new(&left, &right) - ); - Err(anyhow::Error::from(FailedAssertion(message.clone()))) - }, - message, - ) - } - - fn log_assertion(&mut self, result: Result, message: String) -> Result { - if let Some(max) = self.meta.max_assertions { - anyhow::ensure!( - self.assertions.run_count() <= max, - "More assertions were run than the stated max_assertions of {max}" - ); - } - - self.assertions.ran.push(RanAssertion { - id: message.clone(), - result: Ok(RanAssertionResult { - analysis: None, - passed: result.is_ok(), - }), - }); - - if result.is_ok() { - println!("{}✅ {}", self.log_prefix, message); - } else { - println!("{}❌ {}", self.log_prefix, message); - } - - result - } - - pub async fn prompt(&mut self, prompt: impl Into) -> Result { - self.prompt_with_max_turns(prompt, u32::MAX).await - } - - pub async fn prompt_with_max_turns( - &mut self, - prompt: impl Into, - max_turns: u32, - ) -> Result { - let content = vec![UserMessageContent::Text(prompt.into())]; - self.run_turns(Some(content), max_turns).await - } - - pub async fn proceed_with_max_turns(&mut self, max_turns: u32) -> Result { - self.run_turns(None, max_turns).await - } - - async fn run_turns( - &mut self, - prompt: Option>, - max_turns: u32, - ) -> Result { - let tool_metrics = self.tool_metrics.clone(); - let log_prefix = self.log_prefix.clone(); - - let mut remaining_turns = max_turns; - - let mut event_stream = self.agent_thread.update(&mut self.app, |thread, cx| { - if let Some(prompt) = prompt { - let id = UserMessageId::new(); - thread.send(id, prompt, cx) - } else { - thread.proceed(cx) - } - })?; - - let task = self.app.background_spawn(async move { - let mut messages = Vec::new(); - let mut tool_uses_by_id = HashMap::default(); - while let Some(event) = event_stream.next().await { - match event? { - ThreadEvent::UserMessage(user_message) => { - messages.push(Message { - role: Role::User, - text: user_message.to_markdown(), - tool_use: Vec::new(), - }); - } - ThreadEvent::AgentThinking(text) | ThreadEvent::AgentText(text) => { - if matches!( - messages.last(), - Some(Message { - role: Role::Assistant, - .. - }) - ) { - messages.last_mut().unwrap().text.push_str(&text); - } else { - messages.push(Message { - role: Role::Assistant, - text, - tool_use: Vec::new(), - }); - } - } - ThreadEvent::ToolCall(tool_call) => { - let meta = tool_call.meta.expect("Missing meta field in tool_call"); - let tool_name = meta - .get(acp_thread::TOOL_NAME_META_KEY) - .expect("Missing tool_name field in meta") - .as_str() - .expect("Unknown tool_name content in meta"); - - tool_uses_by_id.insert( - tool_call.tool_call_id, - ToolUse { - name: tool_name.to_string(), - value: tool_call.raw_input.unwrap_or_default(), - }, - ); - if matches!( - tool_call.status, - acp::ToolCallStatus::Completed | acp::ToolCallStatus::Failed - ) { - panic!("Tool call completed without update"); - } - } - ThreadEvent::ToolCallUpdate(tool_call_update) => { - if let acp_thread::ToolCallUpdate::UpdateFields(update) = tool_call_update { - if let Some(raw_input) = update.fields.raw_input { - if let Some(tool_use) = - tool_uses_by_id.get_mut(&update.tool_call_id) - { - tool_use.value = raw_input; - } - } - - if matches!( - update.fields.status, - Some(acp::ToolCallStatus::Completed | acp::ToolCallStatus::Failed) - ) { - let succeeded = - update.fields.status == Some(acp::ToolCallStatus::Completed); - - let tool_use = tool_uses_by_id - .remove(&update.tool_call_id) - .expect("Unrecognized tool call completed"); - - let log_message = if succeeded { - format!("✔︎ {}", tool_use.name) - } else { - format!("✖︎ {}", tool_use.name) - }; - println!("{log_prefix}{log_message}"); - - tool_metrics - .lock() - .unwrap() - .insert(tool_use.name.clone().into(), succeeded); - - if let Some(message) = messages.last_mut() { - message.tool_use.push(tool_use); - } else { - messages.push(Message { - role: Role::Assistant, - text: "".to_string(), - tool_use: vec![tool_use], - }); - } - - remaining_turns -= 1; - if remaining_turns == 0 { - return Ok(messages); - } - } - } - } - ThreadEvent::ToolCallAuthorization(_) => panic!( - "{}Bug: Tool confirmation should not be required in eval", - log_prefix - ), - ThreadEvent::Plan(plan) => { - println!("{log_prefix} Got plan: {plan:?}"); - } - ThreadEvent::SubagentSpawned(session) => { - println!("{log_prefix} Got subagent spawn: {session:?}"); - } - ThreadEvent::Retry(status) => { - println!("{log_prefix} Got retry: {status:?}"); - } - ThreadEvent::Stop(stop_reason) => match stop_reason { - acp::StopReason::EndTurn => {} - acp::StopReason::MaxTokens => { - return Err(anyhow!("Exceeded maximum tokens")); - } - acp::StopReason::MaxTurnRequests => { - return Err(anyhow!("Exceeded maximum turn requests")); - } - stop_reason => return Err(anyhow!("{stop_reason:?}")), - }, - } - } - Ok(messages) - }); - - select_biased! { - result = task.fuse() => { - Ok(Response::new(result?)) - } - _ = self.app.background_executor().timer(THREAD_EVENT_TIMEOUT).fuse() => { - anyhow::bail!("Agentic loop stalled - waited {THREAD_EVENT_TIMEOUT:?} without any events"); - } - } - } - - pub fn edits(&self) -> HashMap, FileEdits> { - self.agent_thread.read_with(&self.app, |thread, cx| { - let action_log = thread.action_log().read(cx); - HashMap::from_iter( - action_log - .changed_buffers(cx) - .into_iter() - .map(|(buffer, diff)| { - let snapshot = buffer.read(cx).snapshot(); - - let file = snapshot.file().unwrap(); - let base_text = diff.read(cx).base_text(cx).text(); - - let hunks = diff - .read(cx) - .snapshot(cx) - .hunks(&snapshot) - .map(|hunk| FileEditHunk { - base_text: base_text[hunk.diff_base_byte_range.clone()].to_string(), - text: snapshot - .text_for_range(hunk.range.clone()) - .collect::(), - status: hunk.status(), - }) - .collect(); - - (file.path().clone(), FileEdits { hunks }) - }), - ) - }) - } - - pub fn agent_thread(&self) -> Entity { - self.agent_thread.clone() - } -} - -impl AppContext for ExampleContext { - fn new( - &mut self, - build_entity: impl FnOnce(&mut gpui::Context) -> T, - ) -> Entity { - self.app.new(build_entity) - } - - fn reserve_entity(&mut self) -> gpui::Reservation { - self.app.reserve_entity() - } - - fn insert_entity( - &mut self, - reservation: gpui::Reservation, - build_entity: impl FnOnce(&mut gpui::Context) -> T, - ) -> Entity { - self.app.insert_entity(reservation, build_entity) - } - - fn update_entity( - &mut self, - handle: &Entity, - update: impl FnOnce(&mut T, &mut gpui::Context) -> R, - ) -> R - where - T: 'static, - { - self.app.update_entity(handle, update) - } - - fn as_mut<'a, T>(&'a mut self, handle: &Entity) -> gpui::GpuiBorrow<'a, T> - where - T: 'static, - { - self.app.as_mut(handle) - } - - fn read_entity(&self, handle: &Entity, read: impl FnOnce(&T, &App) -> R) -> R - where - T: 'static, - { - self.app.read_entity(handle, read) - } - - fn update_window(&mut self, window: gpui::AnyWindowHandle, f: F) -> Result - where - F: FnOnce(gpui::AnyView, &mut gpui::Window, &mut App) -> T, - { - self.app.update_window(window, f) - } - - fn read_window( - &self, - window: &gpui::WindowHandle, - read: impl FnOnce(Entity, &App) -> R, - ) -> Result - where - T: 'static, - { - self.app.read_window(window, read) - } - - fn background_spawn( - &self, - future: impl std::future::Future + Send + 'static, - ) -> gpui::Task - where - R: Send + 'static, - { - self.app.background_spawn(future) - } - - fn read_global(&self, callback: impl FnOnce(&G, &App) -> R) -> R - where - G: gpui::Global, - { - self.app.read_global(callback) - } -} - -#[derive(Debug)] -pub struct Response { - messages: Vec, -} - -impl Response { - pub fn new(messages: Vec) -> Self { - Self { messages } - } - - pub fn expect_tool_call( - &self, - tool_name: &'static str, - cx: &mut ExampleContext, - ) -> Result<&ToolUse> { - let result = self.find_tool_call(tool_name); - cx.assert_some(result, format!("called `{}`", tool_name)) - } - - pub fn find_tool_call(&self, tool_name: &str) -> Option<&ToolUse> { - self.messages.iter().rev().find_map(|msg| { - msg.tool_use - .iter() - .find(|tool_use| tool_use.name == tool_name) - }) - } - - pub fn tool_calls(&self) -> impl Iterator { - self.messages.iter().flat_map(|msg| &msg.tool_use) - } - - pub fn texts(&self) -> impl Iterator { - self.messages.iter().map(|message| message.text.clone()) - } -} - -#[derive(Debug)] -pub struct Message { - role: Role, - text: String, - tool_use: Vec, -} - -#[derive(Debug)] -pub struct ToolUse { - pub name: String, - value: serde_json::Value, -} - -impl ToolUse { - pub fn parse_input(&self) -> Result - where - Input: for<'de> serde::Deserialize<'de>, - { - serde_json::from_value::(self.value.clone()).map_err(|err| anyhow!(err)) - } -} - -#[derive(Debug, Eq, PartialEq)] -pub struct FileEdits { - pub hunks: Vec, -} - -#[derive(Debug, Eq, PartialEq)] -pub struct FileEditHunk { - pub base_text: String, - pub text: String, - pub status: DiffHunkStatus, -} - -impl FileEdits { - pub fn has_added_line(&self, line: &str) -> bool { - self.hunks.iter().any(|hunk| { - hunk.status == DiffHunkStatus::added_none() - && hunk.base_text.is_empty() - && hunk.text.contains(line) - }) - } -} diff --git a/crates/eval/src/examples/add_arg_to_trait_method.rs b/crates/eval/src/examples/add_arg_to_trait_method.rs deleted file mode 100644 index 2d06e384b362c2bcbb8101cf00f6908a87f9f71b..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/add_arg_to_trait_method.rs +++ /dev/null @@ -1,115 +0,0 @@ -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; -use util::rel_path::RelPath; - -use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer}; - -pub struct AddArgToTraitMethod; - -#[async_trait(?Send)] -impl Example for AddArgToTraitMethod { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "add_arg_to_trait_method".to_string(), - url: "https://github.com/zed-industries/zed.git".to_string(), - revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(), - language_server: Some(LanguageServer { - file_extension: "rs".to_string(), - allow_preexisting_diagnostics: false, - }), - max_assertions: None, - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: None, - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - const FILENAME: &str = "assistant_tool.rs"; - let _ = cx.prompt(format!( - r#" - Add a `window: Option` argument to the `Tool::run` trait method in {FILENAME}, - and update all the implementations of the trait and call sites accordingly. - "# - )).await?; - - // Adds ignored argument to all but `batch_tool` - - let add_ignored_window_paths = &[ - "code_action_tool", - "code_symbols_tool", - "contents_tool", - "copy_path_tool", - "create_directory_tool", - "create_file_tool", - "delete_path_tool", - "diagnostics_tool", - "edit_file_tool", - "fetch_tool", - "grep_tool", - "list_directory_tool", - "move_path_tool", - "now_tool", - "open_tool", - "path_search_tool", - "read_file_tool", - "rename_tool", - "symbol_info_tool", - "terminal_tool", - "web_search_tool", - ]; - - let edits = cx.edits(); - - for tool_name in add_ignored_window_paths { - let path_str = format!("crates/assistant_tools/src/{}.rs", tool_name); - let edits = edits.get(RelPath::unix(&path_str).unwrap()); - - let ignored = edits.is_some_and(|edits| { - edits.has_added_line(" _window: Option,\n") - }); - let uningored = edits.is_some_and(|edits| { - edits.has_added_line(" window: Option,\n") - }); - - cx.assert(ignored || uningored, format!("Argument: {}", tool_name)) - .ok(); - - cx.assert(ignored, format!("`_` prefix: {}", tool_name)) - .ok(); - } - - // Adds unignored argument to `batch_tool` - - let batch_tool_edits = - edits.get(RelPath::unix("crates/assistant_tools/src/batch_tool.rs").unwrap()); - - cx.assert( - batch_tool_edits.is_some_and(|edits| { - edits.has_added_line(" window: Option,\n") - }), - "Argument: batch_tool", - ) - .ok(); - - Ok(()) - } - - fn diff_assertions(&self) -> Vec { - vec![ - JudgeAssertion { - id: "batch tool passes window to each".to_string(), - description: - "batch_tool is modified to pass a clone of the window to each tool it calls." - .to_string(), - }, - JudgeAssertion { - id: "tool tests updated".to_string(), - description: - "tool tests are updated to pass the new `window` argument (`None` is ok)." - .to_string(), - }, - ] - } -} diff --git a/crates/eval/src/examples/code_block_citations.rs b/crates/eval/src/examples/code_block_citations.rs deleted file mode 100644 index 4fe7aa81124ca3fa8f84cd5145e83bd710fdf461..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/code_block_citations.rs +++ /dev/null @@ -1,218 +0,0 @@ -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; -use markdown::PathWithRange; - -use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer}; - -pub struct CodeBlockCitations; - -const FENCE: &str = "```"; - -#[async_trait(?Send)] -impl Example for CodeBlockCitations { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "code_block_citations".to_string(), - url: "https://github.com/zed-industries/zed.git".to_string(), - revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(), - language_server: Some(LanguageServer { - file_extension: "rs".to_string(), - allow_preexisting_diagnostics: false, - }), - max_assertions: None, - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: None, - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - const FILENAME: &str = "assistant_tool.rs"; - - // Verify that the messages all have the correct formatting. - let texts: Vec = cx - .prompt(format!( - r#" - Show me the method bodies of all the methods of the `Tool` trait in {FILENAME}. - - Please show each method in a separate code snippet. - "# - )) - .await? - .texts() - .collect(); - let closing_fence = format!("\n{FENCE}"); - - for text in texts.iter() { - let mut text = text.as_str(); - - while let Some(index) = text.find(FENCE) { - // Advance text past the opening backticks. - text = &text[index + FENCE.len()..]; - - // Find the closing backticks. - let content_len = text.find(&closing_fence); - - // Verify the citation format - e.g. ```path/to/foo.txt#L123-456 - if let Some(citation_len) = text.find('\n') { - let citation = &text[..citation_len]; - - if let Ok(()) = - cx.assert(citation.contains("/"), format!("Slash in {citation:?}",)) - { - let path_range = PathWithRange::new(citation); - let path = cx.agent_thread().update(cx, |thread, cx| { - thread - .project() - .read(cx) - .find_project_path(path_range.path.as_ref(), cx) - }); - - if let Ok(path) = cx.assert_some(path, format!("Valid path: {citation:?}")) - { - let buffer_text = { - let buffer = cx - .agent_thread() - .update(cx, |thread, cx| { - thread - .project() - .update(cx, |project, cx| project.open_buffer(path, cx)) - }) - .await - .ok(); - - let Ok(buffer_text) = cx.assert_some( - buffer.map(|buffer| { - buffer.read_with(cx, |buffer, _| buffer.text()) - }), - "Reading buffer text succeeded", - ) else { - continue; - }; - buffer_text - }; - - if let Some(content_len) = content_len { - // + 1 because there's a newline character after the citation. - let start_index = citation.len() + 1; - let end_index = content_len.saturating_sub(start_index); - - if cx - .assert( - start_index <= end_index, - "Code block had a valid citation", - ) - .is_ok() - { - let content = &text[start_index..end_index]; - - // deindent (trim the start of each line) because sometimes the model - // chooses to deindent its code snippets for the sake of readability, - // which in markdown is not only reasonable but usually desirable. - cx.assert( - deindent(&buffer_text) - .trim() - .contains(deindent(&content).trim()), - "Code block content was found in file", - ) - .ok(); - - if let Some(range) = path_range.range { - let start_line_index = range.start.line.saturating_sub(1); - let line_count = - range.end.line.saturating_sub(start_line_index); - let mut snippet = buffer_text - .lines() - .skip(start_line_index as usize) - .take(line_count as usize) - .collect::>() - .join("\n"); - - if let Some(start_col) = range.start.col { - snippet = snippet[start_col as usize..].to_string(); - } - - if let Some(end_col) = range.end.col { - let last_line = snippet.lines().last().unwrap(); - snippet = snippet[..snippet.len() - last_line.len() - + end_col as usize] - .to_string(); - } - - // deindent (trim the start of each line) because sometimes the model - // chooses to deindent its code snippets for the sake of readability, - // which in markdown is not only reasonable but usually desirable. - cx.assert_eq( - deindent(snippet.as_str()).trim(), - deindent(content).trim(), - format!( - "Code block was at {:?}-{:?}", - range.start, range.end - ), - ) - .ok(); - } - } - } - } - } - } else { - cx.assert( - false, - format!("Opening {FENCE} did not have a newline anywhere after it."), - ) - .ok(); - } - - if let Some(content_len) = content_len { - // Advance past the closing backticks - text = &text[content_len + FENCE.len()..]; - } else { - // There were no closing backticks associated with these opening backticks. - cx.assert( - false, - "Code block opening had matching closing backticks.".to_string(), - ) - .ok(); - - // There are no more code blocks to parse, so we're done. - break; - } - } - } - - Ok(()) - } - - fn thread_assertions(&self) -> Vec { - vec![ - JudgeAssertion { - id: "trait method bodies are shown".to_string(), - description: - "All method bodies of the Tool trait are shown." - .to_string(), - }, - JudgeAssertion { - id: "code blocks used".to_string(), - description: - "All code snippets are rendered inside markdown code blocks (as opposed to any other formatting besides code blocks)." - .to_string(), - }, - JudgeAssertion { - id: "code blocks use backticks".to_string(), - description: - format!("All markdown code blocks use backtick fences ({FENCE}) rather than indentation.") - } - ] - } -} - -fn deindent(as_str: impl AsRef) -> String { - as_str - .as_ref() - .lines() - .map(|line| line.trim_start()) - .collect::>() - .join("\n") -} diff --git a/crates/eval/src/examples/comment_translation.rs b/crates/eval/src/examples/comment_translation.rs deleted file mode 100644 index 421999893a5a39b3d6f61c22d405bf90528758e7..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/comment_translation.rs +++ /dev/null @@ -1,60 +0,0 @@ -use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion}; -use agent::{EditFileMode, EditFileToolInput}; -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; - -pub struct CommentTranslation; - -#[async_trait(?Send)] -impl Example for CommentTranslation { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "comment_translation".to_string(), - url: "https://github.com/servo/font-kit.git".to_string(), - revision: "504d084e29bce4f60614bc702e91af7f7d9e60ad".to_string(), - language_server: None, - max_assertions: Some(1), - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: None, - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - let response = cx.prompt( - r#" - Edit the following files and translate all their comments to italian, in this exact order: - - - font-kit/src/family.rs - - font-kit/src/canvas.rs - - font-kit/src/error.rs - "# - ).await?; - - let mut create_or_overwrite_count = 0; - for tool_call in response.tool_calls() { - if tool_call.name == "edit_file" { - let input = tool_call.parse_input::()?; - if !matches!(input.mode, EditFileMode::Edit) { - create_or_overwrite_count += 1; - } - } - } - - cx.assert_eq(create_or_overwrite_count, 0, "no_creation_or_overwrite")?; - - Ok(()) - } - - fn diff_assertions(&self) -> Vec { - vec![JudgeAssertion { - id: "comments_translated".to_string(), - description: concat!( - "- Only `family.rs`, `canvas.rs` and `error.rs` should have changed.\n", - "- Their doc comments should have been all translated to Italian." - ) - .into(), - }] - } -} diff --git a/crates/eval/src/examples/file_change_notification.rs b/crates/eval/src/examples/file_change_notification.rs deleted file mode 100644 index 10683ec6509cece9d8d26039ff36ff458bdf418a..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/file_change_notification.rs +++ /dev/null @@ -1,74 +0,0 @@ -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; - -use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion}; - -pub struct FileChangeNotificationExample; - -#[async_trait(?Send)] -impl Example for FileChangeNotificationExample { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "file_change_notification".to_string(), - url: "https://github.com/octocat/hello-world".to_string(), - revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(), - language_server: None, - max_assertions: None, - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: Some(3), - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - // Track README so that the model gets notified of its changes - let project_path = cx.agent_thread().read_with(cx, |thread, cx| { - thread - .project() - .read(cx) - .find_project_path("README", cx) - .expect("README file should exist in this repo") - }); - - let buffer = { - cx.agent_thread() - .update(cx, |thread, cx| { - thread - .project() - .update(cx, |project, cx| project.open_buffer(project_path, cx)) - }) - .await? - }; - - cx.agent_thread().update(cx, |thread, cx| { - thread.action_log().update(cx, |action_log, cx| { - action_log.buffer_read(buffer.clone(), cx); - }); - }); - - // Start conversation (specific message is not important) - cx.prompt_with_max_turns("Find all files in this repo", 1) - .await?; - - // Edit the README buffer - the model should get a notification on next turn - buffer.update(cx, |buffer, cx| { - buffer.edit([(0..buffer.len(), "Surprise!")], None, cx); - }); - - // Run for some more turns. - // The model shouldn't thank us for letting it know about the file change. - cx.proceed_with_max_turns(3).await?; - - Ok(()) - } - - fn thread_assertions(&self) -> Vec { - vec![JudgeAssertion { - id: "change-file-notification".into(), - description: - "Agent should not acknowledge or mention anything about files that have been changed" - .into(), - }] - } -} diff --git a/crates/eval/src/examples/file_search.rs b/crates/eval/src/examples/file_search.rs deleted file mode 100644 index 7de7a07d19184b473fd2cb5ba29b270431b71a4c..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/file_search.rs +++ /dev/null @@ -1,55 +0,0 @@ -use agent::FindPathToolInput; -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; -use regex::Regex; - -use crate::example::{Example, ExampleContext, ExampleMetadata}; - -pub struct FileSearchExample; - -#[async_trait(?Send)] -impl Example for FileSearchExample { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "file_search".to_string(), - url: "https://github.com/zed-industries/zed.git".to_string(), - revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(), - language_server: None, - max_assertions: Some(3), - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: None, - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - const FILENAME: &str = "find_replace_file_tool.rs"; - - let prompt = format!( - r#" - Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait. - - The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for - markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green - background for lines that were added. We should have a div per diff line. - "# - ); - - let response = cx.prompt_with_max_turns(prompt, 1).await?; - let tool_use = response.expect_tool_call("find_path", cx)?; - let input = tool_use.parse_input::()?; - - let glob = input.glob; - cx.assert(glob.ends_with(FILENAME), "glob ends with file name")?; - - let without_filename = glob.replace(FILENAME, ""); - let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?") - .unwrap() - .is_match(&without_filename); - - cx.assert(matches, "glob starts with `**` or project")?; - - Ok(()) - } -} diff --git a/crates/eval/src/examples/find_and_replace_diff_card.toml b/crates/eval/src/examples/find_and_replace_diff_card.toml deleted file mode 100644 index 0e1b9c3972d8cd54bbbb6f066befb273cc6e0abc..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/find_and_replace_diff_card.toml +++ /dev/null @@ -1,43 +0,0 @@ -url = "https://github.com/zed-industries/zed.git" -revision = "38fcadf9481d018543c65f36ac3bafeba190179b" -language_extension = "rs" - -prompt = """ -Look at the `find_replace_file_tool.rs`. I want to implement a card for it. -The card should implement the `Render` trait. - -The card should show a diff. It should be a beautifully presented diff. -The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`). -I want to see a red background for lines that were deleted and a green background for lines -that were added. We should have a div per diff line. -""" - -[diff_assertions] - -modify_find_and_replace_tool = """ -The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. -The struct should contain an `output` field that is the same as the task we were returning before, -and a new `card` field that contains a view for the card. -""" - -card_implementation = """ -The card should be a view that displays a diff. -Each line in the diff should be colored according to whether it was added, removed or unchanged. -""" - -[thread_assertions] - -path_search = """ -The first tool call should be to path search including "find_replace_file_tool.rs" in the string. -(*Not* grep, for example, or reading the file based on a guess at the path.) -This is because we gave the model a filename and it needs to turn that into a real path. -""" - -read_file_from_path_search = """ -After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path. -""" - -symbol_search = """ -When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information -on what path the Render trait might be in. -""" diff --git a/crates/eval/src/examples/grep_params_escapement.rs b/crates/eval/src/examples/grep_params_escapement.rs deleted file mode 100644 index d4ba25cfcba60c66aa4a3b7fd1d93d778df1d9e8..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/grep_params_escapement.rs +++ /dev/null @@ -1,59 +0,0 @@ -use agent::GrepToolInput; -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; - -use crate::example::{Example, ExampleContext, ExampleMetadata}; - -pub struct GrepParamsEscapementExample; - -/* - -This eval checks that the model doesn't use HTML escapement for characters like `<` and -`>` in tool parameters. - - original +system_prompt change +tool description - claude-opus-4 89% 92% 97%+ - claude-sonnet-4 100% - gpt-5-mini 100% - gemini-2.5-pro 98% - -*/ - -#[async_trait(?Send)] -impl Example for GrepParamsEscapementExample { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "grep_params_escapement".to_string(), - url: "https://github.com/octocat/hello-world".to_string(), - revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(), - language_server: None, - max_assertions: Some(1), - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: Some(2), - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - let response = cx - .prompt_with_max_turns("Search for files containing the characters `>` or `<`", 2) - .await?; - let grep_input = response - .find_tool_call("grep") - .and_then(|tool_use| tool_use.parse_input::().ok()); - - cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?; - - cx.assert( - !contains_html_entities(&grep_input.unwrap().regex), - "Tool parameters should not be escaped", - ) - } -} - -fn contains_html_entities(pattern: &str) -> bool { - regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;") - .unwrap() - .is_match(pattern) -} diff --git a/crates/eval/src/examples/hallucinated_tool_calls.toml b/crates/eval/src/examples/hallucinated_tool_calls.toml deleted file mode 100644 index f12f01affef576bc8ada0b34efe57709303c1e81..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/hallucinated_tool_calls.toml +++ /dev/null @@ -1,13 +0,0 @@ -url = "https://github.com/jlowin/fastmcp" -revision = "a2c1e14e5d83af1c32b76280ab368df199c4e860" -language_extension = "py" - -prompt = "Write a LICENSE file just saying 'Apache 2.0' and nothing else" - -profile_name = "ask" - -[thread_assertions] - -no_edit_attempts = """The agent should not claim that it edited or created the file. It should not pretend making any changes.""" - -mention_insufficient_tools = """Agent should mention that it doesn't have relevant tools needed to make the change.""" diff --git a/crates/eval/src/examples/mod.rs b/crates/eval/src/examples/mod.rs deleted file mode 100644 index aec1bce07957fb81c17666b3e64b00a1fa47240f..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/mod.rs +++ /dev/null @@ -1,173 +0,0 @@ -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; -use serde::Deserialize; -use std::collections::BTreeMap; -use std::fs; -use std::{ - path::{Path, PathBuf}, - rc::Rc, -}; -use util::serde::default_true; - -use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion}; - -mod add_arg_to_trait_method; -mod code_block_citations; -mod comment_translation; -mod file_change_notification; -mod file_search; -mod grep_params_escapement; -mod overwrite_file; -mod planets; - -pub fn all(examples_dir: &Path) -> Vec> { - let mut threads: Vec> = vec![ - Rc::new(file_search::FileSearchExample), - Rc::new(add_arg_to_trait_method::AddArgToTraitMethod), - Rc::new(code_block_citations::CodeBlockCitations), - Rc::new(planets::Planets), - Rc::new(comment_translation::CommentTranslation), - Rc::new(overwrite_file::FileOverwriteExample), - Rc::new(file_change_notification::FileChangeNotificationExample), - Rc::new(grep_params_escapement::GrepParamsEscapementExample), - ]; - - for example_path in list_declarative_examples(examples_dir).unwrap() { - threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap())); - } - - threads -} - -struct DeclarativeExample { - metadata: ExampleMetadata, - prompt: String, - diff_assertions: Vec, - thread_assertions: Vec, -} - -impl DeclarativeExample { - pub fn load(example_path: &Path) -> Result { - let name = Self::name_from_path(example_path); - let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?; - let example_dir = example_path.parent().unwrap(); - - let language_server = if base.require_lsp { - Some(crate::example::LanguageServer { - file_extension: base - .language_extension - .expect("Language extension is required when require_lsp = true"), - allow_preexisting_diagnostics: base.allow_preexisting_diagnostics, - }) - } else { - None - }; - - let profile_id = if let Some(profile_name) = base.profile_name { - AgentProfileId(profile_name.into()) - } else { - AgentProfileId::default() - }; - - let existing_thread_json = if let Some(path) = base.existing_thread_path { - let content = fs::read_to_string(example_dir.join(&path)) - .unwrap_or_else(|_| panic!("Failed to read existing thread file: {}", path)); - Some(content) - } else { - None - }; - - let metadata = ExampleMetadata { - name, - url: base.url, - revision: base.revision, - language_server, - max_assertions: None, - profile_id, - existing_thread_json, - max_turns: base.max_turns, - }; - - Ok(DeclarativeExample { - metadata, - prompt: base.prompt, - thread_assertions: base - .thread_assertions - .into_iter() - .map(|(id, description)| JudgeAssertion { id, description }) - .collect(), - diff_assertions: base - .diff_assertions - .into_iter() - .map(|(id, description)| JudgeAssertion { id, description }) - .collect(), - }) - } - - pub fn name_from_path(path: &Path) -> String { - path.file_stem().unwrap().to_string_lossy().into_owned() - } -} - -#[derive(Clone, Debug, Deserialize)] -pub struct ExampleToml { - pub url: String, - pub revision: String, - pub language_extension: Option, - #[expect( - unused, - reason = "This field was found to be unused with serde library bump; it's left as is due to insufficient context on PO's side, but it *may* be fine to remove" - )] - pub insert_id: Option, - #[serde(default = "default_true")] - pub require_lsp: bool, - #[serde(default)] - pub allow_preexisting_diagnostics: bool, - pub prompt: String, - #[serde(default)] - pub profile_name: Option, - #[serde(default)] - pub diff_assertions: BTreeMap, - #[serde(default)] - pub thread_assertions: BTreeMap, - #[serde(default)] - pub existing_thread_path: Option, - #[serde(default)] - pub max_turns: Option, -} - -#[async_trait(?Send)] -impl Example for DeclarativeExample { - fn meta(&self) -> ExampleMetadata { - self.metadata.clone() - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - let max_turns = self.metadata.max_turns.unwrap_or(1000); - let _ = cx.prompt_with_max_turns(&self.prompt, max_turns).await; - Ok(()) - } - - fn diff_assertions(&self) -> Vec { - self.diff_assertions.clone() - } - - fn thread_assertions(&self) -> Vec { - self.thread_assertions.clone() - } -} - -fn list_declarative_examples(examples_dir: &Path) -> Result> { - let path = std::fs::canonicalize(examples_dir).unwrap(); - let entries = std::fs::read_dir(path).unwrap(); - let mut result_paths = Vec::new(); - for entry in entries { - let entry = entry?; - let path = entry.path(); - if path.extension() == Some("toml".as_ref()) { - result_paths.push(path); - } - } - Ok(result_paths) -} diff --git a/crates/eval/src/examples/no_tools_enabled.toml b/crates/eval/src/examples/no_tools_enabled.toml deleted file mode 100644 index 8f8f66244ae74220ba02d04d85e25e0b55271f6c..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/no_tools_enabled.toml +++ /dev/null @@ -1,19 +0,0 @@ -url = "https://github.com/zed-industries/zed" -revision = "main" -require_lsp = false -prompt = """ -I need to explore the codebase to understand what files are available in the project. What can you tell me about the structure of the codebase? - -Please find all uses of the 'find_path' function in the src directory. - -Also, can you tell me what the capital of France is? And how does garbage collection work in programming languages? -""" - -profile_name = "minimal" - -[thread_assertions] -no_hallucinated_tool_calls = """The agent should not hallucinate tool calls - for example, by writing markdown code blocks that simulate commands like `find`, `grep`, `ls`, etc. - since no tools are available. However, it is totally fine if the agent describes to the user what should be done, e.g. telling the user \"You can run `find` to...\" etc.""" - -doesnt_hallucinate_file_paths = """The agent should not make up file paths or pretend to know the structure of the project when tools are not available.""" - -correctly_answers_general_questions = """The agent should correctly answer general knowledge questions about the capital of France and garbage collection without asking for more context, demonstrating it can still be helpful with areas it knows about.""" diff --git a/crates/eval/src/examples/overwrite_file.rs b/crates/eval/src/examples/overwrite_file.rs deleted file mode 100644 index a4df1e97a3f4d9c66262f8679d93324e53df9d53..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/overwrite_file.rs +++ /dev/null @@ -1,51 +0,0 @@ -use agent::{EditFileMode, EditFileToolInput}; -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; - -use crate::example::{Example, ExampleContext, ExampleMetadata}; - -pub struct FileOverwriteExample; - -/* -This eval tests a fix for a destructive behavior of the `edit_file` tool. -Previously, it would rewrite existing files too aggressively, which often -resulted in content loss. - -Model | Pass rate -----------------|---------- -Sonnet 3.7 | 100% -Gemini 2.5 Pro | 80% -*/ - -#[async_trait(?Send)] -impl Example for FileOverwriteExample { - fn meta(&self) -> ExampleMetadata { - let thread_json = include_str!("threads/overwrite-file.json"); - - ExampleMetadata { - name: "file_overwrite".to_string(), - url: "https://github.com/zed-industries/zed.git".to_string(), - revision: "023a60806a8cc82e73bd8d88e63b4b07fc7a0040".to_string(), - language_server: None, - max_assertions: Some(1), - profile_id: AgentProfileId::default(), - existing_thread_json: Some(thread_json.to_string()), - max_turns: None, - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - let response = cx.proceed_with_max_turns(1).await?; - let tool_use = response.expect_tool_call("edit_file", cx)?; - let input = tool_use.parse_input::()?; - let file_overwritten = match input.mode { - EditFileMode::Edit => false, - EditFileMode::Create | EditFileMode::Overwrite => { - input.path.ends_with("src/language_model_selector.rs") - } - }; - - cx.assert(!file_overwritten, "File should be edited, not overwritten") - } -} diff --git a/crates/eval/src/examples/planets.rs b/crates/eval/src/examples/planets.rs deleted file mode 100644 index 1ef257a55db82abe3dab9ef006176df4b12cec5f..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/planets.rs +++ /dev/null @@ -1,75 +0,0 @@ -use agent::{AgentTool, OpenTool, TerminalTool}; -use agent_settings::AgentProfileId; -use anyhow::Result; -use async_trait::async_trait; - -use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion}; - -pub struct Planets; - -#[async_trait(?Send)] -impl Example for Planets { - fn meta(&self) -> ExampleMetadata { - ExampleMetadata { - name: "planets".to_string(), - url: "https://github.com/roc-lang/roc".to_string(), // This commit in this repo is just the Apache2 license, - revision: "59e49c75214f60b4dc4a45092292061c8c26ce27".to_string(), // so effectively a blank project. - language_server: None, - max_assertions: None, - profile_id: AgentProfileId::default(), - existing_thread_json: None, - max_turns: None, - } - } - - async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { - let response = cx - .prompt( - r#" - Make a plain JavaScript web page which renders an animated 3D solar system. - Let me drag to rotate the camera around. - Do not use npm. - "#, - ) - .await?; - let mut open_tool_uses = 0; - let mut terminal_tool_uses = 0; - - for tool_use in response.tool_calls() { - if tool_use.name == OpenTool::NAME { - open_tool_uses += 1; - } else if tool_use.name == TerminalTool::NAME { - terminal_tool_uses += 1; - } - } - - // The open tool should only be used when requested, which it was not. - cx.assert_eq(open_tool_uses, 0, "`open` tool was not used") - .ok(); - // No reason to use the terminal if not using npm. - cx.assert_eq(terminal_tool_uses, 0, "`terminal` tool was not used") - .ok(); - - Ok(()) - } - - fn diff_assertions(&self) -> Vec { - vec![ - JudgeAssertion { - id: "animated solar system".to_string(), - description: "This page should render a solar system, and it should be animated." - .to_string(), - }, - JudgeAssertion { - id: "drag to rotate camera".to_string(), - description: "The user can drag to rotate the camera around.".to_string(), - }, - JudgeAssertion { - id: "plain JavaScript".to_string(), - description: - "The code base uses plain JavaScript and no npm, along with HTML and CSS." - .to_string(), - }, - ] - } -} diff --git a/crates/eval/src/examples/threads/overwrite-file.json b/crates/eval/src/examples/threads/overwrite-file.json deleted file mode 100644 index 392ccde5b8e064bdb9d4a124f38e7a99ca6561f3..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/threads/overwrite-file.json +++ /dev/null @@ -1,262 +0,0 @@ -{ - "completion_mode": "normal", - "cumulative_token_usage": { - "cache_creation_input_tokens": 18383, - "cache_read_input_tokens": 97250, - "input_tokens": 45, - "output_tokens": 776 - }, - "detailed_summary_state": "NotGenerated", - "exceeded_window_error": null, - "initial_project_snapshot": { - "timestamp": "2025-05-08T14:31:16.701157512Z", - "unsaved_buffer_paths": [], - "worktree_snapshots": [ - { - "git_state": { - "current_branch": null, - "diff": "diff --git a/crates/language_model_selector/src/language_model_selector.rs b/crates/language_model_selector/src/language_model_selector.rs\nindex 6775bee98a..e25c9e1415 100644\n--- a/crates/language_model_selector/src/language_model_selector.rs\n+++ b/crates/language_model_selector/src/language_model_selector.rs\n@@ -410,7 +410,8 @@ impl ModelMatcher {\n }\n \n pub fn is_match(self: &Self, info: &ModelInfo) -> bool {\n- self.matched_ids.contains(&info.model.id().0)\n+ let q = (info.model.provider_id(), info.model.id());\n+ self.matched_models.contains(&q)\n }\n }\n \n", - "head_sha": "9245656485e58a5d6d717d82209bc8c57cb9c539", - "remote_url": "git@github.com:zed-industries/zed.git" - }, - "worktree_path": "/home/silver/develop/zed" - } - ] - }, - "messages": [ - { - "context": "\n\nThe following items were attached by the user. They are up-to-date and don't need to be re-read.\n\n\n```rs zed/crates/language_model_selector/src/language_model_selector.rs\nconst TRY_ZED_PRO_URL [L28]\ntype OnModelChanged [L30]\ntype GetActiveModel [L31]\npub struct LanguageModelSelector [L33-37]\n picker [L34]\n _authenticate_all_providers_task [L35]\n _subscriptions [L36]\nimpl LanguageModelSelector [L39-231]\n pub fn new [L40-81]\n fn handle_language_model_registry_event [L83-104]\n fn authenticate_all_providers [L110-154]\n fn all_models [L156-204]\n pub fn active_model [L206-208]\n fn get_active_model_index [L210-230]\nimpl EventEmitter for LanguageModelSelector [L233]\nimpl Focusable for LanguageModelSelector [L235-239]\n fn focus_handle [L236-238]\nimpl Render for LanguageModelSelector [L241-245]\n fn render [L242-244]\npub struct LanguageModelSelectorPopoverMenu [L248-258]\n language_model_selector [L253]\n trigger [L254]\n tooltip [L255]\n handle [L256]\n anchor [L257]\nimpl LanguageModelSelectorPopoverMenu [L260-284]\n pub fn new [L265-278]\n pub fn with_handle [L280-283]\nimpl RenderOnce for LanguageModelSelectorPopoverMenu [L286-304]\n fn render [L291-303]\nstruct ModelInfo [L307-310]\n model [L308]\n icon [L309]\npub struct LanguageModelPickerDelegate [L312-319]\n language_model_selector [L313]\n on_model_changed [L314]\n get_active_model [L315]\n all_models [L316]\n filtered_entries [L317]\n selected_index [L318]\nstruct GroupedModels [L321-324]\n recommended [L322]\n other [L323]\nimpl GroupedModels [L326-385]\n pub fn new [L327-342]\n fn entries [L344-370]\n fn model_infos [L372-384]\nenum LanguageModelPickerEntry [L387-390]\n Model [L388]\n Separator [L389]\nstruct ModelMatcher [L392-396]\n models [L393]\n bg_executor [L394]\n candidates [L395]\nimpl ModelMatcher [L398-452]\n fn new [L399-406]\n pub fn fuzzy_search [L408-424]\n pub fn exact_search [L426-438]\n fn make_match_candidates [L440-451]\nimpl PickerDelegate for LanguageModelPickerDelegate [L454-693]\n type ListItem [L455]\n fn match_count [L457-459]\n fn selected_index [L461-463]\n fn set_selected_index [L465-468]\n fn can_select [L470-480]\n fn placeholder_text [L482-484]\n fn update_matches [L486-546]\n fn confirm [L548-560]\n fn dismissed [L562-566]\n fn render_match [L568-636]\n fn render_footer [L638-692]\n\nShowing symbols 1-71 (total symbols: 71)\n\n```\n\n\n", - "creases": [ - { - "end": 140, - "icon_path": "icons/file_icons/rust.svg", - "label": "language_model_selector.rs", - "start": 38 - } - ], - "id": 0, - "role": "user", - "segments": [ - { - "text": "I recently introduced fuzzy search in [@language_model_selector.rs](@file:zed/crates/language_model_selector/src/language_model_selector.rs). Can you write unit-test that check it? You can find a test list of models on images. ", - "type": "text" - } - ], - "tool_results": [], - "tool_uses": [] - }, - { - "context": "", - "creases": [], - "id": 1, - "role": "assistant", - "segments": [ - { - "text": "I'll help you write unit tests for the fuzzy search functionality in the language model selector. I see that you want to test the fuzzy search with the models shown in the images - both Claude models (3, 3.5, 3.7) and GPT models (GPT-3.5, GPT-4, GPT-4o, etc.).\n\nLet's start by examining the code structure to understand how the fuzzy search works.", - "type": "text" - } - ], - "tool_results": [ - { - "content": "zed/crates/language_model_selector/Cargo.toml\nzed/crates/language_model_selector/LICENSE-GPL\nzed/crates/language_model_selector/src\n", - "is_error": false, - "output": null, - "tool_use_id": "toolu_01FvYMtHs763GL1wA5mpTpqn" - } - ], - "tool_uses": [ - { - "id": "toolu_01FvYMtHs763GL1wA5mpTpqn", - "input": { - "path": "zed/crates/language_model_selector" - }, - "name": "list_directory" - } - ] - }, - { - "context": "", - "creases": [], - "id": 2, - "role": "assistant", - "segments": [ - { - "text": "Let's look at the structure of the tests directory or see if there are existing tests:", - "type": "text" - } - ], - "tool_results": [ - { - "content": "zed/crates/language_model_selector/src/language_model_selector.rs\n", - "is_error": false, - "output": null, - "tool_use_id": "toolu_01KSLDsLLrf7tgixB1sge3cm" - } - ], - "tool_uses": [ - { - "id": "toolu_01KSLDsLLrf7tgixB1sge3cm", - "input": { - "path": "zed/crates/language_model_selector/src" - }, - "name": "list_directory" - } - ] - }, - { - "context": "", - "creases": [], - "id": 3, - "role": "assistant", - "segments": [ - { - "text": "Let's check the Cargo.toml to understand the dependencies:", - "type": "text" - } - ], - "tool_results": [ - { - "content": "[package]\nname = \"language_model_selector\"\nversion = \"0.1.0\"\nedition.workspace = true\npublish.workspace = true\nlicense = \"GPL-3.0-or-later\"\n\n[lints]\nworkspace = true\n\n[lib]\npath = \"src/language_model_selector.rs\"\n\n[dependencies]\ncollections.workspace = true\nfeature_flags.workspace = true\nfuzzy.workspace = true\ngpui.workspace = true\nlanguage_model.workspace = true\nlog.workspace = true\npicker.workspace = true\nproto.workspace = true\nui.workspace = true\n\nzed_actions.workspace = true\n", - "is_error": false, - "output": null, - "tool_use_id": "toolu_019Je2MLfJhpJr93g5igoRAH" - } - ], - "tool_uses": [ - { - "id": "toolu_019Je2MLfJhpJr93g5igoRAH", - "input": { - "path": "zed/crates/language_model_selector/Cargo.toml" - }, - "name": "read_file" - } - ] - }, - { - "context": "", - "creases": [], - "id": 4, - "role": "assistant", - "segments": [ - { - "text": "Now let's look at the fuzzy search implementation in detail:", - "type": "text" - } - ], - "tool_results": [ - { - "content": "struct ModelMatcher {\n models: Vec,\n bg_executor: BackgroundExecutor,\n candidates: Vec,\n}\n\nimpl ModelMatcher {\n fn new(models: Vec, bg_executor: BackgroundExecutor) -> ModelMatcher {\n let candidates = Self::make_match_candidates(&models);\n Self {\n models,\n bg_executor,\n candidates,\n }\n }\n\n pub fn fuzzy_search(&self, query: &str) -> Vec {\n let matches = self.bg_executor.block(match_strings(\n &self.candidates,\n &query,\n false,\n 100,\n &Default::default(),\n self.bg_executor.clone(),\n ));\n\n let matched_models: Vec<_> = matches\n .into_iter()\n .map(|m| self.models[m.candidate_id].clone())\n .collect();\n\n matched_models\n }\n\n pub fn exact_search(&self, query: &str) -> Vec {\n self.models\n .iter()\n .filter(|m| {\n m.model\n .name()\n .0\n .to_lowercase()\n .contains(&query.to_lowercase())\n })\n .cloned()\n .collect::>()\n }\n\n fn make_match_candidates(model_infos: &Vec) -> Vec {\n model_infos\n .iter()\n .enumerate()\n .map(|(index, model)| {\n StringMatchCandidate::new(\n index,\n &format!(\"{}/{}\", &model.model.provider_id().0, &model.model.name().0),\n )\n })\n .collect::>()\n }\n}", - "is_error": false, - "output": null, - "tool_use_id": "toolu_01GpKFEn6hYrXGiS5yBF3urk" - } - ], - "tool_uses": [ - { - "id": "toolu_01GpKFEn6hYrXGiS5yBF3urk", - "input": { - "end_line": 452, - "path": "zed/crates/language_model_selector/src/language_model_selector.rs", - "start_line": 392 - }, - "name": "read_file" - } - ] - }, - { - "context": "", - "creases": [], - "id": 5, - "role": "assistant", - "segments": [ - { - "text": "Let's now check if there's an existing test module:", - "type": "text" - } - ], - "tool_results": [ - { - "content": "No matches found", - "is_error": false, - "output": null, - "tool_use_id": "toolu_01RZAvFvpMq16Dwh6d6cnVzy" - } - ], - "tool_uses": [ - { - "id": "toolu_01RZAvFvpMq16Dwh6d6cnVzy", - "input": { - "include_pattern": "zed/crates/language_model_selector/**/*.rs", - "regex": "mod tests" - }, - "name": "grep" - } - ] - }, - { - "context": "", - "creases": [], - "id": 6, - "role": "assistant", - "segments": [ - { - "text": "Now I'll create a test module for the ModelMatcher's fuzzy search functionality. Let's create this test module:", - "type": "text" - } - ], - "tool_results": [], - "tool_uses": [] - } - ], - "model": { - "model": "claude-3-7-sonnet-latest", - "provider": "zed.dev" - }, - "profile": "Write", - "request_token_usage": [ - { - "cache_creation_input_tokens": 9581, - "input_tokens": 4, - "output_tokens": 5 - }, - { - "cache_creation_input_tokens": 204, - "cache_read_input_tokens": 9581, - "input_tokens": 6, - "output_tokens": 5 - }, - { - "cache_creation_input_tokens": 104, - "cache_read_input_tokens": 9785, - "input_tokens": 6, - "output_tokens": 5 - }, - { - "cache_creation_input_tokens": 239, - "cache_read_input_tokens": 9889, - "input_tokens": 6, - "output_tokens": 5 - }, - { - "cache_creation_input_tokens": 569, - "cache_read_input_tokens": 10128, - "input_tokens": 5, - "output_tokens": 5 - }, - { - "cache_creation_input_tokens": 87, - "cache_read_input_tokens": 10697, - "input_tokens": 5, - "output_tokens": 2 - }, - { - "cache_creation_input_tokens": 7355, - "cache_read_input_tokens": 10784, - "input_tokens": 5, - "output_tokens": 3 - } - ], - "summary": "Fuzzy Search Testing Language Model Selector", - "updated_at": "2025-05-08T18:20:34.205405751Z", - "version": "0.2.0" -} diff --git a/crates/eval/src/examples/tree_sitter_drop_emscripten_dep.toml b/crates/eval/src/examples/tree_sitter_drop_emscripten_dep.toml deleted file mode 100644 index a2846e9d15854a86d5309da04c8ae0f1d01ff6c1..0000000000000000000000000000000000000000 --- a/crates/eval/src/examples/tree_sitter_drop_emscripten_dep.toml +++ /dev/null @@ -1,53 +0,0 @@ -url = "https://github.com/tree-sitter/tree-sitter.git" -revision = "635c49909ce4aa7f58a9375374f91b1b434f6f9c" -language_extension = "rs" - -prompt = """ -Change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten. -Use `ureq` to download the SDK for the current platform and architecture. -Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir. -Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows) -that's inside of the archive. -Don't re-download the SDK if that executable already exists. - -Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{language_name} - -Here are the available wasi-sdk assets: -- wasi-sdk-25.0-x86_64-macos.tar.gz -- wasi-sdk-25.0-arm64-macos.tar.gz -- wasi-sdk-25.0-x86_64-linux.tar.gz -- wasi-sdk-25.0-arm64-linux.tar.gz -- wasi-sdk-25.0-x86_64-linux.tar.gz -- wasi-sdk-25.0-arm64-linux.tar.gz -- wasi-sdk-25.0-x86_64-windows.tar.gz -""" - -[diff_assertions] - -modify_function = """ -The patch modifies the `compile_parser_to_wasm` function, removing logic for running `emscripten`, -and adding logic to download `wasi-sdk`. -""" - -use_listed_assets = """ -The patch implements logic for selecting from the assets listed in the prompt by detecting the -current platform and architecture. -""" - -add_deps = """ -The patch adds a dependency for `ureq` to the Cargo.toml, and adds an import to the top of `loader/lib.rs` -If the patch uses any other dependencies (such as `tar` or `flate2`), it also correctly adds them -to the Cargo.toml and imports them. -""" - -[thread_assertions] - -find_specified_function = """ -The agent finds the specified function `compile_parser_to_wasm` in a logical way. -It does not begin by guessing any paths to files in the codebase, but rather searches for the function by name. -""" - -no_syntax_errors = """ -As it edits the file, the agent never introduces syntax errors. It's ok if there are other compile errors, -but it should not introduce glaring issues like mismatched curly braces. -""" diff --git a/crates/eval/src/explorer.html b/crates/eval/src/explorer.html deleted file mode 100644 index 04c41090d37ef975ce1f4805cde3eaaf433d100a..0000000000000000000000000000000000000000 --- a/crates/eval/src/explorer.html +++ /dev/null @@ -1,949 +0,0 @@ - - - - - - Eval Explorer - - - -

Thread Explorer

-
- - - -
- -
-
-
- -
- Thread 1 of 1: - Default Thread -
- -
- - - - - - - - - - - - -
TurnTextToolResult
- - - - diff --git a/crates/eval/src/explorer.rs b/crates/eval/src/explorer.rs deleted file mode 100644 index 3326070cea4e860210f8ba7e0038fec2f3404c30..0000000000000000000000000000000000000000 --- a/crates/eval/src/explorer.rs +++ /dev/null @@ -1,182 +0,0 @@ -use anyhow::{Context as _, Result}; -use clap::Parser; -use serde_json::{Value, json}; -use std::fs; -use std::path::{Path, PathBuf}; - -#[derive(Parser, Debug)] -#[clap(about = "Generate HTML explorer from JSON thread files")] -struct Args { - /// Paths to JSON files or directories. If a directory is provided, - /// it will be searched for 'last.messages.json' files up to 2 levels deep. - #[clap(long, required = true, num_args = 1..)] - input: Vec, - - /// Path where the output HTML file will be written - #[clap(long)] - output: PathBuf, -} - -/// Recursively finds files with `target_filename` in `dir_path` up to `max_depth`. -#[allow(dead_code)] -fn find_target_files_recursive( - dir_path: &Path, - target_filename: &str, - current_depth: u8, - max_depth: u8, - found_files: &mut Vec, -) -> Result<()> { - if current_depth > max_depth { - return Ok(()); - } - - for entry_result in fs::read_dir(dir_path) - .with_context(|| format!("Failed to read directory: {}", dir_path.display()))? - { - let entry = entry_result.with_context(|| { - format!("Failed to read directory entry in: {}", dir_path.display()) - })?; - let path = entry.path(); - - if path.is_dir() { - find_target_files_recursive( - &path, - target_filename, - current_depth + 1, - max_depth, - found_files, - )?; - } else if path.is_file() - && let Some(filename_osstr) = path.file_name() - && let Some(filename_str) = filename_osstr.to_str() - && filename_str == target_filename - { - found_files.push(path); - } - } - Ok(()) -} - -pub fn generate_explorer_html(input_paths: &[PathBuf], output_path: &PathBuf) -> Result { - if let Some(parent) = output_path.parent() - && !parent.exists() - { - fs::create_dir_all(parent).context(format!( - "Failed to create output directory: {}", - parent.display() - ))?; - } - - let template_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/explorer.html"); - let template_content = fs::read_to_string(&template_path).context(format!( - "Template file not found or couldn't be read: {}", - template_path.display() - ))?; - - if input_paths.is_empty() { - println!( - "No input JSON files found to process. Explorer will be generated with template defaults or empty data." - ); - } - - let threads = input_paths - .iter() - .map(|input_path| { - let file_content = fs::read_to_string(input_path) - .context(format!("Failed to read file: {}", input_path.display()))?; - let mut thread_data: Value = file_content - .parse::() - .context(format!("Failed to parse JSON from file: {}", input_path.display()))?; - - if let Some(obj) = thread_data.as_object_mut() { - obj.insert("filename".to_string(), json!(input_path.display().to_string())); - } else { - eprintln!("Warning: JSON data in {} is not a root object. Wrapping it to include filename.", input_path.display()); - thread_data = json!({ - "original_data": thread_data, - "filename": input_path.display().to_string() - }); - } - Ok(thread_data) - }) - .collect::>>()?; - - let all_threads_data = json!({ "threads": threads }); - let html_content = inject_thread_data(template_content, all_threads_data)?; - fs::write(&output_path, &html_content) - .context(format!("Failed to write output: {}", output_path.display()))?; - - println!( - "Saved data from {} resolved file(s) ({} threads) to {}", - input_paths.len(), - threads.len(), - output_path.display() - ); - Ok(html_content) -} - -fn inject_thread_data(template: String, threads_data: Value) -> Result { - let injection_marker = "let threadsData = window.threadsData || { threads: [dummyThread] };"; - if !template.contains(injection_marker) { - anyhow::bail!( - "Could not find the thread injection point in the template. Expected: '{}'", - injection_marker - ); - } - - let threads_json_string = serde_json::to_string_pretty(&threads_data) - .context("Failed to serialize threads data to JSON")? - .replace("", r"<\/script>"); - - let script_injection = format!("let threadsData = {};", threads_json_string); - let final_html = template.replacen(injection_marker, &script_injection, 1); - - Ok(final_html) -} - -#[cfg(not(any(test, doctest)))] -#[allow(dead_code)] -fn main() -> Result<()> { - let args = Args::parse(); - - const DEFAULT_FILENAME: &str = "last.messages.json"; - const MAX_SEARCH_DEPTH: u8 = 2; - - let mut resolved_input_files: Vec = Vec::new(); - - for input_path_arg in &args.input { - if !input_path_arg.exists() { - eprintln!( - "Warning: Input path {} does not exist. Skipping.", - input_path_arg.display() - ); - continue; - } - - if input_path_arg.is_dir() { - find_target_files_recursive( - input_path_arg, - DEFAULT_FILENAME, - 0, // starting depth - MAX_SEARCH_DEPTH, - &mut resolved_input_files, - ) - .with_context(|| { - format!( - "Error searching for '{}' files in directory: {}", - DEFAULT_FILENAME, - input_path_arg.display() - ) - })?; - } else if input_path_arg.is_file() { - resolved_input_files.push(input_path_arg.clone()); - } - } - - resolved_input_files.sort_unstable(); - resolved_input_files.dedup(); - - println!("No input paths provided/found."); - - generate_explorer_html(&resolved_input_files, &args.output).map(|_| ()) -} diff --git a/crates/eval/src/ids.rs b/crates/eval/src/ids.rs deleted file mode 100644 index 7057344206ba1530db5034fc2ed5d73e52b41382..0000000000000000000000000000000000000000 --- a/crates/eval/src/ids.rs +++ /dev/null @@ -1,29 +0,0 @@ -use anyhow::{Context as _, Result}; -use std::fs; -use std::path::{Path, PathBuf}; -use uuid::Uuid; - -pub fn get_or_create_id(path: &Path) -> Result { - if let Ok(id) = fs::read_to_string(path) { - let trimmed = id.trim(); - if !trimmed.is_empty() { - return Ok(trimmed.to_string()); - } - } - let new_id = Uuid::new_v4().to_string(); - fs::create_dir_all(path.parent().context("invalid id path")?)?; - fs::write(path, &new_id)?; - Ok(new_id) -} - -pub fn eval_system_id_path() -> PathBuf { - dirs::data_local_dir() - .unwrap_or_else(|| PathBuf::from(".")) - .join("zed-eval-system-id") -} - -pub fn eval_installation_id_path() -> PathBuf { - dirs::data_local_dir() - .unwrap_or_else(|| PathBuf::from(".")) - .join("zed-eval-installation-id") -} diff --git a/crates/eval/src/instance.rs b/crates/eval/src/instance.rs deleted file mode 100644 index 54e6ab0b925191c16885b8b8ed89369039c467f6..0000000000000000000000000000000000000000 --- a/crates/eval/src/instance.rs +++ /dev/null @@ -1,1446 +0,0 @@ -use agent::ContextServerRegistry; -use agent_client_protocol as acp; -use anyhow::{Context as _, Result, anyhow, bail}; -use client::proto::LspWorkProgress; -use futures::channel::mpsc; -use futures::future::Shared; -use futures::{FutureExt as _, StreamExt as _, future}; -use gpui::{App, AppContext as _, AsyncApp, Entity, Task}; -use handlebars::Handlebars; -use language::{Buffer, DiagnosticSeverity, OffsetRangeExt as _}; -use language_model::{ - LanguageModel, LanguageModelCompletionEvent, LanguageModelRegistry, LanguageModelRequest, - LanguageModelRequestMessage, LanguageModelToolResultContent, MessageContent, Role, TokenUsage, -}; -use project::{DiagnosticSummary, Project, ProjectPath, lsp_store::OpenLspBufferHandle}; -use prompt_store::{ProjectContext, WorktreeContext}; -use rand::{distr, prelude::*}; -use serde::{Deserialize, Serialize}; -use std::{ - fmt::Write as _, - fs::{self, File}, - io::Write as _, - path::{Path, PathBuf}, - rc::Rc, - sync::{Arc, Mutex}, - time::Duration, -}; -use unindent::Unindent as _; -use util::{ResultExt as _, command::new_command, markdown::MarkdownCodeBlock}; - -use crate::{ - AgentAppState, ToolMetrics, - assertions::{AssertionsReport, RanAssertion, RanAssertionResult}, - example::{Example, ExampleContext, FailedAssertion, JudgeAssertion}, -}; - -pub const ZED_REPO_URL: &str = "https://github.com/zed-industries/zed.git"; - -#[derive(Clone)] -pub struct ExampleInstance { - pub thread: Rc, - pub name: String, - pub run_directory: PathBuf, - pub log_prefix: String, - /// The repetition number for this example (0-based) - /// When running multiple repetitions of the same example, each instance is assigned a unique repetition number. - /// This affects the worktree path and log prefix to avoid clobbering results between runs. - pub repetition: usize, - pub repo_path: PathBuf, - /// Path to the directory containing the requests and responses for the agentic loop - worktrees_dir: PathBuf, -} - -#[derive(Debug, Serialize, Clone)] -pub struct RunOutput { - pub repository_diff: String, - pub diagnostic_summary_before: DiagnosticSummary, - pub diagnostic_summary_after: DiagnosticSummary, - pub diagnostics_before: Option, - pub diagnostics_after: Option, - pub token_usage: TokenUsage, - pub tool_metrics: ToolMetrics, - pub thread_markdown: String, - pub programmatic_assertions: AssertionsReport, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct JudgeDiffInput { - pub repository_diff: String, - pub assertion: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct JudgeThreadInput { - pub messages: String, - pub assertion: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct JudgeOutput { - pub thread: AssertionsReport, - pub diff: AssertionsReport, -} - -impl ExampleInstance { - pub fn new( - thread: Rc, - repos_dir: &Path, - run_dir: &Path, - worktrees_dir: &Path, - repetition: usize, - ) -> Self { - let name = thread.meta().name; - let run_directory = run_dir.join(&name).join(repetition.to_string()); - - let repo_path = repo_path_for_url(repos_dir, &thread.meta().url); - - Self { - name, - thread, - log_prefix: String::new(), - run_directory, - repetition, - repo_path, - worktrees_dir: worktrees_dir.to_path_buf(), - } - } - - pub fn repo_url(&self) -> String { - self.thread.meta().url - } - - pub fn revision(&self) -> String { - self.thread.meta().revision - } - - pub fn worktree_name(&self) -> String { - format!("{}-{}", self.name, self.repetition) - } - - pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) { - self.log_prefix = format!( - "{}{: Result<()> { - let meta = self.thread.meta(); - - let revision_exists = run_git( - &self.repo_path, - &["rev-parse", &format!("{}^{{commit}}", &meta.revision)], - ) - .await - .is_ok(); - - if !revision_exists { - println!("{}Fetching revision {}", self.log_prefix, &meta.revision); - run_git( - &self.repo_path, - &["fetch", "--depth", "1", "origin", &meta.revision], - ) - .await?; - } - Ok(()) - } - - /// Set up the example by checking out the specified Git revision - pub async fn setup(&mut self) -> Result<()> { - let worktree_path = self.worktree_path(); - let meta = self.thread.meta(); - if worktree_path.is_dir() { - println!("{}Resetting existing worktree", self.log_prefix); - - // TODO: consider including "-x" to remove ignored files. The downside of this is that - // it will also remove build artifacts, and so prevent incremental reuse there. - run_git(&worktree_path, &["clean", "--force", "-d"]).await?; - run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?; - run_git(&worktree_path, &["checkout", &meta.revision]).await?; - } else { - println!("{}Creating worktree", self.log_prefix); - - let worktree_path_string = worktree_path.to_string_lossy().into_owned(); - - run_git( - &self.repo_path, - &[ - "worktree", - "add", - "-f", - &worktree_path_string, - &meta.revision, - ], - ) - .await?; - } - - if meta.url == ZED_REPO_URL { - std::fs::write(worktree_path.join(".rules"), std::fs::read(".rules")?)?; - } - - std::fs::create_dir_all(&self.run_directory)?; - - Ok(()) - } - - pub fn worktree_path(&self) -> PathBuf { - self.worktrees_dir - .join(self.worktree_name()) - .join(self.thread.meta().repo_name()) - } - - pub fn run(&self, app_state: Arc, cx: &mut App) -> Task> { - let project = Project::local( - app_state.client.clone(), - app_state.node_runtime.clone(), - app_state.user_store.clone(), - app_state.languages.clone(), - app_state.fs.clone(), - None, - project::LocalProjectFlags { - init_worktree_trust: false, - ..Default::default() - }, - cx, - ); - - let worktree = project.update(cx, |project, cx| { - project.create_worktree(self.worktree_path(), true, cx) - }); - - let meta = self.thread.meta(); - let this = self.clone(); - - cx.spawn(async move |cx| { - let worktree = worktree.await?; - - // Wait for worktree scan to finish before choosing a file to open. - worktree - .update(cx, |worktree, _cx| { - worktree.as_local().unwrap().scan_complete() - }) - .await; - - struct LanguageServerState { - _lsp_open_handle: OpenLspBufferHandle, - language_file_buffer: Entity, - } - - let mut diagnostics_before = None; - let mut diagnostic_summary_before = DiagnosticSummary::default(); - - let lsp = if let Some(language_server) = &meta.language_server { - // Open a file that matches the language to cause LSP to start. - let language_file = worktree - .read_with(cx, |worktree, _cx| { - worktree - .files(false, 0) - .find_map(|e| { - if e.path.clone().extension() - == Some(&language_server.file_extension) - { - Some(ProjectPath { - worktree_id: worktree.id(), - path: e.path.clone(), - }) - } else { - None - } - }) - .context("Failed to find a file for example language") - })?; - - let open_language_file_buffer_task = project.update(cx, |project, cx| { - project.open_buffer(language_file.clone(), cx) - }); - - let language_file_buffer = open_language_file_buffer_task.await?; - - let lsp_open_handle = project.update(cx, |project, cx| { - project.register_buffer_with_language_servers(&language_file_buffer, cx) - }); - - wait_for_lang_server(&project, &language_file_buffer, this.log_prefix.clone(), cx).await?; - - diagnostic_summary_before = project.read_with(cx, |project, cx| { - project.diagnostic_summary(false, cx) - }); - - diagnostics_before = query_lsp_diagnostics(project.clone(), cx).await?; - if diagnostics_before.is_some() && language_server.allow_preexisting_diagnostics { - anyhow::bail!("Example has pre-existing diagnostics. If you want to run this example regardless, set `allow_preexisting_diagnostics` to `true` in `base.toml`"); - } - - Some(LanguageServerState { - _lsp_open_handle: lsp_open_handle, - language_file_buffer, - }) - } else { - None - }; - - anyhow::ensure!(std::env::var("ZED_EVAL_SETUP_ONLY").is_err(), "Setup only mode"); - - let last_diff_file_path = this.run_directory.join("last.diff"); - - // Write an empty "last.diff" so that it can be opened in Zed for convenient view of the - // history using undo/redo. - std::fs::write(&last_diff_file_path, "")?; - - let thread = cx.update(|cx| { - //todo: Do we want to load rules files here? - let worktrees = project.read(cx).visible_worktrees(cx).map(|worktree| { - let root_name = worktree.read(cx).root_name_str().into(); - let abs_path = worktree.read(cx).abs_path(); - - WorktreeContext { - root_name, - abs_path, - rules_file: None, - } - }).collect::>(); - let project_context = cx.new(|_cx| ProjectContext::new(worktrees, vec![])); - let context_server_registry = cx.new(|cx| ContextServerRegistry::new(project.read(cx).context_server_store(), cx)); - - let thread = if let Some(json) = &meta.existing_thread_json { - let session_id = acp::SessionId::new( - rand::rng() - .sample_iter(&distr::Alphanumeric) - .take(7) - .map(char::from) - .collect::(), - ); - - let db_thread = agent::DbThread::from_json(json.as_bytes()).expect("Can't read serialized thread"); - cx.new(|cx| agent::Thread::from_db(session_id, db_thread, project.clone(), project_context, context_server_registry, agent::Templates::new(), cx)) - } else { - cx.new(|cx| agent::Thread::new(project.clone(), project_context, context_server_registry, agent::Templates::new(), None, cx)) - }; - - thread.update(cx, |thread, cx| { - thread.add_default_tools(Rc::new(EvalThreadEnvironment { - project: project.clone(), - }), cx); - thread.set_profile(meta.profile_id.clone(), cx); - thread.set_model( - LanguageModelInterceptor::new( - LanguageModelRegistry::read_global(cx).default_model().expect("Missing model").model.clone(), - this.run_directory.clone(), - last_diff_file_path.clone(), - this.run_directory.join("last.messages.json"), - this.worktree_path(), - this.repo_url(), - ), - cx, - ); - }); - - thread - }); - - let mut example_cx = ExampleContext::new( - meta.clone(), - this.log_prefix.clone(), - thread.clone(), - cx.clone(), - ); - let result = this.thread.conversation(&mut example_cx).await; - - if let Err(err) = result - && !err.is::() { - return Err(err); - } - - println!("{}Stopped", this.log_prefix); - - println!("{}Getting repository diff", this.log_prefix); - let repository_diff = Self::repository_diff(this.worktree_path(), &this.repo_url()).await?; - - std::fs::write(last_diff_file_path, &repository_diff)?; - - - let mut diagnostics_after = None; - let mut diagnostic_summary_after = Default::default(); - - if let Some(language_server_state) = lsp { - wait_for_lang_server(&project, &language_server_state.language_file_buffer, this.log_prefix.clone(), cx).await?; - - println!("{}Getting diagnostics", this.log_prefix); - diagnostics_after = cx - .update(|cx| { - let project = project.clone(); - cx.spawn(async move |cx| query_lsp_diagnostics(project, cx).await) - }) - .await?; - println!("{}Got diagnostics", this.log_prefix); - - diagnostic_summary_after = project.read_with(cx, |project, cx| { - project.diagnostic_summary(false, cx) - }); - - } - - if let Some(diagnostics_before) = &diagnostics_before { - fs::write(this.run_directory.join("diagnostics_before.txt"), diagnostics_before)?; - } - - if let Some(diagnostics_after) = &diagnostics_after { - fs::write(this.run_directory.join("diagnostics_after.txt"), diagnostics_after)?; - } - - Ok(thread.update(cx, |thread, _cx| { - RunOutput { - repository_diff, - diagnostic_summary_before, - diagnostic_summary_after, - diagnostics_before, - diagnostics_after, - token_usage: thread.latest_request_token_usage().unwrap(), - tool_metrics: example_cx.tool_metrics.lock().unwrap().clone(), - thread_markdown: thread.to_markdown(), - programmatic_assertions: example_cx.assertions, - } - })) - }) - } - - async fn repository_diff(repository_path: PathBuf, repository_url: &str) -> Result { - run_git(&repository_path, &["add", "."]).await?; - let mut diff_args = vec!["diff", "--staged"]; - if repository_url == ZED_REPO_URL { - diff_args.push(":(exclude).rules"); - } - run_git(&repository_path, &diff_args).await - } - - pub async fn judge( - &self, - model: Arc, - run_output: &RunOutput, - cx: &AsyncApp, - ) -> JudgeOutput { - let mut output_file = - File::create(self.run_directory.join("judge.md")).expect("failed to create judge.md"); - - let diff_task = self.judge_diff(model.clone(), run_output, cx); - let thread_task = self.judge_thread(model.clone(), run_output, cx); - - let (diff_result, thread_result) = futures::join!(diff_task, thread_task); - - let (diff_response, diff_output) = diff_result; - let (thread_response, thread_output) = thread_result; - - writeln!( - &mut output_file, - "# Judgment\n\n## Thread\n\n{thread_response}\n\n## Diff\n\n{diff_response}", - ) - .log_err(); - - JudgeOutput { - thread: thread_output, - diff: diff_output, - } - } - - async fn judge_diff( - &self, - model: Arc, - run_output: &RunOutput, - cx: &AsyncApp, - ) -> (String, AssertionsReport) { - let diff_assertions = self.thread.diff_assertions(); - - if diff_assertions.is_empty() { - return ( - "No diff assertions".to_string(), - AssertionsReport::default(), - ); - } - - println!("{}Running diff judge", self.log_prefix); - - let judge_diff_prompt = include_str!("judge_diff_prompt.hbs"); - let judge_diff_prompt_name = "judge_diff_prompt"; - let mut hbs = Handlebars::new(); - hbs.register_template_string(judge_diff_prompt_name, judge_diff_prompt) - .unwrap(); - - let to_prompt = |assertion: String| { - hbs.render( - judge_diff_prompt_name, - &JudgeDiffInput { - repository_diff: run_output.repository_diff.clone(), - assertion, - }, - ) - .unwrap() - }; - - let (responses, report) = self - .judge_assertions(model, diff_assertions, to_prompt, cx) - .await; - - println!( - "{}Judge - Diff score: {}%", - self.log_prefix, - report.passed_percentage() - ); - - (responses, report) - } - - async fn judge_thread( - &self, - model: Arc, - run_output: &RunOutput, - cx: &AsyncApp, - ) -> (String, AssertionsReport) { - let thread_assertions = self.thread.thread_assertions(); - - if thread_assertions.is_empty() { - return ( - "No thread assertions".to_string(), - AssertionsReport::default(), - ); - } - - let judge_thread_prompt = include_str!("judge_thread_prompt.hbs"); - let judge_thread_prompt_name = "judge_thread_prompt"; - let mut hbs = Handlebars::new(); - hbs.register_template_string(judge_thread_prompt_name, judge_thread_prompt) - .unwrap(); - - let complete_messages = &run_output.thread_markdown; - let to_prompt = |assertion: String| { - hbs.render( - judge_thread_prompt_name, - &JudgeThreadInput { - messages: complete_messages.clone(), - assertion, - }, - ) - .unwrap() - }; - - let (responses, report) = self - .judge_assertions(model, thread_assertions, to_prompt, cx) - .await; - - println!( - "{}Judge - Thread score: {}%", - self.log_prefix, - report.passed_percentage() - ); - - (responses, report) - } - - async fn judge_assertions( - &self, - model: Arc, - assertions: Vec, - to_prompt: impl Fn(String) -> String, - cx: &AsyncApp, - ) -> (String, AssertionsReport) { - let assertions = assertions.into_iter().map(|assertion| { - let request = LanguageModelRequest { - thread_id: None, - prompt_id: None, - intent: None, - messages: vec![LanguageModelRequestMessage { - role: Role::User, - content: vec![MessageContent::Text(to_prompt(assertion.description))], - cache: false, - reasoning_details: None, - }], - temperature: None, - tools: Vec::new(), - tool_choice: None, - stop: Vec::new(), - thinking_allowed: true, - thinking_effort: None, - speed: None, - }; - - let model = model.clone(); - let log_prefix = self.log_prefix.clone(); - async move { - let response = send_language_model_request(model, request, cx).await; - - let (response, result) = match response { - Ok(response) => ( - response.clone(), - parse_assertion_result(&response).map_err(|err| err.to_string()), - ), - Err(err) => (err.to_string(), Err(err.to_string())), - }; - - if result.is_ok() { - println!("{}✅ {}", log_prefix, assertion.id); - } else { - println!("{}❌ {}", log_prefix, assertion.id); - } - - ( - response, - RanAssertion { - id: assertion.id, - result, - }, - ) - } - }); - - let mut responses = String::new(); - let mut report = AssertionsReport::default(); - - for (response, assertion) in future::join_all(assertions).await { - writeln!(&mut responses, "# {}", assertion.id).unwrap(); - writeln!(&mut responses, "{}\n\n", response).unwrap(); - report.ran.push(assertion); - } - - (responses, report) - } -} - -struct EvalThreadEnvironment { - project: Entity, -} - -struct EvalTerminalHandle { - terminal: Entity, -} - -impl agent::TerminalHandle for EvalTerminalHandle { - fn id(&self, cx: &AsyncApp) -> Result { - Ok(self.terminal.read_with(cx, |term, _cx| term.id().clone())) - } - - fn wait_for_exit(&self, cx: &AsyncApp) -> Result>> { - Ok(self - .terminal - .read_with(cx, |term, _cx| term.wait_for_exit())) - } - - fn current_output(&self, cx: &AsyncApp) -> Result { - Ok(self - .terminal - .read_with(cx, |term, cx| term.current_output(cx))) - } - - fn kill(&self, cx: &AsyncApp) -> Result<()> { - cx.update(|cx| { - self.terminal.update(cx, |terminal, cx| { - terminal.kill(cx); - }); - }); - Ok(()) - } - - fn was_stopped_by_user(&self, cx: &AsyncApp) -> Result { - Ok(self - .terminal - .read_with(cx, |term, _cx| term.was_stopped_by_user())) - } -} - -impl agent::ThreadEnvironment for EvalThreadEnvironment { - fn create_terminal( - &self, - command: String, - cwd: Option, - output_byte_limit: Option, - cx: &mut AsyncApp, - ) -> Task>> { - let project = self.project.clone(); - cx.spawn(async move |cx| { - let language_registry = - project.read_with(cx, |project, _cx| project.languages().clone()); - let id = acp::TerminalId::new(uuid::Uuid::new_v4().to_string()); - let terminal = - acp_thread::create_terminal_entity(command, &[], vec![], cwd.clone(), &project, cx) - .await?; - let terminal = cx.new(|cx| { - acp_thread::Terminal::new( - id, - "", - cwd, - output_byte_limit.map(|limit| limit as usize), - terminal, - language_registry, - cx, - ) - }); - Ok(Rc::new(EvalTerminalHandle { terminal }) as Rc) - }) - } - - fn create_subagent( - &self, - _label: String, - _cx: &mut App, - ) -> Result> { - unimplemented!() - } -} - -struct LanguageModelInterceptor { - model: Arc, - request_count: Arc>, - previous_diff: Arc>, - example_output_dir: PathBuf, - last_diff_file_path: PathBuf, - messages_json_file_path: PathBuf, - repository_path: PathBuf, - repository_url: String, -} - -impl LanguageModelInterceptor { - fn new( - model: Arc, - example_output_dir: PathBuf, - last_diff_file_path: PathBuf, - messages_json_file_path: PathBuf, - repository_path: PathBuf, - repository_url: String, - ) -> Arc { - Arc::new(Self { - model, - request_count: Arc::new(Mutex::new(0)), - previous_diff: Arc::new(Mutex::new("".to_string())), - example_output_dir, - last_diff_file_path, - messages_json_file_path, - repository_path, - repository_url, - }) - } -} - -impl language_model::LanguageModel for LanguageModelInterceptor { - fn id(&self) -> language_model::LanguageModelId { - self.model.id() - } - - fn name(&self) -> language_model::LanguageModelName { - self.model.name() - } - - fn provider_id(&self) -> language_model::LanguageModelProviderId { - self.model.provider_id() - } - - fn provider_name(&self) -> language_model::LanguageModelProviderName { - self.model.provider_name() - } - - fn telemetry_id(&self) -> String { - self.model.telemetry_id() - } - - fn supports_images(&self) -> bool { - self.model.supports_images() - } - - fn supports_tools(&self) -> bool { - self.model.supports_tools() - } - - fn supports_tool_choice(&self, choice: language_model::LanguageModelToolChoice) -> bool { - self.model.supports_tool_choice(choice) - } - - fn max_token_count(&self) -> u64 { - self.model.max_token_count() - } - - fn count_tokens( - &self, - request: LanguageModelRequest, - cx: &App, - ) -> future::BoxFuture<'static, Result> { - self.model.count_tokens(request, cx) - } - - fn stream_completion( - &self, - request: LanguageModelRequest, - cx: &AsyncApp, - ) -> future::BoxFuture< - 'static, - Result< - futures::stream::BoxStream< - 'static, - Result, - >, - language_model::LanguageModelCompletionError, - >, - > { - let stream = self.model.stream_completion(request.clone(), cx); - let request_count = self.request_count.clone(); - let previous_diff = self.previous_diff.clone(); - let example_output_dir = self.example_output_dir.clone(); - let last_diff_file_path = self.last_diff_file_path.clone(); - let messages_json_file_path = self.messages_json_file_path.clone(); - let repository_path = self.repository_path.clone(); - let repository_url = self.repository_url.clone(); - - Box::pin(async move { - let stream = stream.await?; - - let response_events = Arc::new(Mutex::new(Vec::new())); - let request_clone = request.clone(); - - let wrapped_stream = stream.then(move |event| { - let response_events = response_events.clone(); - let request = request_clone.clone(); - let request_count = request_count.clone(); - let previous_diff = previous_diff.clone(); - let example_output_dir = example_output_dir.clone(); - let last_diff_file_path = last_diff_file_path.clone(); - let messages_json_file_path = messages_json_file_path.clone(); - let repository_path = repository_path.clone(); - let repository_url = repository_url.clone(); - - async move { - let event_result = match &event { - Ok(ev) => Ok(ev.clone()), - Err(err) => Err(err.to_string()), - }; - response_events.lock().unwrap().push(event_result); - - let should_execute = matches!( - &event, - Ok(LanguageModelCompletionEvent::Stop { .. }) | Err(_) - ); - - if should_execute { - let current_request_count = { - let mut count = request_count.lock().unwrap(); - *count += 1; - *count - }; - - let messages_file_path = - example_output_dir.join(format!("{current_request_count}.messages.md")); - let diff_file_path = - example_output_dir.join(format!("{current_request_count}.diff")); - let last_messages_file_path = example_output_dir.join("last.messages.md"); - - let collected_events = response_events.lock().unwrap().clone(); - let request_markdown = RequestMarkdown::new(&request); - let response_events_markdown = - response_events_to_markdown(&collected_events); - let dialog = ThreadDialog::new(&request, &collected_events); - let dialog_json = - serde_json::to_string_pretty(&dialog.to_combined_request()) - .unwrap_or_default(); - - let messages = format!( - "{}\n\n{}", - request_markdown.messages, response_events_markdown - ); - fs::write(&messages_file_path, messages.clone()) - .expect("failed to write messages file"); - fs::write(&last_messages_file_path, messages) - .expect("failed to write last messages file"); - fs::write(&messages_json_file_path, dialog_json) - .expect("failed to write last.messages.json"); - - // Get repository diff - let diff_result = - ExampleInstance::repository_diff(repository_path, &repository_url) - .await; - - match diff_result { - Ok(diff) => { - let prev_diff = previous_diff.lock().unwrap().clone(); - if diff != prev_diff { - fs::write(&diff_file_path, &diff) - .expect("failed to write diff file"); - fs::write(&last_diff_file_path, &diff) - .expect("failed to write last diff file"); - *previous_diff.lock().unwrap() = diff; - } - } - Err(err) => { - let error_message = format!("{err:?}"); - fs::write(&diff_file_path, &error_message) - .expect("failed to write diff error to file"); - fs::write(&last_diff_file_path, &error_message) - .expect("failed to write last diff file"); - } - } - - if current_request_count == 1 { - let tools_file_path = example_output_dir.join("tools.md"); - fs::write(tools_file_path, request_markdown.tools) - .expect("failed to write tools file"); - } - } - - event - } - }); - - Ok(Box::pin(wrapped_stream) - as futures::stream::BoxStream< - 'static, - Result< - LanguageModelCompletionEvent, - language_model::LanguageModelCompletionError, - >, - >) - }) - } -} - -pub fn wait_for_lang_server( - project: &Entity, - buffer: &Entity, - log_prefix: String, - cx: &mut AsyncApp, -) -> Task> { - if std::env::var("ZED_EVAL_SKIP_LS").is_ok() { - return Task::ready(Ok(())); - } - - println!("{}⏵ Waiting for language server", log_prefix); - - let (mut tx, mut rx) = mpsc::channel(1); - - let lsp_store = project.read_with(cx, |project, _| project.lsp_store()); - - let has_lang_server = buffer.update(cx, |buffer, cx| { - lsp_store.update(cx, |lsp_store, cx| { - lsp_store - .running_language_servers_for_local_buffer(buffer, cx) - .next() - .is_some() - }) - }); - - if has_lang_server { - project - .update(cx, |project, cx| project.save_buffer(buffer.clone(), cx)) - .detach(); - } - - let subscriptions = - [ - cx.subscribe(&lsp_store, { - let log_prefix = log_prefix.clone(); - move |_, event, _| { - if let project::LspStoreEvent::LanguageServerUpdate { - message: - client::proto::update_language_server::Variant::WorkProgress( - LspWorkProgress { - message: Some(message), - .. - }, - ), - .. - } = event - { - println!("{}⟲ {message}", log_prefix) - } - } - }), - cx.subscribe(project, { - let buffer = buffer.clone(); - move |project, event, cx| match event { - project::Event::LanguageServerAdded(_, _, _) => { - let buffer = buffer.clone(); - project - .update(cx, |project, cx| project.save_buffer(buffer, cx)) - .detach(); - } - project::Event::DiskBasedDiagnosticsFinished { .. } => { - tx.try_send(()).ok(); - } - _ => {} - } - }), - ]; - - cx.spawn(async move |cx| { - let timeout = cx.background_executor().timer(Duration::new(60 * 5, 0)); - let result = futures::select! { - _ = rx.next() => { - println!("{}⚑ Language server idle", log_prefix); - anyhow::Ok(()) - }, - _ = timeout.fuse() => { - anyhow::bail!("LSP wait timed out after 5 minutes"); - } - }; - drop(subscriptions); - result - }) -} - -pub async fn query_lsp_diagnostics( - project: Entity, - cx: &mut AsyncApp, -) -> Result> { - let paths_with_diagnostics = project.update(cx, |project, cx| { - project - .diagnostic_summaries(true, cx) - .filter(|(_, _, summary)| summary.error_count > 0 || summary.warning_count > 0) - .map(|(project_path, _, _)| project_path) - .collect::>() - }); - - if paths_with_diagnostics.is_empty() { - return Ok(None); - } - - let mut output = String::new(); - for project_path in paths_with_diagnostics { - let buffer = project - .update(cx, |project, cx| project.open_buffer(project_path, cx)) - .await?; - let snapshot = buffer.read_with(cx, |buffer, _cx| buffer.snapshot()); - - for (_, group) in snapshot.diagnostic_groups(None) { - let entry = &group.entries[group.primary_ix]; - let range = entry.range.to_point(&snapshot); - let severity = match entry.diagnostic.severity { - DiagnosticSeverity::ERROR => "error", - DiagnosticSeverity::WARNING => "warning", - _ => continue, - }; - - writeln!( - output, - "{} at line {}: {}", - severity, - range.start.row + 1, - entry.diagnostic.message - )?; - } - } - anyhow::Ok(Some(output)) -} - -fn parse_assertion_result(response: &str) -> Result { - let analysis = get_tag("analysis", response)?; - let passed = match get_tag("passed", response)?.to_lowercase().as_str() { - "true" => true, - "false" => false, - value @ _ => bail!("invalid judge `passed` tag: {value}"), - }; - Ok(RanAssertionResult { - analysis: Some(analysis), - passed, - }) -} - -fn get_tag(name: &'static str, response: &str) -> Result { - let start_tag = format!("<{}>", name); - let end_tag = format!("", name); - - let start_ix = response - .find(&start_tag) - .context(format!("{} start tag not found", name))?; - let content_start_ix = start_ix + start_tag.len(); - - let end_ix = content_start_ix - + response[content_start_ix..] - .find(&end_tag) - .context(format!("{} end tag not found", name))?; - - let content = response[content_start_ix..end_ix].trim().unindent(); - - anyhow::Ok(content) -} - -pub fn repo_path_for_url(repos_dir: &Path, repo_url: &str) -> PathBuf { - let repo_name = repo_url - .trim_start_matches("https://") - .replace(|c: char| !c.is_alphanumeric(), "-"); - Path::new(repos_dir).join(repo_name) -} - -pub async fn run_git(repo_path: &Path, args: &[&str]) -> Result { - let output = new_command("git") - .current_dir(repo_path) - .args(args) - .output() - .await?; - - anyhow::ensure!( - output.status.success(), - "`git {}` within `{}` failed with status: {}\nstderr:\n{}\nstdout:\n{}", - args.join(" "), - repo_path.display(), - output.status, - String::from_utf8_lossy(&output.stderr), - String::from_utf8_lossy(&output.stdout), - ); - Ok(String::from_utf8(output.stdout)?.trim().to_string()) -} - -fn push_role(role: &Role, buf: &mut String, assistant_message_number: &mut u32) { - match role { - Role::System => buf.push_str("# ⚙️ SYSTEM\n\n"), - Role::User => buf.push_str("# 👤 USER\n\n"), - Role::Assistant => { - buf.push_str(&format!("# 🤖 ASSISTANT {assistant_message_number}\n\n")); - *assistant_message_number = *assistant_message_number + 1; - } - } -} - -pub async fn send_language_model_request( - model: Arc, - request: LanguageModelRequest, - cx: &AsyncApp, -) -> anyhow::Result { - match model.stream_completion_text(request, cx).await { - Ok(mut stream) => { - let mut full_response = String::new(); - while let Some(chunk_result) = stream.stream.next().await { - match chunk_result { - Ok(chunk_str) => { - full_response.push_str(&chunk_str); - } - Err(err) => { - anyhow::bail!("Error receiving response from language model: {err}"); - } - } - } - Ok(full_response) - } - Err(err) => Err(anyhow!( - "Failed to get response from language model. Error was: {err}" - )), - } -} - -pub struct RequestMarkdown { - pub tools: String, - pub messages: String, -} - -impl RequestMarkdown { - pub fn new(request: &LanguageModelRequest) -> Self { - let mut tools = String::new(); - let mut messages = String::new(); - let mut assistant_message_number: u32 = 1; - - // Print the tools - if !request.tools.is_empty() { - for tool in &request.tools { - write!(&mut tools, "# {}\n\n", tool.name).unwrap(); - write!(&mut tools, "{}\n\n", tool.description).unwrap(); - writeln!( - &mut tools, - "{}", - MarkdownCodeBlock { - tag: "json", - text: &format!("{:#}", tool.input_schema) - } - ) - .unwrap(); - } - } - - // Print the messages - for message in &request.messages { - push_role(&message.role, &mut messages, &mut assistant_message_number); - - for content in &message.content { - match content { - MessageContent::Text(text) => { - messages.push_str(text); - messages.push_str("\n\n"); - } - MessageContent::Image(_) => { - messages.push_str("[IMAGE DATA]\n\n"); - } - MessageContent::Thinking { text, signature } => { - messages.push_str("**Thinking**:\n\n"); - if let Some(sig) = signature { - messages.push_str(&format!("Signature: {}\n\n", sig)); - } - messages.push_str(text); - messages.push_str("\n"); - } - MessageContent::RedactedThinking(items) => { - messages.push_str(&format!( - "**Redacted Thinking**: {} item(s)\n\n", - items.len() - )); - } - MessageContent::ToolUse(tool_use) => { - messages.push_str(&format!( - "**Tool Use**: {} (ID: {})\n", - tool_use.name, tool_use.id - )); - messages.push_str(&format!( - "{}\n", - MarkdownCodeBlock { - tag: "json", - text: &format!("{:#}", tool_use.input) - } - )); - } - MessageContent::ToolResult(tool_result) => { - messages.push_str(&format!( - "**Tool Result**: {} (ID: {})\n\n", - tool_result.tool_name, tool_result.tool_use_id - )); - if tool_result.is_error { - messages.push_str("**ERROR:**\n"); - } - - match &tool_result.content { - LanguageModelToolResultContent::Text(text) => { - writeln!(messages, "{text}\n").ok(); - } - LanguageModelToolResultContent::Image(image) => { - writeln!(messages, "![Image](data:base64,{})\n", image.source).ok(); - } - } - - if let Some(output) = tool_result.output.as_ref() { - writeln!( - messages, - "**Debug Output**:\n\n```json\n{}\n```\n", - serde_json::to_string_pretty(output).unwrap() - ) - .unwrap(); - } - } - } - } - } - - Self { tools, messages } - } -} - -pub fn response_events_to_markdown( - response_events: &[std::result::Result], -) -> String { - let mut response = String::new(); - // Print the response events if any - response.push_str("# Response\n\n"); - let mut text_buffer = String::new(); - let mut thinking_buffer = String::new(); - - let flush_buffers = - |output: &mut String, text_buffer: &mut String, thinking_buffer: &mut String| { - if !text_buffer.is_empty() { - output.push_str(&format!("**Text**:\n{}\n\n", text_buffer)); - text_buffer.clear(); - } - if !thinking_buffer.is_empty() { - output.push_str(&format!("**Thinking**:\n{}\n\n", thinking_buffer)); - thinking_buffer.clear(); - } - }; - - for event in response_events { - match event { - Ok(LanguageModelCompletionEvent::Text(text)) => { - text_buffer.push_str(text); - } - Ok(LanguageModelCompletionEvent::Thinking { text, .. }) => { - thinking_buffer.push_str(text); - } - Ok(LanguageModelCompletionEvent::RedactedThinking { .. }) => {} - Ok(LanguageModelCompletionEvent::Stop(reason)) => { - flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer); - response.push_str(&format!("**Stop**: {:?}\n\n", reason)); - } - Ok(LanguageModelCompletionEvent::ToolUse(tool_use)) => { - flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer); - response.push_str(&format!( - "**Tool Use**: {} (ID: {})\n", - tool_use.name, tool_use.id - )); - response.push_str(&format!( - "{}\n", - MarkdownCodeBlock { - tag: "json", - text: &format!("{:#}", tool_use.input) - } - )); - } - Ok( - LanguageModelCompletionEvent::UsageUpdate(_) - | LanguageModelCompletionEvent::StartMessage { .. } - | LanguageModelCompletionEvent::Queued { .. } - | LanguageModelCompletionEvent::Started - | LanguageModelCompletionEvent::ReasoningDetails(_), - ) => {} - Ok(LanguageModelCompletionEvent::ToolUseJsonParseError { - json_parse_error, .. - }) => { - flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer); - response.push_str(&format!( - "**Error**: parse error in tool use JSON: {}\n\n", - json_parse_error - )); - } - Err(error) => { - flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer); - response.push_str(&format!("**Error**: {}\n\n", error)); - } - } - } - - flush_buffers(&mut response, &mut text_buffer, &mut thinking_buffer); - - response -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -pub struct ThreadDialog { - pub request: LanguageModelRequest, - pub response_events: Vec>, -} - -impl ThreadDialog { - pub fn new( - request: &LanguageModelRequest, - response_events: &[std::result::Result], - ) -> Self { - Self { - request: request.clone(), - response_events: response_events.to_vec(), - } - } - - /// Represents all request and response messages in a unified format. - /// - /// Specifically, it appends the assistant's response (derived from response events) - /// as a new message to existing messages in the request. - pub fn to_combined_request(&self) -> LanguageModelRequest { - let mut request = self.request.clone(); - if let Some(assistant_message) = self.response_events_to_message() { - request.messages.push(assistant_message); - } - request - } - fn response_events_to_message(&self) -> Option { - let response_events = &self.response_events; - let mut content: Vec = Vec::new(); - let mut current_text = String::new(); - - let flush_text = |text: &mut String, content: &mut Vec| { - if !text.is_empty() { - content.push(MessageContent::Text(std::mem::take(text))); - } - }; - - for event in response_events { - match event { - Ok(LanguageModelCompletionEvent::Text(text)) => { - current_text.push_str(text); - } - - Ok(LanguageModelCompletionEvent::ToolUse(tool_use)) => { - flush_text(&mut current_text, &mut content); - if tool_use.is_input_complete { - content.push(MessageContent::ToolUse(tool_use.clone())); - } - } - Ok(LanguageModelCompletionEvent::Thinking { text, signature }) => { - flush_text(&mut current_text, &mut content); - content.push(MessageContent::Thinking { - text: text.clone(), - signature: signature.clone(), - }); - } - - // Skip these - Ok(LanguageModelCompletionEvent::UsageUpdate(_)) - | Ok(LanguageModelCompletionEvent::RedactedThinking { .. }) - | Ok(LanguageModelCompletionEvent::StartMessage { .. }) - | Ok(LanguageModelCompletionEvent::ReasoningDetails(_)) - | Ok(LanguageModelCompletionEvent::Stop(_)) - | Ok(LanguageModelCompletionEvent::Queued { .. }) - | Ok(LanguageModelCompletionEvent::Started) => {} - - Ok(LanguageModelCompletionEvent::ToolUseJsonParseError { - json_parse_error, - .. - }) => { - flush_text(&mut current_text, &mut content); - content.push(MessageContent::Text(format!( - "ERROR: parse error in tool use JSON: {}", - json_parse_error - ))); - } - - Err(error) => { - flush_text(&mut current_text, &mut content); - content.push(MessageContent::Text(format!("ERROR: {}", error))); - } - } - } - - flush_text(&mut current_text, &mut content); - - if !content.is_empty() { - Some(LanguageModelRequestMessage { - role: Role::Assistant, - content, - cache: false, - reasoning_details: None, - }) - } else { - None - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_parse_judge_output() { - let response = r#" - The model did a good job but there were still compilations errors. - true - "# - .unindent(); - - let output = parse_assertion_result(&response).unwrap(); - assert_eq!( - output.analysis, - Some("The model did a good job but there were still compilations errors.".into()) - ); - assert!(output.passed); - - let response = r#" - Text around ignored - - - Failed to compile: - - Error 1 - - Error 2 - - - false - "# - .unindent(); - - let output = parse_assertion_result(&response).unwrap(); - assert_eq!( - output.analysis, - Some("Failed to compile:\n- Error 1\n- Error 2".into()) - ); - assert!(!output.passed); - } -} diff --git a/crates/eval/src/judge_diff_prompt.hbs b/crates/eval/src/judge_diff_prompt.hbs deleted file mode 100644 index 24ef9ac97e389ab5a3059eead27727343786cb1b..0000000000000000000000000000000000000000 --- a/crates/eval/src/judge_diff_prompt.hbs +++ /dev/null @@ -1,25 +0,0 @@ -You are an expert software developer. Your task is to evaluate a diff produced by an AI agent -in response to a prompt. Here is the prompt and the diff: - - -{{{prompt}}} - - - -{{{repository_diff}}} - - -Evaluate whether or not the diff passes the following assertion: - - -{{assertion}} - - -Analyze the diff hunk by hunk, and structure your answer in the following XML format: - -``` -{YOUR ANALYSIS HERE} -{PASSED_ASSERTION} -``` - -Where `PASSED_ASSERTION` is either `true` or `false`. diff --git a/crates/eval/src/judge_thread_prompt.hbs b/crates/eval/src/judge_thread_prompt.hbs deleted file mode 100644 index e80bafcce1f46ddddb236e572b27f51960a5a223..0000000000000000000000000000000000000000 --- a/crates/eval/src/judge_thread_prompt.hbs +++ /dev/null @@ -1,21 +0,0 @@ -You are an expert software developer. -Your task is to evaluate an AI agent's messages and tool calls in this conversation: - - -{{{messages}}} - - -Evaluate whether or not the sequence of messages passes the following assertion: - - -{{{assertion}}} - - -Analyze the messages one by one, and structure your answer in the following XML format: - -``` -{YOUR ANALYSIS HERE} -{PASSED_ASSERTION} -``` - -Where `PASSED_ASSERTION` is either `true` or `false`. diff --git a/crates/eval/src/tool_metrics.rs b/crates/eval/src/tool_metrics.rs deleted file mode 100644 index 63d8a4f2bc4d1be477a81e92aa2a68683f9d6434..0000000000000000000000000000000000000000 --- a/crates/eval/src/tool_metrics.rs +++ /dev/null @@ -1,106 +0,0 @@ -use collections::HashMap; -use serde::{Deserialize, Serialize}; -use std::{fmt::Display, sync::Arc}; - -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct ToolMetrics { - pub use_counts: HashMap, u32>, - pub failure_counts: HashMap, u32>, -} - -impl ToolMetrics { - pub fn insert(&mut self, tool_name: Arc, succeeded: bool) { - *self.use_counts.entry(tool_name.clone()).or_insert(0) += 1; - if !succeeded { - *self.failure_counts.entry(tool_name).or_insert(0) += 1; - } - } - - pub fn merge(&mut self, other: &ToolMetrics) { - for (tool_name, use_count) in &other.use_counts { - *self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count; - } - for (tool_name, failure_count) in &other.failure_counts { - *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count; - } - } - - pub fn is_empty(&self) -> bool { - self.use_counts.is_empty() && self.failure_counts.is_empty() - } -} - -impl Display for ToolMetrics { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut failure_rates: Vec<(Arc, f64)> = Vec::new(); - - for (tool_name, use_count) in &self.use_counts { - let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0); - if *use_count > 0 { - let failure_rate = failure_count as f64 / *use_count as f64; - failure_rates.push((tool_name.clone(), failure_rate)); - } - } - - // Sort by failure rate descending - failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - - // Table dimensions - let tool_width = 30; - let count_width = 10; - let rate_width = 10; - - // Write table top border - writeln!( - f, - "┌{}┬{}┬{}┬{}┐", - "─".repeat(tool_width), - "─".repeat(count_width), - "─".repeat(count_width), - "─".repeat(rate_width) - )?; - - // Write header row - writeln!( - f, - "│{:^30}│{:^10}│{:^10}│{:^10}│", - "Tool", "Uses", "Failures", "Rate" - )?; - - // Write header-data separator - writeln!( - f, - "├{}┼{}┼{}┼{}┤", - "─".repeat(tool_width), - "─".repeat(count_width), - "─".repeat(count_width), - "─".repeat(rate_width) - )?; - - // Write data rows - for (tool_name, failure_rate) in failure_rates { - let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0); - let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0); - writeln!( - f, - "│{:<30}│{:^10}│{:^10}│{:^10}│", - tool_name, - use_count, - failure_count, - format!("{}%", (failure_rate * 100.0).round()) - )?; - } - - // Write table bottom border - writeln!( - f, - "└{}┴{}┴{}┴{}┘", - "─".repeat(tool_width), - "─".repeat(count_width), - "─".repeat(count_width), - "─".repeat(rate_width) - )?; - - Ok(()) - } -} diff --git a/tooling/xtask/src/tasks/workflows.rs b/tooling/xtask/src/tasks/workflows.rs index 35f053f46666a4d5e81bffe27bc80490c20c166d..414c0b7fd8dc2a99027d8687bcf1d4dbe9c4bb85 100644 --- a/tooling/xtask/src/tasks/workflows.rs +++ b/tooling/xtask/src/tasks/workflows.rs @@ -206,7 +206,6 @@ pub fn run_workflows(args: GenerateWorkflowArgs) -> Result<()> { WorkflowFile::zed(publish_extension_cli::publish_extension_cli), WorkflowFile::zed(release::release), WorkflowFile::zed(release_nightly::release_nightly), - WorkflowFile::zed(run_agent_evals::run_agent_evals), WorkflowFile::zed(run_agent_evals::run_cron_unit_evals), WorkflowFile::zed(run_agent_evals::run_unit_evals), WorkflowFile::zed(run_bundling::run_bundling), diff --git a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs index 521f419d9b317c42a1106ebe8500ccf0a3f494ec..8146552e6567fc336be91e3ad6c0687c441b6604 100644 --- a/tooling/xtask/src/tasks/workflows/run_agent_evals.rs +++ b/tooling/xtask/src/tasks/workflows/run_agent_evals.rs @@ -3,32 +3,10 @@ use serde_json::json; use crate::tasks::workflows::{ runners::{self, Platform}, - steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config}, + steps::{self, FluentBuilder as _, NamedJob, named}, vars::{self, WorkflowInput}, }; -pub(crate) fn run_agent_evals() -> Workflow { - let agent_evals = agent_evals(); - let model_name = WorkflowInput::string("model_name", None); - - named::workflow() - .on(Event::default().workflow_dispatch( - WorkflowDispatch::default().add_input(model_name.name, model_name.input()), - )) - .concurrency(vars::one_workflow_per_non_main_branch()) - .add_env(("CARGO_TERM_COLOR", "always")) - .add_env(("CARGO_INCREMENTAL", 0)) - .add_env(("RUST_BACKTRACE", 1)) - .add_env(("ANTHROPIC_API_KEY", vars::ANTHROPIC_API_KEY)) - .add_env(("OPENAI_API_KEY", vars::OPENAI_API_KEY)) - .add_env(("GOOGLE_AI_API_KEY", vars::GOOGLE_AI_API_KEY)) - .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT)) - .add_env(("ZED_CLIENT_CHECKSUM_SEED", vars::ZED_CLIENT_CHECKSUM_SEED)) - .add_env(("ZED_EVAL_TELEMETRY", 1)) - .add_env(("MODEL_NAME", model_name.to_string())) - .add_job(agent_evals.name, agent_evals.job) -} - pub(crate) fn run_unit_evals() -> Workflow { let model_name = WorkflowInput::string("model_name", None); let commit_sha = WorkflowInput::string("commit_sha", None); @@ -59,29 +37,6 @@ fn add_api_keys(step: Step) -> Step { .add_env(("GOOGLE_CLOUD_PROJECT", vars::GOOGLE_CLOUD_PROJECT)) } -fn agent_evals() -> NamedJob { - fn run_eval() -> Step { - named::bash( - "cargo run --package=eval -- --repetitions=8 --concurrency=1 --model \"${MODEL_NAME}\"", - ) - } - - named::job( - Job::default() - .runs_on(runners::LINUX_DEFAULT) - .timeout_minutes(60_u32 * 10) - .add_step(steps::checkout_repo()) - .add_step(steps::cache_rust_dependencies_namespace()) - .map(steps::install_linux_dependencies) - .add_step(setup_cargo_config(Platform::Linux)) - .add_step(steps::setup_sccache(Platform::Linux)) - .add_step(steps::script("cargo build --package=eval")) - .add_step(add_api_keys(run_eval())) - .add_step(steps::show_sccache_stats(Platform::Linux)) - .add_step(steps::cleanup_cargo_config(Platform::Linux)), - ) -} - pub(crate) fn run_cron_unit_evals() -> Workflow { let unit_evals = cron_unit_evals(); diff --git a/typos.toml b/typos.toml index 6788f14700086e39a394c1bff9418af7666d9b0f..959b5fc6f73477369572cdca3ff95d12b43f5ee1 100644 --- a/typos.toml +++ b/typos.toml @@ -49,8 +49,6 @@ extend-exclude = [ "docs/theme/c15t@*.js", # Spellcheck triggers on `|Fixe[sd]|` regex part. "script/danger/dangerfile.ts", - # Eval examples for prompts and criteria - "crates/eval/src/examples/", # File type extensions are not typos "crates/zed/resources/windows/zed.iss", # typos-cli doesn't understand our `vˇariable` markup