From 2457e27437b355f030793f2085cda51bbc39b642 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Fri, 6 Mar 2026 12:12:38 +0100 Subject: [PATCH] eval: Add eval_cli crate (#50922) Very much wip Release Notes: - N/A --- Cargo.lock | 41 ++ Cargo.toml | 1 + crates/eval_cli/.gitignore | 3 + crates/eval_cli/Cargo.toml | 50 +++ crates/eval_cli/Dockerfile | 62 +++ crates/eval_cli/Dockerfile.dockerignore | 21 + crates/eval_cli/LICENSE-GPL | 1 + crates/eval_cli/README.md | 108 +++++ crates/eval_cli/build.rs | 15 + crates/eval_cli/script/build-linux | 57 +++ crates/eval_cli/src/headless.rs | 131 ++++++ crates/eval_cli/src/main.rs | 550 ++++++++++++++++++++++++ crates/eval_cli/zed_eval/__init__.py | 3 + crates/eval_cli/zed_eval/agent.py | 161 +++++++ crates/eval_cli/zed_eval/install.sh.j2 | 49 +++ crates/eval_cli/zed_eval/pyproject.toml | 10 + 16 files changed, 1263 insertions(+) create mode 100644 crates/eval_cli/.gitignore create mode 100644 crates/eval_cli/Cargo.toml create mode 100644 crates/eval_cli/Dockerfile create mode 100644 crates/eval_cli/Dockerfile.dockerignore create mode 120000 crates/eval_cli/LICENSE-GPL create mode 100644 crates/eval_cli/README.md create mode 100644 crates/eval_cli/build.rs create mode 100755 crates/eval_cli/script/build-linux create mode 100644 crates/eval_cli/src/headless.rs create mode 100644 crates/eval_cli/src/main.rs create mode 100644 crates/eval_cli/zed_eval/__init__.py create mode 100644 crates/eval_cli/zed_eval/agent.py create mode 100644 crates/eval_cli/zed_eval/install.sh.j2 create mode 100644 crates/eval_cli/zed_eval/pyproject.toml diff --git a/Cargo.lock b/Cargo.lock index ec376710159b3117bb883ddaa0ba2a4a539293bc..b147a39663d567bee029ed8b6c6694f0c6b41e85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5892,6 +5892,47 @@ dependencies = [ "watch", ] +[[package]] +name = "eval_cli" +version = "0.1.0" +dependencies = [ + "acp_thread", + "agent", + "agent-client-protocol", + "agent_ui", + "anyhow", + "clap", + "client", + "ctrlc", + "debug_adapter_extension", + "env_logger 0.11.8", + "extension", + "feature_flags", + "fs", + "futures 0.3.31", + "gpui", + "gpui_platform", + "gpui_tokio", + "language", + "language_extension", + "language_model", + "language_models", + "languages", + "node_runtime", + "paths", + "project", + "prompt_store", + "release_channel", + "reqwest_client", + "serde", + "serde_json", + "settings", + "shellexpand 2.1.2", + "terminal_view", + "util", + "watch", +] + [[package]] name = "eval_utils" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 497bdd203d958f3ad7d33cd98f5ff1e9b2e34655..597a5f2a207c27154dcf1a55c85d97271604f83f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ members = [ "crates/encoding_selector", "crates/etw_tracing", "crates/eval", + "crates/eval_cli", "crates/eval_utils", "crates/explorer_command_injector", "crates/extension", diff --git a/crates/eval_cli/.gitignore b/crates/eval_cli/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..083ef6e3d354cb335e59916071199149d11965be --- /dev/null +++ b/crates/eval_cli/.gitignore @@ -0,0 +1,3 @@ +**/jobs +**/*.egg-info +**/__pycache__ diff --git a/crates/eval_cli/Cargo.toml b/crates/eval_cli/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..d8f52992e2ae9512e694bb11c491fd8b60c0c947 --- /dev/null +++ b/crates/eval_cli/Cargo.toml @@ -0,0 +1,50 @@ +[package] +name = "eval_cli" +version = "0.1.0" +publish.workspace = true +edition.workspace = true +license = "GPL-3.0-or-later" + +[lints] +workspace = true + +[[bin]] +name = "eval-cli" +path = "src/main.rs" + +[dependencies] +acp_thread.workspace = true +agent.workspace = true +agent-client-protocol.workspace = true +agent_ui.workspace = true +anyhow.workspace = true +clap.workspace = true +client.workspace = true +ctrlc = { version = "3.5", features = ["termination"] } +debug_adapter_extension.workspace = true +env_logger.workspace = true +extension.workspace = true +feature_flags.workspace = true +fs.workspace = true +futures.workspace = true +gpui.workspace = true +gpui_platform.workspace = true +gpui_tokio.workspace = true +language.workspace = true +language_extension.workspace = true +language_model.workspace = true +language_models.workspace = true +languages = { workspace = true, features = ["load-grammars"] } +node_runtime.workspace = true +paths.workspace = true +project.workspace = true +prompt_store.workspace = true +release_channel.workspace = true +reqwest_client.workspace = true +serde.workspace = true +serde_json.workspace = true +settings.workspace = true +shellexpand.workspace = true +terminal_view.workspace = true +util.workspace = true +watch.workspace = true diff --git a/crates/eval_cli/Dockerfile b/crates/eval_cli/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..7b91a7adf991428670fac43ad745a6e9998c9c38 --- /dev/null +++ b/crates/eval_cli/Dockerfile @@ -0,0 +1,62 @@ +# Build eval-cli for Linux. +# +# Usage (from the zed repo root): +# docker build --platform linux/amd64 -f crates/eval_cli/Dockerfile -t eval-cli-builder . +# docker cp "$(docker create eval-cli-builder)":/eval-cli ./target/eval-cli +# +# Or use the helper script: +# crates/eval_cli/script/build-linux + +FROM rust:1.93.1-bookworm AS builder + +WORKDIR /app + +# Install build dependencies (subset of script/linux needed for headless GPUI). +RUN apt-get update && apt-get install -y --no-install-recommends \ + cmake \ + clang \ + g++ \ + libasound2-dev \ + libfontconfig-dev \ + libgit2-dev \ + libglib2.0-dev \ + libssl-dev \ + libwayland-dev \ + libx11-xcb-dev \ + libxkbcommon-x11-dev \ + libzstd-dev \ + libsqlite3-dev \ + build-essential \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install wild linker for faster linking (built from source to match bookworm's glibc). +RUN cargo install --locked wild-linker --version 0.8.0 --root /usr/local + +# Download WASI SDK (needed by some dependencies). +ARG TARGETARCH +RUN mkdir -p /app/target && \ + WASI_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64" || echo "x86_64") && \ + curl -L "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-25/wasi-sdk-25.0-${WASI_ARCH}-linux.tar.gz" \ + | tar -xz -C /app/target && \ + mv /app/target/wasi-sdk-25.0-${WASI_ARCH}-linux /app/target/wasi-sdk + +# Pre-install the toolchain specified in rust-toolchain.toml so it is cached. +RUN rustup toolchain install 1.93 --profile minimal \ + --component rustfmt --component clippy --component rust-analyzer --component rust-src \ + --target wasm32-wasip2 --target wasm32-unknown-unknown --target x86_64-unknown-linux-musl + +COPY . . + +ENV CC=clang CXX=clang++ +ENV RUSTFLAGS="-C linker=clang -C link-arg=--ld-path=wild" + +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/usr/local/cargo/git \ + --mount=type=cache,target=/app/target \ + cargo build --release --package eval_cli && \ + cp /app/target/release/eval-cli /eval-cli && \ + strip /eval-cli + +FROM scratch +COPY --from=builder /eval-cli /eval-cli diff --git a/crates/eval_cli/Dockerfile.dockerignore b/crates/eval_cli/Dockerfile.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..419f92f9c9b6dad52f04c9ad39e031a7405f2a4b --- /dev/null +++ b/crates/eval_cli/Dockerfile.dockerignore @@ -0,0 +1,21 @@ +.git +.github +**/.gitignore +**/.gitkeep +.gitattributes +.mailmap +**/target +zed.xcworkspace +.DS_Store +compose.yml +plugins/bin +script/node_modules +styles/node_modules +crates/collab/static/styles.css +vendor/bin +assets/themes/ +**/jobs + +**/*.egg-info +**/__pycache__ +**/.venv diff --git a/crates/eval_cli/LICENSE-GPL b/crates/eval_cli/LICENSE-GPL new file mode 120000 index 0000000000000000000000000000000000000000..89e542f750cd3860a0598eff0dc34b56d7336dc4 --- /dev/null +++ b/crates/eval_cli/LICENSE-GPL @@ -0,0 +1 @@ +../../LICENSE-GPL \ No newline at end of file diff --git a/crates/eval_cli/README.md b/crates/eval_cli/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a9952bbf4fe1066a78acaad15bfab10d0cee098d --- /dev/null +++ b/crates/eval_cli/README.md @@ -0,0 +1,108 @@ +# eval-cli + +Headless CLI binary for running Zed's agent in evaluation/benchmark +environments. Designed to work inside containerized environments like +[Harbor](https://harborframework.com/) where the repository is already +checked out and API keys are provided via environment variables. + +Uses the same `NativeAgent` + `AcpThread` pipeline as the production Zed +editor — full agentic loop with tool calls, subagents, and retries, just +without a GUI. + +## Building + +### Native (for local testing on the same OS) + +``` +cargo build --release -p eval_cli +``` + +### Cross-compile for Linux x86_64 (from macOS or other hosts) + +Harbor containers run Linux x86_64. Use the Docker-based build script: + +``` +crates/eval_cli/script/build-linux +``` + +This produces `target/eval-cli` (an x86_64 Linux ELF binary). You can +also specify a custom output path: + +``` +crates/eval_cli/script/build-linux --output ~/bin/eval-cli-linux +``` + +## Standalone usage + +``` +eval-cli \ + --workdir /testbed \ + --model anthropic/claude-sonnet-4-6-latest \ + --instruction "Fix the bug described in..." \ + --timeout 600 \ + --output-dir /logs/agent +``` + +Reads API keys from environment variables (`ANTHROPIC_API_KEY`, +`OPENAI_API_KEY`, etc.). Writes `result.json`, `thread.md`, and +`thread.json` to the output directory. + +### Exit codes + +| Code | Meaning | +| ---- | ---------------------------------- | +| 0 | Agent finished | +| 1 | Error (model/auth/runtime failure) | +| 2 | Timeout | +| 3 | Interrupted (SIGTERM/SIGINT) | + +## Harbor integration + +The `zed_eval/` directory contains a Python package that +implements Harbor's `BaseInstalledAgent` interface, allowing eval-cli to +be used with `--agent-import-path` without modifying Harbor's source code. + +### Setup + +``` +pip install -e crates/eval_cli/harbor/ +``` + +### Running with a local binary + +Build for Linux first, then pass the binary path: + +``` +crates/eval_cli/script/build-linux + +harbor run -d "swebench_verified@latest" \ + --agent-import-path zed_eval.agent:ZedAgent \ + --ae binary_path=target/eval-cli \ + -m anthropic/claude-sonnet-4-6-latest +``` + +The agent uploads the binary into the container during setup — no +download URL needed during local iteration. + +### Running with a download URL + +For CI or when the binary is hosted somewhere: + +``` +harbor run -d "swebench_verified@latest" \ + --agent-import-path zed_eval.agent:ZedAgent \ + --ak download_url=https://example.com/eval-cli \ + -m anthropic/claude-sonnet-4-6-latest +``` + +### Setting a timeout + +Pass `EVAL_CLI_TIMEOUT` via `--ae`: + +``` +harbor run -d "swebench_verified@latest" \ + --agent-import-path zed_eval.agent:ZedAgent \ + --ak binary_path=target/eval-cli \ + --ae EVAL_CLI_TIMEOUT=600 \ + -m anthropic/claude-sonnet-4-6-latest +``` diff --git a/crates/eval_cli/build.rs b/crates/eval_cli/build.rs new file mode 100644 index 0000000000000000000000000000000000000000..0180e9036fbd049ba5a9e5b455ec1c017cd700e3 --- /dev/null +++ b/crates/eval_cli/build.rs @@ -0,0 +1,15 @@ +fn main() { + let cargo_toml = + std::fs::read_to_string("../zed/Cargo.toml").expect("Failed to read crates/zed/Cargo.toml"); + let version = cargo_toml + .lines() + .find(|line| line.starts_with("version = ")) + .expect("Version not found in crates/zed/Cargo.toml") + .split('=') + .nth(1) + .expect("Invalid version format") + .trim() + .trim_matches('"'); + println!("cargo:rerun-if-changed=../zed/Cargo.toml"); + println!("cargo:rustc-env=ZED_PKG_VERSION={}", version); +} diff --git a/crates/eval_cli/script/build-linux b/crates/eval_cli/script/build-linux new file mode 100755 index 0000000000000000000000000000000000000000..9c710668de2aa5e956efff727e6ef8eb2c5ed627 --- /dev/null +++ b/crates/eval_cli/script/build-linux @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +# Build eval-cli for x86_64 Linux from any host (macOS, Linux, etc.) +# using Docker. The resulting binary is placed at the path printed on +# completion (default: target/eval-cli). +# +# Usage: +# crates/eval_cli/script/build-linux [--output PATH] +# +# Examples: +# crates/eval_cli/script/build-linux +# crates/eval_cli/script/build-linux --output ~/bin/eval-cli +# +# Prerequisites: Docker must be installed and running. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +OUTPUT="${REPO_ROOT}/target/eval-cli" + +while [[ $# -gt 0 ]]; do + case $1 in + --output) + OUTPUT="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +cd "$REPO_ROOT" + +IMAGE_TAG="eval-cli-builder" + +echo "Building eval-cli for x86_64-unknown-linux-gnu..." +echo " Repo root: $REPO_ROOT" +echo " Output: $OUTPUT" +echo "" + +docker build \ + --platform linux/amd64 \ + -f crates/eval_cli/Dockerfile \ + -t "$IMAGE_TAG" \ + . + +CONTAINER_ID=$(docker create "$IMAGE_TAG" /eval-cli) +mkdir -p "$(dirname "$OUTPUT")" +docker cp "$CONTAINER_ID":/eval-cli "$OUTPUT" +docker rm "$CONTAINER_ID" > /dev/null + +echo "" +echo "Built successfully: $OUTPUT" +echo " $(file "$OUTPUT")" diff --git a/crates/eval_cli/src/headless.rs b/crates/eval_cli/src/headless.rs new file mode 100644 index 0000000000000000000000000000000000000000..1448cbeb7a724b2b4dfdb1cbba430dcc3cdfd5b5 --- /dev/null +++ b/crates/eval_cli/src/headless.rs @@ -0,0 +1,131 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use client::{Client, ProxySettings, UserStore}; +use extension::ExtensionHostProxy; +use fs::RealFs; +use gpui::http_client::read_proxy_from_env; +use gpui::{App, AppContext as _, Entity}; +use gpui_tokio::Tokio; +use language::LanguageRegistry; +use language_extension::LspAccess; +use node_runtime::{NodeBinaryOptions, NodeRuntime}; +use project::project_settings::ProjectSettings; +use prompt_store::PromptBuilder; +use release_channel::{AppCommitSha, AppVersion}; +use reqwest_client::ReqwestClient; +use settings::{Settings, SettingsStore}; +use util::ResultExt as _; + +pub struct AgentCliAppState { + pub languages: Arc, + pub client: Arc, + pub user_store: Entity, + pub fs: Arc, + pub node_runtime: NodeRuntime, +} + +pub fn init(cx: &mut App) -> Arc { + let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned())); + + let app_version = AppVersion::load( + env!("ZED_PKG_VERSION"), + option_env!("ZED_BUILD_ID"), + app_commit_sha, + ); + + release_channel::init(app_version.clone(), cx); + gpui_tokio::init(cx); + + let settings_store = SettingsStore::new(cx, &settings::default_settings()); + cx.set_global(settings_store); + + let user_agent = format!( + "Zed Agent CLI/{} ({}; {})", + app_version, + std::env::consts::OS, + std::env::consts::ARCH + ); + let proxy_str = ProxySettings::get_global(cx).proxy.to_owned(); + let proxy_url = proxy_str + .as_ref() + .and_then(|input| input.parse().ok()) + .or_else(read_proxy_from_env); + let http = { + let _guard = Tokio::handle(cx).enter(); + ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent) + .expect("could not start HTTP client") + }; + cx.set_http_client(Arc::new(http)); + + let client = Client::production(cx); + cx.set_http_client(client.http_client()); + + let git_binary_path = None; + let fs = Arc::new(RealFs::new( + git_binary_path, + cx.background_executor().clone(), + )); + + let mut languages = LanguageRegistry::new(cx.background_executor().clone()); + languages.set_language_server_download_dir(paths::languages_dir().clone()); + let languages = Arc::new(languages); + + let user_store = cx.new(|cx| UserStore::new(client.clone(), cx)); + + extension::init(cx); + + let (mut node_options_tx, node_options_rx) = watch::channel(None); + cx.observe_global::(move |cx| { + let settings = &ProjectSettings::get_global(cx).node; + let options = NodeBinaryOptions { + allow_path_lookup: !settings.ignore_system_version, + allow_binary_download: true, + use_paths: settings.path.as_ref().map(|node_path| { + let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref()); + let npm_path = settings + .npm_path + .as_ref() + .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref())); + ( + node_path.clone(), + npm_path.unwrap_or_else(|| { + let base_path = PathBuf::new(); + node_path.parent().unwrap_or(&base_path).join("npm") + }), + ) + }), + }; + node_options_tx.send(Some(options)).log_err(); + }) + .detach(); + let node_runtime = NodeRuntime::new(client.http_client(), None, node_options_rx); + + let extension_host_proxy = ExtensionHostProxy::global(cx); + debug_adapter_extension::init(extension_host_proxy.clone(), cx); + language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone()); + language_model::init(client.clone(), cx); + language_models::init(user_store.clone(), client.clone(), cx); + languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx); + prompt_store::init(cx); + terminal_view::init(cx); + + let stdout_is_a_pty = false; + let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx); + agent_ui::init( + fs.clone(), + client.clone(), + prompt_builder, + languages.clone(), + true, + cx, + ); + + Arc::new(AgentCliAppState { + languages, + client, + user_store, + fs, + node_runtime, + }) +} diff --git a/crates/eval_cli/src/main.rs b/crates/eval_cli/src/main.rs new file mode 100644 index 0000000000000000000000000000000000000000..0f8dbed7ba12cee934e7631dc7068c83db1dc293 --- /dev/null +++ b/crates/eval_cli/src/main.rs @@ -0,0 +1,550 @@ +//! Headless CLI binary for running Zed's agent in evaluation/benchmark environments. +//! +//! Designed to work inside containerized environments (like Harbor/termbench) where: +//! - The repository is already checked out at the working directory +//! - The model API key is provided via environment variables +//! - Results are written to an output directory (default: `/logs/agent/`) +//! +//! ## Usage +//! +//! ```text +//! eval-cli --workdir /testbed --model anthropic/claude-sonnet-4-6-latest \ +//! --instruction "Fix the bug described in..." --timeout 600 +//! ``` +//! +//! ## Output +//! +//! Writes to `--output-dir` (default `/logs/agent/`): +//! - `result.json` — structured result with status, timing, and token usage +//! - `thread.md` — full conversation as markdown +//! - `thread.json` — raw thread state as JSON +//! +//! ## Exit codes +//! +//! | Code | Meaning | +//! |------|---------| +//! | 0 | Agent finished | +//! | 1 | Error (model/auth/runtime failure) | +//! | 2 | Timeout | +//! | 3 | Interrupted (SIGTERM/SIGINT) | + +mod headless; + +use std::path::PathBuf; +use std::process; +use std::rc::Rc; +use std::str::FromStr; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +use acp_thread::AgentConnection as _; +use agent::{NativeAgent, NativeAgentConnection, Templates, ThreadStore}; +use agent_client_protocol as acp; +use anyhow::{Context, Result}; +use clap::Parser; +use feature_flags::FeatureFlagAppExt as _; + +use futures::{FutureExt, select_biased}; +use gpui::{AppContext as _, AsyncApp, Entity, UpdateGlobal}; +use language_model::{LanguageModelRegistry, SelectedModel}; +use project::Project; +use settings::SettingsStore; + +use crate::headless::AgentCliAppState; + +#[derive(Parser, Debug)] +#[command( + name = "eval-cli", + about = "Run Zed's agent headlessly in evaluation/benchmark environments" +)] +struct Args { + /// Output current environment variables as JSON to stdout. + /// Used internally by Zed's shell environment capture. + #[arg(long, hide = true)] + printenv: bool, + + /// Path to the repository working directory. Defaults to the current directory. + #[arg(long, default_value = ".")] + workdir: PathBuf, + + /// Instruction/prompt text. If omitted, read from --instruction-file or stdin. + #[arg(long)] + instruction: Option, + + /// Language model to use, in `provider/model` format. + #[arg(long, default_value = "anthropic/claude-sonnet-4-6-latest")] + model: String, + + /// Maximum wall-clock time in seconds for the agent run. + #[arg(long)] + timeout: Option, + + /// Directory for output artifacts (result.json, thread.md, thread.json). + #[arg(long, default_value = "/logs/agent")] + output_dir: PathBuf, +} + +enum AgentOutcome { + Completed, + Timeout { seconds: u64 }, + Interrupted, +} + +#[derive(serde::Serialize)] +struct EvalResult { + status: String, + #[serde(skip_serializing_if = "Option::is_none")] + error: Option, + duration_secs: f64, + #[serde(skip_serializing_if = "Option::is_none")] + timeout_secs: Option, + model: String, + #[serde(skip_serializing_if = "Option::is_none")] + input_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + output_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + cache_creation_input_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + cache_read_input_tokens: Option, +} + +const EXIT_OK: i32 = 0; +const EXIT_ERROR: i32 = 1; +const EXIT_TIMEOUT: i32 = 2; +const EXIT_INTERRUPTED: i32 = 3; + +static TERMINATED: AtomicBool = AtomicBool::new(false); + +fn main() { + let args = Args::parse(); + + if args.printenv { + util::shell_env::print_env(); + return; + } + + env_logger::init(); + + ctrlc::set_handler(|| { + TERMINATED.store(true, Ordering::SeqCst); + }) + .expect("failed to set signal handler"); + + let instruction = read_instruction(&args).unwrap_or_else(|e| { + eprintln!("Error reading instruction: {e}"); + process::exit(EXIT_ERROR); + }); + + let workdir = args.workdir.canonicalize().unwrap_or_else(|e| { + eprintln!("Invalid --workdir {:?}: {e}", args.workdir); + process::exit(EXIT_ERROR); + }); + + let output_dir = args.output_dir.clone(); + if let Err(e) = std::fs::create_dir_all(&output_dir) { + eprintln!("Error creating output dir {}: {e}", output_dir.display()); + process::exit(EXIT_ERROR); + } + + let http_client = Arc::new(reqwest_client::ReqwestClient::new()); + let app = gpui_platform::headless().with_http_client(http_client); + + app.run(move |cx| { + let app_state = headless::init(cx); + cx.set_staff(true); + + let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| { + registry + .providers() + .iter() + .map(|p| p.authenticate(cx)) + .collect::>() + }); + + let model_name = args.model.clone(); + let timeout = args.timeout; + + cx.spawn(async move |cx| { + futures::future::join_all(auth_tasks).await; + + let start = Instant::now(); + + let (outcome, token_usage) = run_agent( + &app_state, + &workdir, + &instruction, + &model_name, + timeout, + Some(&output_dir), + cx, + ) + .await; + + let duration = start.elapsed(); + + let (status, error, exit_code) = match &outcome { + Ok(AgentOutcome::Completed) => ("completed".to_string(), None, EXIT_OK), + Ok(AgentOutcome::Timeout { seconds }) => { + eprintln!("Timeout: agent exceeded {seconds}s time limit"); + ("timeout".to_string(), None, EXIT_TIMEOUT) + } + Ok(AgentOutcome::Interrupted) => { + eprintln!("Interrupted: received SIGTERM, saved partial output"); + ("interrupted".to_string(), None, EXIT_INTERRUPTED) + } + Err(e) => { + eprintln!("Error: {e:#}"); + ("error".to_string(), Some(format!("{e:#}")), EXIT_ERROR) + } + }; + + let result = EvalResult { + status, + error, + duration_secs: duration.as_secs_f64(), + timeout_secs: timeout, + model: model_name.clone(), + input_tokens: token_usage.as_ref().map(|u| u.input_tokens), + output_tokens: token_usage.as_ref().map(|u| u.output_tokens), + cache_creation_input_tokens: token_usage + .as_ref() + .filter(|u| u.cache_creation_input_tokens > 0) + .map(|u| u.cache_creation_input_tokens), + cache_read_input_tokens: token_usage + .as_ref() + .filter(|u| u.cache_read_input_tokens > 0) + .map(|u| u.cache_read_input_tokens), + }; + + match serde_json::to_string_pretty(&result) { + Ok(json) => { + if let Err(e) = std::fs::write(output_dir.join("result.json"), &json) { + eprintln!("Error writing result.json: {e:#}"); + } + eprintln!("[eval-cli] result: {json}"); + } + Err(e) => eprintln!("Error serializing result: {e:#}"), + } + + cx.update(|cx| cx.quit()); + process::exit(exit_code); + }) + .detach(); + }); +} + +fn read_instruction(args: &Args) -> Result { + let text = if let Some(text) = &args.instruction { + text.clone() + } else { + use std::io::Read; + let mut buf = String::new(); + std::io::stdin() + .read_to_string(&mut buf) + .context("reading instruction from stdin")?; + buf + }; + anyhow::ensure!(!text.trim().is_empty(), "instruction is empty"); + Ok(text) +} + +async fn run_agent( + app_state: &Arc, + workdir: &std::path::Path, + instruction: &str, + model_name: &str, + timeout: Option, + output_dir: Option<&std::path::Path>, + cx: &mut AsyncApp, +) -> (Result, Option) { + let setup_result: Result<()> = cx.update(|cx| { + let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!("{e}"))?; + let registry = LanguageModelRegistry::global(cx); + let model = registry + .read(cx) + .available_models(cx) + .find(|m| m.id() == selected.model && m.provider_id() == selected.provider) + .ok_or_else(|| { + let available = registry + .read(cx) + .available_models(cx) + .map(|m| format!("{}/{}", m.provider_id().0, m.id().0)) + .collect::>() + .join(", "); + anyhow::anyhow!("Model {model_name} not found. Available: {available}") + })?; + + let supports_thinking = model.supports_thinking(); + + registry.update(cx, |registry, cx| { + registry.set_default_model( + Some(language_model::ConfiguredModel { + provider: registry + .provider(&model.provider_id()) + .context("Provider not found")?, + model, + }), + cx, + ); + anyhow::Ok(()) + })?; + + let (enable_thinking, effort) = if supports_thinking { + (true, "\"high\"") + } else { + (false, "null") + }; + let provider_id = selected.provider.0.to_string(); + let model_id = selected.model.0.to_string(); + SettingsStore::update_global(cx, |store, cx| { + let settings = format!( + r#"{{ + "agent": {{ + "tool_permissions": {{"default": "allow"}}, + "default_model": {{ + "provider": "{provider_id}", + "model": "{model_id}", + "enable_thinking": {enable_thinking}, + "effort": {effort} + }} + }}, + "autosave": "off", + "format_on_save": "off" + }}" + "# + ); + store.set_user_settings(&settings, cx).ok(); + }); + + anyhow::Ok(()) + }); + + if let Err(e) = setup_result { + return (Err(e), None); + } + + let project = cx.update(|cx| { + Project::local( + app_state.client.clone(), + app_state.node_runtime.clone(), + app_state.user_store.clone(), + app_state.languages.clone(), + app_state.fs.clone(), + None, + project::LocalProjectFlags { + init_worktree_trust: false, + ..Default::default() + }, + cx, + ) + }); + + let worktree = project.update(cx, |project, cx| project.create_worktree(workdir, true, cx)); + let worktree = match worktree.await { + Ok(w) => w, + Err(e) => return (Err(e).context("creating worktree"), None), + }; + + let scan_result = worktree.update(cx, |tree, _cx| { + tree.as_local() + .context("expected local worktree") + .map(|local| local.scan_complete()) + }); + match scan_result { + Ok(future) => future.await, + Err(e) => return (Err(e), None), + }; + + let thread_store = cx.new(|cx| ThreadStore::new(cx)); + let agent = match NativeAgent::new( + project.clone(), + thread_store, + Templates::new(), + None, + app_state.fs.clone(), + cx, + ) + .await + { + Ok(a) => a, + Err(e) => return (Err(e).context("creating agent"), None), + }; + + let connection = Rc::new(NativeAgentConnection(agent.clone())); + let acp_thread = match cx + .update(|cx| connection.clone().new_session(project, workdir, cx)) + .await + { + Ok(t) => t, + Err(e) => return (Err(e).context("creating ACP session"), None), + }; + + let _subscription = cx.subscribe(&acp_thread, |acp_thread, event, cx| { + log_acp_thread_event(&acp_thread, event, cx); + }); + + let message = vec![acp::ContentBlock::Text(acp::TextContent::new( + instruction.to_string(), + ))]; + + let send_future = acp_thread.update(cx, |acp_thread: &mut acp_thread::AcpThread, cx| { + acp_thread.send(message, cx) + }); + + let timeout_future = if let Some(timeout_secs) = timeout { + futures::future::Either::Left( + cx.background_executor() + .timer(Duration::from_secs(timeout_secs)), + ) + } else { + futures::future::Either::Right(futures::future::pending::<()>()) + }; + + let sigterm_future = { + let executor = cx.background_executor().clone(); + async move { + while !TERMINATED.load(Ordering::Relaxed) { + executor.timer(Duration::from_millis(100)).await; + } + } + }; + + let outcome = select_biased! { + result = send_future.fuse() => match result { + Ok(Some(response)) => { + eprintln!("[eval-cli] stopped: {:?}", response.stop_reason); + if response.stop_reason == acp::StopReason::MaxTokens { + Err(anyhow::anyhow!("Model hit maximum token limit")) + } else { + Ok(AgentOutcome::Completed) + } + } + Ok(None) => { + eprintln!("[eval-cli] completed (no response)"); + Ok(AgentOutcome::Completed) + } + Err(e) => Err(e).context("agent run failed"), + }, + _ = sigterm_future.fuse() => { + eprintln!("[eval-cli] received SIGTERM, cancelling..."); + acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await; + Ok(AgentOutcome::Interrupted) + }, + _ = timeout_future.fuse() => { + acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await; + Ok(AgentOutcome::Timeout { seconds: timeout.unwrap_or(0) }) + } + }; + + let thread = cx.update(|cx| { + let session_id = acp_thread.read(cx).session_id().clone(); + connection.thread(&session_id, cx) + }); + + let cumulative_usage = if let Some(thread) = &thread { + let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx)); + let db_thread = db_thread.await; + let usage = db_thread.cumulative_token_usage; + if usage.input_tokens > 0 || usage.output_tokens > 0 { + Some(usage) + } else { + None + } + } else { + None + }; + + let acp_usage = cx.update(|cx| { + acp_thread + .read(cx) + .token_usage() + .map(|usage| language_model::TokenUsage { + input_tokens: usage.input_tokens, + output_tokens: usage.output_tokens, + ..Default::default() + }) + }); + + let final_usage = cumulative_usage.or(acp_usage); + + if let (Some(thread), Some(dir)) = (&thread, output_dir) { + let markdown = thread.read_with(cx, |thread, _cx| thread.to_markdown()); + if let Err(e) = std::fs::write(dir.join("thread.md"), markdown) { + eprintln!("Error writing thread.md: {e:#}"); + } + + let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx)); + let db_thread = db_thread.await; + match serde_json::to_string_pretty(&db_thread) { + Ok(json) => { + if let Err(e) = std::fs::write(dir.join("thread.json"), json) { + eprintln!("Error writing thread.json: {e:#}"); + } + } + Err(e) => eprintln!("Error serializing thread.json: {e:#}"), + } + } + + (outcome, final_usage) +} + +fn log_acp_thread_event( + acp_thread: &Entity, + event: &acp_thread::AcpThreadEvent, + cx: &mut gpui::App, +) { + match event { + acp_thread::AcpThreadEvent::NewEntry => { + let entries = acp_thread.read(cx).entries(); + if let Some(acp_thread::AgentThreadEntry::AssistantMessage(message)) = entries.last() { + for chunk in &message.chunks { + if let acp_thread::AssistantMessageChunk::Message { block } = chunk { + if let acp_thread::ContentBlock::Markdown { markdown } = block { + let text = markdown.read(cx).source().to_string(); + if !text.is_empty() { + eprint!("{text}"); + } + } + } + } + } + } + acp_thread::AcpThreadEvent::EntryUpdated(index) => { + let entries = acp_thread.read(cx).entries(); + if let Some(acp_thread::AgentThreadEntry::ToolCall(tool_call)) = entries.get(*index) { + if let Some(name) = &tool_call.tool_name { + match &tool_call.status { + acp_thread::ToolCallStatus::Completed => { + eprintln!("[tool] {name} ✓"); + } + acp_thread::ToolCallStatus::Failed => { + eprintln!("[tool] {name} ✗"); + } + acp_thread::ToolCallStatus::Rejected => { + eprintln!("[tool] {name} rejected"); + } + acp_thread::ToolCallStatus::Canceled => { + eprintln!("[tool] {name} canceled"); + } + _ => {} + } + } + } + } + acp_thread::AcpThreadEvent::Stopped(reason) => { + eprintln!("\n[eval-cli] stopped: {reason:?}"); + } + acp_thread::AcpThreadEvent::Error => { + eprintln!("[eval-cli] error event"); + } + acp_thread::AcpThreadEvent::Retry(status) => { + eprintln!("[eval-cli] retry: {status:?}"); + } + acp_thread::AcpThreadEvent::SubagentSpawned(session_id) => { + eprintln!("[eval-cli] subagent spawned: {session_id}"); + } + _ => {} + } +} diff --git a/crates/eval_cli/zed_eval/__init__.py b/crates/eval_cli/zed_eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8cf07a06883a70660eb4bb3ca5a20ae304e6871b --- /dev/null +++ b/crates/eval_cli/zed_eval/__init__.py @@ -0,0 +1,3 @@ +from zed_eval.agent import ZedAgent + +__all__ = ["ZedAgent"] diff --git a/crates/eval_cli/zed_eval/agent.py b/crates/eval_cli/zed_eval/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..6214ff18d784dd9620f404a00ba1b48ce96b5707 --- /dev/null +++ b/crates/eval_cli/zed_eval/agent.py @@ -0,0 +1,161 @@ +"""Harbor agent wrapper for Zed's eval-cli binary. + +Usage: + # Build eval-cli locally first: + cargo build --release -p eval_cli + + # Run via Harbor with a local binary: + harbor run -d "dataset@version" \ + --agent-import-path zed_eval.agent:ZedAgent \ + --ae binary_path=/path/to/target/release/eval-cli \ + --agent-model anthropic/claude-sonnet-4-6-latest + + # Or with a download URL (for CI): + harbor run -d "dataset@version" \ + --agent-import-path zed_eval.agent:ZedAgent \ + --ae download_url=https://example.com/eval-cli \ + --agent-model anthropic/claude-sonnet-4-6-latest +""" + +import json +import os +import shlex +from pathlib import Path + +from harbor.agents.installed.base import BaseInstalledAgent, ExecInput +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext + + +class ZedAgent(BaseInstalledAgent): + """Runs Zed's headless AI agent (eval-cli) to solve tasks. + + The eval-cli binary boots a headless GPUI application and uses the same + NativeAgent + AcpThread pipeline as the production Zed editor, driving + the full agentic loop (tool calls, subagents, retries) without a GUI. + """ + + def __init__( + self, + logs_dir: Path, + binary_path: str | None = None, + download_url: str | None = None, + *args, + **kwargs, + ): + super().__init__(logs_dir, *args, **kwargs) + self._binary_path = binary_path + self._download_url = download_url or os.environ.get("EVAL_CLI_DOWNLOAD_URL") + + @staticmethod + def name() -> str: + return "zed" + + @property + def _install_agent_template_path(self) -> Path: + return Path(__file__).parent / "install.sh.j2" + + async def setup(self, environment: BaseEnvironment) -> None: + await environment.exec(command="mkdir -p /installed-agent") + + if self._binary_path: + binary = Path(self._binary_path) + if not binary.exists(): + raise FileNotFoundError( + f"eval-cli binary not found at {binary}. " + "Build it with: cargo build --release -p eval_cli" + ) + await environment.upload_file( + source_path=binary, + target_path="/usr/local/bin/eval-cli", + ) + await environment.exec(command="chmod +x /usr/local/bin/eval-cli") + + await super().setup(environment) + + @property + def _template_variables(self) -> dict[str, str]: + variables = super()._template_variables + if self._binary_path: + variables["binary_uploaded"] = "true" + if self._download_url: + variables["download_url"] = self._download_url + return variables + + def populate_context_post_run(self, context: AgentContext) -> None: + result_data = None + for json_file in self.logs_dir.rglob("result.json"): + try: + result_data = json.loads(json_file.read_text()) + break + except (json.JSONDecodeError, OSError): + continue + + if result_data is None: + self.logger.warning("Could not find or parse result.json from eval-cli") + return + + if result_data.get("input_tokens") is not None: + context.n_input_tokens = result_data["input_tokens"] + if result_data.get("output_tokens") is not None: + context.n_output_tokens = result_data["output_tokens"] + if result_data.get("cache_read_input_tokens") is not None: + context.n_cache_tokens = result_data["cache_read_input_tokens"] + + context.metadata = { + "status": result_data.get("status"), + "duration_secs": result_data.get("duration_secs"), + "model": result_data.get("model"), + } + + def _get_api_env(self) -> dict[str, str]: + env: dict[str, str] = {} + if not self.model_name or "/" not in self.model_name: + return env + + provider = self.model_name.split("/", 1)[0] + provider_env_map = { + "anthropic": "ANTHROPIC_API_KEY", + "openai": "OPENAI_API_KEY", + "google": "GEMINI_API_KEY", + "gemini": "GEMINI_API_KEY", + "deepseek": "DEEPSEEK_API_KEY", + "mistral": "MISTRAL_API_KEY", + } + + env_var = provider_env_map.get(provider) + if env_var: + api_key = os.environ.get(env_var, "") + if api_key: + env[env_var] = api_key + + return env + + def create_run_agent_commands(self, instruction: str) -> list[ExecInput]: + escaped_instruction = shlex.quote(instruction) + env = self._get_api_env() + + parts = ["eval-cli", "--workdir /testbed", "--output-dir /logs/agent"] + + if self.model_name: + parts.append(f"--model {self.model_name}") + + timeout = self._extra_env.get("EVAL_CLI_TIMEOUT") + if timeout: + parts.append(f"--timeout {timeout}") + + parts.append(f"--instruction {escaped_instruction}") + + eval_cli_command = " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt" + + patch_command = ( + "cd /testbed && " + "git add -A && " + "git diff --cached HEAD > /logs/agent/patch.diff && " + "echo \"Patch size: $(wc -c < /logs/agent/patch.diff) bytes\"" + ) + + return [ + ExecInput(command=eval_cli_command, env=env), + ExecInput(command=patch_command), + ] diff --git a/crates/eval_cli/zed_eval/install.sh.j2 b/crates/eval_cli/zed_eval/install.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..f7ebbe028216a1a7a0fd606e50a2f707db34c5ce --- /dev/null +++ b/crates/eval_cli/zed_eval/install.sh.j2 @@ -0,0 +1,49 @@ +#!/bin/bash +set -euo pipefail + +# Install runtime dependencies needed by the eval-cli binary (dynamically linked +# against glibc + these shared libraries from its GPUI/terminal/language stacks). +apt-get update +apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + git \ + libasound2 \ + libfontconfig1 \ + libglib2.0-0 \ + libsqlite3-0 \ + libssl3 \ + libwayland-client0 \ + libx11-xcb1 \ + libxkbcommon-x11-0 \ + libzstd1 + +# Install Node.js 22 LTS (needed by language servers like basedpyright). +curl -fsSL https://deb.nodesource.com/setup_22.x | bash - +apt-get install -y --no-install-recommends nodejs + +# Install uv (needed for running Python tests in SWE-bench tasks). +curl -LsSf https://astral.sh/uv/install.sh | sh +. "$HOME/.local/bin/env" +ln -sf "$HOME/.local/bin/uv" /usr/local/bin/uv +ln -sf "$HOME/.local/bin/uvx" /usr/local/bin/uvx + +{% if binary_uploaded is defined %} +# Binary was uploaded directly via setup() — just verify it works. +eval-cli --help +{% elif download_url is defined %} +curl -fsSL "{{ download_url }}" -o /usr/local/bin/eval-cli +chmod +x /usr/local/bin/eval-cli +eval-cli --help +{% else %} +echo "ERROR: No eval-cli binary provided." +echo "" +echo "Either pass binary_path= to upload a local build:" +echo " --ae binary_path=/path/to/target/release/eval-cli" +echo "" +echo "Or set download_url= / EVAL_CLI_DOWNLOAD_URL:" +echo " --ae download_url=https://example.com/eval-cli" +exit 1 +{% endif %} + +echo "INSTALL_SUCCESS" diff --git a/crates/eval_cli/zed_eval/pyproject.toml b/crates/eval_cli/zed_eval/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..416c025826eaf99ad029c914b609aa28abd56f00 --- /dev/null +++ b/crates/eval_cli/zed_eval/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "zed-eval" +version = "0.1.0" +description = "Harbor agent wrapper for Zed's eval-cli" +requires-python = ">=3.12" +dependencies = ["harbor"] + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta"