eval: Add eval_cli crate (#50922)

Ben Brandt created

Very much wip

Release Notes:

- N/A

Change summary

Cargo.lock                              |  41 ++
Cargo.toml                              |   1 
crates/eval_cli/.gitignore              |   3 
crates/eval_cli/Cargo.toml              |  50 ++
crates/eval_cli/Dockerfile              |  62 +++
crates/eval_cli/Dockerfile.dockerignore |  21 +
crates/eval_cli/LICENSE-GPL             |   1 
crates/eval_cli/README.md               | 108 +++++
crates/eval_cli/build.rs                |  15 
crates/eval_cli/script/build-linux      |  57 ++
crates/eval_cli/src/headless.rs         | 131 ++++++
crates/eval_cli/src/main.rs             | 550 +++++++++++++++++++++++++++
crates/eval_cli/zed_eval/__init__.py    |   3 
crates/eval_cli/zed_eval/agent.py       | 161 +++++++
crates/eval_cli/zed_eval/install.sh.j2  |  49 ++
crates/eval_cli/zed_eval/pyproject.toml |  10 
16 files changed, 1,263 insertions(+)

Detailed changes

Cargo.lock 🔗

@@ -5892,6 +5892,47 @@ dependencies = [
  "watch",
 ]
 
+[[package]]
+name = "eval_cli"
+version = "0.1.0"
+dependencies = [
+ "acp_thread",
+ "agent",
+ "agent-client-protocol",
+ "agent_ui",
+ "anyhow",
+ "clap",
+ "client",
+ "ctrlc",
+ "debug_adapter_extension",
+ "env_logger 0.11.8",
+ "extension",
+ "feature_flags",
+ "fs",
+ "futures 0.3.31",
+ "gpui",
+ "gpui_platform",
+ "gpui_tokio",
+ "language",
+ "language_extension",
+ "language_model",
+ "language_models",
+ "languages",
+ "node_runtime",
+ "paths",
+ "project",
+ "prompt_store",
+ "release_channel",
+ "reqwest_client",
+ "serde",
+ "serde_json",
+ "settings",
+ "shellexpand 2.1.2",
+ "terminal_view",
+ "util",
+ "watch",
+]
+
 [[package]]
 name = "eval_utils"
 version = "0.1.0"

Cargo.toml 🔗

@@ -66,6 +66,7 @@ members = [
     "crates/encoding_selector",
     "crates/etw_tracing",
     "crates/eval",
+    "crates/eval_cli",
     "crates/eval_utils",
     "crates/explorer_command_injector",
     "crates/extension",

crates/eval_cli/Cargo.toml 🔗

@@ -0,0 +1,50 @@
+[package]
+name = "eval_cli"
+version = "0.1.0"
+publish.workspace = true
+edition.workspace = true
+license = "GPL-3.0-or-later"
+
+[lints]
+workspace = true
+
+[[bin]]
+name = "eval-cli"
+path = "src/main.rs"
+
+[dependencies]
+acp_thread.workspace = true
+agent.workspace = true
+agent-client-protocol.workspace = true
+agent_ui.workspace = true
+anyhow.workspace = true
+clap.workspace = true
+client.workspace = true
+ctrlc = { version = "3.5", features = ["termination"] }
+debug_adapter_extension.workspace = true
+env_logger.workspace = true
+extension.workspace = true
+feature_flags.workspace = true
+fs.workspace = true
+futures.workspace = true
+gpui.workspace = true
+gpui_platform.workspace = true
+gpui_tokio.workspace = true
+language.workspace = true
+language_extension.workspace = true
+language_model.workspace = true
+language_models.workspace = true
+languages = { workspace = true, features = ["load-grammars"] }
+node_runtime.workspace = true
+paths.workspace = true
+project.workspace = true
+prompt_store.workspace = true
+release_channel.workspace = true
+reqwest_client.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+settings.workspace = true
+shellexpand.workspace = true
+terminal_view.workspace = true
+util.workspace = true
+watch.workspace = true

crates/eval_cli/Dockerfile 🔗

@@ -0,0 +1,62 @@
+# Build eval-cli for Linux.
+#
+# Usage (from the zed repo root):
+#   docker build --platform linux/amd64 -f crates/eval_cli/Dockerfile -t eval-cli-builder .
+#   docker cp "$(docker create eval-cli-builder)":/eval-cli ./target/eval-cli
+#
+# Or use the helper script:
+#   crates/eval_cli/script/build-linux
+
+FROM rust:1.93.1-bookworm AS builder
+
+WORKDIR /app
+
+# Install build dependencies (subset of script/linux needed for headless GPUI).
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    cmake \
+    clang \
+    g++ \
+    libasound2-dev \
+    libfontconfig-dev \
+    libgit2-dev \
+    libglib2.0-dev \
+    libssl-dev \
+    libwayland-dev \
+    libx11-xcb-dev \
+    libxkbcommon-x11-dev \
+    libzstd-dev \
+    libsqlite3-dev \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install wild linker for faster linking (built from source to match bookworm's glibc).
+RUN cargo install --locked wild-linker --version 0.8.0 --root /usr/local
+
+# Download WASI SDK (needed by some dependencies).
+ARG TARGETARCH
+RUN mkdir -p /app/target && \
+    WASI_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64" || echo "x86_64") && \
+    curl -L "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-25/wasi-sdk-25.0-${WASI_ARCH}-linux.tar.gz" \
+    | tar -xz -C /app/target && \
+    mv /app/target/wasi-sdk-25.0-${WASI_ARCH}-linux /app/target/wasi-sdk
+
+# Pre-install the toolchain specified in rust-toolchain.toml so it is cached.
+RUN rustup toolchain install 1.93 --profile minimal \
+    --component rustfmt --component clippy --component rust-analyzer --component rust-src \
+    --target wasm32-wasip2 --target wasm32-unknown-unknown --target x86_64-unknown-linux-musl
+
+COPY . .
+
+ENV CC=clang CXX=clang++
+ENV RUSTFLAGS="-C linker=clang -C link-arg=--ld-path=wild"
+
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+    --mount=type=cache,target=/usr/local/cargo/git \
+    --mount=type=cache,target=/app/target \
+    cargo build --release --package eval_cli && \
+    cp /app/target/release/eval-cli /eval-cli && \
+    strip /eval-cli
+
+FROM scratch
+COPY --from=builder /eval-cli /eval-cli

crates/eval_cli/Dockerfile.dockerignore 🔗

@@ -0,0 +1,21 @@
+.git
+.github
+**/.gitignore
+**/.gitkeep
+.gitattributes
+.mailmap
+**/target
+zed.xcworkspace
+.DS_Store
+compose.yml
+plugins/bin
+script/node_modules
+styles/node_modules
+crates/collab/static/styles.css
+vendor/bin
+assets/themes/
+**/jobs
+
+**/*.egg-info
+**/__pycache__
+**/.venv

crates/eval_cli/README.md 🔗

@@ -0,0 +1,108 @@
+# eval-cli
+
+Headless CLI binary for running Zed's agent in evaluation/benchmark
+environments. Designed to work inside containerized environments like
+[Harbor](https://harborframework.com/) where the repository is already
+checked out and API keys are provided via environment variables.
+
+Uses the same `NativeAgent` + `AcpThread` pipeline as the production Zed
+editor — full agentic loop with tool calls, subagents, and retries, just
+without a GUI.
+
+## Building
+
+### Native (for local testing on the same OS)
+
+```
+cargo build --release -p eval_cli
+```
+
+### Cross-compile for Linux x86_64 (from macOS or other hosts)
+
+Harbor containers run Linux x86_64. Use the Docker-based build script:
+
+```
+crates/eval_cli/script/build-linux
+```
+
+This produces `target/eval-cli` (an x86_64 Linux ELF binary). You can
+also specify a custom output path:
+
+```
+crates/eval_cli/script/build-linux --output ~/bin/eval-cli-linux
+```
+
+## Standalone usage
+
+```
+eval-cli \
+  --workdir /testbed \
+  --model anthropic/claude-sonnet-4-6-latest \
+  --instruction "Fix the bug described in..." \
+  --timeout 600 \
+  --output-dir /logs/agent
+```
+
+Reads API keys from environment variables (`ANTHROPIC_API_KEY`,
+`OPENAI_API_KEY`, etc.). Writes `result.json`, `thread.md`, and
+`thread.json` to the output directory.
+
+### Exit codes
+
+| Code | Meaning                            |
+| ---- | ---------------------------------- |
+| 0    | Agent finished                     |
+| 1    | Error (model/auth/runtime failure) |
+| 2    | Timeout                            |
+| 3    | Interrupted (SIGTERM/SIGINT)       |
+
+## Harbor integration
+
+The `zed_eval/` directory contains a Python package that
+implements Harbor's `BaseInstalledAgent` interface, allowing eval-cli to
+be used with `--agent-import-path` without modifying Harbor's source code.
+
+### Setup
+
+```
+pip install -e crates/eval_cli/harbor/
+```
+
+### Running with a local binary
+
+Build for Linux first, then pass the binary path:
+
+```
+crates/eval_cli/script/build-linux
+
+harbor run -d "swebench_verified@latest" \
+  --agent-import-path zed_eval.agent:ZedAgent \
+  --ae binary_path=target/eval-cli \
+  -m anthropic/claude-sonnet-4-6-latest
+```
+
+The agent uploads the binary into the container during setup — no
+download URL needed during local iteration.
+
+### Running with a download URL
+
+For CI or when the binary is hosted somewhere:
+
+```
+harbor run -d "swebench_verified@latest" \
+  --agent-import-path zed_eval.agent:ZedAgent \
+  --ak download_url=https://example.com/eval-cli \
+  -m anthropic/claude-sonnet-4-6-latest
+```
+
+### Setting a timeout
+
+Pass `EVAL_CLI_TIMEOUT` via `--ae`:
+
+```
+harbor run -d "swebench_verified@latest" \
+  --agent-import-path zed_eval.agent:ZedAgent \
+  --ak binary_path=target/eval-cli \
+  --ae EVAL_CLI_TIMEOUT=600 \
+  -m anthropic/claude-sonnet-4-6-latest
+```

crates/eval_cli/build.rs 🔗

@@ -0,0 +1,15 @@
+fn main() {
+    let cargo_toml =
+        std::fs::read_to_string("../zed/Cargo.toml").expect("Failed to read crates/zed/Cargo.toml");
+    let version = cargo_toml
+        .lines()
+        .find(|line| line.starts_with("version = "))
+        .expect("Version not found in crates/zed/Cargo.toml")
+        .split('=')
+        .nth(1)
+        .expect("Invalid version format")
+        .trim()
+        .trim_matches('"');
+    println!("cargo:rerun-if-changed=../zed/Cargo.toml");
+    println!("cargo:rustc-env=ZED_PKG_VERSION={}", version);
+}

crates/eval_cli/script/build-linux 🔗

@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+#
+# Build eval-cli for x86_64 Linux from any host (macOS, Linux, etc.)
+# using Docker. The resulting binary is placed at the path printed on
+# completion (default: target/eval-cli).
+#
+# Usage:
+#   crates/eval_cli/script/build-linux [--output PATH]
+#
+# Examples:
+#   crates/eval_cli/script/build-linux
+#   crates/eval_cli/script/build-linux --output ~/bin/eval-cli
+#
+# Prerequisites: Docker must be installed and running.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+OUTPUT="${REPO_ROOT}/target/eval-cli"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --output)
+            OUTPUT="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            exit 1
+            ;;
+    esac
+done
+
+cd "$REPO_ROOT"
+
+IMAGE_TAG="eval-cli-builder"
+
+echo "Building eval-cli for x86_64-unknown-linux-gnu..."
+echo "  Repo root: $REPO_ROOT"
+echo "  Output:    $OUTPUT"
+echo ""
+
+docker build \
+    --platform linux/amd64 \
+    -f crates/eval_cli/Dockerfile \
+    -t "$IMAGE_TAG" \
+    .
+
+CONTAINER_ID=$(docker create "$IMAGE_TAG" /eval-cli)
+mkdir -p "$(dirname "$OUTPUT")"
+docker cp "$CONTAINER_ID":/eval-cli "$OUTPUT"
+docker rm "$CONTAINER_ID" > /dev/null
+
+echo ""
+echo "Built successfully: $OUTPUT"
+echo "  $(file "$OUTPUT")"

crates/eval_cli/src/headless.rs 🔗

@@ -0,0 +1,131 @@
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use client::{Client, ProxySettings, UserStore};
+use extension::ExtensionHostProxy;
+use fs::RealFs;
+use gpui::http_client::read_proxy_from_env;
+use gpui::{App, AppContext as _, Entity};
+use gpui_tokio::Tokio;
+use language::LanguageRegistry;
+use language_extension::LspAccess;
+use node_runtime::{NodeBinaryOptions, NodeRuntime};
+use project::project_settings::ProjectSettings;
+use prompt_store::PromptBuilder;
+use release_channel::{AppCommitSha, AppVersion};
+use reqwest_client::ReqwestClient;
+use settings::{Settings, SettingsStore};
+use util::ResultExt as _;
+
+pub struct AgentCliAppState {
+    pub languages: Arc<LanguageRegistry>,
+    pub client: Arc<Client>,
+    pub user_store: Entity<UserStore>,
+    pub fs: Arc<dyn fs::Fs>,
+    pub node_runtime: NodeRuntime,
+}
+
+pub fn init(cx: &mut App) -> Arc<AgentCliAppState> {
+    let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned()));
+
+    let app_version = AppVersion::load(
+        env!("ZED_PKG_VERSION"),
+        option_env!("ZED_BUILD_ID"),
+        app_commit_sha,
+    );
+
+    release_channel::init(app_version.clone(), cx);
+    gpui_tokio::init(cx);
+
+    let settings_store = SettingsStore::new(cx, &settings::default_settings());
+    cx.set_global(settings_store);
+
+    let user_agent = format!(
+        "Zed Agent CLI/{} ({}; {})",
+        app_version,
+        std::env::consts::OS,
+        std::env::consts::ARCH
+    );
+    let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
+    let proxy_url = proxy_str
+        .as_ref()
+        .and_then(|input| input.parse().ok())
+        .or_else(read_proxy_from_env);
+    let http = {
+        let _guard = Tokio::handle(cx).enter();
+        ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
+            .expect("could not start HTTP client")
+    };
+    cx.set_http_client(Arc::new(http));
+
+    let client = Client::production(cx);
+    cx.set_http_client(client.http_client());
+
+    let git_binary_path = None;
+    let fs = Arc::new(RealFs::new(
+        git_binary_path,
+        cx.background_executor().clone(),
+    ));
+
+    let mut languages = LanguageRegistry::new(cx.background_executor().clone());
+    languages.set_language_server_download_dir(paths::languages_dir().clone());
+    let languages = Arc::new(languages);
+
+    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
+
+    extension::init(cx);
+
+    let (mut node_options_tx, node_options_rx) = watch::channel(None);
+    cx.observe_global::<SettingsStore>(move |cx| {
+        let settings = &ProjectSettings::get_global(cx).node;
+        let options = NodeBinaryOptions {
+            allow_path_lookup: !settings.ignore_system_version,
+            allow_binary_download: true,
+            use_paths: settings.path.as_ref().map(|node_path| {
+                let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
+                let npm_path = settings
+                    .npm_path
+                    .as_ref()
+                    .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
+                (
+                    node_path.clone(),
+                    npm_path.unwrap_or_else(|| {
+                        let base_path = PathBuf::new();
+                        node_path.parent().unwrap_or(&base_path).join("npm")
+                    }),
+                )
+            }),
+        };
+        node_options_tx.send(Some(options)).log_err();
+    })
+    .detach();
+    let node_runtime = NodeRuntime::new(client.http_client(), None, node_options_rx);
+
+    let extension_host_proxy = ExtensionHostProxy::global(cx);
+    debug_adapter_extension::init(extension_host_proxy.clone(), cx);
+    language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone());
+    language_model::init(client.clone(), cx);
+    language_models::init(user_store.clone(), client.clone(), cx);
+    languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx);
+    prompt_store::init(cx);
+    terminal_view::init(cx);
+
+    let stdout_is_a_pty = false;
+    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
+    agent_ui::init(
+        fs.clone(),
+        client.clone(),
+        prompt_builder,
+        languages.clone(),
+        true,
+        cx,
+    );
+
+    Arc::new(AgentCliAppState {
+        languages,
+        client,
+        user_store,
+        fs,
+        node_runtime,
+    })
+}

crates/eval_cli/src/main.rs 🔗

@@ -0,0 +1,550 @@
+//! Headless CLI binary for running Zed's agent in evaluation/benchmark environments.
+//!
+//! Designed to work inside containerized environments (like Harbor/termbench) where:
+//! - The repository is already checked out at the working directory
+//! - The model API key is provided via environment variables
+//! - Results are written to an output directory (default: `/logs/agent/`)
+//!
+//! ## Usage
+//!
+//! ```text
+//! eval-cli --workdir /testbed --model anthropic/claude-sonnet-4-6-latest \
+//!          --instruction "Fix the bug described in..." --timeout 600
+//! ```
+//!
+//! ## Output
+//!
+//! Writes to `--output-dir` (default `/logs/agent/`):
+//!   - `result.json`  — structured result with status, timing, and token usage
+//!   - `thread.md`    — full conversation as markdown
+//!   - `thread.json`  — raw thread state as JSON
+//!
+//! ## Exit codes
+//!
+//! | Code | Meaning |
+//! |------|---------|
+//! | 0    | Agent finished |
+//! | 1    | Error (model/auth/runtime failure) |
+//! | 2    | Timeout |
+//! | 3    | Interrupted (SIGTERM/SIGINT) |
+
+mod headless;
+
+use std::path::PathBuf;
+use std::process;
+use std::rc::Rc;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::{Duration, Instant};
+
+use acp_thread::AgentConnection as _;
+use agent::{NativeAgent, NativeAgentConnection, Templates, ThreadStore};
+use agent_client_protocol as acp;
+use anyhow::{Context, Result};
+use clap::Parser;
+use feature_flags::FeatureFlagAppExt as _;
+
+use futures::{FutureExt, select_biased};
+use gpui::{AppContext as _, AsyncApp, Entity, UpdateGlobal};
+use language_model::{LanguageModelRegistry, SelectedModel};
+use project::Project;
+use settings::SettingsStore;
+
+use crate::headless::AgentCliAppState;
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "eval-cli",
+    about = "Run Zed's agent headlessly in evaluation/benchmark environments"
+)]
+struct Args {
+    /// Output current environment variables as JSON to stdout.
+    /// Used internally by Zed's shell environment capture.
+    #[arg(long, hide = true)]
+    printenv: bool,
+
+    /// Path to the repository working directory. Defaults to the current directory.
+    #[arg(long, default_value = ".")]
+    workdir: PathBuf,
+
+    /// Instruction/prompt text. If omitted, read from --instruction-file or stdin.
+    #[arg(long)]
+    instruction: Option<String>,
+
+    /// Language model to use, in `provider/model` format.
+    #[arg(long, default_value = "anthropic/claude-sonnet-4-6-latest")]
+    model: String,
+
+    /// Maximum wall-clock time in seconds for the agent run.
+    #[arg(long)]
+    timeout: Option<u64>,
+
+    /// Directory for output artifacts (result.json, thread.md, thread.json).
+    #[arg(long, default_value = "/logs/agent")]
+    output_dir: PathBuf,
+}
+
+enum AgentOutcome {
+    Completed,
+    Timeout { seconds: u64 },
+    Interrupted,
+}
+
+#[derive(serde::Serialize)]
+struct EvalResult {
+    status: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    error: Option<String>,
+    duration_secs: f64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    timeout_secs: Option<u64>,
+    model: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    input_tokens: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    output_tokens: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    cache_creation_input_tokens: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    cache_read_input_tokens: Option<u64>,
+}
+
+const EXIT_OK: i32 = 0;
+const EXIT_ERROR: i32 = 1;
+const EXIT_TIMEOUT: i32 = 2;
+const EXIT_INTERRUPTED: i32 = 3;
+
+static TERMINATED: AtomicBool = AtomicBool::new(false);
+
+fn main() {
+    let args = Args::parse();
+
+    if args.printenv {
+        util::shell_env::print_env();
+        return;
+    }
+
+    env_logger::init();
+
+    ctrlc::set_handler(|| {
+        TERMINATED.store(true, Ordering::SeqCst);
+    })
+    .expect("failed to set signal handler");
+
+    let instruction = read_instruction(&args).unwrap_or_else(|e| {
+        eprintln!("Error reading instruction: {e}");
+        process::exit(EXIT_ERROR);
+    });
+
+    let workdir = args.workdir.canonicalize().unwrap_or_else(|e| {
+        eprintln!("Invalid --workdir {:?}: {e}", args.workdir);
+        process::exit(EXIT_ERROR);
+    });
+
+    let output_dir = args.output_dir.clone();
+    if let Err(e) = std::fs::create_dir_all(&output_dir) {
+        eprintln!("Error creating output dir {}: {e}", output_dir.display());
+        process::exit(EXIT_ERROR);
+    }
+
+    let http_client = Arc::new(reqwest_client::ReqwestClient::new());
+    let app = gpui_platform::headless().with_http_client(http_client);
+
+    app.run(move |cx| {
+        let app_state = headless::init(cx);
+        cx.set_staff(true);
+
+        let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
+            registry
+                .providers()
+                .iter()
+                .map(|p| p.authenticate(cx))
+                .collect::<Vec<_>>()
+        });
+
+        let model_name = args.model.clone();
+        let timeout = args.timeout;
+
+        cx.spawn(async move |cx| {
+            futures::future::join_all(auth_tasks).await;
+
+            let start = Instant::now();
+
+            let (outcome, token_usage) = run_agent(
+                &app_state,
+                &workdir,
+                &instruction,
+                &model_name,
+                timeout,
+                Some(&output_dir),
+                cx,
+            )
+            .await;
+
+            let duration = start.elapsed();
+
+            let (status, error, exit_code) = match &outcome {
+                Ok(AgentOutcome::Completed) => ("completed".to_string(), None, EXIT_OK),
+                Ok(AgentOutcome::Timeout { seconds }) => {
+                    eprintln!("Timeout: agent exceeded {seconds}s time limit");
+                    ("timeout".to_string(), None, EXIT_TIMEOUT)
+                }
+                Ok(AgentOutcome::Interrupted) => {
+                    eprintln!("Interrupted: received SIGTERM, saved partial output");
+                    ("interrupted".to_string(), None, EXIT_INTERRUPTED)
+                }
+                Err(e) => {
+                    eprintln!("Error: {e:#}");
+                    ("error".to_string(), Some(format!("{e:#}")), EXIT_ERROR)
+                }
+            };
+
+            let result = EvalResult {
+                status,
+                error,
+                duration_secs: duration.as_secs_f64(),
+                timeout_secs: timeout,
+                model: model_name.clone(),
+                input_tokens: token_usage.as_ref().map(|u| u.input_tokens),
+                output_tokens: token_usage.as_ref().map(|u| u.output_tokens),
+                cache_creation_input_tokens: token_usage
+                    .as_ref()
+                    .filter(|u| u.cache_creation_input_tokens > 0)
+                    .map(|u| u.cache_creation_input_tokens),
+                cache_read_input_tokens: token_usage
+                    .as_ref()
+                    .filter(|u| u.cache_read_input_tokens > 0)
+                    .map(|u| u.cache_read_input_tokens),
+            };
+
+            match serde_json::to_string_pretty(&result) {
+                Ok(json) => {
+                    if let Err(e) = std::fs::write(output_dir.join("result.json"), &json) {
+                        eprintln!("Error writing result.json: {e:#}");
+                    }
+                    eprintln!("[eval-cli] result: {json}");
+                }
+                Err(e) => eprintln!("Error serializing result: {e:#}"),
+            }
+
+            cx.update(|cx| cx.quit());
+            process::exit(exit_code);
+        })
+        .detach();
+    });
+}
+
+fn read_instruction(args: &Args) -> Result<String> {
+    let text = if let Some(text) = &args.instruction {
+        text.clone()
+    } else {
+        use std::io::Read;
+        let mut buf = String::new();
+        std::io::stdin()
+            .read_to_string(&mut buf)
+            .context("reading instruction from stdin")?;
+        buf
+    };
+    anyhow::ensure!(!text.trim().is_empty(), "instruction is empty");
+    Ok(text)
+}
+
+async fn run_agent(
+    app_state: &Arc<AgentCliAppState>,
+    workdir: &std::path::Path,
+    instruction: &str,
+    model_name: &str,
+    timeout: Option<u64>,
+    output_dir: Option<&std::path::Path>,
+    cx: &mut AsyncApp,
+) -> (Result<AgentOutcome>, Option<language_model::TokenUsage>) {
+    let setup_result: Result<()> = cx.update(|cx| {
+        let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!("{e}"))?;
+        let registry = LanguageModelRegistry::global(cx);
+        let model = registry
+            .read(cx)
+            .available_models(cx)
+            .find(|m| m.id() == selected.model && m.provider_id() == selected.provider)
+            .ok_or_else(|| {
+                let available = registry
+                    .read(cx)
+                    .available_models(cx)
+                    .map(|m| format!("{}/{}", m.provider_id().0, m.id().0))
+                    .collect::<Vec<_>>()
+                    .join(", ");
+                anyhow::anyhow!("Model {model_name} not found. Available: {available}")
+            })?;
+
+        let supports_thinking = model.supports_thinking();
+
+        registry.update(cx, |registry, cx| {
+            registry.set_default_model(
+                Some(language_model::ConfiguredModel {
+                    provider: registry
+                        .provider(&model.provider_id())
+                        .context("Provider not found")?,
+                    model,
+                }),
+                cx,
+            );
+            anyhow::Ok(())
+        })?;
+
+        let (enable_thinking, effort) = if supports_thinking {
+            (true, "\"high\"")
+        } else {
+            (false, "null")
+        };
+        let provider_id = selected.provider.0.to_string();
+        let model_id = selected.model.0.to_string();
+        SettingsStore::update_global(cx, |store, cx| {
+            let settings = format!(
+                r#"{{
+                    "agent": {{
+                        "tool_permissions": {{"default": "allow"}},
+                        "default_model": {{
+                            "provider": "{provider_id}",
+                            "model": "{model_id}",
+                            "enable_thinking": {enable_thinking},
+                            "effort": {effort}
+                        }}
+                    }},
+                    "autosave": "off",
+                    "format_on_save": "off"
+                }}"
+                "#
+            );
+            store.set_user_settings(&settings, cx).ok();
+        });
+
+        anyhow::Ok(())
+    });
+
+    if let Err(e) = setup_result {
+        return (Err(e), None);
+    }
+
+    let project = cx.update(|cx| {
+        Project::local(
+            app_state.client.clone(),
+            app_state.node_runtime.clone(),
+            app_state.user_store.clone(),
+            app_state.languages.clone(),
+            app_state.fs.clone(),
+            None,
+            project::LocalProjectFlags {
+                init_worktree_trust: false,
+                ..Default::default()
+            },
+            cx,
+        )
+    });
+
+    let worktree = project.update(cx, |project, cx| project.create_worktree(workdir, true, cx));
+    let worktree = match worktree.await {
+        Ok(w) => w,
+        Err(e) => return (Err(e).context("creating worktree"), None),
+    };
+
+    let scan_result = worktree.update(cx, |tree, _cx| {
+        tree.as_local()
+            .context("expected local worktree")
+            .map(|local| local.scan_complete())
+    });
+    match scan_result {
+        Ok(future) => future.await,
+        Err(e) => return (Err(e), None),
+    };
+
+    let thread_store = cx.new(|cx| ThreadStore::new(cx));
+    let agent = match NativeAgent::new(
+        project.clone(),
+        thread_store,
+        Templates::new(),
+        None,
+        app_state.fs.clone(),
+        cx,
+    )
+    .await
+    {
+        Ok(a) => a,
+        Err(e) => return (Err(e).context("creating agent"), None),
+    };
+
+    let connection = Rc::new(NativeAgentConnection(agent.clone()));
+    let acp_thread = match cx
+        .update(|cx| connection.clone().new_session(project, workdir, cx))
+        .await
+    {
+        Ok(t) => t,
+        Err(e) => return (Err(e).context("creating ACP session"), None),
+    };
+
+    let _subscription = cx.subscribe(&acp_thread, |acp_thread, event, cx| {
+        log_acp_thread_event(&acp_thread, event, cx);
+    });
+
+    let message = vec![acp::ContentBlock::Text(acp::TextContent::new(
+        instruction.to_string(),
+    ))];
+
+    let send_future = acp_thread.update(cx, |acp_thread: &mut acp_thread::AcpThread, cx| {
+        acp_thread.send(message, cx)
+    });
+
+    let timeout_future = if let Some(timeout_secs) = timeout {
+        futures::future::Either::Left(
+            cx.background_executor()
+                .timer(Duration::from_secs(timeout_secs)),
+        )
+    } else {
+        futures::future::Either::Right(futures::future::pending::<()>())
+    };
+
+    let sigterm_future = {
+        let executor = cx.background_executor().clone();
+        async move {
+            while !TERMINATED.load(Ordering::Relaxed) {
+                executor.timer(Duration::from_millis(100)).await;
+            }
+        }
+    };
+
+    let outcome = select_biased! {
+        result = send_future.fuse() => match result {
+            Ok(Some(response)) => {
+                eprintln!("[eval-cli] stopped: {:?}", response.stop_reason);
+                if response.stop_reason == acp::StopReason::MaxTokens {
+                    Err(anyhow::anyhow!("Model hit maximum token limit"))
+                } else {
+                    Ok(AgentOutcome::Completed)
+                }
+            }
+            Ok(None) => {
+                eprintln!("[eval-cli] completed (no response)");
+                Ok(AgentOutcome::Completed)
+            }
+            Err(e) => Err(e).context("agent run failed"),
+        },
+        _ = sigterm_future.fuse() => {
+            eprintln!("[eval-cli] received SIGTERM, cancelling...");
+            acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
+            Ok(AgentOutcome::Interrupted)
+        },
+        _ = timeout_future.fuse() => {
+            acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
+            Ok(AgentOutcome::Timeout { seconds: timeout.unwrap_or(0) })
+        }
+    };
+
+    let thread = cx.update(|cx| {
+        let session_id = acp_thread.read(cx).session_id().clone();
+        connection.thread(&session_id, cx)
+    });
+
+    let cumulative_usage = if let Some(thread) = &thread {
+        let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
+        let db_thread = db_thread.await;
+        let usage = db_thread.cumulative_token_usage;
+        if usage.input_tokens > 0 || usage.output_tokens > 0 {
+            Some(usage)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    let acp_usage = cx.update(|cx| {
+        acp_thread
+            .read(cx)
+            .token_usage()
+            .map(|usage| language_model::TokenUsage {
+                input_tokens: usage.input_tokens,
+                output_tokens: usage.output_tokens,
+                ..Default::default()
+            })
+    });
+
+    let final_usage = cumulative_usage.or(acp_usage);
+
+    if let (Some(thread), Some(dir)) = (&thread, output_dir) {
+        let markdown = thread.read_with(cx, |thread, _cx| thread.to_markdown());
+        if let Err(e) = std::fs::write(dir.join("thread.md"), markdown) {
+            eprintln!("Error writing thread.md: {e:#}");
+        }
+
+        let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
+        let db_thread = db_thread.await;
+        match serde_json::to_string_pretty(&db_thread) {
+            Ok(json) => {
+                if let Err(e) = std::fs::write(dir.join("thread.json"), json) {
+                    eprintln!("Error writing thread.json: {e:#}");
+                }
+            }
+            Err(e) => eprintln!("Error serializing thread.json: {e:#}"),
+        }
+    }
+
+    (outcome, final_usage)
+}
+
+fn log_acp_thread_event(
+    acp_thread: &Entity<acp_thread::AcpThread>,
+    event: &acp_thread::AcpThreadEvent,
+    cx: &mut gpui::App,
+) {
+    match event {
+        acp_thread::AcpThreadEvent::NewEntry => {
+            let entries = acp_thread.read(cx).entries();
+            if let Some(acp_thread::AgentThreadEntry::AssistantMessage(message)) = entries.last() {
+                for chunk in &message.chunks {
+                    if let acp_thread::AssistantMessageChunk::Message { block } = chunk {
+                        if let acp_thread::ContentBlock::Markdown { markdown } = block {
+                            let text = markdown.read(cx).source().to_string();
+                            if !text.is_empty() {
+                                eprint!("{text}");
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        acp_thread::AcpThreadEvent::EntryUpdated(index) => {
+            let entries = acp_thread.read(cx).entries();
+            if let Some(acp_thread::AgentThreadEntry::ToolCall(tool_call)) = entries.get(*index) {
+                if let Some(name) = &tool_call.tool_name {
+                    match &tool_call.status {
+                        acp_thread::ToolCallStatus::Completed => {
+                            eprintln!("[tool] {name} ✓");
+                        }
+                        acp_thread::ToolCallStatus::Failed => {
+                            eprintln!("[tool] {name} ✗");
+                        }
+                        acp_thread::ToolCallStatus::Rejected => {
+                            eprintln!("[tool] {name} rejected");
+                        }
+                        acp_thread::ToolCallStatus::Canceled => {
+                            eprintln!("[tool] {name} canceled");
+                        }
+                        _ => {}
+                    }
+                }
+            }
+        }
+        acp_thread::AcpThreadEvent::Stopped(reason) => {
+            eprintln!("\n[eval-cli] stopped: {reason:?}");
+        }
+        acp_thread::AcpThreadEvent::Error => {
+            eprintln!("[eval-cli] error event");
+        }
+        acp_thread::AcpThreadEvent::Retry(status) => {
+            eprintln!("[eval-cli] retry: {status:?}");
+        }
+        acp_thread::AcpThreadEvent::SubagentSpawned(session_id) => {
+            eprintln!("[eval-cli] subagent spawned: {session_id}");
+        }
+        _ => {}
+    }
+}

crates/eval_cli/zed_eval/agent.py 🔗

@@ -0,0 +1,161 @@
+"""Harbor agent wrapper for Zed's eval-cli binary.
+
+Usage:
+    # Build eval-cli locally first:
+    cargo build --release -p eval_cli
+
+    # Run via Harbor with a local binary:
+    harbor run -d "dataset@version" \
+        --agent-import-path zed_eval.agent:ZedAgent \
+        --ae binary_path=/path/to/target/release/eval-cli \
+        --agent-model anthropic/claude-sonnet-4-6-latest
+
+    # Or with a download URL (for CI):
+    harbor run -d "dataset@version" \
+        --agent-import-path zed_eval.agent:ZedAgent \
+        --ae download_url=https://example.com/eval-cli \
+        --agent-model anthropic/claude-sonnet-4-6-latest
+"""
+
+import json
+import os
+import shlex
+from pathlib import Path
+
+from harbor.agents.installed.base import BaseInstalledAgent, ExecInput
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+
+
+class ZedAgent(BaseInstalledAgent):
+    """Runs Zed's headless AI agent (eval-cli) to solve tasks.
+
+    The eval-cli binary boots a headless GPUI application and uses the same
+    NativeAgent + AcpThread pipeline as the production Zed editor, driving
+    the full agentic loop (tool calls, subagents, retries) without a GUI.
+    """
+
+    def __init__(
+        self,
+        logs_dir: Path,
+        binary_path: str | None = None,
+        download_url: str | None = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(logs_dir, *args, **kwargs)
+        self._binary_path = binary_path
+        self._download_url = download_url or os.environ.get("EVAL_CLI_DOWNLOAD_URL")
+
+    @staticmethod
+    def name() -> str:
+        return "zed"
+
+    @property
+    def _install_agent_template_path(self) -> Path:
+        return Path(__file__).parent / "install.sh.j2"
+
+    async def setup(self, environment: BaseEnvironment) -> None:
+        await environment.exec(command="mkdir -p /installed-agent")
+
+        if self._binary_path:
+            binary = Path(self._binary_path)
+            if not binary.exists():
+                raise FileNotFoundError(
+                    f"eval-cli binary not found at {binary}. "
+                    "Build it with: cargo build --release -p eval_cli"
+                )
+            await environment.upload_file(
+                source_path=binary,
+                target_path="/usr/local/bin/eval-cli",
+            )
+            await environment.exec(command="chmod +x /usr/local/bin/eval-cli")
+
+        await super().setup(environment)
+
+    @property
+    def _template_variables(self) -> dict[str, str]:
+        variables = super()._template_variables
+        if self._binary_path:
+            variables["binary_uploaded"] = "true"
+        if self._download_url:
+            variables["download_url"] = self._download_url
+        return variables
+
+    def populate_context_post_run(self, context: AgentContext) -> None:
+        result_data = None
+        for json_file in self.logs_dir.rglob("result.json"):
+            try:
+                result_data = json.loads(json_file.read_text())
+                break
+            except (json.JSONDecodeError, OSError):
+                continue
+
+        if result_data is None:
+            self.logger.warning("Could not find or parse result.json from eval-cli")
+            return
+
+        if result_data.get("input_tokens") is not None:
+            context.n_input_tokens = result_data["input_tokens"]
+        if result_data.get("output_tokens") is not None:
+            context.n_output_tokens = result_data["output_tokens"]
+        if result_data.get("cache_read_input_tokens") is not None:
+            context.n_cache_tokens = result_data["cache_read_input_tokens"]
+
+        context.metadata = {
+            "status": result_data.get("status"),
+            "duration_secs": result_data.get("duration_secs"),
+            "model": result_data.get("model"),
+        }
+
+    def _get_api_env(self) -> dict[str, str]:
+        env: dict[str, str] = {}
+        if not self.model_name or "/" not in self.model_name:
+            return env
+
+        provider = self.model_name.split("/", 1)[0]
+        provider_env_map = {
+            "anthropic": "ANTHROPIC_API_KEY",
+            "openai": "OPENAI_API_KEY",
+            "google": "GEMINI_API_KEY",
+            "gemini": "GEMINI_API_KEY",
+            "deepseek": "DEEPSEEK_API_KEY",
+            "mistral": "MISTRAL_API_KEY",
+        }
+
+        env_var = provider_env_map.get(provider)
+        if env_var:
+            api_key = os.environ.get(env_var, "")
+            if api_key:
+                env[env_var] = api_key
+
+        return env
+
+    def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
+        escaped_instruction = shlex.quote(instruction)
+        env = self._get_api_env()
+
+        parts = ["eval-cli", "--workdir /testbed", "--output-dir /logs/agent"]
+
+        if self.model_name:
+            parts.append(f"--model {self.model_name}")
+
+        timeout = self._extra_env.get("EVAL_CLI_TIMEOUT")
+        if timeout:
+            parts.append(f"--timeout {timeout}")
+
+        parts.append(f"--instruction {escaped_instruction}")
+
+        eval_cli_command = " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt"
+
+        patch_command = (
+            "cd /testbed && "
+            "git add -A && "
+            "git diff --cached HEAD > /logs/agent/patch.diff && "
+            "echo \"Patch size: $(wc -c < /logs/agent/patch.diff) bytes\""
+        )
+
+        return [
+            ExecInput(command=eval_cli_command, env=env),
+            ExecInput(command=patch_command),
+        ]

crates/eval_cli/zed_eval/install.sh.j2 🔗

@@ -0,0 +1,49 @@
+#!/bin/bash
+set -euo pipefail
+
+# Install runtime dependencies needed by the eval-cli binary (dynamically linked
+# against glibc + these shared libraries from its GPUI/terminal/language stacks).
+apt-get update
+apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    git \
+    libasound2 \
+    libfontconfig1 \
+    libglib2.0-0 \
+    libsqlite3-0 \
+    libssl3 \
+    libwayland-client0 \
+    libx11-xcb1 \
+    libxkbcommon-x11-0 \
+    libzstd1
+
+# Install Node.js 22 LTS (needed by language servers like basedpyright).
+curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
+apt-get install -y --no-install-recommends nodejs
+
+# Install uv (needed for running Python tests in SWE-bench tasks).
+curl -LsSf https://astral.sh/uv/install.sh | sh
+. "$HOME/.local/bin/env"
+ln -sf "$HOME/.local/bin/uv" /usr/local/bin/uv
+ln -sf "$HOME/.local/bin/uvx" /usr/local/bin/uvx
+
+{% if binary_uploaded is defined %}
+# Binary was uploaded directly via setup() — just verify it works.
+eval-cli --help
+{% elif download_url is defined %}
+curl -fsSL "{{ download_url }}" -o /usr/local/bin/eval-cli
+chmod +x /usr/local/bin/eval-cli
+eval-cli --help
+{% else %}
+echo "ERROR: No eval-cli binary provided."
+echo ""
+echo "Either pass binary_path= to upload a local build:"
+echo "  --ae binary_path=/path/to/target/release/eval-cli"
+echo ""
+echo "Or set download_url= / EVAL_CLI_DOWNLOAD_URL:"
+echo "  --ae download_url=https://example.com/eval-cli"
+exit 1
+{% endif %}
+
+echo "INSTALL_SUCCESS"

crates/eval_cli/zed_eval/pyproject.toml 🔗

@@ -0,0 +1,10 @@
+[project]
+name = "zed-eval"
+version = "0.1.0"
+description = "Harbor agent wrapper for Zed's eval-cli"
+requires-python = ">=3.12"
+dependencies = ["harbor"]
+
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"