Detailed changes
@@ -5892,6 +5892,47 @@ dependencies = [
"watch",
]
+[[package]]
+name = "eval_cli"
+version = "0.1.0"
+dependencies = [
+ "acp_thread",
+ "agent",
+ "agent-client-protocol",
+ "agent_ui",
+ "anyhow",
+ "clap",
+ "client",
+ "ctrlc",
+ "debug_adapter_extension",
+ "env_logger 0.11.8",
+ "extension",
+ "feature_flags",
+ "fs",
+ "futures 0.3.31",
+ "gpui",
+ "gpui_platform",
+ "gpui_tokio",
+ "language",
+ "language_extension",
+ "language_model",
+ "language_models",
+ "languages",
+ "node_runtime",
+ "paths",
+ "project",
+ "prompt_store",
+ "release_channel",
+ "reqwest_client",
+ "serde",
+ "serde_json",
+ "settings",
+ "shellexpand 2.1.2",
+ "terminal_view",
+ "util",
+ "watch",
+]
+
[[package]]
name = "eval_utils"
version = "0.1.0"
@@ -66,6 +66,7 @@ members = [
"crates/encoding_selector",
"crates/etw_tracing",
"crates/eval",
+ "crates/eval_cli",
"crates/eval_utils",
"crates/explorer_command_injector",
"crates/extension",
@@ -0,0 +1,3 @@
+**/jobs
+**/*.egg-info
+**/__pycache__
@@ -0,0 +1,50 @@
+[package]
+name = "eval_cli"
+version = "0.1.0"
+publish.workspace = true
+edition.workspace = true
+license = "GPL-3.0-or-later"
+
+[lints]
+workspace = true
+
+[[bin]]
+name = "eval-cli"
+path = "src/main.rs"
+
+[dependencies]
+acp_thread.workspace = true
+agent.workspace = true
+agent-client-protocol.workspace = true
+agent_ui.workspace = true
+anyhow.workspace = true
+clap.workspace = true
+client.workspace = true
+ctrlc = { version = "3.5", features = ["termination"] }
+debug_adapter_extension.workspace = true
+env_logger.workspace = true
+extension.workspace = true
+feature_flags.workspace = true
+fs.workspace = true
+futures.workspace = true
+gpui.workspace = true
+gpui_platform.workspace = true
+gpui_tokio.workspace = true
+language.workspace = true
+language_extension.workspace = true
+language_model.workspace = true
+language_models.workspace = true
+languages = { workspace = true, features = ["load-grammars"] }
+node_runtime.workspace = true
+paths.workspace = true
+project.workspace = true
+prompt_store.workspace = true
+release_channel.workspace = true
+reqwest_client.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+settings.workspace = true
+shellexpand.workspace = true
+terminal_view.workspace = true
+util.workspace = true
+watch.workspace = true
@@ -0,0 +1,62 @@
+# Build eval-cli for Linux.
+#
+# Usage (from the zed repo root):
+# docker build --platform linux/amd64 -f crates/eval_cli/Dockerfile -t eval-cli-builder .
+# docker cp "$(docker create eval-cli-builder)":/eval-cli ./target/eval-cli
+#
+# Or use the helper script:
+# crates/eval_cli/script/build-linux
+
+FROM rust:1.93.1-bookworm AS builder
+
+WORKDIR /app
+
+# Install build dependencies (subset of script/linux needed for headless GPUI).
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ cmake \
+ clang \
+ g++ \
+ libasound2-dev \
+ libfontconfig-dev \
+ libgit2-dev \
+ libglib2.0-dev \
+ libssl-dev \
+ libwayland-dev \
+ libx11-xcb-dev \
+ libxkbcommon-x11-dev \
+ libzstd-dev \
+ libsqlite3-dev \
+ build-essential \
+ curl \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install wild linker for faster linking (built from source to match bookworm's glibc).
+RUN cargo install --locked wild-linker --version 0.8.0 --root /usr/local
+
+# Download WASI SDK (needed by some dependencies).
+ARG TARGETARCH
+RUN mkdir -p /app/target && \
+ WASI_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64" || echo "x86_64") && \
+ curl -L "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-25/wasi-sdk-25.0-${WASI_ARCH}-linux.tar.gz" \
+ | tar -xz -C /app/target && \
+ mv /app/target/wasi-sdk-25.0-${WASI_ARCH}-linux /app/target/wasi-sdk
+
+# Pre-install the toolchain specified in rust-toolchain.toml so it is cached.
+RUN rustup toolchain install 1.93 --profile minimal \
+ --component rustfmt --component clippy --component rust-analyzer --component rust-src \
+ --target wasm32-wasip2 --target wasm32-unknown-unknown --target x86_64-unknown-linux-musl
+
+COPY . .
+
+ENV CC=clang CXX=clang++
+ENV RUSTFLAGS="-C linker=clang -C link-arg=--ld-path=wild"
+
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+ --mount=type=cache,target=/usr/local/cargo/git \
+ --mount=type=cache,target=/app/target \
+ cargo build --release --package eval_cli && \
+ cp /app/target/release/eval-cli /eval-cli && \
+ strip /eval-cli
+
+FROM scratch
+COPY --from=builder /eval-cli /eval-cli
@@ -0,0 +1,21 @@
+.git
+.github
+**/.gitignore
+**/.gitkeep
+.gitattributes
+.mailmap
+**/target
+zed.xcworkspace
+.DS_Store
+compose.yml
+plugins/bin
+script/node_modules
+styles/node_modules
+crates/collab/static/styles.css
+vendor/bin
+assets/themes/
+**/jobs
+
+**/*.egg-info
+**/__pycache__
+**/.venv
@@ -0,0 +1 @@
+../../LICENSE-GPL
@@ -0,0 +1,108 @@
+# eval-cli
+
+Headless CLI binary for running Zed's agent in evaluation/benchmark
+environments. Designed to work inside containerized environments like
+[Harbor](https://harborframework.com/) where the repository is already
+checked out and API keys are provided via environment variables.
+
+Uses the same `NativeAgent` + `AcpThread` pipeline as the production Zed
+editor — full agentic loop with tool calls, subagents, and retries, just
+without a GUI.
+
+## Building
+
+### Native (for local testing on the same OS)
+
+```
+cargo build --release -p eval_cli
+```
+
+### Cross-compile for Linux x86_64 (from macOS or other hosts)
+
+Harbor containers run Linux x86_64. Use the Docker-based build script:
+
+```
+crates/eval_cli/script/build-linux
+```
+
+This produces `target/eval-cli` (an x86_64 Linux ELF binary). You can
+also specify a custom output path:
+
+```
+crates/eval_cli/script/build-linux --output ~/bin/eval-cli-linux
+```
+
+## Standalone usage
+
+```
+eval-cli \
+ --workdir /testbed \
+ --model anthropic/claude-sonnet-4-6-latest \
+ --instruction "Fix the bug described in..." \
+ --timeout 600 \
+ --output-dir /logs/agent
+```
+
+Reads API keys from environment variables (`ANTHROPIC_API_KEY`,
+`OPENAI_API_KEY`, etc.). Writes `result.json`, `thread.md`, and
+`thread.json` to the output directory.
+
+### Exit codes
+
+| Code | Meaning |
+| ---- | ---------------------------------- |
+| 0 | Agent finished |
+| 1 | Error (model/auth/runtime failure) |
+| 2 | Timeout |
+| 3 | Interrupted (SIGTERM/SIGINT) |
+
+## Harbor integration
+
+The `zed_eval/` directory contains a Python package that
+implements Harbor's `BaseInstalledAgent` interface, allowing eval-cli to
+be used with `--agent-import-path` without modifying Harbor's source code.
+
+### Setup
+
+```
+pip install -e crates/eval_cli/harbor/
+```
+
+### Running with a local binary
+
+Build for Linux first, then pass the binary path:
+
+```
+crates/eval_cli/script/build-linux
+
+harbor run -d "swebench_verified@latest" \
+ --agent-import-path zed_eval.agent:ZedAgent \
+ --ae binary_path=target/eval-cli \
+ -m anthropic/claude-sonnet-4-6-latest
+```
+
+The agent uploads the binary into the container during setup — no
+download URL needed during local iteration.
+
+### Running with a download URL
+
+For CI or when the binary is hosted somewhere:
+
+```
+harbor run -d "swebench_verified@latest" \
+ --agent-import-path zed_eval.agent:ZedAgent \
+ --ak download_url=https://example.com/eval-cli \
+ -m anthropic/claude-sonnet-4-6-latest
+```
+
+### Setting a timeout
+
+Pass `EVAL_CLI_TIMEOUT` via `--ae`:
+
+```
+harbor run -d "swebench_verified@latest" \
+ --agent-import-path zed_eval.agent:ZedAgent \
+ --ak binary_path=target/eval-cli \
+ --ae EVAL_CLI_TIMEOUT=600 \
+ -m anthropic/claude-sonnet-4-6-latest
+```
@@ -0,0 +1,15 @@
+fn main() {
+ let cargo_toml =
+ std::fs::read_to_string("../zed/Cargo.toml").expect("Failed to read crates/zed/Cargo.toml");
+ let version = cargo_toml
+ .lines()
+ .find(|line| line.starts_with("version = "))
+ .expect("Version not found in crates/zed/Cargo.toml")
+ .split('=')
+ .nth(1)
+ .expect("Invalid version format")
+ .trim()
+ .trim_matches('"');
+ println!("cargo:rerun-if-changed=../zed/Cargo.toml");
+ println!("cargo:rustc-env=ZED_PKG_VERSION={}", version);
+}
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+#
+# Build eval-cli for x86_64 Linux from any host (macOS, Linux, etc.)
+# using Docker. The resulting binary is placed at the path printed on
+# completion (default: target/eval-cli).
+#
+# Usage:
+# crates/eval_cli/script/build-linux [--output PATH]
+#
+# Examples:
+# crates/eval_cli/script/build-linux
+# crates/eval_cli/script/build-linux --output ~/bin/eval-cli
+#
+# Prerequisites: Docker must be installed and running.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+OUTPUT="${REPO_ROOT}/target/eval-cli"
+
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --output)
+ OUTPUT="$2"
+ shift 2
+ ;;
+ *)
+ echo "Unknown option: $1" >&2
+ exit 1
+ ;;
+ esac
+done
+
+cd "$REPO_ROOT"
+
+IMAGE_TAG="eval-cli-builder"
+
+echo "Building eval-cli for x86_64-unknown-linux-gnu..."
+echo " Repo root: $REPO_ROOT"
+echo " Output: $OUTPUT"
+echo ""
+
+docker build \
+ --platform linux/amd64 \
+ -f crates/eval_cli/Dockerfile \
+ -t "$IMAGE_TAG" \
+ .
+
+CONTAINER_ID=$(docker create "$IMAGE_TAG" /eval-cli)
+mkdir -p "$(dirname "$OUTPUT")"
+docker cp "$CONTAINER_ID":/eval-cli "$OUTPUT"
+docker rm "$CONTAINER_ID" > /dev/null
+
+echo ""
+echo "Built successfully: $OUTPUT"
+echo " $(file "$OUTPUT")"
@@ -0,0 +1,131 @@
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use client::{Client, ProxySettings, UserStore};
+use extension::ExtensionHostProxy;
+use fs::RealFs;
+use gpui::http_client::read_proxy_from_env;
+use gpui::{App, AppContext as _, Entity};
+use gpui_tokio::Tokio;
+use language::LanguageRegistry;
+use language_extension::LspAccess;
+use node_runtime::{NodeBinaryOptions, NodeRuntime};
+use project::project_settings::ProjectSettings;
+use prompt_store::PromptBuilder;
+use release_channel::{AppCommitSha, AppVersion};
+use reqwest_client::ReqwestClient;
+use settings::{Settings, SettingsStore};
+use util::ResultExt as _;
+
+pub struct AgentCliAppState {
+ pub languages: Arc<LanguageRegistry>,
+ pub client: Arc<Client>,
+ pub user_store: Entity<UserStore>,
+ pub fs: Arc<dyn fs::Fs>,
+ pub node_runtime: NodeRuntime,
+}
+
+pub fn init(cx: &mut App) -> Arc<AgentCliAppState> {
+ let app_commit_sha = option_env!("ZED_COMMIT_SHA").map(|s| AppCommitSha::new(s.to_owned()));
+
+ let app_version = AppVersion::load(
+ env!("ZED_PKG_VERSION"),
+ option_env!("ZED_BUILD_ID"),
+ app_commit_sha,
+ );
+
+ release_channel::init(app_version.clone(), cx);
+ gpui_tokio::init(cx);
+
+ let settings_store = SettingsStore::new(cx, &settings::default_settings());
+ cx.set_global(settings_store);
+
+ let user_agent = format!(
+ "Zed Agent CLI/{} ({}; {})",
+ app_version,
+ std::env::consts::OS,
+ std::env::consts::ARCH
+ );
+ let proxy_str = ProxySettings::get_global(cx).proxy.to_owned();
+ let proxy_url = proxy_str
+ .as_ref()
+ .and_then(|input| input.parse().ok())
+ .or_else(read_proxy_from_env);
+ let http = {
+ let _guard = Tokio::handle(cx).enter();
+ ReqwestClient::proxy_and_user_agent(proxy_url, &user_agent)
+ .expect("could not start HTTP client")
+ };
+ cx.set_http_client(Arc::new(http));
+
+ let client = Client::production(cx);
+ cx.set_http_client(client.http_client());
+
+ let git_binary_path = None;
+ let fs = Arc::new(RealFs::new(
+ git_binary_path,
+ cx.background_executor().clone(),
+ ));
+
+ let mut languages = LanguageRegistry::new(cx.background_executor().clone());
+ languages.set_language_server_download_dir(paths::languages_dir().clone());
+ let languages = Arc::new(languages);
+
+ let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
+
+ extension::init(cx);
+
+ let (mut node_options_tx, node_options_rx) = watch::channel(None);
+ cx.observe_global::<SettingsStore>(move |cx| {
+ let settings = &ProjectSettings::get_global(cx).node;
+ let options = NodeBinaryOptions {
+ allow_path_lookup: !settings.ignore_system_version,
+ allow_binary_download: true,
+ use_paths: settings.path.as_ref().map(|node_path| {
+ let node_path = PathBuf::from(shellexpand::tilde(node_path).as_ref());
+ let npm_path = settings
+ .npm_path
+ .as_ref()
+ .map(|path| PathBuf::from(shellexpand::tilde(&path).as_ref()));
+ (
+ node_path.clone(),
+ npm_path.unwrap_or_else(|| {
+ let base_path = PathBuf::new();
+ node_path.parent().unwrap_or(&base_path).join("npm")
+ }),
+ )
+ }),
+ };
+ node_options_tx.send(Some(options)).log_err();
+ })
+ .detach();
+ let node_runtime = NodeRuntime::new(client.http_client(), None, node_options_rx);
+
+ let extension_host_proxy = ExtensionHostProxy::global(cx);
+ debug_adapter_extension::init(extension_host_proxy.clone(), cx);
+ language_extension::init(LspAccess::Noop, extension_host_proxy, languages.clone());
+ language_model::init(client.clone(), cx);
+ language_models::init(user_store.clone(), client.clone(), cx);
+ languages::init(languages.clone(), fs.clone(), node_runtime.clone(), cx);
+ prompt_store::init(cx);
+ terminal_view::init(cx);
+
+ let stdout_is_a_pty = false;
+ let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
+ agent_ui::init(
+ fs.clone(),
+ client.clone(),
+ prompt_builder,
+ languages.clone(),
+ true,
+ cx,
+ );
+
+ Arc::new(AgentCliAppState {
+ languages,
+ client,
+ user_store,
+ fs,
+ node_runtime,
+ })
+}
@@ -0,0 +1,550 @@
+//! Headless CLI binary for running Zed's agent in evaluation/benchmark environments.
+//!
+//! Designed to work inside containerized environments (like Harbor/termbench) where:
+//! - The repository is already checked out at the working directory
+//! - The model API key is provided via environment variables
+//! - Results are written to an output directory (default: `/logs/agent/`)
+//!
+//! ## Usage
+//!
+//! ```text
+//! eval-cli --workdir /testbed --model anthropic/claude-sonnet-4-6-latest \
+//! --instruction "Fix the bug described in..." --timeout 600
+//! ```
+//!
+//! ## Output
+//!
+//! Writes to `--output-dir` (default `/logs/agent/`):
+//! - `result.json` — structured result with status, timing, and token usage
+//! - `thread.md` — full conversation as markdown
+//! - `thread.json` — raw thread state as JSON
+//!
+//! ## Exit codes
+//!
+//! | Code | Meaning |
+//! |------|---------|
+//! | 0 | Agent finished |
+//! | 1 | Error (model/auth/runtime failure) |
+//! | 2 | Timeout |
+//! | 3 | Interrupted (SIGTERM/SIGINT) |
+
+mod headless;
+
+use std::path::PathBuf;
+use std::process;
+use std::rc::Rc;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::{Duration, Instant};
+
+use acp_thread::AgentConnection as _;
+use agent::{NativeAgent, NativeAgentConnection, Templates, ThreadStore};
+use agent_client_protocol as acp;
+use anyhow::{Context, Result};
+use clap::Parser;
+use feature_flags::FeatureFlagAppExt as _;
+
+use futures::{FutureExt, select_biased};
+use gpui::{AppContext as _, AsyncApp, Entity, UpdateGlobal};
+use language_model::{LanguageModelRegistry, SelectedModel};
+use project::Project;
+use settings::SettingsStore;
+
+use crate::headless::AgentCliAppState;
+
+#[derive(Parser, Debug)]
+#[command(
+ name = "eval-cli",
+ about = "Run Zed's agent headlessly in evaluation/benchmark environments"
+)]
+struct Args {
+ /// Output current environment variables as JSON to stdout.
+ /// Used internally by Zed's shell environment capture.
+ #[arg(long, hide = true)]
+ printenv: bool,
+
+ /// Path to the repository working directory. Defaults to the current directory.
+ #[arg(long, default_value = ".")]
+ workdir: PathBuf,
+
+ /// Instruction/prompt text. If omitted, read from --instruction-file or stdin.
+ #[arg(long)]
+ instruction: Option<String>,
+
+ /// Language model to use, in `provider/model` format.
+ #[arg(long, default_value = "anthropic/claude-sonnet-4-6-latest")]
+ model: String,
+
+ /// Maximum wall-clock time in seconds for the agent run.
+ #[arg(long)]
+ timeout: Option<u64>,
+
+ /// Directory for output artifacts (result.json, thread.md, thread.json).
+ #[arg(long, default_value = "/logs/agent")]
+ output_dir: PathBuf,
+}
+
+enum AgentOutcome {
+ Completed,
+ Timeout { seconds: u64 },
+ Interrupted,
+}
+
+#[derive(serde::Serialize)]
+struct EvalResult {
+ status: String,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ error: Option<String>,
+ duration_secs: f64,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ timeout_secs: Option<u64>,
+ model: String,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ input_tokens: Option<u64>,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ output_tokens: Option<u64>,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ cache_creation_input_tokens: Option<u64>,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ cache_read_input_tokens: Option<u64>,
+}
+
+const EXIT_OK: i32 = 0;
+const EXIT_ERROR: i32 = 1;
+const EXIT_TIMEOUT: i32 = 2;
+const EXIT_INTERRUPTED: i32 = 3;
+
+static TERMINATED: AtomicBool = AtomicBool::new(false);
+
+fn main() {
+ let args = Args::parse();
+
+ if args.printenv {
+ util::shell_env::print_env();
+ return;
+ }
+
+ env_logger::init();
+
+ ctrlc::set_handler(|| {
+ TERMINATED.store(true, Ordering::SeqCst);
+ })
+ .expect("failed to set signal handler");
+
+ let instruction = read_instruction(&args).unwrap_or_else(|e| {
+ eprintln!("Error reading instruction: {e}");
+ process::exit(EXIT_ERROR);
+ });
+
+ let workdir = args.workdir.canonicalize().unwrap_or_else(|e| {
+ eprintln!("Invalid --workdir {:?}: {e}", args.workdir);
+ process::exit(EXIT_ERROR);
+ });
+
+ let output_dir = args.output_dir.clone();
+ if let Err(e) = std::fs::create_dir_all(&output_dir) {
+ eprintln!("Error creating output dir {}: {e}", output_dir.display());
+ process::exit(EXIT_ERROR);
+ }
+
+ let http_client = Arc::new(reqwest_client::ReqwestClient::new());
+ let app = gpui_platform::headless().with_http_client(http_client);
+
+ app.run(move |cx| {
+ let app_state = headless::init(cx);
+ cx.set_staff(true);
+
+ let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
+ registry
+ .providers()
+ .iter()
+ .map(|p| p.authenticate(cx))
+ .collect::<Vec<_>>()
+ });
+
+ let model_name = args.model.clone();
+ let timeout = args.timeout;
+
+ cx.spawn(async move |cx| {
+ futures::future::join_all(auth_tasks).await;
+
+ let start = Instant::now();
+
+ let (outcome, token_usage) = run_agent(
+ &app_state,
+ &workdir,
+ &instruction,
+ &model_name,
+ timeout,
+ Some(&output_dir),
+ cx,
+ )
+ .await;
+
+ let duration = start.elapsed();
+
+ let (status, error, exit_code) = match &outcome {
+ Ok(AgentOutcome::Completed) => ("completed".to_string(), None, EXIT_OK),
+ Ok(AgentOutcome::Timeout { seconds }) => {
+ eprintln!("Timeout: agent exceeded {seconds}s time limit");
+ ("timeout".to_string(), None, EXIT_TIMEOUT)
+ }
+ Ok(AgentOutcome::Interrupted) => {
+ eprintln!("Interrupted: received SIGTERM, saved partial output");
+ ("interrupted".to_string(), None, EXIT_INTERRUPTED)
+ }
+ Err(e) => {
+ eprintln!("Error: {e:#}");
+ ("error".to_string(), Some(format!("{e:#}")), EXIT_ERROR)
+ }
+ };
+
+ let result = EvalResult {
+ status,
+ error,
+ duration_secs: duration.as_secs_f64(),
+ timeout_secs: timeout,
+ model: model_name.clone(),
+ input_tokens: token_usage.as_ref().map(|u| u.input_tokens),
+ output_tokens: token_usage.as_ref().map(|u| u.output_tokens),
+ cache_creation_input_tokens: token_usage
+ .as_ref()
+ .filter(|u| u.cache_creation_input_tokens > 0)
+ .map(|u| u.cache_creation_input_tokens),
+ cache_read_input_tokens: token_usage
+ .as_ref()
+ .filter(|u| u.cache_read_input_tokens > 0)
+ .map(|u| u.cache_read_input_tokens),
+ };
+
+ match serde_json::to_string_pretty(&result) {
+ Ok(json) => {
+ if let Err(e) = std::fs::write(output_dir.join("result.json"), &json) {
+ eprintln!("Error writing result.json: {e:#}");
+ }
+ eprintln!("[eval-cli] result: {json}");
+ }
+ Err(e) => eprintln!("Error serializing result: {e:#}"),
+ }
+
+ cx.update(|cx| cx.quit());
+ process::exit(exit_code);
+ })
+ .detach();
+ });
+}
+
+fn read_instruction(args: &Args) -> Result<String> {
+ let text = if let Some(text) = &args.instruction {
+ text.clone()
+ } else {
+ use std::io::Read;
+ let mut buf = String::new();
+ std::io::stdin()
+ .read_to_string(&mut buf)
+ .context("reading instruction from stdin")?;
+ buf
+ };
+ anyhow::ensure!(!text.trim().is_empty(), "instruction is empty");
+ Ok(text)
+}
+
+async fn run_agent(
+ app_state: &Arc<AgentCliAppState>,
+ workdir: &std::path::Path,
+ instruction: &str,
+ model_name: &str,
+ timeout: Option<u64>,
+ output_dir: Option<&std::path::Path>,
+ cx: &mut AsyncApp,
+) -> (Result<AgentOutcome>, Option<language_model::TokenUsage>) {
+ let setup_result: Result<()> = cx.update(|cx| {
+ let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!("{e}"))?;
+ let registry = LanguageModelRegistry::global(cx);
+ let model = registry
+ .read(cx)
+ .available_models(cx)
+ .find(|m| m.id() == selected.model && m.provider_id() == selected.provider)
+ .ok_or_else(|| {
+ let available = registry
+ .read(cx)
+ .available_models(cx)
+ .map(|m| format!("{}/{}", m.provider_id().0, m.id().0))
+ .collect::<Vec<_>>()
+ .join(", ");
+ anyhow::anyhow!("Model {model_name} not found. Available: {available}")
+ })?;
+
+ let supports_thinking = model.supports_thinking();
+
+ registry.update(cx, |registry, cx| {
+ registry.set_default_model(
+ Some(language_model::ConfiguredModel {
+ provider: registry
+ .provider(&model.provider_id())
+ .context("Provider not found")?,
+ model,
+ }),
+ cx,
+ );
+ anyhow::Ok(())
+ })?;
+
+ let (enable_thinking, effort) = if supports_thinking {
+ (true, "\"high\"")
+ } else {
+ (false, "null")
+ };
+ let provider_id = selected.provider.0.to_string();
+ let model_id = selected.model.0.to_string();
+ SettingsStore::update_global(cx, |store, cx| {
+ let settings = format!(
+ r#"{{
+ "agent": {{
+ "tool_permissions": {{"default": "allow"}},
+ "default_model": {{
+ "provider": "{provider_id}",
+ "model": "{model_id}",
+ "enable_thinking": {enable_thinking},
+ "effort": {effort}
+ }}
+ }},
+ "autosave": "off",
+ "format_on_save": "off"
+ }}"
+ "#
+ );
+ store.set_user_settings(&settings, cx).ok();
+ });
+
+ anyhow::Ok(())
+ });
+
+ if let Err(e) = setup_result {
+ return (Err(e), None);
+ }
+
+ let project = cx.update(|cx| {
+ Project::local(
+ app_state.client.clone(),
+ app_state.node_runtime.clone(),
+ app_state.user_store.clone(),
+ app_state.languages.clone(),
+ app_state.fs.clone(),
+ None,
+ project::LocalProjectFlags {
+ init_worktree_trust: false,
+ ..Default::default()
+ },
+ cx,
+ )
+ });
+
+ let worktree = project.update(cx, |project, cx| project.create_worktree(workdir, true, cx));
+ let worktree = match worktree.await {
+ Ok(w) => w,
+ Err(e) => return (Err(e).context("creating worktree"), None),
+ };
+
+ let scan_result = worktree.update(cx, |tree, _cx| {
+ tree.as_local()
+ .context("expected local worktree")
+ .map(|local| local.scan_complete())
+ });
+ match scan_result {
+ Ok(future) => future.await,
+ Err(e) => return (Err(e), None),
+ };
+
+ let thread_store = cx.new(|cx| ThreadStore::new(cx));
+ let agent = match NativeAgent::new(
+ project.clone(),
+ thread_store,
+ Templates::new(),
+ None,
+ app_state.fs.clone(),
+ cx,
+ )
+ .await
+ {
+ Ok(a) => a,
+ Err(e) => return (Err(e).context("creating agent"), None),
+ };
+
+ let connection = Rc::new(NativeAgentConnection(agent.clone()));
+ let acp_thread = match cx
+ .update(|cx| connection.clone().new_session(project, workdir, cx))
+ .await
+ {
+ Ok(t) => t,
+ Err(e) => return (Err(e).context("creating ACP session"), None),
+ };
+
+ let _subscription = cx.subscribe(&acp_thread, |acp_thread, event, cx| {
+ log_acp_thread_event(&acp_thread, event, cx);
+ });
+
+ let message = vec![acp::ContentBlock::Text(acp::TextContent::new(
+ instruction.to_string(),
+ ))];
+
+ let send_future = acp_thread.update(cx, |acp_thread: &mut acp_thread::AcpThread, cx| {
+ acp_thread.send(message, cx)
+ });
+
+ let timeout_future = if let Some(timeout_secs) = timeout {
+ futures::future::Either::Left(
+ cx.background_executor()
+ .timer(Duration::from_secs(timeout_secs)),
+ )
+ } else {
+ futures::future::Either::Right(futures::future::pending::<()>())
+ };
+
+ let sigterm_future = {
+ let executor = cx.background_executor().clone();
+ async move {
+ while !TERMINATED.load(Ordering::Relaxed) {
+ executor.timer(Duration::from_millis(100)).await;
+ }
+ }
+ };
+
+ let outcome = select_biased! {
+ result = send_future.fuse() => match result {
+ Ok(Some(response)) => {
+ eprintln!("[eval-cli] stopped: {:?}", response.stop_reason);
+ if response.stop_reason == acp::StopReason::MaxTokens {
+ Err(anyhow::anyhow!("Model hit maximum token limit"))
+ } else {
+ Ok(AgentOutcome::Completed)
+ }
+ }
+ Ok(None) => {
+ eprintln!("[eval-cli] completed (no response)");
+ Ok(AgentOutcome::Completed)
+ }
+ Err(e) => Err(e).context("agent run failed"),
+ },
+ _ = sigterm_future.fuse() => {
+ eprintln!("[eval-cli] received SIGTERM, cancelling...");
+ acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
+ Ok(AgentOutcome::Interrupted)
+ },
+ _ = timeout_future.fuse() => {
+ acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
+ Ok(AgentOutcome::Timeout { seconds: timeout.unwrap_or(0) })
+ }
+ };
+
+ let thread = cx.update(|cx| {
+ let session_id = acp_thread.read(cx).session_id().clone();
+ connection.thread(&session_id, cx)
+ });
+
+ let cumulative_usage = if let Some(thread) = &thread {
+ let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
+ let db_thread = db_thread.await;
+ let usage = db_thread.cumulative_token_usage;
+ if usage.input_tokens > 0 || usage.output_tokens > 0 {
+ Some(usage)
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+
+ let acp_usage = cx.update(|cx| {
+ acp_thread
+ .read(cx)
+ .token_usage()
+ .map(|usage| language_model::TokenUsage {
+ input_tokens: usage.input_tokens,
+ output_tokens: usage.output_tokens,
+ ..Default::default()
+ })
+ });
+
+ let final_usage = cumulative_usage.or(acp_usage);
+
+ if let (Some(thread), Some(dir)) = (&thread, output_dir) {
+ let markdown = thread.read_with(cx, |thread, _cx| thread.to_markdown());
+ if let Err(e) = std::fs::write(dir.join("thread.md"), markdown) {
+ eprintln!("Error writing thread.md: {e:#}");
+ }
+
+ let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
+ let db_thread = db_thread.await;
+ match serde_json::to_string_pretty(&db_thread) {
+ Ok(json) => {
+ if let Err(e) = std::fs::write(dir.join("thread.json"), json) {
+ eprintln!("Error writing thread.json: {e:#}");
+ }
+ }
+ Err(e) => eprintln!("Error serializing thread.json: {e:#}"),
+ }
+ }
+
+ (outcome, final_usage)
+}
+
+fn log_acp_thread_event(
+ acp_thread: &Entity<acp_thread::AcpThread>,
+ event: &acp_thread::AcpThreadEvent,
+ cx: &mut gpui::App,
+) {
+ match event {
+ acp_thread::AcpThreadEvent::NewEntry => {
+ let entries = acp_thread.read(cx).entries();
+ if let Some(acp_thread::AgentThreadEntry::AssistantMessage(message)) = entries.last() {
+ for chunk in &message.chunks {
+ if let acp_thread::AssistantMessageChunk::Message { block } = chunk {
+ if let acp_thread::ContentBlock::Markdown { markdown } = block {
+ let text = markdown.read(cx).source().to_string();
+ if !text.is_empty() {
+ eprint!("{text}");
+ }
+ }
+ }
+ }
+ }
+ }
+ acp_thread::AcpThreadEvent::EntryUpdated(index) => {
+ let entries = acp_thread.read(cx).entries();
+ if let Some(acp_thread::AgentThreadEntry::ToolCall(tool_call)) = entries.get(*index) {
+ if let Some(name) = &tool_call.tool_name {
+ match &tool_call.status {
+ acp_thread::ToolCallStatus::Completed => {
+ eprintln!("[tool] {name} ✓");
+ }
+ acp_thread::ToolCallStatus::Failed => {
+ eprintln!("[tool] {name} ✗");
+ }
+ acp_thread::ToolCallStatus::Rejected => {
+ eprintln!("[tool] {name} rejected");
+ }
+ acp_thread::ToolCallStatus::Canceled => {
+ eprintln!("[tool] {name} canceled");
+ }
+ _ => {}
+ }
+ }
+ }
+ }
+ acp_thread::AcpThreadEvent::Stopped(reason) => {
+ eprintln!("\n[eval-cli] stopped: {reason:?}");
+ }
+ acp_thread::AcpThreadEvent::Error => {
+ eprintln!("[eval-cli] error event");
+ }
+ acp_thread::AcpThreadEvent::Retry(status) => {
+ eprintln!("[eval-cli] retry: {status:?}");
+ }
+ acp_thread::AcpThreadEvent::SubagentSpawned(session_id) => {
+ eprintln!("[eval-cli] subagent spawned: {session_id}");
+ }
+ _ => {}
+ }
+}
@@ -0,0 +1,3 @@
+from zed_eval.agent import ZedAgent
+
+__all__ = ["ZedAgent"]
@@ -0,0 +1,161 @@
+"""Harbor agent wrapper for Zed's eval-cli binary.
+
+Usage:
+ # Build eval-cli locally first:
+ cargo build --release -p eval_cli
+
+ # Run via Harbor with a local binary:
+ harbor run -d "dataset@version" \
+ --agent-import-path zed_eval.agent:ZedAgent \
+ --ae binary_path=/path/to/target/release/eval-cli \
+ --agent-model anthropic/claude-sonnet-4-6-latest
+
+ # Or with a download URL (for CI):
+ harbor run -d "dataset@version" \
+ --agent-import-path zed_eval.agent:ZedAgent \
+ --ae download_url=https://example.com/eval-cli \
+ --agent-model anthropic/claude-sonnet-4-6-latest
+"""
+
+import json
+import os
+import shlex
+from pathlib import Path
+
+from harbor.agents.installed.base import BaseInstalledAgent, ExecInput
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+
+
+class ZedAgent(BaseInstalledAgent):
+ """Runs Zed's headless AI agent (eval-cli) to solve tasks.
+
+ The eval-cli binary boots a headless GPUI application and uses the same
+ NativeAgent + AcpThread pipeline as the production Zed editor, driving
+ the full agentic loop (tool calls, subagents, retries) without a GUI.
+ """
+
+ def __init__(
+ self,
+ logs_dir: Path,
+ binary_path: str | None = None,
+ download_url: str | None = None,
+ *args,
+ **kwargs,
+ ):
+ super().__init__(logs_dir, *args, **kwargs)
+ self._binary_path = binary_path
+ self._download_url = download_url or os.environ.get("EVAL_CLI_DOWNLOAD_URL")
+
+ @staticmethod
+ def name() -> str:
+ return "zed"
+
+ @property
+ def _install_agent_template_path(self) -> Path:
+ return Path(__file__).parent / "install.sh.j2"
+
+ async def setup(self, environment: BaseEnvironment) -> None:
+ await environment.exec(command="mkdir -p /installed-agent")
+
+ if self._binary_path:
+ binary = Path(self._binary_path)
+ if not binary.exists():
+ raise FileNotFoundError(
+ f"eval-cli binary not found at {binary}. "
+ "Build it with: cargo build --release -p eval_cli"
+ )
+ await environment.upload_file(
+ source_path=binary,
+ target_path="/usr/local/bin/eval-cli",
+ )
+ await environment.exec(command="chmod +x /usr/local/bin/eval-cli")
+
+ await super().setup(environment)
+
+ @property
+ def _template_variables(self) -> dict[str, str]:
+ variables = super()._template_variables
+ if self._binary_path:
+ variables["binary_uploaded"] = "true"
+ if self._download_url:
+ variables["download_url"] = self._download_url
+ return variables
+
+ def populate_context_post_run(self, context: AgentContext) -> None:
+ result_data = None
+ for json_file in self.logs_dir.rglob("result.json"):
+ try:
+ result_data = json.loads(json_file.read_text())
+ break
+ except (json.JSONDecodeError, OSError):
+ continue
+
+ if result_data is None:
+ self.logger.warning("Could not find or parse result.json from eval-cli")
+ return
+
+ if result_data.get("input_tokens") is not None:
+ context.n_input_tokens = result_data["input_tokens"]
+ if result_data.get("output_tokens") is not None:
+ context.n_output_tokens = result_data["output_tokens"]
+ if result_data.get("cache_read_input_tokens") is not None:
+ context.n_cache_tokens = result_data["cache_read_input_tokens"]
+
+ context.metadata = {
+ "status": result_data.get("status"),
+ "duration_secs": result_data.get("duration_secs"),
+ "model": result_data.get("model"),
+ }
+
+ def _get_api_env(self) -> dict[str, str]:
+ env: dict[str, str] = {}
+ if not self.model_name or "/" not in self.model_name:
+ return env
+
+ provider = self.model_name.split("/", 1)[0]
+ provider_env_map = {
+ "anthropic": "ANTHROPIC_API_KEY",
+ "openai": "OPENAI_API_KEY",
+ "google": "GEMINI_API_KEY",
+ "gemini": "GEMINI_API_KEY",
+ "deepseek": "DEEPSEEK_API_KEY",
+ "mistral": "MISTRAL_API_KEY",
+ }
+
+ env_var = provider_env_map.get(provider)
+ if env_var:
+ api_key = os.environ.get(env_var, "")
+ if api_key:
+ env[env_var] = api_key
+
+ return env
+
+ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
+ escaped_instruction = shlex.quote(instruction)
+ env = self._get_api_env()
+
+ parts = ["eval-cli", "--workdir /testbed", "--output-dir /logs/agent"]
+
+ if self.model_name:
+ parts.append(f"--model {self.model_name}")
+
+ timeout = self._extra_env.get("EVAL_CLI_TIMEOUT")
+ if timeout:
+ parts.append(f"--timeout {timeout}")
+
+ parts.append(f"--instruction {escaped_instruction}")
+
+ eval_cli_command = " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt"
+
+ patch_command = (
+ "cd /testbed && "
+ "git add -A && "
+ "git diff --cached HEAD > /logs/agent/patch.diff && "
+ "echo \"Patch size: $(wc -c < /logs/agent/patch.diff) bytes\""
+ )
+
+ return [
+ ExecInput(command=eval_cli_command, env=env),
+ ExecInput(command=patch_command),
+ ]
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -euo pipefail
+
+# Install runtime dependencies needed by the eval-cli binary (dynamically linked
+# against glibc + these shared libraries from its GPUI/terminal/language stacks).
+apt-get update
+apt-get install -y --no-install-recommends \
+ ca-certificates \
+ curl \
+ git \
+ libasound2 \
+ libfontconfig1 \
+ libglib2.0-0 \
+ libsqlite3-0 \
+ libssl3 \
+ libwayland-client0 \
+ libx11-xcb1 \
+ libxkbcommon-x11-0 \
+ libzstd1
+
+# Install Node.js 22 LTS (needed by language servers like basedpyright).
+curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
+apt-get install -y --no-install-recommends nodejs
+
+# Install uv (needed for running Python tests in SWE-bench tasks).
+curl -LsSf https://astral.sh/uv/install.sh | sh
+. "$HOME/.local/bin/env"
+ln -sf "$HOME/.local/bin/uv" /usr/local/bin/uv
+ln -sf "$HOME/.local/bin/uvx" /usr/local/bin/uvx
+
+{% if binary_uploaded is defined %}
+# Binary was uploaded directly via setup() — just verify it works.
+eval-cli --help
+{% elif download_url is defined %}
+curl -fsSL "{{ download_url }}" -o /usr/local/bin/eval-cli
+chmod +x /usr/local/bin/eval-cli
+eval-cli --help
+{% else %}
+echo "ERROR: No eval-cli binary provided."
+echo ""
+echo "Either pass binary_path= to upload a local build:"
+echo " --ae binary_path=/path/to/target/release/eval-cli"
+echo ""
+echo "Or set download_url= / EVAL_CLI_DOWNLOAD_URL:"
+echo " --ae download_url=https://example.com/eval-cli"
+exit 1
+{% endif %}
+
+echo "INSTALL_SUCCESS"
@@ -0,0 +1,10 @@
+[project]
+name = "zed-eval"
+version = "0.1.0"
+description = "Harbor agent wrapper for Zed's eval-cli"
+requires-python = ">=3.12"
+dependencies = ["harbor"]
+
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"