1//! Headless CLI binary for running Zed's agent in evaluation/benchmark environments.
2//!
3//! Designed to work inside containerized environments (like Harbor/termbench) where:
4//! - The repository is already checked out at the working directory
5//! - The model API key is provided via environment variables
6//! - Results are written to an output directory (default: `/logs/agent/`)
7//!
8//! ## Usage
9//!
10//! ```text
11//! eval-cli --workdir /testbed --model anthropic/claude-sonnet-4-6-latest \
12//! --instruction "Fix the bug described in..." --timeout 600
13//! ```
14//!
15//! ## Output
16//!
17//! Writes to `--output-dir` (default `/logs/agent/`):
18//! - `result.json` — structured result with status, timing, and token usage
19//! - `thread.md` — full conversation as markdown
20//! - `thread.json` — raw thread state as JSON
21//!
22//! ## Exit codes
23//!
24//! | Code | Meaning |
25//! |------|---------|
26//! | 0 | Agent finished |
27//! | 1 | Error (model/auth/runtime failure) |
28//! | 2 | Timeout |
29//! | 3 | Interrupted (SIGTERM/SIGINT) |
30
31mod headless;
32
33use std::path::PathBuf;
34use std::process;
35use std::rc::Rc;
36use std::str::FromStr;
37use std::sync::Arc;
38use std::sync::atomic::{AtomicBool, Ordering};
39use std::time::{Duration, Instant};
40
41use acp_thread::AgentConnection as _;
42use agent::{NativeAgent, NativeAgentConnection, Templates, ThreadStore};
43use agent_client_protocol as acp;
44use anyhow::{Context, Result};
45use clap::Parser;
46use feature_flags::FeatureFlagAppExt as _;
47
48use futures::{FutureExt, select_biased};
49use gpui::{AppContext as _, AsyncApp, Entity, UpdateGlobal};
50use language_model::{LanguageModelRegistry, SelectedModel};
51use project::Project;
52use settings::SettingsStore;
53use util::path_list::PathList;
54
55use crate::headless::AgentCliAppState;
56
57#[derive(Parser, Debug)]
58#[command(
59 name = "eval-cli",
60 about = "Run Zed's agent headlessly in evaluation/benchmark environments"
61)]
62struct Args {
63 /// Output current environment variables as JSON to stdout.
64 /// Used internally by Zed's shell environment capture.
65 #[arg(long, hide = true)]
66 printenv: bool,
67
68 /// Path to the repository working directory. Defaults to the current directory.
69 #[arg(long, default_value = ".")]
70 workdir: PathBuf,
71
72 /// Instruction/prompt text. If omitted, read from --instruction-file or stdin.
73 #[arg(long)]
74 instruction: Option<String>,
75
76 /// Language model to use, in `provider/model` format.
77 #[arg(long, default_value = "anthropic/claude-sonnet-4-6-latest")]
78 model: String,
79
80 /// Maximum wall-clock time in seconds for the agent run.
81 #[arg(long)]
82 timeout: Option<u64>,
83
84 /// Directory for output artifacts (result.json, thread.md, thread.json).
85 #[arg(long, default_value = "/logs/agent")]
86 output_dir: PathBuf,
87}
88
89enum AgentOutcome {
90 Completed,
91 Timeout { seconds: u64 },
92 Interrupted,
93}
94
95#[derive(serde::Serialize)]
96struct EvalResult {
97 status: String,
98 #[serde(skip_serializing_if = "Option::is_none")]
99 error: Option<String>,
100 duration_secs: f64,
101 #[serde(skip_serializing_if = "Option::is_none")]
102 timeout_secs: Option<u64>,
103 model: String,
104 #[serde(skip_serializing_if = "Option::is_none")]
105 input_tokens: Option<u64>,
106 #[serde(skip_serializing_if = "Option::is_none")]
107 output_tokens: Option<u64>,
108 #[serde(skip_serializing_if = "Option::is_none")]
109 cache_creation_input_tokens: Option<u64>,
110 #[serde(skip_serializing_if = "Option::is_none")]
111 cache_read_input_tokens: Option<u64>,
112}
113
114const EXIT_OK: i32 = 0;
115const EXIT_ERROR: i32 = 1;
116const EXIT_TIMEOUT: i32 = 2;
117const EXIT_INTERRUPTED: i32 = 3;
118
119static TERMINATED: AtomicBool = AtomicBool::new(false);
120
121fn main() {
122 let args = Args::parse();
123
124 if args.printenv {
125 util::shell_env::print_env();
126 return;
127 }
128
129 env_logger::init();
130
131 ctrlc::set_handler(|| {
132 TERMINATED.store(true, Ordering::SeqCst);
133 })
134 .expect("failed to set signal handler");
135
136 let instruction = read_instruction(&args).unwrap_or_else(|e| {
137 eprintln!("Error reading instruction: {e}");
138 process::exit(EXIT_ERROR);
139 });
140
141 let workdir = args.workdir.canonicalize().unwrap_or_else(|e| {
142 eprintln!("Invalid --workdir {:?}: {e}", args.workdir);
143 process::exit(EXIT_ERROR);
144 });
145
146 let output_dir = args.output_dir.clone();
147 if let Err(e) = std::fs::create_dir_all(&output_dir) {
148 eprintln!("Error creating output dir {}: {e}", output_dir.display());
149 process::exit(EXIT_ERROR);
150 }
151
152 let http_client = Arc::new(reqwest_client::ReqwestClient::new());
153 let app = gpui_platform::headless().with_http_client(http_client);
154
155 app.run(move |cx| {
156 let app_state = headless::init(cx);
157 cx.set_staff(true);
158
159 let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
160 registry
161 .providers()
162 .iter()
163 .map(|p| p.authenticate(cx))
164 .collect::<Vec<_>>()
165 });
166
167 let model_name = args.model.clone();
168 let timeout = args.timeout;
169
170 cx.spawn(async move |cx| {
171 futures::future::join_all(auth_tasks).await;
172
173 let start = Instant::now();
174
175 let (outcome, token_usage) = run_agent(
176 &app_state,
177 &workdir,
178 &instruction,
179 &model_name,
180 timeout,
181 Some(&output_dir),
182 cx,
183 )
184 .await;
185
186 let duration = start.elapsed();
187
188 let (status, error, exit_code) = match &outcome {
189 Ok(AgentOutcome::Completed) => ("completed".to_string(), None, EXIT_OK),
190 Ok(AgentOutcome::Timeout { seconds }) => {
191 eprintln!("Timeout: agent exceeded {seconds}s time limit");
192 ("timeout".to_string(), None, EXIT_TIMEOUT)
193 }
194 Ok(AgentOutcome::Interrupted) => {
195 eprintln!("Interrupted: received SIGTERM, saved partial output");
196 ("interrupted".to_string(), None, EXIT_INTERRUPTED)
197 }
198 Err(e) => {
199 eprintln!("Error: {e:#}");
200 ("error".to_string(), Some(format!("{e:#}")), EXIT_ERROR)
201 }
202 };
203
204 let result = EvalResult {
205 status,
206 error,
207 duration_secs: duration.as_secs_f64(),
208 timeout_secs: timeout,
209 model: model_name.clone(),
210 input_tokens: token_usage.as_ref().map(|u| u.input_tokens),
211 output_tokens: token_usage.as_ref().map(|u| u.output_tokens),
212 cache_creation_input_tokens: token_usage
213 .as_ref()
214 .filter(|u| u.cache_creation_input_tokens > 0)
215 .map(|u| u.cache_creation_input_tokens),
216 cache_read_input_tokens: token_usage
217 .as_ref()
218 .filter(|u| u.cache_read_input_tokens > 0)
219 .map(|u| u.cache_read_input_tokens),
220 };
221
222 match serde_json::to_string_pretty(&result) {
223 Ok(json) => {
224 if let Err(e) = std::fs::write(output_dir.join("result.json"), &json) {
225 eprintln!("Error writing result.json: {e:#}");
226 }
227 eprintln!("[eval-cli] result: {json}");
228 }
229 Err(e) => eprintln!("Error serializing result: {e:#}"),
230 }
231
232 cx.update(|cx| cx.quit());
233 process::exit(exit_code);
234 })
235 .detach();
236 });
237}
238
239fn read_instruction(args: &Args) -> Result<String> {
240 let text = if let Some(text) = &args.instruction {
241 text.clone()
242 } else {
243 use std::io::Read;
244 let mut buf = String::new();
245 std::io::stdin()
246 .read_to_string(&mut buf)
247 .context("reading instruction from stdin")?;
248 buf
249 };
250 anyhow::ensure!(!text.trim().is_empty(), "instruction is empty");
251 Ok(text)
252}
253
254async fn run_agent(
255 app_state: &Arc<AgentCliAppState>,
256 workdir: &std::path::Path,
257 instruction: &str,
258 model_name: &str,
259 timeout: Option<u64>,
260 output_dir: Option<&std::path::Path>,
261 cx: &mut AsyncApp,
262) -> (Result<AgentOutcome>, Option<language_model::TokenUsage>) {
263 let setup_result: Result<()> = cx.update(|cx| {
264 let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!("{e}"))?;
265 let registry = LanguageModelRegistry::global(cx);
266 let model = registry
267 .read(cx)
268 .available_models(cx)
269 .find(|m| m.id() == selected.model && m.provider_id() == selected.provider)
270 .ok_or_else(|| {
271 let available = registry
272 .read(cx)
273 .available_models(cx)
274 .map(|m| format!("{}/{}", m.provider_id().0, m.id().0))
275 .collect::<Vec<_>>()
276 .join(", ");
277 anyhow::anyhow!("Model {model_name} not found. Available: {available}")
278 })?;
279
280 let supports_thinking = model.supports_thinking();
281
282 registry.update(cx, |registry, cx| {
283 registry.set_default_model(
284 Some(language_model::ConfiguredModel {
285 provider: registry
286 .provider(&model.provider_id())
287 .context("Provider not found")?,
288 model,
289 }),
290 cx,
291 );
292 anyhow::Ok(())
293 })?;
294
295 let (enable_thinking, effort) = if supports_thinking {
296 (true, "\"high\"")
297 } else {
298 (false, "null")
299 };
300 let provider_id = selected.provider.0.to_string();
301 let model_id = selected.model.0.to_string();
302 SettingsStore::update_global(cx, |store, cx| {
303 let settings = format!(
304 r#"{{
305 "agent": {{
306 "tool_permissions": {{"default": "allow"}},
307 "default_model": {{
308 "provider": "{provider_id}",
309 "model": "{model_id}",
310 "enable_thinking": {enable_thinking},
311 "effort": {effort}
312 }}
313 }},
314 "autosave": "off",
315 "format_on_save": "off"
316 }}"
317 "#
318 );
319 store.set_user_settings(&settings, cx).ok();
320 });
321
322 anyhow::Ok(())
323 });
324
325 if let Err(e) = setup_result {
326 return (Err(e), None);
327 }
328
329 let project = cx.update(|cx| {
330 Project::local(
331 app_state.client.clone(),
332 app_state.node_runtime.clone(),
333 app_state.user_store.clone(),
334 app_state.languages.clone(),
335 app_state.fs.clone(),
336 None,
337 project::LocalProjectFlags {
338 init_worktree_trust: false,
339 ..Default::default()
340 },
341 cx,
342 )
343 });
344
345 let worktree = project.update(cx, |project, cx| project.create_worktree(workdir, true, cx));
346 let worktree = match worktree.await {
347 Ok(w) => w,
348 Err(e) => return (Err(e).context("creating worktree"), None),
349 };
350
351 let scan_result = worktree.update(cx, |tree, _cx| {
352 tree.as_local()
353 .context("expected local worktree")
354 .map(|local| local.scan_complete())
355 });
356 match scan_result {
357 Ok(future) => future.await,
358 Err(e) => return (Err(e), None),
359 };
360
361 let agent = cx.update(|cx| {
362 let thread_store = cx.new(|cx| ThreadStore::new(cx));
363 NativeAgent::new(
364 thread_store,
365 Templates::new(),
366 None,
367 app_state.fs.clone(),
368 cx,
369 )
370 });
371
372 let connection = Rc::new(NativeAgentConnection(agent.clone()));
373 let acp_thread = match cx
374 .update(|cx| {
375 connection
376 .clone()
377 .new_session(project, PathList::new(&[workdir]), cx)
378 })
379 .await
380 {
381 Ok(t) => t,
382 Err(e) => return (Err(e).context("creating ACP session"), None),
383 };
384
385 let _subscription = cx.subscribe(&acp_thread, |acp_thread, event, cx| {
386 log_acp_thread_event(&acp_thread, event, cx);
387 });
388
389 let message = vec![acp::ContentBlock::Text(acp::TextContent::new(
390 instruction.to_string(),
391 ))];
392
393 let send_future = acp_thread.update(cx, |acp_thread: &mut acp_thread::AcpThread, cx| {
394 acp_thread.send(message, cx)
395 });
396
397 let timeout_future = if let Some(timeout_secs) = timeout {
398 futures::future::Either::Left(
399 cx.background_executor()
400 .timer(Duration::from_secs(timeout_secs)),
401 )
402 } else {
403 futures::future::Either::Right(futures::future::pending::<()>())
404 };
405
406 let sigterm_future = {
407 let executor = cx.background_executor().clone();
408 async move {
409 while !TERMINATED.load(Ordering::Relaxed) {
410 executor.timer(Duration::from_millis(100)).await;
411 }
412 }
413 };
414
415 let outcome = select_biased! {
416 result = send_future.fuse() => match result {
417 Ok(Some(response)) => {
418 eprintln!("[eval-cli] stopped: {:?}", response.stop_reason);
419 if response.stop_reason == acp::StopReason::MaxTokens {
420 Err(anyhow::anyhow!("Model hit maximum token limit"))
421 } else {
422 Ok(AgentOutcome::Completed)
423 }
424 }
425 Ok(None) => {
426 eprintln!("[eval-cli] completed (no response)");
427 Ok(AgentOutcome::Completed)
428 }
429 Err(e) => Err(e).context("agent run failed"),
430 },
431 _ = sigterm_future.fuse() => {
432 eprintln!("[eval-cli] received SIGTERM, cancelling...");
433 acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
434 Ok(AgentOutcome::Interrupted)
435 },
436 _ = timeout_future.fuse() => {
437 acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
438 Ok(AgentOutcome::Timeout { seconds: timeout.unwrap_or(0) })
439 }
440 };
441
442 let thread = cx.update(|cx| {
443 let session_id = acp_thread.read(cx).session_id().clone();
444 connection.thread(&session_id, cx)
445 });
446
447 let cumulative_usage = if let Some(thread) = &thread {
448 let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
449 let db_thread = db_thread.await;
450 let usage = db_thread.cumulative_token_usage;
451 if usage.input_tokens > 0 || usage.output_tokens > 0 {
452 Some(usage)
453 } else {
454 None
455 }
456 } else {
457 None
458 };
459
460 let acp_usage = cx.update(|cx| {
461 acp_thread
462 .read(cx)
463 .token_usage()
464 .map(|usage| language_model::TokenUsage {
465 input_tokens: usage.input_tokens,
466 output_tokens: usage.output_tokens,
467 ..Default::default()
468 })
469 });
470
471 let final_usage = cumulative_usage.or(acp_usage);
472
473 if let (Some(thread), Some(dir)) = (&thread, output_dir) {
474 let markdown = thread.read_with(cx, |thread, _cx| thread.to_markdown());
475 if let Err(e) = std::fs::write(dir.join("thread.md"), markdown) {
476 eprintln!("Error writing thread.md: {e:#}");
477 }
478
479 let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
480 let db_thread = db_thread.await;
481 match serde_json::to_string_pretty(&db_thread) {
482 Ok(json) => {
483 if let Err(e) = std::fs::write(dir.join("thread.json"), json) {
484 eprintln!("Error writing thread.json: {e:#}");
485 }
486 }
487 Err(e) => eprintln!("Error serializing thread.json: {e:#}"),
488 }
489 }
490
491 (outcome, final_usage)
492}
493
494fn log_acp_thread_event(
495 acp_thread: &Entity<acp_thread::AcpThread>,
496 event: &acp_thread::AcpThreadEvent,
497 cx: &mut gpui::App,
498) {
499 match event {
500 acp_thread::AcpThreadEvent::NewEntry => {
501 let entries = acp_thread.read(cx).entries();
502 if let Some(acp_thread::AgentThreadEntry::AssistantMessage(message)) = entries.last() {
503 for chunk in &message.chunks {
504 if let acp_thread::AssistantMessageChunk::Message { block } = chunk {
505 if let acp_thread::ContentBlock::Markdown { markdown } = block {
506 let text = markdown.read(cx).source().to_string();
507 if !text.is_empty() {
508 eprint!("{text}");
509 }
510 }
511 }
512 }
513 }
514 }
515 acp_thread::AcpThreadEvent::EntryUpdated(index) => {
516 let entries = acp_thread.read(cx).entries();
517 if let Some(acp_thread::AgentThreadEntry::ToolCall(tool_call)) = entries.get(*index) {
518 if let Some(name) = &tool_call.tool_name {
519 match &tool_call.status {
520 acp_thread::ToolCallStatus::Completed => {
521 eprintln!("[tool] {name} ✓");
522 }
523 acp_thread::ToolCallStatus::Failed => {
524 eprintln!("[tool] {name} ✗");
525 }
526 acp_thread::ToolCallStatus::Rejected => {
527 eprintln!("[tool] {name} rejected");
528 }
529 acp_thread::ToolCallStatus::Canceled => {
530 eprintln!("[tool] {name} canceled");
531 }
532 _ => {}
533 }
534 }
535 }
536 }
537 acp_thread::AcpThreadEvent::Stopped(reason) => {
538 eprintln!("\n[eval-cli] stopped: {reason:?}");
539 }
540 acp_thread::AcpThreadEvent::Error => {
541 eprintln!("[eval-cli] error event");
542 }
543 acp_thread::AcpThreadEvent::Retry(status) => {
544 eprintln!("[eval-cli] retry: {status:?}");
545 }
546 acp_thread::AcpThreadEvent::SubagentSpawned(session_id) => {
547 eprintln!("[eval-cli] subagent spawned: {session_id}");
548 }
549 _ => {}
550 }
551}