1//! Headless CLI binary for running Zed's agent in evaluation/benchmark environments.
2//!
3//! Designed to work inside containerized environments (like Harbor/termbench) where:
4//! - The repository is already checked out at the working directory
5//! - The model API key is provided via environment variables
6//! - Results are written to an output directory (default: `/logs/agent/`)
7//!
8//! ## Usage
9//!
10//! ```text
11//! eval-cli --workdir /testbed --model anthropic/claude-sonnet-4-6-latest \
12//! --instruction "Fix the bug described in..." --timeout 600
13//! ```
14//!
15//! ## Output
16//!
17//! Writes to `--output-dir` (default `/logs/agent/`):
18//! - `result.json` — structured result with status, timing, and token usage
19//! - `thread.md` — full conversation as markdown
20//! - `thread.json` — raw thread state as JSON
21//!
22//! ## Exit codes
23//!
24//! | Code | Meaning |
25//! |------|---------|
26//! | 0 | Agent finished |
27//! | 1 | Error (model/auth/runtime failure) |
28//! | 2 | Timeout |
29//! | 3 | Interrupted (SIGTERM/SIGINT) |
30
31mod headless;
32
33use std::path::PathBuf;
34use std::process;
35use std::rc::Rc;
36use std::str::FromStr;
37use std::sync::Arc;
38use std::sync::atomic::{AtomicBool, Ordering};
39use std::time::{Duration, Instant};
40
41use acp_thread::AgentConnection as _;
42use agent::{NativeAgent, NativeAgentConnection, Templates, ThreadStore};
43use agent_client_protocol as acp;
44use anyhow::{Context, Result};
45use clap::Parser;
46use feature_flags::FeatureFlagAppExt as _;
47
48use futures::{FutureExt, select_biased};
49use gpui::{AppContext as _, AsyncApp, Entity, UpdateGlobal};
50use language_model::{LanguageModelRegistry, SelectedModel};
51use project::Project;
52use settings::SettingsStore;
53
54use crate::headless::AgentCliAppState;
55
56#[derive(Parser, Debug)]
57#[command(
58 name = "eval-cli",
59 about = "Run Zed's agent headlessly in evaluation/benchmark environments"
60)]
61struct Args {
62 /// Output current environment variables as JSON to stdout.
63 /// Used internally by Zed's shell environment capture.
64 #[arg(long, hide = true)]
65 printenv: bool,
66
67 /// Path to the repository working directory. Defaults to the current directory.
68 #[arg(long, default_value = ".")]
69 workdir: PathBuf,
70
71 /// Instruction/prompt text. If omitted, read from --instruction-file or stdin.
72 #[arg(long)]
73 instruction: Option<String>,
74
75 /// Language model to use, in `provider/model` format.
76 #[arg(long, default_value = "anthropic/claude-sonnet-4-6-latest")]
77 model: String,
78
79 /// Maximum wall-clock time in seconds for the agent run.
80 #[arg(long)]
81 timeout: Option<u64>,
82
83 /// Directory for output artifacts (result.json, thread.md, thread.json).
84 #[arg(long, default_value = "/logs/agent")]
85 output_dir: PathBuf,
86}
87
88enum AgentOutcome {
89 Completed,
90 Timeout { seconds: u64 },
91 Interrupted,
92}
93
94#[derive(serde::Serialize)]
95struct EvalResult {
96 status: String,
97 #[serde(skip_serializing_if = "Option::is_none")]
98 error: Option<String>,
99 duration_secs: f64,
100 #[serde(skip_serializing_if = "Option::is_none")]
101 timeout_secs: Option<u64>,
102 model: String,
103 #[serde(skip_serializing_if = "Option::is_none")]
104 input_tokens: Option<u64>,
105 #[serde(skip_serializing_if = "Option::is_none")]
106 output_tokens: Option<u64>,
107 #[serde(skip_serializing_if = "Option::is_none")]
108 cache_creation_input_tokens: Option<u64>,
109 #[serde(skip_serializing_if = "Option::is_none")]
110 cache_read_input_tokens: Option<u64>,
111}
112
113const EXIT_OK: i32 = 0;
114const EXIT_ERROR: i32 = 1;
115const EXIT_TIMEOUT: i32 = 2;
116const EXIT_INTERRUPTED: i32 = 3;
117
118static TERMINATED: AtomicBool = AtomicBool::new(false);
119
120fn main() {
121 let args = Args::parse();
122
123 if args.printenv {
124 util::shell_env::print_env();
125 return;
126 }
127
128 env_logger::init();
129
130 ctrlc::set_handler(|| {
131 TERMINATED.store(true, Ordering::SeqCst);
132 })
133 .expect("failed to set signal handler");
134
135 let instruction = read_instruction(&args).unwrap_or_else(|e| {
136 eprintln!("Error reading instruction: {e}");
137 process::exit(EXIT_ERROR);
138 });
139
140 let workdir = args.workdir.canonicalize().unwrap_or_else(|e| {
141 eprintln!("Invalid --workdir {:?}: {e}", args.workdir);
142 process::exit(EXIT_ERROR);
143 });
144
145 let output_dir = args.output_dir.clone();
146 if let Err(e) = std::fs::create_dir_all(&output_dir) {
147 eprintln!("Error creating output dir {}: {e}", output_dir.display());
148 process::exit(EXIT_ERROR);
149 }
150
151 let http_client = Arc::new(reqwest_client::ReqwestClient::new());
152 let app = gpui_platform::headless().with_http_client(http_client);
153
154 app.run(move |cx| {
155 let app_state = headless::init(cx);
156 cx.set_staff(true);
157
158 let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
159 registry
160 .providers()
161 .iter()
162 .map(|p| p.authenticate(cx))
163 .collect::<Vec<_>>()
164 });
165
166 let model_name = args.model.clone();
167 let timeout = args.timeout;
168
169 cx.spawn(async move |cx| {
170 futures::future::join_all(auth_tasks).await;
171
172 let start = Instant::now();
173
174 let (outcome, token_usage) = run_agent(
175 &app_state,
176 &workdir,
177 &instruction,
178 &model_name,
179 timeout,
180 Some(&output_dir),
181 cx,
182 )
183 .await;
184
185 let duration = start.elapsed();
186
187 let (status, error, exit_code) = match &outcome {
188 Ok(AgentOutcome::Completed) => ("completed".to_string(), None, EXIT_OK),
189 Ok(AgentOutcome::Timeout { seconds }) => {
190 eprintln!("Timeout: agent exceeded {seconds}s time limit");
191 ("timeout".to_string(), None, EXIT_TIMEOUT)
192 }
193 Ok(AgentOutcome::Interrupted) => {
194 eprintln!("Interrupted: received SIGTERM, saved partial output");
195 ("interrupted".to_string(), None, EXIT_INTERRUPTED)
196 }
197 Err(e) => {
198 eprintln!("Error: {e:#}");
199 ("error".to_string(), Some(format!("{e:#}")), EXIT_ERROR)
200 }
201 };
202
203 let result = EvalResult {
204 status,
205 error,
206 duration_secs: duration.as_secs_f64(),
207 timeout_secs: timeout,
208 model: model_name.clone(),
209 input_tokens: token_usage.as_ref().map(|u| u.input_tokens),
210 output_tokens: token_usage.as_ref().map(|u| u.output_tokens),
211 cache_creation_input_tokens: token_usage
212 .as_ref()
213 .filter(|u| u.cache_creation_input_tokens > 0)
214 .map(|u| u.cache_creation_input_tokens),
215 cache_read_input_tokens: token_usage
216 .as_ref()
217 .filter(|u| u.cache_read_input_tokens > 0)
218 .map(|u| u.cache_read_input_tokens),
219 };
220
221 match serde_json::to_string_pretty(&result) {
222 Ok(json) => {
223 if let Err(e) = std::fs::write(output_dir.join("result.json"), &json) {
224 eprintln!("Error writing result.json: {e:#}");
225 }
226 eprintln!("[eval-cli] result: {json}");
227 }
228 Err(e) => eprintln!("Error serializing result: {e:#}"),
229 }
230
231 cx.update(|cx| cx.quit());
232 process::exit(exit_code);
233 })
234 .detach();
235 });
236}
237
238fn read_instruction(args: &Args) -> Result<String> {
239 let text = if let Some(text) = &args.instruction {
240 text.clone()
241 } else {
242 use std::io::Read;
243 let mut buf = String::new();
244 std::io::stdin()
245 .read_to_string(&mut buf)
246 .context("reading instruction from stdin")?;
247 buf
248 };
249 anyhow::ensure!(!text.trim().is_empty(), "instruction is empty");
250 Ok(text)
251}
252
253async fn run_agent(
254 app_state: &Arc<AgentCliAppState>,
255 workdir: &std::path::Path,
256 instruction: &str,
257 model_name: &str,
258 timeout: Option<u64>,
259 output_dir: Option<&std::path::Path>,
260 cx: &mut AsyncApp,
261) -> (Result<AgentOutcome>, Option<language_model::TokenUsage>) {
262 let setup_result: Result<()> = cx.update(|cx| {
263 let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!("{e}"))?;
264 let registry = LanguageModelRegistry::global(cx);
265 let model = registry
266 .read(cx)
267 .available_models(cx)
268 .find(|m| m.id() == selected.model && m.provider_id() == selected.provider)
269 .ok_or_else(|| {
270 let available = registry
271 .read(cx)
272 .available_models(cx)
273 .map(|m| format!("{}/{}", m.provider_id().0, m.id().0))
274 .collect::<Vec<_>>()
275 .join(", ");
276 anyhow::anyhow!("Model {model_name} not found. Available: {available}")
277 })?;
278
279 let supports_thinking = model.supports_thinking();
280
281 registry.update(cx, |registry, cx| {
282 registry.set_default_model(
283 Some(language_model::ConfiguredModel {
284 provider: registry
285 .provider(&model.provider_id())
286 .context("Provider not found")?,
287 model,
288 }),
289 cx,
290 );
291 anyhow::Ok(())
292 })?;
293
294 let (enable_thinking, effort) = if supports_thinking {
295 (true, "\"high\"")
296 } else {
297 (false, "null")
298 };
299 let provider_id = selected.provider.0.to_string();
300 let model_id = selected.model.0.to_string();
301 SettingsStore::update_global(cx, |store, cx| {
302 let settings = format!(
303 r#"{{
304 "agent": {{
305 "tool_permissions": {{"default": "allow"}},
306 "default_model": {{
307 "provider": "{provider_id}",
308 "model": "{model_id}",
309 "enable_thinking": {enable_thinking},
310 "effort": {effort}
311 }}
312 }},
313 "autosave": "off",
314 "format_on_save": "off"
315 }}"
316 "#
317 );
318 store.set_user_settings(&settings, cx).ok();
319 });
320
321 anyhow::Ok(())
322 });
323
324 if let Err(e) = setup_result {
325 return (Err(e), None);
326 }
327
328 let project = cx.update(|cx| {
329 Project::local(
330 app_state.client.clone(),
331 app_state.node_runtime.clone(),
332 app_state.user_store.clone(),
333 app_state.languages.clone(),
334 app_state.fs.clone(),
335 None,
336 project::LocalProjectFlags {
337 init_worktree_trust: false,
338 ..Default::default()
339 },
340 cx,
341 )
342 });
343
344 let worktree = project.update(cx, |project, cx| project.create_worktree(workdir, true, cx));
345 let worktree = match worktree.await {
346 Ok(w) => w,
347 Err(e) => return (Err(e).context("creating worktree"), None),
348 };
349
350 let scan_result = worktree.update(cx, |tree, _cx| {
351 tree.as_local()
352 .context("expected local worktree")
353 .map(|local| local.scan_complete())
354 });
355 match scan_result {
356 Ok(future) => future.await,
357 Err(e) => return (Err(e), None),
358 };
359
360 let agent = cx.update(|cx| {
361 let thread_store = cx.new(|cx| ThreadStore::new(cx));
362 NativeAgent::new(
363 thread_store,
364 Templates::new(),
365 None,
366 app_state.fs.clone(),
367 cx,
368 )
369 });
370
371 let connection = Rc::new(NativeAgentConnection(agent.clone()));
372 let acp_thread = match cx
373 .update(|cx| connection.clone().new_session(project, workdir, cx))
374 .await
375 {
376 Ok(t) => t,
377 Err(e) => return (Err(e).context("creating ACP session"), None),
378 };
379
380 let _subscription = cx.subscribe(&acp_thread, |acp_thread, event, cx| {
381 log_acp_thread_event(&acp_thread, event, cx);
382 });
383
384 let message = vec![acp::ContentBlock::Text(acp::TextContent::new(
385 instruction.to_string(),
386 ))];
387
388 let send_future = acp_thread.update(cx, |acp_thread: &mut acp_thread::AcpThread, cx| {
389 acp_thread.send(message, cx)
390 });
391
392 let timeout_future = if let Some(timeout_secs) = timeout {
393 futures::future::Either::Left(
394 cx.background_executor()
395 .timer(Duration::from_secs(timeout_secs)),
396 )
397 } else {
398 futures::future::Either::Right(futures::future::pending::<()>())
399 };
400
401 let sigterm_future = {
402 let executor = cx.background_executor().clone();
403 async move {
404 while !TERMINATED.load(Ordering::Relaxed) {
405 executor.timer(Duration::from_millis(100)).await;
406 }
407 }
408 };
409
410 let outcome = select_biased! {
411 result = send_future.fuse() => match result {
412 Ok(Some(response)) => {
413 eprintln!("[eval-cli] stopped: {:?}", response.stop_reason);
414 if response.stop_reason == acp::StopReason::MaxTokens {
415 Err(anyhow::anyhow!("Model hit maximum token limit"))
416 } else {
417 Ok(AgentOutcome::Completed)
418 }
419 }
420 Ok(None) => {
421 eprintln!("[eval-cli] completed (no response)");
422 Ok(AgentOutcome::Completed)
423 }
424 Err(e) => Err(e).context("agent run failed"),
425 },
426 _ = sigterm_future.fuse() => {
427 eprintln!("[eval-cli] received SIGTERM, cancelling...");
428 acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
429 Ok(AgentOutcome::Interrupted)
430 },
431 _ = timeout_future.fuse() => {
432 acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
433 Ok(AgentOutcome::Timeout { seconds: timeout.unwrap_or(0) })
434 }
435 };
436
437 let thread = cx.update(|cx| {
438 let session_id = acp_thread.read(cx).session_id().clone();
439 connection.thread(&session_id, cx)
440 });
441
442 let cumulative_usage = if let Some(thread) = &thread {
443 let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
444 let db_thread = db_thread.await;
445 let usage = db_thread.cumulative_token_usage;
446 if usage.input_tokens > 0 || usage.output_tokens > 0 {
447 Some(usage)
448 } else {
449 None
450 }
451 } else {
452 None
453 };
454
455 let acp_usage = cx.update(|cx| {
456 acp_thread
457 .read(cx)
458 .token_usage()
459 .map(|usage| language_model::TokenUsage {
460 input_tokens: usage.input_tokens,
461 output_tokens: usage.output_tokens,
462 ..Default::default()
463 })
464 });
465
466 let final_usage = cumulative_usage.or(acp_usage);
467
468 if let (Some(thread), Some(dir)) = (&thread, output_dir) {
469 let markdown = thread.read_with(cx, |thread, _cx| thread.to_markdown());
470 if let Err(e) = std::fs::write(dir.join("thread.md"), markdown) {
471 eprintln!("Error writing thread.md: {e:#}");
472 }
473
474 let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
475 let db_thread = db_thread.await;
476 match serde_json::to_string_pretty(&db_thread) {
477 Ok(json) => {
478 if let Err(e) = std::fs::write(dir.join("thread.json"), json) {
479 eprintln!("Error writing thread.json: {e:#}");
480 }
481 }
482 Err(e) => eprintln!("Error serializing thread.json: {e:#}"),
483 }
484 }
485
486 (outcome, final_usage)
487}
488
489fn log_acp_thread_event(
490 acp_thread: &Entity<acp_thread::AcpThread>,
491 event: &acp_thread::AcpThreadEvent,
492 cx: &mut gpui::App,
493) {
494 match event {
495 acp_thread::AcpThreadEvent::NewEntry => {
496 let entries = acp_thread.read(cx).entries();
497 if let Some(acp_thread::AgentThreadEntry::AssistantMessage(message)) = entries.last() {
498 for chunk in &message.chunks {
499 if let acp_thread::AssistantMessageChunk::Message { block } = chunk {
500 if let acp_thread::ContentBlock::Markdown { markdown } = block {
501 let text = markdown.read(cx).source().to_string();
502 if !text.is_empty() {
503 eprint!("{text}");
504 }
505 }
506 }
507 }
508 }
509 }
510 acp_thread::AcpThreadEvent::EntryUpdated(index) => {
511 let entries = acp_thread.read(cx).entries();
512 if let Some(acp_thread::AgentThreadEntry::ToolCall(tool_call)) = entries.get(*index) {
513 if let Some(name) = &tool_call.tool_name {
514 match &tool_call.status {
515 acp_thread::ToolCallStatus::Completed => {
516 eprintln!("[tool] {name} ✓");
517 }
518 acp_thread::ToolCallStatus::Failed => {
519 eprintln!("[tool] {name} ✗");
520 }
521 acp_thread::ToolCallStatus::Rejected => {
522 eprintln!("[tool] {name} rejected");
523 }
524 acp_thread::ToolCallStatus::Canceled => {
525 eprintln!("[tool] {name} canceled");
526 }
527 _ => {}
528 }
529 }
530 }
531 }
532 acp_thread::AcpThreadEvent::Stopped(reason) => {
533 eprintln!("\n[eval-cli] stopped: {reason:?}");
534 }
535 acp_thread::AcpThreadEvent::Error => {
536 eprintln!("[eval-cli] error event");
537 }
538 acp_thread::AcpThreadEvent::Retry(status) => {
539 eprintln!("[eval-cli] retry: {status:?}");
540 }
541 acp_thread::AcpThreadEvent::SubagentSpawned(session_id) => {
542 eprintln!("[eval-cli] subagent spawned: {session_id}");
543 }
544 _ => {}
545 }
546}