1//! Headless CLI binary for running Zed's agent in evaluation/benchmark environments.
2//!
3//! Designed to work inside containerized environments (like Harbor/termbench) where:
4//! - The repository is already checked out at the working directory
5//! - The model API key is provided via environment variables
6//! - Results are written to an output directory (default: `/logs/agent/`)
7//!
8//! ## Usage
9//!
10//! ```text
11//! eval-cli --workdir /testbed --model anthropic/claude-sonnet-4-6-latest \
12//! --instruction "Fix the bug described in..." --timeout 600
13//! ```
14//!
15//! ## Output
16//!
17//! Writes to `--output-dir` (default `/logs/agent/`):
18//! - `result.json` — structured result with status, timing, and token usage
19//! - `thread.md` — full conversation as markdown
20//! - `thread.json` — raw thread state as JSON
21//!
22//! ## Exit codes
23//!
24//! | Code | Meaning |
25//! |------|---------|
26//! | 0 | Agent finished |
27//! | 1 | Error (model/auth/runtime failure) |
28//! | 2 | Timeout |
29//! | 3 | Interrupted (SIGTERM/SIGINT) |
30
31mod headless;
32
33use std::path::PathBuf;
34use std::process;
35use std::rc::Rc;
36use std::str::FromStr;
37use std::sync::Arc;
38use std::sync::atomic::{AtomicBool, Ordering};
39use std::time::{Duration, Instant};
40
41use acp_thread::AgentConnection as _;
42use agent::{NativeAgent, NativeAgentConnection, Templates, ThreadStore};
43use agent_client_protocol as acp;
44use anyhow::{Context, Result};
45use clap::Parser;
46use feature_flags::FeatureFlagAppExt as _;
47
48use futures::{FutureExt, select_biased};
49use gpui::{AppContext as _, AsyncApp, Entity, UpdateGlobal};
50use language_model::{LanguageModelRegistry, SelectedModel};
51use project::Project;
52use settings::SettingsStore;
53use util::path_list::PathList;
54
55use crate::headless::AgentCliAppState;
56
57#[derive(Parser, Debug)]
58#[command(
59 name = "eval-cli",
60 about = "Run Zed's agent headlessly in evaluation/benchmark environments"
61)]
62struct Args {
63 /// Output current environment variables as JSON to stdout.
64 /// Used internally by Zed's shell environment capture.
65 #[arg(long, hide = true)]
66 printenv: bool,
67
68 /// Path to the repository working directory. Defaults to the current directory.
69 #[arg(long, default_value = ".")]
70 workdir: PathBuf,
71
72 /// Instruction/prompt text. If omitted, read from --instruction-file or stdin.
73 #[arg(long)]
74 instruction: Option<String>,
75
76 /// Language model to use, in `provider/model` format.
77 #[arg(long, default_value = "anthropic/claude-sonnet-4-6-latest")]
78 model: String,
79
80 /// Maximum wall-clock time in seconds for the agent run.
81 #[arg(long)]
82 timeout: Option<u64>,
83
84 /// Directory for output artifacts (result.json, thread.md, thread.json).
85 #[arg(long, default_value = ".")]
86 output_dir: PathBuf,
87
88 /// Disable staff mode (staff mode is enabled by default).
89 #[arg(long)]
90 no_staff: bool,
91
92 /// Reasoning effort level for models that support thinking (low, medium, high).
93 /// Defaults to "high" for thinking-capable models.
94 #[arg(long)]
95 reasoning_effort: Option<String>,
96
97 /// Enable or disable extended thinking. Defaults to model auto-detection if omitted.
98 #[arg(long)]
99 thinking: Option<bool>,
100}
101
102enum AgentOutcome {
103 Completed,
104 Timeout { seconds: u64 },
105 Interrupted,
106}
107
108#[derive(serde::Serialize)]
109struct EvalResult {
110 status: String,
111 #[serde(skip_serializing_if = "Option::is_none")]
112 error: Option<String>,
113 duration_secs: f64,
114 #[serde(skip_serializing_if = "Option::is_none")]
115 timeout_secs: Option<u64>,
116 model: String,
117 #[serde(skip_serializing_if = "Option::is_none")]
118 input_tokens: Option<u64>,
119 #[serde(skip_serializing_if = "Option::is_none")]
120 output_tokens: Option<u64>,
121 #[serde(skip_serializing_if = "Option::is_none")]
122 cache_creation_input_tokens: Option<u64>,
123 #[serde(skip_serializing_if = "Option::is_none")]
124 cache_read_input_tokens: Option<u64>,
125}
126
127const EXIT_OK: i32 = 0;
128const EXIT_ERROR: i32 = 1;
129const EXIT_TIMEOUT: i32 = 2;
130const EXIT_INTERRUPTED: i32 = 3;
131
132static TERMINATED: AtomicBool = AtomicBool::new(false);
133
134fn main() {
135 let args = Args::parse();
136
137 if args.printenv {
138 util::shell_env::print_env();
139 return;
140 }
141
142 env_logger::init();
143
144 ctrlc::set_handler(|| {
145 TERMINATED.store(true, Ordering::SeqCst);
146 })
147 .expect("failed to set signal handler");
148
149 let instruction = read_instruction(&args).unwrap_or_else(|e| {
150 eprintln!("Error reading instruction: {e}");
151 process::exit(EXIT_ERROR);
152 });
153
154 let workdir = args.workdir.canonicalize().unwrap_or_else(|e| {
155 eprintln!("Invalid --workdir {:?}: {e}", args.workdir);
156 process::exit(EXIT_ERROR);
157 });
158
159 let output_dir = args.output_dir.clone();
160 if let Err(e) = std::fs::create_dir_all(&output_dir) {
161 eprintln!("Error creating output dir {}: {e}", output_dir.display());
162 process::exit(EXIT_ERROR);
163 }
164
165 let http_client = Arc::new(reqwest_client::ReqwestClient::new());
166 let app = gpui_platform::headless().with_http_client(http_client);
167
168 app.run(move |cx| {
169 let app_state = headless::init(cx);
170 cx.set_staff(!args.no_staff);
171
172 let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
173 registry
174 .providers()
175 .iter()
176 .map(|p| p.authenticate(cx))
177 .collect::<Vec<_>>()
178 });
179
180 let model_name = args.model.clone();
181 let timeout = args.timeout;
182 let thinking_override = args.thinking;
183 let reasoning_effort = args.reasoning_effort.clone();
184
185 cx.spawn(async move |cx| {
186 futures::future::join_all(auth_tasks).await;
187
188 let start = Instant::now();
189
190 let (outcome, token_usage) = run_agent(
191 &app_state,
192 &workdir,
193 &instruction,
194 &model_name,
195 timeout,
196 thinking_override,
197 reasoning_effort.as_deref(),
198 Some(&output_dir),
199 cx,
200 )
201 .await;
202
203 let duration = start.elapsed();
204
205 let (status, error, exit_code) = match &outcome {
206 Ok(AgentOutcome::Completed) => ("completed".to_string(), None, EXIT_OK),
207 Ok(AgentOutcome::Timeout { seconds }) => {
208 eprintln!("Timeout: agent exceeded {seconds}s time limit");
209 ("timeout".to_string(), None, EXIT_TIMEOUT)
210 }
211 Ok(AgentOutcome::Interrupted) => {
212 eprintln!("Interrupted: received SIGTERM, saved partial output");
213 ("interrupted".to_string(), None, EXIT_INTERRUPTED)
214 }
215 Err(e) => {
216 eprintln!("Error: {e:#}");
217 ("error".to_string(), Some(format!("{e:#}")), EXIT_ERROR)
218 }
219 };
220
221 let result = EvalResult {
222 status,
223 error,
224 duration_secs: duration.as_secs_f64(),
225 timeout_secs: timeout,
226 model: model_name.clone(),
227 input_tokens: token_usage.as_ref().map(|u| u.input_tokens),
228 output_tokens: token_usage.as_ref().map(|u| u.output_tokens),
229 cache_creation_input_tokens: token_usage
230 .as_ref()
231 .filter(|u| u.cache_creation_input_tokens > 0)
232 .map(|u| u.cache_creation_input_tokens),
233 cache_read_input_tokens: token_usage
234 .as_ref()
235 .filter(|u| u.cache_read_input_tokens > 0)
236 .map(|u| u.cache_read_input_tokens),
237 };
238
239 match serde_json::to_string_pretty(&result) {
240 Ok(json) => {
241 if let Err(e) = std::fs::write(output_dir.join("result.json"), &json) {
242 eprintln!("Error writing result.json: {e:#}");
243 }
244 eprintln!("[eval-cli] result: {json}");
245 }
246 Err(e) => eprintln!("Error serializing result: {e:#}"),
247 }
248
249 cx.update(|cx| cx.quit());
250 process::exit(exit_code);
251 })
252 .detach();
253 });
254}
255
256fn read_instruction(args: &Args) -> Result<String> {
257 let text = if let Some(text) = &args.instruction {
258 text.clone()
259 } else {
260 use std::io::Read;
261 let mut buf = String::new();
262 std::io::stdin()
263 .read_to_string(&mut buf)
264 .context("reading instruction from stdin")?;
265 buf
266 };
267 anyhow::ensure!(!text.trim().is_empty(), "instruction is empty");
268 Ok(text)
269}
270
271async fn run_agent(
272 app_state: &Arc<AgentCliAppState>,
273 workdir: &std::path::Path,
274 instruction: &str,
275 model_name: &str,
276 timeout: Option<u64>,
277 thinking_override: Option<bool>,
278 reasoning_effort: Option<&str>,
279 output_dir: Option<&std::path::Path>,
280 cx: &mut AsyncApp,
281) -> (Result<AgentOutcome>, Option<language_model::TokenUsage>) {
282 let setup_result: Result<()> = cx.update(|cx| {
283 let selected = SelectedModel::from_str(model_name).map_err(|e| anyhow::anyhow!("{e}"))?;
284 let registry = LanguageModelRegistry::global(cx);
285 let model = registry
286 .read(cx)
287 .available_models(cx)
288 .find(|m| m.id() == selected.model && m.provider_id() == selected.provider)
289 .ok_or_else(|| {
290 let available = registry
291 .read(cx)
292 .available_models(cx)
293 .map(|m| format!("{}/{}", m.provider_id().0, m.id().0))
294 .collect::<Vec<_>>()
295 .join(", ");
296 anyhow::anyhow!("Model {model_name} not found. Available: {available}")
297 })?;
298
299 let supports_thinking = model.supports_thinking();
300
301 registry.update(cx, |registry, cx| {
302 registry.set_default_model(
303 Some(language_model::ConfiguredModel {
304 provider: registry
305 .provider(&model.provider_id())
306 .context("Provider not found")?,
307 model,
308 }),
309 cx,
310 );
311 anyhow::Ok(())
312 })?;
313
314 let enable_thinking = thinking_override.unwrap_or(supports_thinking);
315 let effort = if enable_thinking {
316 match reasoning_effort {
317 Some(level) => format!("\"{level}\""),
318 None => "\"high\"".to_string(),
319 }
320 } else {
321 "null".to_string()
322 };
323 let provider_id = selected.provider.0.to_string();
324 let model_id = selected.model.0.to_string();
325 SettingsStore::update_global(cx, |store, cx| {
326 let settings = format!(
327 r#"{{
328 "agent": {{
329 "tool_permissions": {{"default": "allow"}},
330 "default_model": {{
331 "provider": "{provider_id}",
332 "model": "{model_id}",
333 "enable_thinking": {enable_thinking},
334 "effort": {effort}
335 }}
336 }},
337 "autosave": "off",
338 "format_on_save": "off"
339 }}"
340 "#
341 );
342 store.set_user_settings(&settings, cx).ok();
343 });
344
345 anyhow::Ok(())
346 });
347
348 if let Err(e) = setup_result {
349 return (Err(e), None);
350 }
351
352 let project = cx.update(|cx| {
353 Project::local(
354 app_state.client.clone(),
355 app_state.node_runtime.clone(),
356 app_state.user_store.clone(),
357 app_state.languages.clone(),
358 app_state.fs.clone(),
359 None,
360 project::LocalProjectFlags {
361 init_worktree_trust: false,
362 ..Default::default()
363 },
364 cx,
365 )
366 });
367
368 let worktree = project.update(cx, |project, cx| project.create_worktree(workdir, true, cx));
369 let worktree = match worktree.await {
370 Ok(w) => w,
371 Err(e) => return (Err(e).context("creating worktree"), None),
372 };
373
374 let scan_result = worktree.update(cx, |tree, _cx| {
375 tree.as_local()
376 .context("expected local worktree")
377 .map(|local| local.scan_complete())
378 });
379 match scan_result {
380 Ok(future) => future.await,
381 Err(e) => return (Err(e), None),
382 };
383
384 let agent = cx.update(|cx| {
385 let thread_store = cx.new(|cx| ThreadStore::new(cx));
386 NativeAgent::new(
387 thread_store,
388 Templates::new(),
389 None,
390 app_state.fs.clone(),
391 cx,
392 )
393 });
394
395 let connection = Rc::new(NativeAgentConnection(agent.clone()));
396 let acp_thread = match cx
397 .update(|cx| {
398 connection
399 .clone()
400 .new_session(project, PathList::new(&[workdir]), cx)
401 })
402 .await
403 {
404 Ok(t) => t,
405 Err(e) => return (Err(e).context("creating ACP session"), None),
406 };
407
408 let _subscription = cx.subscribe(&acp_thread, |acp_thread, event, cx| {
409 log_acp_thread_event(&acp_thread, event, cx);
410 });
411
412 let message = vec![acp::ContentBlock::Text(acp::TextContent::new(
413 instruction.to_string(),
414 ))];
415
416 let send_future = acp_thread.update(cx, |acp_thread: &mut acp_thread::AcpThread, cx| {
417 acp_thread.send(message, cx)
418 });
419
420 let timeout_future = if let Some(timeout_secs) = timeout {
421 futures::future::Either::Left(
422 cx.background_executor()
423 .timer(Duration::from_secs(timeout_secs)),
424 )
425 } else {
426 futures::future::Either::Right(futures::future::pending::<()>())
427 };
428
429 let sigterm_future = {
430 let executor = cx.background_executor().clone();
431 async move {
432 while !TERMINATED.load(Ordering::Relaxed) {
433 executor.timer(Duration::from_millis(100)).await;
434 }
435 }
436 };
437
438 let outcome = select_biased! {
439 result = send_future.fuse() => match result {
440 Ok(Some(response)) => {
441 eprintln!("[eval-cli] stopped: {:?}", response.stop_reason);
442 if response.stop_reason == acp::StopReason::MaxTokens {
443 Err(anyhow::anyhow!("Model hit maximum token limit"))
444 } else {
445 Ok(AgentOutcome::Completed)
446 }
447 }
448 Ok(None) => {
449 eprintln!("[eval-cli] completed (no response)");
450 Ok(AgentOutcome::Completed)
451 }
452 Err(e) => Err(e).context("agent run failed"),
453 },
454 _ = sigterm_future.fuse() => {
455 eprintln!("[eval-cli] received SIGTERM, cancelling...");
456 acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
457 Ok(AgentOutcome::Interrupted)
458 },
459 _ = timeout_future.fuse() => {
460 acp_thread.update(cx, |t: &mut acp_thread::AcpThread, cx| t.cancel(cx)).await;
461 Ok(AgentOutcome::Timeout { seconds: timeout.unwrap_or(0) })
462 }
463 };
464
465 let thread = cx.update(|cx| {
466 let session_id = acp_thread.read(cx).session_id().clone();
467 connection.thread(&session_id, cx)
468 });
469
470 let cumulative_usage = if let Some(thread) = &thread {
471 let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
472 let db_thread = db_thread.await;
473 let usage = db_thread.cumulative_token_usage;
474 if usage.input_tokens > 0 || usage.output_tokens > 0 {
475 Some(usage)
476 } else {
477 None
478 }
479 } else {
480 None
481 };
482
483 let acp_usage = cx.update(|cx| {
484 acp_thread
485 .read(cx)
486 .token_usage()
487 .map(|usage| language_model::TokenUsage {
488 input_tokens: usage.input_tokens,
489 output_tokens: usage.output_tokens,
490 ..Default::default()
491 })
492 });
493
494 let final_usage = cumulative_usage.or(acp_usage);
495
496 if let (Some(thread), Some(dir)) = (&thread, output_dir) {
497 let markdown = thread.read_with(cx, |thread, _cx| thread.to_markdown());
498 if let Err(e) = std::fs::write(dir.join("thread.md"), markdown) {
499 eprintln!("Error writing thread.md: {e:#}");
500 }
501
502 let db_thread = thread.read_with(cx, |thread, cx| thread.to_db(cx));
503 let db_thread = db_thread.await;
504 match serde_json::to_string_pretty(&db_thread) {
505 Ok(json) => {
506 if let Err(e) = std::fs::write(dir.join("thread.json"), json) {
507 eprintln!("Error writing thread.json: {e:#}");
508 }
509 }
510 Err(e) => eprintln!("Error serializing thread.json: {e:#}"),
511 }
512 }
513
514 (outcome, final_usage)
515}
516
517fn log_acp_thread_event(
518 acp_thread: &Entity<acp_thread::AcpThread>,
519 event: &acp_thread::AcpThreadEvent,
520 cx: &mut gpui::App,
521) {
522 match event {
523 acp_thread::AcpThreadEvent::NewEntry => {
524 let entries = acp_thread.read(cx).entries();
525 if let Some(acp_thread::AgentThreadEntry::AssistantMessage(message)) = entries.last() {
526 for chunk in &message.chunks {
527 if let acp_thread::AssistantMessageChunk::Message { block } = chunk {
528 if let acp_thread::ContentBlock::Markdown { markdown } = block {
529 let text = markdown.read(cx).source().to_string();
530 if !text.is_empty() {
531 eprint!("{text}");
532 }
533 }
534 }
535 }
536 }
537 }
538 acp_thread::AcpThreadEvent::EntryUpdated(index) => {
539 let entries = acp_thread.read(cx).entries();
540 if let Some(acp_thread::AgentThreadEntry::ToolCall(tool_call)) = entries.get(*index) {
541 if let Some(name) = &tool_call.tool_name {
542 match &tool_call.status {
543 acp_thread::ToolCallStatus::Completed => {
544 eprintln!("[tool] {name} ✓");
545 }
546 acp_thread::ToolCallStatus::Failed => {
547 eprintln!("[tool] {name} ✗");
548 }
549 acp_thread::ToolCallStatus::Rejected => {
550 eprintln!("[tool] {name} rejected");
551 }
552 acp_thread::ToolCallStatus::Canceled => {
553 eprintln!("[tool] {name} canceled");
554 }
555 _ => {}
556 }
557 }
558 }
559 }
560 acp_thread::AcpThreadEvent::Stopped(reason) => {
561 eprintln!("\n[eval-cli] stopped: {reason:?}");
562 }
563 acp_thread::AcpThreadEvent::Error => {
564 eprintln!("[eval-cli] error event");
565 }
566 acp_thread::AcpThreadEvent::Retry(status) => {
567 eprintln!("[eval-cli] retry: {status:?}");
568 }
569 acp_thread::AcpThreadEvent::SubagentSpawned(session_id) => {
570 eprintln!("[eval-cli] subagent spawned: {session_id}");
571 }
572 _ => {}
573 }
574}