eval_cli: Improve setup for the eval_cli args (#52209)

Ben Brandt created

## Context

Fixes some issues I ran into when running this on a remote machine.

## Self-Review Checklist

<!-- Check before requesting review: -->
- [x] I've reviewed my own diff for quality, security, and reliability
- [x] Unsafe blocks (if any) have justifying comments
- [x] The content is consistent with the [UI/UX
checklist](https://github.com/zed-industries/zed/blob/main/CONTRIBUTING.md#uiux-checklist)
- [x] Tests cover the new/changed behavior
- [x] Performance impact has been considered and is acceptable

Release Notes:

- N/A

Change summary

Cargo.lock                             |  1 
crates/eval_cli/Cargo.toml             |  1 
crates/eval_cli/src/headless.rs        |  4 +++
crates/eval_cli/src/main.rs            | 33 +++++++++++++++++++++++----
crates/eval_cli/zed_eval/agent.py      | 21 ++++++++++++++++-
crates/eval_cli/zed_eval/install.sh.j2 |  6 +++++
6 files changed, 59 insertions(+), 7 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -5912,6 +5912,7 @@ dependencies = [
  "clap",
  "client",
  "ctrlc",
+ "db",
  "debug_adapter_extension",
  "env_logger 0.11.8",
  "extension",

crates/eval_cli/Cargo.toml 🔗

@@ -21,6 +21,7 @@ anyhow.workspace = true
 clap.workspace = true
 client.workspace = true
 ctrlc = { version = "3.5", features = ["termination"] }
+db.workspace = true
 debug_adapter_extension.workspace = true
 env_logger.workspace = true
 extension.workspace = true

crates/eval_cli/src/headless.rs 🔗

@@ -2,6 +2,7 @@ use std::path::PathBuf;
 use std::sync::Arc;
 
 use client::{Client, ProxySettings, UserStore};
+use db::AppDatabase;
 use extension::ExtensionHostProxy;
 use fs::RealFs;
 use gpui::http_client::read_proxy_from_env;
@@ -61,6 +62,9 @@ pub fn init(cx: &mut App) -> Arc<AgentCliAppState> {
     let client = Client::production(cx);
     cx.set_http_client(client.http_client());
 
+    let app_db = AppDatabase::new();
+    cx.set_global(app_db);
+
     let git_binary_path = None;
     let fs = Arc::new(RealFs::new(
         git_binary_path,

crates/eval_cli/src/main.rs 🔗

@@ -82,8 +82,21 @@ struct Args {
     timeout: Option<u64>,
 
     /// Directory for output artifacts (result.json, thread.md, thread.json).
-    #[arg(long, default_value = "/logs/agent")]
+    #[arg(long, default_value = ".")]
     output_dir: PathBuf,
+
+    /// Disable staff mode (staff mode is enabled by default).
+    #[arg(long)]
+    no_staff: bool,
+
+    /// Reasoning effort level for models that support thinking (low, medium, high).
+    /// Defaults to "high" for thinking-capable models.
+    #[arg(long)]
+    reasoning_effort: Option<String>,
+
+    /// Enable or disable extended thinking. Defaults to model auto-detection if omitted.
+    #[arg(long)]
+    thinking: Option<bool>,
 }
 
 enum AgentOutcome {
@@ -154,7 +167,7 @@ fn main() {
 
     app.run(move |cx| {
         let app_state = headless::init(cx);
-        cx.set_staff(true);
+        cx.set_staff(!args.no_staff);
 
         let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
             registry
@@ -166,6 +179,8 @@ fn main() {
 
         let model_name = args.model.clone();
         let timeout = args.timeout;
+        let thinking_override = args.thinking;
+        let reasoning_effort = args.reasoning_effort.clone();
 
         cx.spawn(async move |cx| {
             futures::future::join_all(auth_tasks).await;
@@ -178,6 +193,8 @@ fn main() {
                 &instruction,
                 &model_name,
                 timeout,
+                thinking_override,
+                reasoning_effort.as_deref(),
                 Some(&output_dir),
                 cx,
             )
@@ -257,6 +274,8 @@ async fn run_agent(
     instruction: &str,
     model_name: &str,
     timeout: Option<u64>,
+    thinking_override: Option<bool>,
+    reasoning_effort: Option<&str>,
     output_dir: Option<&std::path::Path>,
     cx: &mut AsyncApp,
 ) -> (Result<AgentOutcome>, Option<language_model::TokenUsage>) {
@@ -292,10 +311,14 @@ async fn run_agent(
             anyhow::Ok(())
         })?;
 
-        let (enable_thinking, effort) = if supports_thinking {
-            (true, "\"high\"")
+        let enable_thinking = thinking_override.unwrap_or(supports_thinking);
+        let effort = if enable_thinking {
+            match reasoning_effort {
+                Some(level) => format!("\"{level}\""),
+                None => "\"high\"".to_string(),
+            }
         } else {
-            (false, "null")
+            "null".to_string()
         };
         let provider_id = selected.provider.0.to_string();
         let model_id = selected.model.0.to_string();

crates/eval_cli/zed_eval/agent.py 🔗

@@ -144,15 +144,32 @@ class ZedAgent(BaseInstalledAgent):
         if timeout:
             parts.append(f"--timeout {timeout}")
 
+        staff = self._extra_env.get("EVAL_CLI_STAFF")
+        if staff and staff.lower() == "false":
+            parts.append("--no-staff")
+
+        reasoning_effort = self._extra_env.get("EVAL_CLI_REASONING_EFFORT")
+        if reasoning_effort:
+            parts.append(f"--reasoning-effort {shlex.quote(reasoning_effort)}")
+
+        enable_thinking = self._extra_env.get("EVAL_CLI_ENABLE_THINKING")
+        if enable_thinking:
+            if enable_thinking.lower() == "true":
+                parts.append("--enable-thinking")
+            elif enable_thinking.lower() == "false":
+                parts.append("--disable-thinking")
+
         parts.append(f"--instruction {escaped_instruction}")
 
-        eval_cli_command = " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt"
+        eval_cli_command = (
+            " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt"
+        )
 
         patch_command = (
             "cd /testbed && "
             "git add -A && "
             "git diff --cached HEAD > /logs/agent/patch.diff && "
-            "echo \"Patch size: $(wc -c < /logs/agent/patch.diff) bytes\""
+            'echo "Patch size: $(wc -c < /logs/agent/patch.diff) bytes"'
         )
 
         return [

crates/eval_cli/zed_eval/install.sh.j2 🔗

@@ -22,6 +22,12 @@ apt-get install -y --no-install-recommends \
 curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
 apt-get install -y --no-install-recommends nodejs
 
+# Preinstall basedpyright in Zed's language server cache to avoid first-run npm install latency.
+ZED_DATA_DIR="${XDG_DATA_HOME:-$HOME/.local/share}/zed"
+BASEDPYRIGHT_DIR="$ZED_DATA_DIR/languages/basedpyright"
+mkdir -p "$BASEDPYRIGHT_DIR"
+npm install --prefix "$BASEDPYRIGHT_DIR" --save-exact basedpyright
+
 # Install uv (needed for running Python tests in SWE-bench tasks).
 curl -LsSf https://astral.sh/uv/install.sh | sh
 . "$HOME/.local/bin/env"