diff --git a/Cargo.lock b/Cargo.lock index 194c544cdab73b4e64d0fb3e085f7c55072c1589..e6feea12955d8d4ce2bceba9ed108fa4fe40b49c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5912,6 +5912,7 @@ dependencies = [ "clap", "client", "ctrlc", + "db", "debug_adapter_extension", "env_logger 0.11.8", "extension", diff --git a/crates/eval_cli/Cargo.toml b/crates/eval_cli/Cargo.toml index d8f52992e2ae9512e694bb11c491fd8b60c0c947..cac5dc6aa28fa9dfa9b7d41caf0db125daf596dc 100644 --- a/crates/eval_cli/Cargo.toml +++ b/crates/eval_cli/Cargo.toml @@ -21,6 +21,7 @@ anyhow.workspace = true clap.workspace = true client.workspace = true ctrlc = { version = "3.5", features = ["termination"] } +db.workspace = true debug_adapter_extension.workspace = true env_logger.workspace = true extension.workspace = true diff --git a/crates/eval_cli/src/headless.rs b/crates/eval_cli/src/headless.rs index 54f14ee1938d4b58bdc32acbd07eced8d8a86406..0e2e40835fa3507ee20e6f1c6cf01226724451c1 100644 --- a/crates/eval_cli/src/headless.rs +++ b/crates/eval_cli/src/headless.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use std::sync::Arc; use client::{Client, ProxySettings, UserStore}; +use db::AppDatabase; use extension::ExtensionHostProxy; use fs::RealFs; use gpui::http_client::read_proxy_from_env; @@ -61,6 +62,9 @@ pub fn init(cx: &mut App) -> Arc { let client = Client::production(cx); cx.set_http_client(client.http_client()); + let app_db = AppDatabase::new(); + cx.set_global(app_db); + let git_binary_path = None; let fs = Arc::new(RealFs::new( git_binary_path, diff --git a/crates/eval_cli/src/main.rs b/crates/eval_cli/src/main.rs index b49cc4d53f50eeb5ea10216867257332c5354cb4..f9ab1835f94327c72462ba7014bf7517d12ac55d 100644 --- a/crates/eval_cli/src/main.rs +++ b/crates/eval_cli/src/main.rs @@ -82,8 +82,21 @@ struct Args { timeout: Option, /// Directory for output artifacts (result.json, thread.md, thread.json). - #[arg(long, default_value = "/logs/agent")] + #[arg(long, default_value = ".")] output_dir: PathBuf, + + /// Disable staff mode (staff mode is enabled by default). + #[arg(long)] + no_staff: bool, + + /// Reasoning effort level for models that support thinking (low, medium, high). + /// Defaults to "high" for thinking-capable models. + #[arg(long)] + reasoning_effort: Option, + + /// Enable or disable extended thinking. Defaults to model auto-detection if omitted. + #[arg(long)] + thinking: Option, } enum AgentOutcome { @@ -154,7 +167,7 @@ fn main() { app.run(move |cx| { let app_state = headless::init(cx); - cx.set_staff(true); + cx.set_staff(!args.no_staff); let auth_tasks = LanguageModelRegistry::global(cx).update(cx, |registry, cx| { registry @@ -166,6 +179,8 @@ fn main() { let model_name = args.model.clone(); let timeout = args.timeout; + let thinking_override = args.thinking; + let reasoning_effort = args.reasoning_effort.clone(); cx.spawn(async move |cx| { futures::future::join_all(auth_tasks).await; @@ -178,6 +193,8 @@ fn main() { &instruction, &model_name, timeout, + thinking_override, + reasoning_effort.as_deref(), Some(&output_dir), cx, ) @@ -257,6 +274,8 @@ async fn run_agent( instruction: &str, model_name: &str, timeout: Option, + thinking_override: Option, + reasoning_effort: Option<&str>, output_dir: Option<&std::path::Path>, cx: &mut AsyncApp, ) -> (Result, Option) { @@ -292,10 +311,14 @@ async fn run_agent( anyhow::Ok(()) })?; - let (enable_thinking, effort) = if supports_thinking { - (true, "\"high\"") + let enable_thinking = thinking_override.unwrap_or(supports_thinking); + let effort = if enable_thinking { + match reasoning_effort { + Some(level) => format!("\"{level}\""), + None => "\"high\"".to_string(), + } } else { - (false, "null") + "null".to_string() }; let provider_id = selected.provider.0.to_string(); let model_id = selected.model.0.to_string(); diff --git a/crates/eval_cli/zed_eval/agent.py b/crates/eval_cli/zed_eval/agent.py index 6214ff18d784dd9620f404a00ba1b48ce96b5707..5e70713e0dec8512c6303dc5ed7314c245fb6728 100644 --- a/crates/eval_cli/zed_eval/agent.py +++ b/crates/eval_cli/zed_eval/agent.py @@ -144,15 +144,32 @@ class ZedAgent(BaseInstalledAgent): if timeout: parts.append(f"--timeout {timeout}") + staff = self._extra_env.get("EVAL_CLI_STAFF") + if staff and staff.lower() == "false": + parts.append("--no-staff") + + reasoning_effort = self._extra_env.get("EVAL_CLI_REASONING_EFFORT") + if reasoning_effort: + parts.append(f"--reasoning-effort {shlex.quote(reasoning_effort)}") + + enable_thinking = self._extra_env.get("EVAL_CLI_ENABLE_THINKING") + if enable_thinking: + if enable_thinking.lower() == "true": + parts.append("--enable-thinking") + elif enable_thinking.lower() == "false": + parts.append("--disable-thinking") + parts.append(f"--instruction {escaped_instruction}") - eval_cli_command = " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt" + eval_cli_command = ( + " ".join(parts) + " 2>&1 | stdbuf -oL tee /logs/agent/eval-cli.txt" + ) patch_command = ( "cd /testbed && " "git add -A && " "git diff --cached HEAD > /logs/agent/patch.diff && " - "echo \"Patch size: $(wc -c < /logs/agent/patch.diff) bytes\"" + 'echo "Patch size: $(wc -c < /logs/agent/patch.diff) bytes"' ) return [ diff --git a/crates/eval_cli/zed_eval/install.sh.j2 b/crates/eval_cli/zed_eval/install.sh.j2 index f7ebbe028216a1a7a0fd606e50a2f707db34c5ce..80b1fae991cf37ebf07df1784e1cefcfd8fc7209 100644 --- a/crates/eval_cli/zed_eval/install.sh.j2 +++ b/crates/eval_cli/zed_eval/install.sh.j2 @@ -22,6 +22,12 @@ apt-get install -y --no-install-recommends \ curl -fsSL https://deb.nodesource.com/setup_22.x | bash - apt-get install -y --no-install-recommends nodejs +# Preinstall basedpyright in Zed's language server cache to avoid first-run npm install latency. +ZED_DATA_DIR="${XDG_DATA_HOME:-$HOME/.local/share}/zed" +BASEDPYRIGHT_DIR="$ZED_DATA_DIR/languages/basedpyright" +mkdir -p "$BASEDPYRIGHT_DIR" +npm install --prefix "$BASEDPYRIGHT_DIR" --save-exact basedpyright + # Install uv (needed for running Python tests in SWE-bench tasks). curl -LsSf https://astral.sh/uv/install.sh | sh . "$HOME/.local/bin/env"