Add timeout support to terminal tool (#44895)

Nathan Sobo created 2 months ago

Adds an optional `timeout_ms` parameter to the terminal tool that allows
bounding the runtime of shell commands. When the timeout expires, the
running terminal task is killed and the tool returns with the partial
output captured so far.

## Summary

This PR adds the ability for the agent to specify a maximum runtime when
invoking the terminal tool. This helps prevent indefinite hangs when
running commands that might wait for network, user prompts, or long
builds/tests.

## Changes

- Add `timeout_ms` field to `TerminalToolInput` schema
- Extend `TerminalHandle` trait with `kill()` method
- Implement `kill()` for `AcpTerminalHandle` and `EvalTerminalHandle`
- Race terminal exit against timeout, killing on expiry
- Update system prompt to recommend using timeouts for long-running
commands
- Add test for timeout behavior
- Update `.rules` to document GPUI executor timers for tests

## Testing

- Added `test_terminal_tool_timeout_kills_handle` which verifies that
when a timeout is specified and expires, the terminal handle is killed
and the tool returns with partial output.
- All existing agent tests pass.

Release Notes:

- agent: Added optional `timeout_ms` parameter to the terminal tool,
allowing the agent to bound command runtime and prevent indefinite hangs

Change summary

.rules                                       |   6 
crates/agent/src/agent.rs                    |   9 
crates/agent/src/templates/system_prompt.hbs |   2 
crates/agent/src/tests/mod.rs                | 219 +++++++++++++++++++++
crates/agent/src/thread.rs                   |   1 
crates/agent/src/tools/terminal_tool.rs      |  33 ++
crates/eval/src/instance.rs                  |   9 
crates/terminal/src/terminal.rs              |   5 
8 files changed, 275 insertions(+), 9 deletions(-)

Detailed changes

.rules 🔗

@@ -26,6 +26,12 @@
   });
   ```
 
+# Timers in tests
+
+* In GPUI tests, prefer GPUI executor timers over `smol::Timer::after(...)` when you need timeouts, delays, or to drive `run_until_parked()`:
+  - Use `cx.background_executor().timer(duration).await` (or `cx.background_executor.timer(duration).await` in `TestAppContext`) so the work is scheduled on GPUI's dispatcher.
+  - Avoid `smol::Timer::after(...)` for test timeouts when you rely on `run_until_parked()`, because it may not be tracked by GPUI's scheduler and can lead to "nothing left to run" when pumping.
+
 # GPUI
 
 GPUI is a UI framework which also provides primitives for state and concurrency management.

crates/agent/src/agent.rs 🔗

@@ -1219,6 +1219,15 @@ impl TerminalHandle for AcpTerminalHandle {
         self.terminal
             .read_with(cx, |term, cx| term.current_output(cx))
     }
+
+    fn kill(&self, cx: &AsyncApp) -> Result<()> {
+        cx.update(|cx| {
+            self.terminal.update(cx, |terminal, cx| {
+                terminal.kill(cx);
+            });
+        })?;
+        Ok(())
+    }
 }
 
 #[cfg(test)]

crates/agent/src/templates/system_prompt.hbs 🔗

@@ -16,7 +16,7 @@ You are a highly skilled software engineer with extensive knowledge in many prog
 3. DO NOT use tools to access items that are already available in the context section.
 4. Use only the tools that are currently available.
 5. DO NOT use a tool that is not available just because it appears in the conversation. This means the user turned it off.
-6. NEVER run commands that don't terminate on their own such as web servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers.
+6. When running commands that may run indefinitely or for a long time (such as build scripts, tests, servers, or file watchers), specify `timeout_ms` to bound runtime. If the command times out, the user can always ask you to run it again with a longer timeout or no timeout if they're willing to wait or cancel manually.
 7. Avoid HTML entity escaping - use plain characters instead.
 
 ## Searching and Reading

crates/agent/src/tests/mod.rs 🔗

@@ -9,14 +9,16 @@ use collections::IndexMap;
 use context_server::{ContextServer, ContextServerCommand, ContextServerId};
 use fs::{FakeFs, Fs};
 use futures::{
-    StreamExt,
+    FutureExt as _, StreamExt,
     channel::{
         mpsc::{self, UnboundedReceiver},
         oneshot,
     },
+    future::{Fuse, Shared},
 };
 use gpui::{
-    App, AppContext, Entity, Task, TestAppContext, UpdateGlobal, http_client::FakeHttpClient,
+    App, AppContext, AsyncApp, Entity, Task, TestAppContext, UpdateGlobal,
+    http_client::FakeHttpClient,
 };
 use indoc::indoc;
 use language_model::{
@@ -35,12 +37,109 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
 use settings::{Settings, SettingsStore};
-use std::{path::Path, rc::Rc, sync::Arc, time::Duration};
+use std::{
+    path::Path,
+    pin::Pin,
+    rc::Rc,
+    sync::{
+        Arc,
+        atomic::{AtomicBool, Ordering},
+    },
+    time::Duration,
+};
 use util::path;
 
 mod test_tools;
 use test_tools::*;
 
+fn init_test(cx: &mut TestAppContext) {
+    cx.update(|cx| {
+        let settings_store = SettingsStore::test(cx);
+        cx.set_global(settings_store);
+    });
+}
+
+struct FakeTerminalHandle {
+    killed: Arc<AtomicBool>,
+    wait_for_exit: Shared<Task<acp::TerminalExitStatus>>,
+    output: acp::TerminalOutputResponse,
+    id: acp::TerminalId,
+}
+
+impl FakeTerminalHandle {
+    fn new_never_exits(cx: &mut App) -> Self {
+        let killed = Arc::new(AtomicBool::new(false));
+
+        let killed_for_task = killed.clone();
+        let wait_for_exit = cx
+            .spawn(async move |cx| {
+                loop {
+                    if killed_for_task.load(Ordering::SeqCst) {
+                        return acp::TerminalExitStatus::new();
+                    }
+                    cx.background_executor()
+                        .timer(Duration::from_millis(1))
+                        .await;
+                }
+            })
+            .shared();
+
+        Self {
+            killed,
+            wait_for_exit,
+            output: acp::TerminalOutputResponse::new("partial output".to_string(), false),
+            id: acp::TerminalId::new("fake_terminal".to_string()),
+        }
+    }
+
+    fn was_killed(&self) -> bool {
+        self.killed.load(Ordering::SeqCst)
+    }
+}
+
+impl crate::TerminalHandle for FakeTerminalHandle {
+    fn id(&self, _cx: &AsyncApp) -> Result<acp::TerminalId> {
+        Ok(self.id.clone())
+    }
+
+    fn current_output(&self, _cx: &AsyncApp) -> Result<acp::TerminalOutputResponse> {
+        Ok(self.output.clone())
+    }
+
+    fn wait_for_exit(&self, _cx: &AsyncApp) -> Result<Shared<Task<acp::TerminalExitStatus>>> {
+        Ok(self.wait_for_exit.clone())
+    }
+
+    fn kill(&self, _cx: &AsyncApp) -> Result<()> {
+        self.killed.store(true, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+struct FakeThreadEnvironment {
+    handle: Rc<FakeTerminalHandle>,
+}
+
+impl crate::ThreadEnvironment for FakeThreadEnvironment {
+    fn create_terminal(
+        &self,
+        _command: String,
+        _cwd: Option<std::path::PathBuf>,
+        _output_byte_limit: Option<u64>,
+        _cx: &mut AsyncApp,
+    ) -> Task<Result<Rc<dyn crate::TerminalHandle>>> {
+        Task::ready(Ok(self.handle.clone() as Rc<dyn crate::TerminalHandle>))
+    }
+}
+
+fn always_allow_tools(cx: &mut TestAppContext) {
+    cx.update(|cx| {
+        let mut settings = agent_settings::AgentSettings::get_global(cx).clone();
+        settings.always_allow_tool_actions = true;
+        agent_settings::AgentSettings::override_global(settings, cx);
+    });
+}
+
 #[gpui::test]
 async fn test_echo(cx: &mut TestAppContext) {
     let ThreadTest { model, thread, .. } = setup(cx, TestModel::Fake).await;
@@ -71,6 +170,120 @@ async fn test_echo(cx: &mut TestAppContext) {
     assert_eq!(stop_events(events), vec![acp::StopReason::EndTurn]);
 }
 
+#[gpui::test]
+async fn test_terminal_tool_timeout_kills_handle(cx: &mut TestAppContext) {
+    init_test(cx);
+    always_allow_tools(cx);
+
+    let fs = FakeFs::new(cx.executor());
+    let project = Project::test(fs, [], cx).await;
+
+    let handle = Rc::new(cx.update(|cx| FakeTerminalHandle::new_never_exits(cx)));
+    let environment = Rc::new(FakeThreadEnvironment {
+        handle: handle.clone(),
+    });
+
+    #[allow(clippy::arc_with_non_send_sync)]
+    let tool = Arc::new(crate::TerminalTool::new(project, environment));
+    let (event_stream, mut rx) = crate::ToolCallEventStream::test();
+
+    let task = cx.update(|cx| {
+        tool.run(
+            crate::TerminalToolInput {
+                command: "sleep 1000".to_string(),
+                cd: ".".to_string(),
+                timeout_ms: Some(5),
+            },
+            event_stream,
+            cx,
+        )
+    });
+
+    let update = rx.expect_update_fields().await;
+    assert!(
+        update.content.iter().any(|blocks| {
+            blocks
+                .iter()
+                .any(|c| matches!(c, acp::ToolCallContent::Terminal(_)))
+        }),
+        "expected tool call update to include terminal content"
+    );
+
+    let mut task_future: Pin<Box<Fuse<Task<Result<String>>>>> = Box::pin(task.fuse());
+
+    let deadline = std::time::Instant::now() + Duration::from_millis(500);
+    loop {
+        if let Some(result) = task_future.as_mut().now_or_never() {
+            let result = result.expect("terminal tool task should complete");
+
+            assert!(
+                handle.was_killed(),
+                "expected terminal handle to be killed on timeout"
+            );
+            assert!(
+                result.contains("partial output"),
+                "expected result to include terminal output, got: {result}"
+            );
+            return;
+        }
+
+        if std::time::Instant::now() >= deadline {
+            panic!("timed out waiting for terminal tool task to complete");
+        }
+
+        cx.run_until_parked();
+        cx.background_executor.timer(Duration::from_millis(1)).await;
+    }
+}
+
+#[gpui::test]
+#[ignore]
+async fn test_terminal_tool_without_timeout_does_not_kill_handle(cx: &mut TestAppContext) {
+    init_test(cx);
+    always_allow_tools(cx);
+
+    let fs = FakeFs::new(cx.executor());
+    let project = Project::test(fs, [], cx).await;
+
+    let handle = Rc::new(cx.update(|cx| FakeTerminalHandle::new_never_exits(cx)));
+    let environment = Rc::new(FakeThreadEnvironment {
+        handle: handle.clone(),
+    });
+
+    #[allow(clippy::arc_with_non_send_sync)]
+    let tool = Arc::new(crate::TerminalTool::new(project, environment));
+    let (event_stream, mut rx) = crate::ToolCallEventStream::test();
+
+    let _task = cx.update(|cx| {
+        tool.run(
+            crate::TerminalToolInput {
+                command: "sleep 1000".to_string(),
+                cd: ".".to_string(),
+                timeout_ms: None,
+            },
+            event_stream,
+            cx,
+        )
+    });
+
+    let update = rx.expect_update_fields().await;
+    assert!(
+        update.content.iter().any(|blocks| {
+            blocks
+                .iter()
+                .any(|c| matches!(c, acp::ToolCallContent::Terminal(_)))
+        }),
+        "expected tool call update to include terminal content"
+    );
+
+    smol::Timer::after(Duration::from_millis(25)).await;
+
+    assert!(
+        !handle.was_killed(),
+        "did not expect terminal handle to be killed without a timeout"
+    );
+}
+
 #[gpui::test]
 async fn test_thinking(cx: &mut TestAppContext) {
     let ThreadTest { model, thread, .. } = setup(cx, TestModel::Fake).await;

crates/agent/src/thread.rs 🔗

@@ -530,6 +530,7 @@ pub trait TerminalHandle {
     fn id(&self, cx: &AsyncApp) -> Result<acp::TerminalId>;
     fn current_output(&self, cx: &AsyncApp) -> Result<acp::TerminalOutputResponse>;
     fn wait_for_exit(&self, cx: &AsyncApp) -> Result<Shared<Task<acp::TerminalExitStatus>>>;
+    fn kill(&self, cx: &AsyncApp) -> Result<()>;
 }
 
 pub trait ThreadEnvironment {

crates/agent/src/tools/terminal_tool.rs 🔗

@@ -1,6 +1,7 @@
 use agent_client_protocol as acp;
 use anyhow::Result;
-use gpui::{App, Entity, SharedString, Task};
+use futures::FutureExt as _;
+use gpui::{App, AppContext, Entity, SharedString, Task};
 use project::Project;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
@@ -8,6 +9,7 @@ use std::{
     path::{Path, PathBuf},
     rc::Rc,
     sync::Arc,
+    time::Duration,
 };
 use util::markdown::MarkdownInlineCode;
 
@@ -25,13 +27,17 @@ const COMMAND_OUTPUT_LIMIT: u64 = 16 * 1024;
 ///
 /// Do not use this tool for commands that run indefinitely, such as servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers that don't terminate on their own.
 ///
+/// For potentially long-running commands, prefer specifying `timeout_ms` to bound runtime and prevent indefinite hangs.
+///
 /// Remember that each invocation of this tool will spawn a new shell process, so you can't rely on any state from previous invocations.
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 pub struct TerminalToolInput {
     /// The one-liner command to execute.
-    command: String,
+    pub command: String,
     /// Working directory for the command. This must be one of the root directories of the project.
-    cd: String,
+    pub cd: String,
+    /// Optional maximum runtime (in milliseconds). If exceeded, the running terminal task is killed.
+    pub timeout_ms: Option<u64>,
 }
 
 pub struct TerminalTool {
@@ -116,7 +122,26 @@ impl AgentTool for TerminalTool {
                 acp::ToolCallContent::Terminal(acp::Terminal::new(terminal_id)),
             ]));
 
-            let exit_status = terminal.wait_for_exit(cx)?.await;
+            let timeout = input.timeout_ms.map(Duration::from_millis);
+
+            let exit_status = match timeout {
+                Some(timeout) => {
+                    let wait_for_exit = terminal.wait_for_exit(cx)?;
+                    let timeout_task = cx.background_spawn(async move {
+                        smol::Timer::after(timeout).await;
+                    });
+
+                    futures::select! {
+                        status = wait_for_exit.clone().fuse() => status,
+                        _ = timeout_task.fuse() => {
+                            terminal.kill(cx)?;
+                            wait_for_exit.await
+                        }
+                    }
+                }
+                None => terminal.wait_for_exit(cx)?.await,
+            };
+
             let output = terminal.current_output(cx)?;
 
             Ok(process_content(output, &input.command, exit_status))

crates/eval/src/instance.rs 🔗

@@ -625,6 +625,15 @@ impl agent::TerminalHandle for EvalTerminalHandle {
         self.terminal
             .read_with(cx, |term, cx| term.current_output(cx))
     }
+
+    fn kill(&self, cx: &AsyncApp) -> Result<()> {
+        cx.update(|cx| {
+            self.terminal.update(cx, |terminal, cx| {
+                terminal.kill(cx);
+            });
+        })?;
+        Ok(())
+    }
 }
 
 impl agent::ThreadEnvironment for EvalThreadEnvironment {

crates/terminal/src/terminal.rs 🔗

@@ -369,6 +369,7 @@ impl TerminalBuilder {
             last_content: Default::default(),
             last_mouse: None,
             matches: Vec::new(),
+
             selection_head: None,
             breadcrumb_text: String::new(),
             scroll_px: px(0.),
@@ -595,6 +596,7 @@ impl TerminalBuilder {
                 last_content: Default::default(),
                 last_mouse: None,
                 matches: Vec::new(),
+
                 selection_head: None,
                 breadcrumb_text: String::new(),
                 scroll_px: px(0.),
@@ -826,6 +828,7 @@ pub struct Terminal {
     pub matches: Vec<RangeInclusive<AlacPoint>>,
     pub last_content: TerminalContent,
     pub selection_head: Option<AlacPoint>,
+
     pub breadcrumb_text: String,
     title_override: Option<String>,
     scroll_px: Pixels,
@@ -939,7 +942,7 @@ impl Terminal {
             AlacTermEvent::Bell => {
                 cx.emit(Event::Bell);
             }
-            AlacTermEvent::Exit => self.register_task_finished(None, cx),
+            AlacTermEvent::Exit => self.register_task_finished(Some(9), cx),
             AlacTermEvent::MouseCursorDirty => {
                 //NOOP, Handled in render
             }