diff --git a/Cargo.lock b/Cargo.lock index 4984de3fbbb95021b67be0f9112af9f5ec7102af..b9cb4e0cf922ab085cc7df739e7cc2b05f693a91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5019,6 +5019,7 @@ version = "0.1.0" dependencies = [ "agent", "anyhow", + "assistant_settings", "assistant_tool", "assistant_tools", "async-trait", diff --git a/crates/agent/src/active_thread.rs b/crates/agent/src/active_thread.rs index 0454a2772d758fe1bfd928027a5f760636088081..cd61b8e1b188266e70adf537eb9c15fe8a4305e1 100644 --- a/crates/agent/src/active_thread.rs +++ b/crates/agent/src/active_thread.rs @@ -1070,6 +1070,22 @@ impl ActiveThread { cx, ); } + ThreadEvent::MissingToolUse { + tool_use_id, + ui_text, + } => { + self.render_tool_use_markdown( + tool_use_id.clone(), + ui_text, + "", + self.thread + .read(cx) + .output_for_tool(tool_use_id) + .map(|output| output.clone().into()) + .unwrap_or("".into()), + cx, + ); + } } } diff --git a/crates/agent/src/agent_diff.rs b/crates/agent/src/agent_diff.rs index 6d24f8ad5078974a6390809fce02b55563c3ced9..894c0e5b933349cc668688167dc0a0bf74a4fca0 100644 --- a/crates/agent/src/agent_diff.rs +++ b/crates/agent/src/agent_diff.rs @@ -1372,6 +1372,7 @@ impl AgentDiff { | ThreadEvent::StreamedAssistantThinking(_, _) | ThreadEvent::StreamedToolUse { .. } | ThreadEvent::InvalidToolInput { .. } + | ThreadEvent::MissingToolUse { .. } | ThreadEvent::MessageAdded(_) | ThreadEvent::MessageEdited(_) | ThreadEvent::MessageDeleted(_) diff --git a/crates/agent/src/thread.rs b/crates/agent/src/thread.rs index 83266335e9c65c0c52d576b5b7cd583020f8e654..dedcdd425b3eab9844bda3aabe154adbd9b4e5ad 100644 --- a/crates/agent/src/thread.rs +++ b/crates/agent/src/thread.rs @@ -1911,12 +1911,54 @@ impl Thread { cx, ); } + } else { + self.handle_hallucinated_tool_use( + tool_use.id.clone(), + tool_use.name.clone(), + window, + cx, + ); } } pending_tool_uses } + pub fn handle_hallucinated_tool_use( + &mut self, + tool_use_id: LanguageModelToolUseId, + hallucinated_tool_name: Arc, + window: Option, + cx: &mut Context, + ) { + let available_tools = self.tools.read(cx).enabled_tools(cx); + + let tool_list = available_tools + .iter() + .map(|tool| format!("- {}: {}", tool.name(), tool.description())) + .collect::>() + .join("\n"); + + let error_message = format!( + "The tool '{}' doesn't exist or is not enabled. Available tools:\n{}", + hallucinated_tool_name, tool_list + ); + + let pending_tool_use = self.tool_use.insert_tool_output( + tool_use_id.clone(), + hallucinated_tool_name, + Err(anyhow!("Missing tool call: {error_message}")), + self.configured_model.as_ref(), + ); + + cx.emit(ThreadEvent::MissingToolUse { + tool_use_id: tool_use_id.clone(), + ui_text: error_message.into(), + }); + + self.tool_finished(tool_use_id, pending_tool_use, false, window, cx); + } + pub fn receive_invalid_tool_json( &mut self, tool_use_id: LanguageModelToolUseId, @@ -2574,6 +2616,10 @@ pub enum ThreadEvent { ui_text: Arc, input: serde_json::Value, }, + MissingToolUse { + tool_use_id: LanguageModelToolUseId, + ui_text: Arc, + }, InvalidToolInput { tool_use_id: LanguageModelToolUseId, ui_text: Arc, diff --git a/crates/eval/Cargo.toml b/crates/eval/Cargo.toml index 77fd920866f6e0c6a313780f83713c2e767be847..af7930ba51293baac741f348732455ccde4b2821 100644 --- a/crates/eval/Cargo.toml +++ b/crates/eval/Cargo.toml @@ -20,6 +20,7 @@ path = "src/explorer.rs" [dependencies] agent.workspace = true anyhow.workspace = true +assistant_settings.workspace = true assistant_tool.workspace = true assistant_tools.workspace = true async-trait.workspace = true diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs index 5220be6de9eb5ae9ab246caa2485b0c742275647..d2478e89e3d757f8a52fb8579ee78c26d4d1dd4a 100644 --- a/crates/eval/src/example.rs +++ b/crates/eval/src/example.rs @@ -12,6 +12,7 @@ use crate::{ }; use agent::{ContextLoadResult, Thread, ThreadEvent}; use anyhow::{Result, anyhow}; +use assistant_settings::AgentProfileId; use async_trait::async_trait; use buffer_diff::DiffHunkStatus; use collections::HashMap; @@ -46,6 +47,7 @@ pub struct ExampleMetadata { pub revision: String, pub language_server: Option, pub max_assertions: Option, + pub profile_id: AgentProfileId, } #[derive(Clone, Debug)] @@ -268,6 +270,12 @@ impl ExampleContext { ThreadEvent::InvalidToolInput { .. } => { println!("{log_prefix} invalid tool input"); } + ThreadEvent::MissingToolUse { + tool_use_id: _, + ui_text, + } => { + println!("{log_prefix} {ui_text}"); + } ThreadEvent::ToolConfirmationNeeded => { panic!( "{}Bug: Tool confirmation should not be required in eval", diff --git a/crates/eval/src/examples/add_arg_to_trait_method.rs b/crates/eval/src/examples/add_arg_to_trait_method.rs index d797d08ce2ac9f2d9838f8a4a389b6d8e94a17e2..dbf2e8bd35dffb0ece0162f8092796d64b02632a 100644 --- a/crates/eval/src/examples/add_arg_to_trait_method.rs +++ b/crates/eval/src/examples/add_arg_to_trait_method.rs @@ -1,6 +1,7 @@ use std::path::Path; use anyhow::Result; +use assistant_settings::AgentProfileId; use async_trait::async_trait; use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer}; @@ -19,6 +20,7 @@ impl Example for AddArgToTraitMethod { allow_preexisting_diagnostics: false, }), max_assertions: None, + profile_id: AgentProfileId::default(), } } diff --git a/crates/eval/src/examples/code_block_citations.rs b/crates/eval/src/examples/code_block_citations.rs index 90085a91acf5aadd16fda9ec9ea23d00fd0c3dbb..13fb346bf98373bf0dc13f5b46dfe89185a4585a 100644 --- a/crates/eval/src/examples/code_block_citations.rs +++ b/crates/eval/src/examples/code_block_citations.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use assistant_settings::AgentProfileId; use async_trait::async_trait; use markdown::PathWithRange; @@ -20,6 +21,7 @@ impl Example for CodeBlockCitations { allow_preexisting_diagnostics: false, }), max_assertions: None, + profile_id: AgentProfileId::default(), } } diff --git a/crates/eval/src/examples/comment_translation.rs b/crates/eval/src/examples/comment_translation.rs index 8c57150ddbb1e952eaea2fd4c54cb85e3848637f..9796afaad6b5a2c5dba786b59d5d01be0e116dc4 100644 --- a/crates/eval/src/examples/comment_translation.rs +++ b/crates/eval/src/examples/comment_translation.rs @@ -1,5 +1,6 @@ use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion}; use anyhow::Result; +use assistant_settings::AgentProfileId; use assistant_tools::StreamingEditFileToolInput; use async_trait::async_trait; @@ -14,6 +15,7 @@ impl Example for CommentTranslation { revision: "504d084e29bce4f60614bc702e91af7f7d9e60ad".to_string(), language_server: None, max_assertions: Some(1), + profile_id: AgentProfileId::default(), } } diff --git a/crates/eval/src/examples/file_search.rs b/crates/eval/src/examples/file_search.rs index f4b5f752133d90379e0d848bad9c1f403aad3750..5da0d03f37844ed1e31a928338b1cfec7e3ba553 100644 --- a/crates/eval/src/examples/file_search.rs +++ b/crates/eval/src/examples/file_search.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use assistant_settings::AgentProfileId; use assistant_tools::FindPathToolInput; use async_trait::async_trait; use regex::Regex; @@ -16,6 +17,7 @@ impl Example for FileSearchExample { revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(), language_server: None, max_assertions: Some(3), + profile_id: AgentProfileId::default(), } } diff --git a/crates/eval/src/examples/hallucinated_tool_calls.toml b/crates/eval/src/examples/hallucinated_tool_calls.toml new file mode 100644 index 0000000000000000000000000000000000000000..f12f01affef576bc8ada0b34efe57709303c1e81 --- /dev/null +++ b/crates/eval/src/examples/hallucinated_tool_calls.toml @@ -0,0 +1,13 @@ +url = "https://github.com/jlowin/fastmcp" +revision = "a2c1e14e5d83af1c32b76280ab368df199c4e860" +language_extension = "py" + +prompt = "Write a LICENSE file just saying 'Apache 2.0' and nothing else" + +profile_name = "ask" + +[thread_assertions] + +no_edit_attempts = """The agent should not claim that it edited or created the file. It should not pretend making any changes.""" + +mention_insufficient_tools = """Agent should mention that it doesn't have relevant tools needed to make the change.""" diff --git a/crates/eval/src/examples/mod.rs b/crates/eval/src/examples/mod.rs index e8718b2ae4c6f9fd3e72b34b227c123fc14db0fd..d7604170d3d0df5cfb96d2ad8d34a36c0965c3bf 100644 --- a/crates/eval/src/examples/mod.rs +++ b/crates/eval/src/examples/mod.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use assistant_settings::AgentProfileId; use async_trait::async_trait; use serde::Deserialize; use std::collections::BTreeMap; @@ -56,12 +57,19 @@ impl DeclarativeExample { None }; + let profile_id = if let Some(profile_name) = base.profile_name { + AgentProfileId(profile_name.into()) + } else { + AgentProfileId::default() + }; + let metadata = ExampleMetadata { name, url: base.url, revision: base.revision, language_server, max_assertions: None, + profile_id, }; Ok(DeclarativeExample { @@ -97,6 +105,8 @@ pub struct ExampleToml { pub allow_preexisting_diagnostics: bool, pub prompt: String, #[serde(default)] + pub profile_name: Option, + #[serde(default)] pub diff_assertions: BTreeMap, #[serde(default)] pub thread_assertions: BTreeMap, diff --git a/crates/eval/src/examples/planets.rs b/crates/eval/src/examples/planets.rs index c900ea87d95dc70b6b735afc00f87cd0bbcaaab6..9ccd077f949dc33c9052ef0a901f8ff72fab7c77 100644 --- a/crates/eval/src/examples/planets.rs +++ b/crates/eval/src/examples/planets.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use assistant_settings::AgentProfileId; use assistant_tool::Tool; use assistant_tools::{OpenTool, TerminalTool}; use async_trait::async_trait; @@ -16,6 +17,7 @@ impl Example for Planets { revision: "59e49c75214f60b4dc4a45092292061c8c26ce27".to_string(), // so effectively a blank project. language_server: None, max_assertions: None, + profile_id: AgentProfileId::default(), } } diff --git a/crates/eval/src/instance.rs b/crates/eval/src/instance.rs index f9c9b72e306b92a26a9c5f64ae0701da37ebdfb1..44c1bb4eebacfb3ae94ab64c7d22aaba3a658438 100644 --- a/crates/eval/src/instance.rs +++ b/crates/eval/src/instance.rs @@ -307,9 +307,14 @@ impl ExampleInstance { std::fs::write(&last_diff_file_path, "")?; let thread_store = thread_store.await?; + + let profile_id = meta.profile_id.clone(); + thread_store.update(cx, |thread_store, cx| thread_store.load_profile_by_id(profile_id, cx)).expect("Failed to load profile"); + let thread = thread_store.update(cx, |thread_store, cx| thread_store.create_thread(cx))?; + thread.update(cx, |thread, _cx| { let mut request_count = 0; let previous_diff = Rc::new(RefCell::new("".to_string()));