diff --git a/assets/prompts/assistant_system_prompt.hbs b/assets/prompts/assistant_system_prompt.hbs index 0aaceb156a11ffc0574e68065d9d031208f02e68..a155dea19d08b578b561b90da5d044d40bea72db 100644 --- a/assets/prompts/assistant_system_prompt.hbs +++ b/assets/prompts/assistant_system_prompt.hbs @@ -17,13 +17,13 @@ You are a highly skilled software engineer with extensive knowledge in many prog 4. Use only the tools that are currently available. 5. DO NOT use a tool that is not available just because it appears in the conversation. This means the user turned it off. 6. NEVER run commands that don't terminate on their own such as web servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers. +7. Avoid HTML entity escaping - use plain characters instead. ## Searching and Reading If you are unsure how to fulfill the user's request, gather more information with tool calls and/or clarifying questions. {{! TODO: If there are files, we should mention it but otherwise omit that fact }} -{{#if has_tools}} If appropriate, use tool calls to explore the current project, which contains the following root directories: {{#each worktrees}} @@ -38,7 +38,6 @@ If appropriate, use tool calls to explore the current project, which contains th - As you learn about the structure of the project, use that information to scope `grep` searches to targeted subtrees of the project. - The user might specify a partial file path. If you don't know the full path, use `find_path` (not `grep`) before you read the file. {{/if}} -{{/if}} {{else}} You are being tasked with providing a response, but you have no ability to use tools or to read or write any aspect of the user's system (other than any context the user might have provided to you). diff --git a/crates/assistant_tools/src/assistant_tools.rs b/crates/assistant_tools/src/assistant_tools.rs index de57e8ebb314ccf248764f0b327dda89ed0d5608..83312a07b625404085694194b92ee7c732a67998 100644 --- a/crates/assistant_tools/src/assistant_tools.rs +++ b/crates/assistant_tools/src/assistant_tools.rs @@ -37,13 +37,13 @@ use crate::diagnostics_tool::DiagnosticsTool; use crate::edit_file_tool::EditFileTool; use crate::fetch_tool::FetchTool; use crate::find_path_tool::FindPathTool; -use crate::grep_tool::GrepTool; use crate::list_directory_tool::ListDirectoryTool; use crate::now_tool::NowTool; use crate::thinking_tool::ThinkingTool; pub use edit_file_tool::{EditFileMode, EditFileToolInput}; pub use find_path_tool::FindPathToolInput; +pub use grep_tool::{GrepTool, GrepToolInput}; pub use open_tool::OpenTool; pub use read_file_tool::{ReadFileTool, ReadFileToolInput}; pub use terminal_tool::TerminalTool; diff --git a/crates/assistant_tools/src/grep_tool/description.md b/crates/assistant_tools/src/grep_tool/description.md index 33983e66ddc1d128a43513ab29efc40e9942c4ae..e3c0b43f31da53df49ce905e764dedcc5ea530de 100644 --- a/crates/assistant_tools/src/grep_tool/description.md +++ b/crates/assistant_tools/src/grep_tool/description.md @@ -6,3 +6,4 @@ Searches the contents of files in the project with a regular expression - Never use this tool to search for paths. Only search file contents with this tool. - Use this tool when you need to find files containing specific patterns - Results are paginated with 20 matches per page. Use the optional 'offset' parameter to request subsequent pages. +- DO NOT use HTML entities solely to escape characters in the tool parameters. diff --git a/crates/eval/src/examples/grep_params_escapement.rs b/crates/eval/src/examples/grep_params_escapement.rs new file mode 100644 index 0000000000000000000000000000000000000000..0532698ba28b45bd8111767eb51ea1336e18fa13 --- /dev/null +++ b/crates/eval/src/examples/grep_params_escapement.rs @@ -0,0 +1,59 @@ +use agent_settings::AgentProfileId; +use anyhow::Result; +use assistant_tools::GrepToolInput; +use async_trait::async_trait; + +use crate::example::{Example, ExampleContext, ExampleMetadata}; + +pub struct GrepParamsEscapementExample; + +/* + +This eval checks that the model doesn't use HTML escapement for characters like `<` and +`>` in tool parameters. + + original +system_prompt change +tool description + claude-opus-4 89% 92% 97%+ + claude-sonnet-4 100% + gpt-4.1-mini 100% + gemini-2.5-pro 98% + +*/ + +#[async_trait(?Send)] +impl Example for GrepParamsEscapementExample { + fn meta(&self) -> ExampleMetadata { + ExampleMetadata { + name: "grep_params_escapement".to_string(), + url: "https://github.com/octocat/hello-world".to_string(), + revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(), + language_server: None, + max_assertions: Some(1), + profile_id: AgentProfileId::default(), + existing_thread_json: None, + max_turns: Some(2), + } + } + + async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> { + // cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works"); + cx.push_user_message("Search for files containing the characters `>` or `<`"); + let response = cx.run_turns(2).await?; + let grep_input = response + .find_tool_call("grep") + .and_then(|tool_use| tool_use.parse_input::().ok()); + + cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?; + + cx.assert( + !contains_html_entities(&grep_input.unwrap().regex), + "Tool parameters should not be escaped", + ) + } +} + +fn contains_html_entities(pattern: &str) -> bool { + regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;") + .unwrap() + .is_match(pattern) +} diff --git a/crates/eval/src/examples/mod.rs b/crates/eval/src/examples/mod.rs index 5968ee2fd0b599152d60702f3fc8baa045fe1e7f..26c4c95e50027cad70a3b94e596c52d8906a2dc0 100644 --- a/crates/eval/src/examples/mod.rs +++ b/crates/eval/src/examples/mod.rs @@ -16,6 +16,7 @@ mod add_arg_to_trait_method; mod code_block_citations; mod comment_translation; mod file_search; +mod grep_params_escapement; mod overwrite_file; mod planets; @@ -27,6 +28,7 @@ pub fn all(examples_dir: &Path) -> Vec> { Rc::new(planets::Planets), Rc::new(comment_translation::CommentTranslation), Rc::new(overwrite_file::FileOverwriteExample), + Rc::new(grep_params_escapement::GrepParamsEscapementExample), ]; for example_path in list_declarative_examples(examples_dir).unwrap() {