Detailed changes
@@ -17,13 +17,13 @@ You are a highly skilled software engineer with extensive knowledge in many prog
4. Use only the tools that are currently available.
5. DO NOT use a tool that is not available just because it appears in the conversation. This means the user turned it off.
6. NEVER run commands that don't terminate on their own such as web servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers.
+7. Avoid HTML entity escaping - use plain characters instead.
## Searching and Reading
If you are unsure how to fulfill the user's request, gather more information with tool calls and/or clarifying questions.
{{! TODO: If there are files, we should mention it but otherwise omit that fact }}
-{{#if has_tools}}
If appropriate, use tool calls to explore the current project, which contains the following root directories:
{{#each worktrees}}
@@ -38,7 +38,6 @@ If appropriate, use tool calls to explore the current project, which contains th
- As you learn about the structure of the project, use that information to scope `grep` searches to targeted subtrees of the project.
- The user might specify a partial file path. If you don't know the full path, use `find_path` (not `grep`) before you read the file.
{{/if}}
-{{/if}}
{{else}}
You are being tasked with providing a response, but you have no ability to use tools or to read or write any aspect of the user's system (other than any context the user might have provided to you).
@@ -37,13 +37,13 @@ use crate::diagnostics_tool::DiagnosticsTool;
use crate::edit_file_tool::EditFileTool;
use crate::fetch_tool::FetchTool;
use crate::find_path_tool::FindPathTool;
-use crate::grep_tool::GrepTool;
use crate::list_directory_tool::ListDirectoryTool;
use crate::now_tool::NowTool;
use crate::thinking_tool::ThinkingTool;
pub use edit_file_tool::{EditFileMode, EditFileToolInput};
pub use find_path_tool::FindPathToolInput;
+pub use grep_tool::{GrepTool, GrepToolInput};
pub use open_tool::OpenTool;
pub use read_file_tool::{ReadFileTool, ReadFileToolInput};
pub use terminal_tool::TerminalTool;
@@ -6,3 +6,4 @@ Searches the contents of files in the project with a regular expression
- Never use this tool to search for paths. Only search file contents with this tool.
- Use this tool when you need to find files containing specific patterns
- Results are paginated with 20 matches per page. Use the optional 'offset' parameter to request subsequent pages.
+- DO NOT use HTML entities solely to escape characters in the tool parameters.
@@ -0,0 +1,59 @@
+use agent_settings::AgentProfileId;
+use anyhow::Result;
+use assistant_tools::GrepToolInput;
+use async_trait::async_trait;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata};
+
+pub struct GrepParamsEscapementExample;
+
+/*
+
+This eval checks that the model doesn't use HTML escapement for characters like `<` and
+`>` in tool parameters.
+
+ original +system_prompt change +tool description
+ claude-opus-4 89% 92% 97%+
+ claude-sonnet-4 100%
+ gpt-4.1-mini 100%
+ gemini-2.5-pro 98%
+
+*/
+
+#[async_trait(?Send)]
+impl Example for GrepParamsEscapementExample {
+ fn meta(&self) -> ExampleMetadata {
+ ExampleMetadata {
+ name: "grep_params_escapement".to_string(),
+ url: "https://github.com/octocat/hello-world".to_string(),
+ revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
+ language_server: None,
+ max_assertions: Some(1),
+ profile_id: AgentProfileId::default(),
+ existing_thread_json: None,
+ max_turns: Some(2),
+ }
+ }
+
+ async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+ // cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works");
+ cx.push_user_message("Search for files containing the characters `>` or `<`");
+ let response = cx.run_turns(2).await?;
+ let grep_input = response
+ .find_tool_call("grep")
+ .and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
+
+ cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
+
+ cx.assert(
+ !contains_html_entities(&grep_input.unwrap().regex),
+ "Tool parameters should not be escaped",
+ )
+ }
+}
+
+fn contains_html_entities(pattern: &str) -> bool {
+ regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
+ .unwrap()
+ .is_match(pattern)
+}
@@ -16,6 +16,7 @@ mod add_arg_to_trait_method;
mod code_block_citations;
mod comment_translation;
mod file_search;
+mod grep_params_escapement;
mod overwrite_file;
mod planets;
@@ -27,6 +28,7 @@ pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
Rc::new(planets::Planets),
Rc::new(comment_translation::CommentTranslation),
Rc::new(overwrite_file::FileOverwriteExample),
+ Rc::new(grep_params_escapement::GrepParamsEscapementExample),
];
for example_path in list_declarative_examples(examples_dir).unwrap() {