grep_params_escapement.rs

 1use agent_settings::AgentProfileId;
 2use anyhow::Result;
 3use async_trait::async_trait;
 4
 5use crate::example::{Example, ExampleContext, ExampleMetadata};
 6
 7pub struct GrepParamsEscapementExample;
 8
 9/*
10
11This eval checks that the model doesn't use HTML escapement for characters like `<` and
12`>` in tool parameters.
13
14                      original     +system_prompt change    +tool description
15  claude-opus-4        89%          92%                     97%+
16  claude-sonnet-4      100%
17  gpt-4.1-mini         100%
18  gemini-2.5-pro                    98%
19
20*/
21
22#[async_trait(?Send)]
23impl Example for GrepParamsEscapementExample {
24    fn meta(&self) -> ExampleMetadata {
25        ExampleMetadata {
26            name: "grep_params_escapement".to_string(),
27            url: "https://github.com/octocat/hello-world".to_string(),
28            revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
29            language_server: None,
30            max_assertions: Some(1),
31            profile_id: AgentProfileId::default(),
32            existing_thread_json: None,
33            max_turns: Some(2),
34        }
35    }
36
37    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
38        // cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works");
39        cx.push_user_message("Search for files containing the characters `>` or `<`");
40        let response = cx.run_turns(2).await?;
41        let grep_input = response
42            .find_tool_call("grep")
43            .and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
44
45        cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
46
47        cx.assert(
48            !contains_html_entities(&grep_input.unwrap().regex),
49            "Tool parameters should not be escaped",
50        )
51    }
52}
53
54fn contains_html_entities(pattern: &str) -> bool {
55    regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
56        .unwrap()
57        .is_match(pattern)
58}