grep_params_escapement.rs

 1use agent::GrepToolInput;
 2use agent_settings::AgentProfileId;
 3use anyhow::Result;
 4use async_trait::async_trait;
 5
 6use crate::example::{Example, ExampleContext, ExampleMetadata};
 7
 8pub struct GrepParamsEscapementExample;
 9
10/*
11
12This eval checks that the model doesn't use HTML escapement for characters like `<` and
13`>` in tool parameters.
14
15                      original     +system_prompt change    +tool description
16  claude-opus-4        89%          92%                     97%+
17  claude-sonnet-4      100%
18  gpt-4.1-mini         100%
19  gemini-2.5-pro                    98%
20
21*/
22
23#[async_trait(?Send)]
24impl Example for GrepParamsEscapementExample {
25    fn meta(&self) -> ExampleMetadata {
26        ExampleMetadata {
27            name: "grep_params_escapement".to_string(),
28            url: "https://github.com/octocat/hello-world".to_string(),
29            revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
30            language_server: None,
31            max_assertions: Some(1),
32            profile_id: AgentProfileId::default(),
33            existing_thread_json: None,
34            max_turns: Some(2),
35        }
36    }
37
38    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
39        let response = cx
40            .prompt_with_max_turns("Search for files containing the characters `>` or `<`", 2)
41            .await?;
42        let grep_input = response
43            .find_tool_call("grep")
44            .and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
45
46        cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
47
48        cx.assert(
49            !contains_html_entities(&grep_input.unwrap().regex),
50            "Tool parameters should not be escaped",
51        )
52    }
53}
54
55fn contains_html_entities(pattern: &str) -> bool {
56    regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
57        .unwrap()
58        .is_match(pattern)
59}