1use agent_settings::AgentProfileId;
2use anyhow::Result;
3use async_trait::async_trait;
4
5use crate::example::{Example, ExampleContext, ExampleMetadata};
6
7pub struct GrepParamsEscapementExample;
8
9/*
10
11This eval checks that the model doesn't use HTML escapement for characters like `<` and
12`>` in tool parameters.
13
14 original +system_prompt change +tool description
15 claude-opus-4 89% 92% 97%+
16 claude-sonnet-4 100%
17 gpt-4.1-mini 100%
18 gemini-2.5-pro 98%
19
20*/
21
22#[async_trait(?Send)]
23impl Example for GrepParamsEscapementExample {
24 fn meta(&self) -> ExampleMetadata {
25 ExampleMetadata {
26 name: "grep_params_escapement".to_string(),
27 url: "https://github.com/octocat/hello-world".to_string(),
28 revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
29 language_server: None,
30 max_assertions: Some(1),
31 profile_id: AgentProfileId::default(),
32 existing_thread_json: None,
33 max_turns: Some(2),
34 }
35 }
36
37 async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
38 // cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works");
39 cx.push_user_message("Search for files containing the characters `>` or `<`");
40 let response = cx.run_turns(2).await?;
41 let grep_input = response
42 .find_tool_call("grep")
43 .and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
44
45 cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
46
47 cx.assert(
48 !contains_html_entities(&grep_input.unwrap().regex),
49 "Tool parameters should not be escaped",
50 )
51 }
52}
53
54fn contains_html_entities(pattern: &str) -> bool {
55 regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
56 .unwrap()
57 .is_match(pattern)
58}