Add mismatched tag threshold parameter to eval function (#32190)

Ben Brandt created

Replace hardcoded 0.10 threshold with configurable parameter and set
0.05 default for most tests, with 0.2 for from_pixels_constructor
eval that produces more mismatched tags.

Release Notes:

- N/A

Change summary

.github/workflows/unit_evals.yml               |  2 +-
crates/assistant_tools/src/edit_agent/evals.rs | 20 ++++++++++++++++++--
2 files changed, 19 insertions(+), 3 deletions(-)

Detailed changes

.github/workflows/unit_evals.yml 🔗

@@ -66,7 +66,7 @@ jobs:
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
 
-      - name: Send the pull request link into the Slack channel
+      - name: Send failure message to Slack channel if needed
         if: ${{ failure() }}
         uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
         with:

crates/assistant_tools/src/edit_agent/evals.rs 🔗

@@ -58,6 +58,7 @@ fn eval_extract_handle_command_output() {
     eval(
         100,
         0.7, // Taking the lower bar for Gemini
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(
@@ -116,6 +117,7 @@ fn eval_delete_run_git_blame() {
     eval(
         100,
         0.95,
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(
@@ -178,6 +180,7 @@ fn eval_translate_doc_comments() {
     eval(
         200,
         1.,
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(
@@ -241,6 +244,7 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
     eval(
         100,
         0.95,
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(
@@ -365,6 +369,7 @@ fn eval_disable_cursor_blinking() {
     eval(
         100,
         0.95,
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(User, [text("Let's research how to cursor blinking works.")]),
@@ -448,6 +453,9 @@ fn eval_from_pixels_constructor() {
     eval(
         100,
         0.95,
+        // For whatever reason, this eval produces more mismatched tags.
+        // Increasing for now, let's see if we can bring this down.
+        0.2,
         EvalInput::from_conversation(
             vec![
                 message(
@@ -648,6 +656,7 @@ fn eval_zode() {
     eval(
         50,
         1.,
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
@@ -754,6 +763,7 @@ fn eval_add_overwrite_test() {
     eval(
         200,
         0.5, // TODO: make this eval better
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(
@@ -993,6 +1003,7 @@ fn eval_create_empty_file() {
     eval(
         100,
         0.99,
+        0.05,
         EvalInput::from_conversation(
             vec![
                 message(User, [text("Create a second empty todo file ")]),
@@ -1279,7 +1290,12 @@ impl EvalAssertion {
     }
 }
 
-fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
+fn eval(
+    iterations: usize,
+    expected_pass_ratio: f32,
+    mismatched_tag_threshold: f32,
+    mut eval: EvalInput,
+) {
     let mut evaluated_count = 0;
     let mut failed_count = 0;
     report_progress(evaluated_count, failed_count, iterations);
@@ -1351,7 +1367,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
 
     let mismatched_tag_ratio =
         cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
-    if mismatched_tag_ratio > 0.10 {
+    if mismatched_tag_ratio > mismatched_tag_threshold {
         for eval_output in eval_outputs {
             println!("{}", eval_output);
         }