evals.rs

   1use super::*;
   2use crate::{
   3    EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
   4};
   5use Role::*;
   6use client::{Client, UserStore};
   7use collections::HashMap;
   8use fs::FakeFs;
   9use futures::{FutureExt, future::LocalBoxFuture};
  10use gpui::{AppContext, TestAppContext, Timer};
  11use http_client::StatusCode;
  12use indoc::{formatdoc, indoc};
  13use language_model::{
  14    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  15    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  16};
  17use project::Project;
  18use prompt_store::{ProjectContext, WorktreeContext};
  19use rand::prelude::*;
  20use reqwest_client::ReqwestClient;
  21use serde_json::json;
  22use std::{
  23    cmp::Reverse,
  24    fmt::{self, Display},
  25    io::Write as _,
  26    path::Path,
  27    str::FromStr,
  28    sync::mpsc,
  29    time::Duration,
  30};
  31use util::path;
  32
  33#[test]
  34#[cfg_attr(not(feature = "unit-eval"), ignore)]
  35fn eval_extract_handle_command_output() {
  36    // Test how well agent generates multiple edit hunks.
  37    //
  38    // Model                       | Pass rate
  39    // ----------------------------|----------
  40    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  41    // claude-sonnet-4             |  0.97 (2025-06-14)
  42    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  43    // gemini-2.5-flash            |  0.11 (2025-05-22)
  44    // gpt-4.1                     |  1.00 (2025-05-22)
  45    // claude-sonnet-4.5           |  0.79 (2025-11-10)
  46
  47    let input_file_path = "root/blame.rs";
  48    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  49    let possible_diffs = vec![
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  56        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  57    ];
  58    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  59    eval(
  60        100,
  61        0.95,
  62        0.05,
  63        EvalInput::from_conversation(
  64            vec![
  65                message(
  66                    User,
  67                    [text(formatdoc! {"
  68                        Read the `{input_file_path}` file and extract a method in
  69                        the final stanza of `run_git_blame` to deal with command failures,
  70                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  71                        Do not document the method and do not add any comments.
  72
  73                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  74                    "})],
  75                ),
  76                message(
  77                    Assistant,
  78                    [tool_use(
  79                        "tool_1",
  80                        "read_file",
  81                        ReadFileToolInput {
  82                            path: input_file_path.into(),
  83                            start_line: None,
  84                            end_line: None,
  85                        },
  86                    )],
  87                ),
  88                message(
  89                    User,
  90                    [tool_result("tool_1", "read_file", input_file_content)],
  91                ),
  92                message(
  93                    Assistant,
  94                    [tool_use(
  95                        "tool_2",
  96                        "edit_file",
  97                        EditFileToolInput {
  98                            display_description: edit_description.into(),
  99                            path: input_file_path.into(),
 100                            mode: EditFileMode::Edit,
 101                        },
 102                    )],
 103                ),
 104            ],
 105            Some(input_file_content.into()),
 106            EvalAssertion::assert_diff_any(possible_diffs),
 107        ),
 108    );
 109}
 110
 111#[test]
 112#[cfg_attr(not(feature = "unit-eval"), ignore)]
 113fn eval_delete_run_git_blame() {
 114    // Model                       | Pass rate
 115    // ----------------------------|----------
 116    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 117    // claude-sonnet-4             | 0.96 (2025-06-14)
 118    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 119    // gemini-2.5-flash            |
 120    // gpt-4.1                     |
 121
 122    let input_file_path = "root/blame.rs";
 123    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 124    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 125    let edit_description = "Delete the `run_git_blame` function.";
 126    eval(
 127        100,
 128        0.95,
 129        0.05,
 130        EvalInput::from_conversation(
 131            vec![
 132                message(
 133                    User,
 134                    [text(formatdoc! {"
 135                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 136                        one function, not its usages.
 137                    "})],
 138                ),
 139                message(
 140                    Assistant,
 141                    [tool_use(
 142                        "tool_1",
 143                        "read_file",
 144                        ReadFileToolInput {
 145                            path: input_file_path.into(),
 146                            start_line: None,
 147                            end_line: None,
 148                        },
 149                    )],
 150                ),
 151                message(
 152                    User,
 153                    [tool_result("tool_1", "read_file", input_file_content)],
 154                ),
 155                message(
 156                    Assistant,
 157                    [tool_use(
 158                        "tool_2",
 159                        "edit_file",
 160                        EditFileToolInput {
 161                            display_description: edit_description.into(),
 162                            path: input_file_path.into(),
 163                            mode: EditFileMode::Edit,
 164                        },
 165                    )],
 166                ),
 167            ],
 168            Some(input_file_content.into()),
 169            EvalAssertion::assert_eq(output_file_content),
 170        ),
 171    );
 172}
 173
 174#[test]
 175#[cfg_attr(not(feature = "unit-eval"), ignore)]
 176fn eval_translate_doc_comments() {
 177    //  Model                          | Pass rate
 178    // ============================================
 179    //
 180    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 181    //  claude-sonnet-4                |  1.0  (2025-06-14)
 182    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 183    //  gemini-2.5-flash-preview-04-17 |
 184    //  gpt-4.1                        |
 185
 186    let input_file_path = "root/canvas.rs";
 187    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 188    let edit_description = "Translate all doc comments to Italian";
 189    eval(
 190        200,
 191        1.,
 192        0.05,
 193        EvalInput::from_conversation(
 194            vec![
 195                message(
 196                    User,
 197                    [text(formatdoc! {"
 198                        Read the {input_file_path} file and edit it (without overwriting it),
 199                        translating all the doc comments to italian.
 200                    "})],
 201                ),
 202                message(
 203                    Assistant,
 204                    [tool_use(
 205                        "tool_1",
 206                        "read_file",
 207                        ReadFileToolInput {
 208                            path: input_file_path.into(),
 209                            start_line: None,
 210                            end_line: None,
 211                        },
 212                    )],
 213                ),
 214                message(
 215                    User,
 216                    [tool_result("tool_1", "read_file", input_file_content)],
 217                ),
 218                message(
 219                    Assistant,
 220                    [tool_use(
 221                        "tool_2",
 222                        "edit_file",
 223                        EditFileToolInput {
 224                            display_description: edit_description.into(),
 225                            path: input_file_path.into(),
 226                            mode: EditFileMode::Edit,
 227                        },
 228                    )],
 229                ),
 230            ],
 231            Some(input_file_content.into()),
 232            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 233        ),
 234    );
 235}
 236
 237#[test]
 238#[cfg_attr(not(feature = "unit-eval"), ignore)]
 239fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 240    //  Model                          | Pass rate
 241    // ============================================
 242    //
 243    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 244    //  claude-sonnet-4                |  0.11 (2025-06-14)
 245    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 246    //  gemini-2.5-flash-preview-04-17 |
 247    //  gpt-4.1                        |
 248    //  claude-sonnet-4.5              |  0.25 (2025-11-10)
 249
 250    let input_file_path = "root/lib.rs";
 251    let input_file_content =
 252        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 253    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 254    eval(
 255        100,
 256        0.95,
 257        0.05,
 258        EvalInput::from_conversation(
 259            vec![
 260                message(
 261                    User,
 262                    [text(formatdoc! {"
 263                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 264                        Use `ureq` to download the SDK for the current platform and architecture.
 265                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 266                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 267                        that's inside of the archive.
 268                        Don't re-download the SDK if that executable already exists.
 269
 270                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 271
 272                        Here are the available wasi-sdk assets:
 273                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 274                        - wasi-sdk-25.0-arm64-macos.tar.gz
 275                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 276                        - wasi-sdk-25.0-arm64-linux.tar.gz
 277                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 278                        - wasi-sdk-25.0-arm64-linux.tar.gz
 279                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 280                    "})],
 281                ),
 282                message(
 283                    Assistant,
 284                    [tool_use(
 285                        "tool_1",
 286                        "read_file",
 287                        ReadFileToolInput {
 288                            path: input_file_path.into(),
 289                            start_line: Some(971),
 290                            end_line: Some(1050),
 291                        },
 292                    )],
 293                ),
 294                message(
 295                    User,
 296                    [tool_result(
 297                        "tool_1",
 298                        "read_file",
 299                        lines(input_file_content, 971..1050),
 300                    )],
 301                ),
 302                message(
 303                    Assistant,
 304                    [tool_use(
 305                        "tool_2",
 306                        "read_file",
 307                        ReadFileToolInput {
 308                            path: input_file_path.into(),
 309                            start_line: Some(1050),
 310                            end_line: Some(1100),
 311                        },
 312                    )],
 313                ),
 314                message(
 315                    User,
 316                    [tool_result(
 317                        "tool_2",
 318                        "read_file",
 319                        lines(input_file_content, 1050..1100),
 320                    )],
 321                ),
 322                message(
 323                    Assistant,
 324                    [tool_use(
 325                        "tool_3",
 326                        "read_file",
 327                        ReadFileToolInput {
 328                            path: input_file_path.into(),
 329                            start_line: Some(1100),
 330                            end_line: Some(1150),
 331                        },
 332                    )],
 333                ),
 334                message(
 335                    User,
 336                    [tool_result(
 337                        "tool_3",
 338                        "read_file",
 339                        lines(input_file_content, 1100..1150),
 340                    )],
 341                ),
 342                message(
 343                    Assistant,
 344                    [tool_use(
 345                        "tool_4",
 346                        "edit_file",
 347                        EditFileToolInput {
 348                            display_description: edit_description.into(),
 349                            path: input_file_path.into(),
 350                            mode: EditFileMode::Edit,
 351                        },
 352                    )],
 353                ),
 354            ],
 355            Some(input_file_content.into()),
 356            EvalAssertion::judge_diff(indoc! {"
 357                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 358                - ureq is used to download the SDK for current platform and architecture
 359            "}),
 360        ),
 361    );
 362}
 363
 364#[test]
 365#[cfg_attr(not(feature = "unit-eval"), ignore)]
 366fn eval_disable_cursor_blinking() {
 367    //  Model                          | Pass rate
 368    // ============================================
 369    //
 370    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 371    //  claude-sonnet-4                |  0.81 (2025-07-14)
 372    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 373    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 374    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 375    //  claude-sonnet-4.5              |  0.20 (2025-11-10)
 376
 377    let input_file_path = "root/editor.rs";
 378    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 379    let edit_description = "Comment out the call to `BlinkManager::enable`";
 380    let possible_diffs = vec![
 381        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 382        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 383        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 384        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 385    ];
 386    eval(
 387        100,
 388        0.51,
 389        0.05,
 390        EvalInput::from_conversation(
 391            vec![
 392                message(User, [text("Let's research how to cursor blinking works.")]),
 393                message(
 394                    Assistant,
 395                    [tool_use(
 396                        "tool_1",
 397                        "grep",
 398                        GrepToolInput {
 399                            regex: "blink".into(),
 400                            include_pattern: None,
 401                            offset: 0,
 402                            case_sensitive: false,
 403                        },
 404                    )],
 405                ),
 406                message(
 407                    User,
 408                    [tool_result(
 409                        "tool_1",
 410                        "grep",
 411                        [
 412                            lines(input_file_content, 100..400),
 413                            lines(input_file_content, 800..1300),
 414                            lines(input_file_content, 1600..2000),
 415                            lines(input_file_content, 5000..5500),
 416                            lines(input_file_content, 8000..9000),
 417                            lines(input_file_content, 18455..18470),
 418                            lines(input_file_content, 20000..20500),
 419                            lines(input_file_content, 21000..21300),
 420                        ]
 421                        .join("Match found:\n\n"),
 422                    )],
 423                ),
 424                message(
 425                    User,
 426                    [text(indoc! {"
 427                        Comment out the lines that interact with the BlinkManager.
 428                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 429                        Don't add additional comments.
 430                    "})],
 431                ),
 432                message(
 433                    Assistant,
 434                    [tool_use(
 435                        "tool_4",
 436                        "edit_file",
 437                        EditFileToolInput {
 438                            display_description: edit_description.into(),
 439                            path: input_file_path.into(),
 440                            mode: EditFileMode::Edit,
 441                        },
 442                    )],
 443                ),
 444            ],
 445            Some(input_file_content.into()),
 446            EvalAssertion::assert_diff_any(possible_diffs),
 447        ),
 448    );
 449}
 450
 451#[test]
 452#[cfg_attr(not(feature = "unit-eval"), ignore)]
 453fn eval_from_pixels_constructor() {
 454    // Results for 2025-06-13
 455    //
 456    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 457    // value. Higher values improve the pass rate but may sometimes cause
 458    // edits to be misapplied. In the context of this eval, this means
 459    // the agent might add from_pixels tests in incorrect locations
 460    // (e.g., at the beginning of the file), yet the evaluation may still
 461    // rate it highly.
 462    //
 463    //  Model                          | Date        | Pass rate
 464    // =========================================================
 465    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 466    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 467    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 468    //  gpt-4.1                        |
 469
 470    let input_file_path = "root/canvas.rs";
 471    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 472    let edit_description = "Implement from_pixels constructor and add tests.";
 473    eval(
 474        100,
 475        0.95,
 476        // For whatever reason, this eval produces more mismatched tags.
 477        // Increasing for now, let's see if we can bring this down.
 478        0.25,
 479        EvalInput::from_conversation(
 480            vec![
 481                message(
 482                    User,
 483                    [text(indoc! {"
 484                        Introduce a new `from_pixels` constructor in Canvas and
 485                        also add tests for it in the same file.
 486                    "})],
 487                ),
 488                message(
 489                    Assistant,
 490                    [tool_use(
 491                        "tool_1",
 492                        "read_file",
 493                        ReadFileToolInput {
 494                            path: input_file_path.into(),
 495                            start_line: None,
 496                            end_line: None,
 497                        },
 498                    )],
 499                ),
 500                message(
 501                    User,
 502                    [tool_result("tool_1", "read_file", input_file_content)],
 503                ),
 504                message(
 505                    Assistant,
 506                    [tool_use(
 507                        "tool_2",
 508                        "grep",
 509                        GrepToolInput {
 510                            regex: "mod\\s+tests".into(),
 511                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 512                            offset: 0,
 513                            case_sensitive: false,
 514                        },
 515                    )],
 516                ),
 517                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 518                message(
 519                    Assistant,
 520                    [tool_use(
 521                        "tool_3",
 522                        "grep",
 523                        GrepToolInput {
 524                            regex: "mod\\s+tests".into(),
 525                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 526                            offset: 0,
 527                            case_sensitive: false,
 528                        },
 529                    )],
 530                ),
 531                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 532                message(
 533                    Assistant,
 534                    [tool_use(
 535                        "tool_4",
 536                        "grep",
 537                        GrepToolInput {
 538                            regex: "#\\[test\\]".into(),
 539                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 540                            offset: 0,
 541                            case_sensitive: false,
 542                        },
 543                    )],
 544                ),
 545                message(
 546                    User,
 547                    [tool_result(
 548                        "tool_4",
 549                        "grep",
 550                        indoc! {"
 551                            Found 6 matches:
 552
 553                            ## Matches in font-kit/src/loaders/core_text.rs
 554
 555                            ### mod test › L926-936
 556                            ```
 557                            mod test {
 558                                use super::Font;
 559                                use crate::properties::{Stretch, Weight};
 560
 561                                #[cfg(feature = \"source\")]
 562                                use crate::source::SystemSource;
 563
 564                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 565
 566                                #[cfg(feature = \"source\")]
 567                                #[test]
 568                            ```
 569
 570                            55 lines remaining in ancestor node. Read the file to see all.
 571
 572                            ### mod test › L947-951
 573                            ```
 574                                }
 575
 576                                #[test]
 577                                fn test_core_text_to_css_font_weight() {
 578                                    // Exact matches
 579                            ```
 580
 581                            ### mod test › L959-963
 582                            ```
 583                                }
 584
 585                                #[test]
 586                                fn test_core_text_to_css_font_stretch() {
 587                                    // Exact matches
 588                            ```
 589
 590                            ## Matches in font-kit/src/loaders/freetype.rs
 591
 592                            ### mod test › L1238-1248
 593                            ```
 594                            mod test {
 595                                use crate::loaders::freetype::Font;
 596
 597                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 598                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 599
 600                                #[test]
 601                                fn get_pcf_postscript_name() {
 602                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 603                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 604                                }
 605                            ```
 606
 607                            1 lines remaining in ancestor node. Read the file to see all.
 608
 609                            ## Matches in font-kit/src/sources/core_text.rs
 610
 611                            ### mod test › L265-275
 612                            ```
 613                            mod test {
 614                                use crate::properties::{Stretch, Weight};
 615
 616                                #[test]
 617                                fn test_css_to_core_text_font_weight() {
 618                                    // Exact matches
 619                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 620                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 621                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 622                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 623
 624                            ```
 625
 626                            27 lines remaining in ancestor node. Read the file to see all.
 627
 628                            ### mod test › L278-282
 629                            ```
 630                                }
 631
 632                                #[test]
 633                                fn test_css_to_core_text_font_stretch() {
 634                                    // Exact matches
 635                            ```
 636                        "},
 637                    )],
 638                ),
 639                message(
 640                    Assistant,
 641                    [tool_use(
 642                        "tool_5",
 643                        "edit_file",
 644                        EditFileToolInput {
 645                            display_description: edit_description.into(),
 646                            path: input_file_path.into(),
 647                            mode: EditFileMode::Edit,
 648                        },
 649                    )],
 650                ),
 651            ],
 652            Some(input_file_content.into()),
 653            EvalAssertion::judge_diff(indoc! {"
 654                    - The diff contains a new `from_pixels` constructor
 655                    - The diff contains new tests for the `from_pixels` constructor
 656                "}),
 657        ),
 658    );
 659}
 660
 661#[test]
 662#[cfg_attr(not(feature = "unit-eval"), ignore)]
 663fn eval_zode() {
 664    //  Model                          | Pass rate
 665    // ============================================
 666    //
 667    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 668    //  claude-sonnet-4                |  1.0 (2025-06-14)
 669    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 670    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 671    //  gpt-4.1                        |  1.0 (2025-05-22)
 672
 673    let input_file_path = "root/zode.py";
 674    let input_content = None;
 675    let edit_description = "Create the main Zode CLI script";
 676    eval(
 677        50,
 678        1.,
 679        0.05,
 680        EvalInput::from_conversation(
 681            vec![
 682                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 683                message(
 684                    Assistant,
 685                    [
 686                        tool_use(
 687                            "tool_1",
 688                            "read_file",
 689                            ReadFileToolInput {
 690                                path: "root/eval/react.py".into(),
 691                                start_line: None,
 692                                end_line: None,
 693                            },
 694                        ),
 695                        tool_use(
 696                            "tool_2",
 697                            "read_file",
 698                            ReadFileToolInput {
 699                                path: "root/eval/react_test.py".into(),
 700                                start_line: None,
 701                                end_line: None,
 702                            },
 703                        ),
 704                    ],
 705                ),
 706                message(
 707                    User,
 708                    [
 709                        tool_result(
 710                            "tool_1",
 711                            "read_file",
 712                            include_str!("evals/fixtures/zode/react.py"),
 713                        ),
 714                        tool_result(
 715                            "tool_2",
 716                            "read_file",
 717                            include_str!("evals/fixtures/zode/react_test.py"),
 718                        ),
 719                    ],
 720                ),
 721                message(
 722                    Assistant,
 723                    [
 724                        text(
 725                            "Now that I understand what we need to build, I'll create the main Python script:",
 726                        ),
 727                        tool_use(
 728                            "tool_3",
 729                            "edit_file",
 730                            EditFileToolInput {
 731                                display_description: edit_description.into(),
 732                                path: input_file_path.into(),
 733                                mode: EditFileMode::Create,
 734                            },
 735                        ),
 736                    ],
 737                ),
 738            ],
 739            input_content,
 740            EvalAssertion::new(async move |sample, _, _cx| {
 741                let invalid_starts = [' ', '`', '\n'];
 742                let mut message = String::new();
 743                for start in invalid_starts {
 744                    if sample.text_after.starts_with(start) {
 745                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 746                        break;
 747                    }
 748                }
 749                // Remove trailing newline.
 750                message.pop();
 751
 752                if message.is_empty() {
 753                    Ok(EvalAssertionOutcome {
 754                        score: 100,
 755                        message: None,
 756                    })
 757                } else {
 758                    Ok(EvalAssertionOutcome {
 759                        score: 0,
 760                        message: Some(message),
 761                    })
 762                }
 763            }),
 764        ),
 765    );
 766}
 767
 768#[test]
 769#[cfg_attr(not(feature = "unit-eval"), ignore)]
 770fn eval_add_overwrite_test() {
 771    //  Model                          | Pass rate
 772    // ============================================
 773    //
 774    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 775    //  claude-sonnet-4                |  0.07 (2025-06-14)
 776    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 777    //  gemini-2.5-flash-preview-04-17 |
 778    //  gpt-4.1                        |
 779    //  claude-sonnet-4.5              |  0.34 (2025-11-10)
 780
 781    let input_file_path = "root/action_log.rs";
 782    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 783    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 784    eval(
 785        200,
 786        0.5, // TODO: make this eval better
 787        0.05,
 788        EvalInput::from_conversation(
 789            vec![
 790                message(
 791                    User,
 792                    [text(indoc! {"
 793                        Introduce a new test in `action_log.rs` to test overwriting a file.
 794                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 795                        Take inspiration from all the other tests in the file.
 796                    "})],
 797                ),
 798                message(
 799                    Assistant,
 800                    [tool_use(
 801                        "tool_1",
 802                        "read_file",
 803                        ReadFileToolInput {
 804                            path: input_file_path.into(),
 805                            start_line: None,
 806                            end_line: None,
 807                        },
 808                    )],
 809                ),
 810                message(
 811                    User,
 812                    [tool_result(
 813                        "tool_1",
 814                        "read_file",
 815                        indoc! {"
 816                            pub struct ActionLog [L13-20]
 817                             tracked_buffers [L15]
 818                             edited_since_project_diagnostics_check [L17]
 819                             project [L19]
 820                            impl ActionLog [L22-498]
 821                             pub fn new [L24-30]
 822                             pub fn project [L32-34]
 823                             pub fn checked_project_diagnostics [L37-39]
 824                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 825                             fn track_buffer_internal [L46-101]
 826                             fn handle_buffer_event [L103-116]
 827                             fn handle_buffer_edited [L118-123]
 828                             fn handle_buffer_file_changed [L125-158]
 829                             async fn maintain_diff [L160-264]
 830                             pub fn buffer_read [L267-269]
 831                             pub fn buffer_created [L272-276]
 832                             pub fn buffer_edited [L279-287]
 833                             pub fn will_delete_buffer [L289-304]
 834                             pub fn keep_edits_in_range [L306-364]
 835                             pub fn reject_edits_in_ranges [L366-459]
 836                             pub fn keep_all_edits [L461-473]
 837                             pub fn changed_buffers [L476-482]
 838                             pub fn stale_buffers [L485-497]
 839                            fn apply_non_conflicting_edits [L500-561]
 840                            fn diff_snapshots [L563-585]
 841                            fn point_to_row_edit [L587-614]
 842                            enum ChangeAuthor [L617-620]
 843                             User [L618]
 844                             Agent [L619]
 845                            enum TrackedBufferStatus [L623-627]
 846                             Created [L624]
 847                             Modified [L625]
 848                             Deleted [L626]
 849                            struct TrackedBuffer [L629-641]
 850                             buffer [L630]
 851                             base_text [L631]
 852                             unreviewed_changes [L632]
 853                             status [L633]
 854                             version [L634]
 855                             diff [L635]
 856                             snapshot [L636]
 857                             diff_update [L637]
 858                             _open_lsp_handle [L638]
 859                             _maintain_diff [L639]
 860                             _subscription [L640]
 861                            impl TrackedBuffer [L643-657]
 862                             fn has_changes [L644-650]
 863                             fn schedule_diff_update [L652-656]
 864                            pub struct ChangedBuffer [L659-661]
 865                             pub diff [L660]
 866                            mod tests [L664-1574]
 867                             fn init_logger [L678-682]
 868                             fn init_test [L684-691]
 869                             async fn test_keep_edits [L694-769]
 870                             async fn test_deletions [L772-854]
 871                             async fn test_overlapping_user_edits [L857-951]
 872                             async fn test_creating_files [L954-1010]
 873                             async fn test_deleting_files [L1013-1120]
 874                             async fn test_reject_edits [L1123-1255]
 875                             async fn test_reject_multiple_edits [L1258-1331]
 876                             async fn test_reject_deleted_file [L1334-1388]
 877                             async fn test_reject_created_file [L1391-1443]
 878                             async fn test_random_diffs [L1446-1535]
 879                              fn quiesce [L1510-1534]
 880                             struct HunkStatus [L1538-1542]
 881                              range [L1539]
 882                              diff_status [L1540]
 883                              old_text [L1541]
 884                             fn unreviewed_hunks [L1544-1573]
 885
 886                            Showing symbols 1-69 (total symbols: 69)
 887
 888                            Using the line numbers in this outline, you can call this tool again while specifying
 889                            the start_line and end_line fields to see the implementations of symbols in the outline.
 890                        "},
 891                    )],
 892                ),
 893                message(
 894                    Assistant,
 895                    [
 896                        text(
 897                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 898                        ),
 899                        tool_use(
 900                            "tool_2",
 901                            "read_file",
 902                            ReadFileToolInput {
 903                                path: input_file_path.into(),
 904                                start_line: Some(953),
 905                                end_line: Some(1010),
 906                            },
 907                        ),
 908                    ],
 909                ),
 910                message(
 911                    User,
 912                    [tool_result(
 913                        "tool_2",
 914                        "read_file",
 915                        lines(input_file_content, 953..1010),
 916                    )],
 917                ),
 918                message(
 919                    Assistant,
 920                    [
 921                        text(
 922                            "Now let's look at another relevant test to understand how they're structured:",
 923                        ),
 924                        tool_use(
 925                            "tool_3",
 926                            "read_file",
 927                            ReadFileToolInput {
 928                                path: input_file_path.into(),
 929                                start_line: Some(1012),
 930                                end_line: Some(1120),
 931                            },
 932                        ),
 933                    ],
 934                ),
 935                message(
 936                    User,
 937                    [tool_result(
 938                        "tool_3",
 939                        "read_file",
 940                        lines(input_file_content, 1012..1120),
 941                    )],
 942                ),
 943                message(
 944                    Assistant,
 945                    [
 946                        text("Now let's look at how `buffer_created` is implemented:"),
 947                        tool_use(
 948                            "tool_4",
 949                            "read_file",
 950                            ReadFileToolInput {
 951                                path: input_file_path.into(),
 952                                start_line: Some(271),
 953                                end_line: Some(276),
 954                            },
 955                        ),
 956                    ],
 957                ),
 958                message(
 959                    User,
 960                    [tool_result(
 961                        "tool_4",
 962                        "read_file",
 963                        lines(input_file_content, 271..276),
 964                    )],
 965                ),
 966                message(
 967                    Assistant,
 968                    [
 969                        text(indoc! {"
 970                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 971
 972                            1. Create a file
 973                            2. Open a buffer for it
 974                            3. Track it in the action log
 975                            4. Modify the buffer
 976                            5. Call `buffer_created` again as if the file were new
 977                            6. Check that the changes are properly tracked
 978
 979                            Let's write the test:
 980                        "}),
 981                        tool_use(
 982                            "tool_5",
 983                            "edit_file",
 984                            EditFileToolInput {
 985                                display_description: edit_description.into(),
 986                                path: input_file_path.into(),
 987                                mode: EditFileMode::Edit,
 988                            },
 989                        ),
 990                    ],
 991                ),
 992            ],
 993            Some(input_file_content.into()),
 994            EvalAssertion::judge_diff(
 995                "A new test for overwritten files was created, without changing any previous test",
 996            ),
 997        ),
 998    );
 999}
1000
1001#[test]
1002#[cfg_attr(not(feature = "unit-eval"), ignore)]
1003fn eval_create_empty_file() {
1004    // Check that Edit Agent can create a file without writing its
1005    // thoughts into it. This issue is not specific to empty files, but
1006    // it's easier to reproduce with them.
1007    //
1008    //  Model                          | Pass rate
1009    // ============================================
1010    //
1011    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1012    //  claude-sonnet-4                |  1.00 (2025-06-14)
1013    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1014    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1015    //  gpt-4.1                        |  1.00 (2025-05-21)
1016    //
1017    //
1018    // TODO: gpt-4.1-mini errored 38 times:
1019    // "data did not match any variant of untagged enum ResponseStreamResult"
1020
1021    let input_file_content = None;
1022    let expected_output_content = String::new();
1023    eval(
1024        100,
1025        0.99,
1026        0.05,
1027        EvalInput::from_conversation(
1028            vec![
1029                message(User, [text("Create a second empty todo file ")]),
1030                message(
1031                    Assistant,
1032                    [
1033                        text(formatdoc! {"
1034                        I'll help you create a second empty todo file.
1035                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1036                        "}),
1037                        tool_use(
1038                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1039                            "list_directory",
1040                            ListDirectoryToolInput {
1041                                path: "root".to_string(),
1042                            },
1043                        ),
1044                    ],
1045                ),
1046                message(
1047                    User,
1048                    [tool_result(
1049                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1050                        "list_directory",
1051                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1052                    )],
1053                ),
1054                message(
1055                    Assistant,
1056                    [
1057                        text(formatdoc! {"
1058                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1059                    "}),
1060                        tool_use(
1061                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1062                            "edit_file",
1063                            EditFileToolInput {
1064                                display_description: "Create empty TODO3 file".to_string(),
1065                                mode: EditFileMode::Create,
1066                                path: "root/TODO3".into(),
1067                            },
1068                        ),
1069                    ],
1070                ),
1071            ],
1072            input_file_content,
1073            // Bad behavior is to write something like
1074            // "I'll create an empty TODO3 file as requested."
1075            EvalAssertion::assert_eq(expected_output_content),
1076        ),
1077    );
1078}
1079
1080fn message(
1081    role: Role,
1082    contents: impl IntoIterator<Item = MessageContent>,
1083) -> LanguageModelRequestMessage {
1084    LanguageModelRequestMessage {
1085        role,
1086        content: contents.into_iter().collect(),
1087        cache: false,
1088    }
1089}
1090
1091fn text(text: impl Into<String>) -> MessageContent {
1092    MessageContent::Text(text.into())
1093}
1094
1095fn lines(input: &str, range: Range<usize>) -> String {
1096    input
1097        .lines()
1098        .skip(range.start)
1099        .take(range.len())
1100        .collect::<Vec<_>>()
1101        .join("\n")
1102}
1103
1104fn tool_use(
1105    id: impl Into<Arc<str>>,
1106    name: impl Into<Arc<str>>,
1107    input: impl Serialize,
1108) -> MessageContent {
1109    MessageContent::ToolUse(LanguageModelToolUse {
1110        id: LanguageModelToolUseId::from(id.into()),
1111        name: name.into(),
1112        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1113        input: serde_json::to_value(input).unwrap(),
1114        is_input_complete: true,
1115    })
1116}
1117
1118fn tool_result(
1119    id: impl Into<Arc<str>>,
1120    name: impl Into<Arc<str>>,
1121    result: impl Into<Arc<str>>,
1122) -> MessageContent {
1123    MessageContent::ToolResult(LanguageModelToolResult {
1124        tool_use_id: LanguageModelToolUseId::from(id.into()),
1125        tool_name: name.into(),
1126        is_error: false,
1127        content: LanguageModelToolResultContent::Text(result.into()),
1128        output: None,
1129    })
1130}
1131
1132#[derive(Clone)]
1133struct EvalInput {
1134    conversation: Vec<LanguageModelRequestMessage>,
1135    edit_file_input: EditFileToolInput,
1136    input_content: Option<String>,
1137    assertion: EvalAssertion,
1138}
1139
1140impl EvalInput {
1141    fn from_conversation(
1142        conversation: Vec<LanguageModelRequestMessage>,
1143        input_content: Option<String>,
1144        assertion: EvalAssertion,
1145    ) -> Self {
1146        let msg = conversation.last().expect("Conversation must not be empty");
1147        if msg.role != Role::Assistant {
1148            panic!("Conversation must end with an assistant message");
1149        }
1150        let tool_use = msg
1151            .content
1152            .iter()
1153            .flat_map(|content| match content {
1154                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1155                    Some(tool_use)
1156                }
1157                _ => None,
1158            })
1159            .next()
1160            .expect("Conversation must end with an edit_file tool use")
1161            .clone();
1162
1163        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1164
1165        EvalInput {
1166            conversation,
1167            edit_file_input,
1168            input_content,
1169            assertion,
1170        }
1171    }
1172}
1173
1174#[derive(Clone)]
1175struct EvalSample {
1176    text_before: String,
1177    text_after: String,
1178    edit_output: EditAgentOutput,
1179    diff: String,
1180}
1181
1182trait AssertionFn: 'static + Send + Sync {
1183    fn assert<'a>(
1184        &'a self,
1185        sample: &'a EvalSample,
1186        judge_model: Arc<dyn LanguageModel>,
1187        cx: &'a mut TestAppContext,
1188    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1189}
1190
1191impl<F> AssertionFn for F
1192where
1193    F: 'static
1194        + Send
1195        + Sync
1196        + AsyncFn(
1197            &EvalSample,
1198            Arc<dyn LanguageModel>,
1199            &mut TestAppContext,
1200        ) -> Result<EvalAssertionOutcome>,
1201{
1202    fn assert<'a>(
1203        &'a self,
1204        sample: &'a EvalSample,
1205        judge_model: Arc<dyn LanguageModel>,
1206        cx: &'a mut TestAppContext,
1207    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1208        (self)(sample, judge_model, cx).boxed_local()
1209    }
1210}
1211
1212#[derive(Clone)]
1213struct EvalAssertion(Arc<dyn AssertionFn>);
1214
1215impl EvalAssertion {
1216    fn new<F>(f: F) -> Self
1217    where
1218        F: 'static
1219            + Send
1220            + Sync
1221            + AsyncFn(
1222                &EvalSample,
1223                Arc<dyn LanguageModel>,
1224                &mut TestAppContext,
1225            ) -> Result<EvalAssertionOutcome>,
1226    {
1227        EvalAssertion(Arc::new(f))
1228    }
1229
1230    fn assert_eq(expected: impl Into<String>) -> Self {
1231        let expected = expected.into();
1232        Self::new(async move |sample, _judge, _cx| {
1233            Ok(EvalAssertionOutcome {
1234                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1235                    100
1236                } else {
1237                    0
1238                },
1239                message: None,
1240            })
1241        })
1242    }
1243
1244    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1245        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1246        Self::new(async move |sample, _judge, _cx| {
1247            let matches = expected_diffs.iter().any(|possible_diff| {
1248                let expected =
1249                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1250                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1251            });
1252
1253            Ok(EvalAssertionOutcome {
1254                score: if matches { 100 } else { 0 },
1255                message: None,
1256            })
1257        })
1258    }
1259
1260    fn judge_diff(assertions: &'static str) -> Self {
1261        Self::new(async move |sample, judge, cx| {
1262            let prompt = DiffJudgeTemplate {
1263                diff: sample.diff.clone(),
1264                assertions,
1265            }
1266            .render(&Templates::new())
1267            .unwrap();
1268
1269            let request = LanguageModelRequest {
1270                messages: vec![LanguageModelRequestMessage {
1271                    role: Role::User,
1272                    content: vec![prompt.into()],
1273                    cache: false,
1274                }],
1275                thinking_allowed: true,
1276                ..Default::default()
1277            };
1278            let mut response = retry_on_rate_limit(async || {
1279                Ok(judge
1280                    .stream_completion_text(request.clone(), &cx.to_async())
1281                    .await?)
1282            })
1283            .await?;
1284            let mut output = String::new();
1285            while let Some(chunk) = response.stream.next().await {
1286                let chunk = chunk?;
1287                output.push_str(&chunk);
1288            }
1289
1290            // Parse the score from the response
1291            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1292            if let Some(captures) = re.captures(&output)
1293                && let Some(score_match) = captures.get(1)
1294            {
1295                let score = score_match.as_str().parse().unwrap_or(0);
1296                return Ok(EvalAssertionOutcome {
1297                    score,
1298                    message: Some(output),
1299                });
1300            }
1301
1302            anyhow::bail!("No score found in response. Raw output: {output}");
1303        })
1304    }
1305
1306    async fn run(
1307        &self,
1308        input: &EvalSample,
1309        judge_model: Arc<dyn LanguageModel>,
1310        cx: &mut TestAppContext,
1311    ) -> Result<EvalAssertionOutcome> {
1312        self.0.assert(input, judge_model, cx).await
1313    }
1314}
1315
1316fn eval(
1317    iterations: usize,
1318    expected_pass_ratio: f32,
1319    mismatched_tag_threshold: f32,
1320    mut eval: EvalInput,
1321) {
1322    let mut evaluated_count = 0;
1323    let mut failed_count = 0;
1324    report_progress(evaluated_count, failed_count, iterations);
1325
1326    let (tx, rx) = mpsc::channel();
1327
1328    // Cache the last message in the conversation, and run one instance of the eval so that
1329    // all the next ones are cached.
1330    eval.conversation.last_mut().unwrap().cache = true;
1331    run_eval(eval.clone(), tx.clone());
1332
1333    let executor = gpui::background_executor();
1334    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1335    for _ in 1..iterations {
1336        let eval = eval.clone();
1337        let tx = tx.clone();
1338        let semaphore = semaphore.clone();
1339        executor
1340            .spawn(async move {
1341                let _guard = semaphore.acquire().await;
1342                run_eval(eval, tx)
1343            })
1344            .detach();
1345    }
1346    drop(tx);
1347
1348    let mut failed_evals = HashMap::default();
1349    let mut errored_evals = HashMap::default();
1350    let mut eval_outputs = Vec::new();
1351    let mut cumulative_parser_metrics = EditParserMetrics::default();
1352    while let Ok(output) = rx.recv() {
1353        match output {
1354            Ok(output) => {
1355                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1356                eval_outputs.push(output.clone());
1357                if output.assertion.score < 80 {
1358                    failed_count += 1;
1359                    failed_evals
1360                        .entry(output.sample.text_after.clone())
1361                        .or_insert(Vec::new())
1362                        .push(output);
1363                }
1364            }
1365            Err(error) => {
1366                failed_count += 1;
1367                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1368            }
1369        }
1370
1371        evaluated_count += 1;
1372        report_progress(evaluated_count, failed_count, iterations);
1373    }
1374
1375    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1376    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1377    if actual_pass_ratio < expected_pass_ratio {
1378        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1379        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1380        for (error, count) in errored_evals {
1381            println!("Eval errored {} times. Error: {}", count, error);
1382        }
1383
1384        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1385        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1386        for (_buffer_output, failed_evals) in failed_evals {
1387            let eval_output = failed_evals.first().unwrap();
1388            println!("Eval failed {} times", failed_evals.len());
1389            println!("{}", eval_output);
1390        }
1391
1392        panic!(
1393            "Actual pass ratio: {}\nExpected pass ratio: {}",
1394            actual_pass_ratio, expected_pass_ratio
1395        );
1396    }
1397
1398    let mismatched_tag_ratio =
1399        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1400    if mismatched_tag_ratio > mismatched_tag_threshold {
1401        for eval_output in eval_outputs {
1402            println!("{}", eval_output);
1403        }
1404        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1405    }
1406}
1407
1408fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1409    let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1410    let mut cx = TestAppContext::build(dispatcher, None);
1411    let output = cx.executor().block_test(async {
1412        let test = EditAgentTest::new(&mut cx).await;
1413        test.eval(eval, &mut cx).await
1414    });
1415    tx.send(output).unwrap();
1416}
1417
1418#[derive(Clone)]
1419struct EvalOutput {
1420    sample: EvalSample,
1421    assertion: EvalAssertionOutcome,
1422}
1423
1424impl Display for EvalOutput {
1425    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1426        writeln!(f, "Score: {:?}", self.assertion.score)?;
1427        if let Some(message) = self.assertion.message.as_ref() {
1428            writeln!(f, "Message: {}", message)?;
1429        }
1430
1431        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1432
1433        writeln!(
1434            f,
1435            "Parser Metrics:\n{:#?}",
1436            self.sample.edit_output.parser_metrics
1437        )?;
1438        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1439        Ok(())
1440    }
1441}
1442
1443fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1444    let passed_count = evaluated_count - failed_count;
1445    let passed_ratio = if evaluated_count == 0 {
1446        0.0
1447    } else {
1448        passed_count as f64 / evaluated_count as f64
1449    };
1450    print!(
1451        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1452        evaluated_count,
1453        iterations,
1454        passed_ratio * 100.0
1455    );
1456    std::io::stdout().flush().unwrap();
1457}
1458
1459struct EditAgentTest {
1460    agent: EditAgent,
1461    project: Entity<Project>,
1462    judge_model: Arc<dyn LanguageModel>,
1463}
1464
1465impl EditAgentTest {
1466    async fn new(cx: &mut TestAppContext) -> Self {
1467        cx.executor().allow_parking();
1468
1469        let fs = FakeFs::new(cx.executor());
1470        cx.update(|cx| {
1471            settings::init(cx);
1472            gpui_tokio::init(cx);
1473            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1474            cx.set_http_client(http_client);
1475            let client = Client::production(cx);
1476            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1477            settings::init(cx);
1478            language_model::init(client.clone(), cx);
1479            language_models::init(user_store, client.clone(), cx);
1480        });
1481
1482        fs.insert_tree("/root", json!({})).await;
1483        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1484        let agent_model = SelectedModel::from_str(
1485            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1486        )
1487        .unwrap();
1488        let judge_model = SelectedModel::from_str(
1489            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1490        )
1491        .unwrap();
1492
1493        let authenticate_provider_tasks = cx.update(|cx| {
1494            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1495                registry
1496                    .providers()
1497                    .iter()
1498                    .map(|p| p.authenticate(cx))
1499                    .collect::<Vec<_>>()
1500            })
1501        });
1502        let (agent_model, judge_model) = cx
1503            .update(|cx| {
1504                cx.spawn(async move |cx| {
1505                    futures::future::join_all(authenticate_provider_tasks).await;
1506                    let agent_model = Self::load_model(&agent_model, cx).await;
1507                    let judge_model = Self::load_model(&judge_model, cx).await;
1508                    (agent_model.unwrap(), judge_model.unwrap())
1509                })
1510            })
1511            .await;
1512        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1513
1514        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1515
1516        Self {
1517            agent: EditAgent::new(
1518                agent_model,
1519                project.clone(),
1520                action_log,
1521                Templates::new(),
1522                edit_format,
1523            ),
1524            project,
1525            judge_model,
1526        }
1527    }
1528
1529    async fn load_model(
1530        selected_model: &SelectedModel,
1531        cx: &mut AsyncApp,
1532    ) -> Result<Arc<dyn LanguageModel>> {
1533        cx.update(|cx| {
1534            let registry = LanguageModelRegistry::read_global(cx);
1535            let provider = registry
1536                .provider(&selected_model.provider)
1537                .expect("Provider not found");
1538            provider.authenticate(cx)
1539        })?
1540        .await?;
1541        cx.update(|cx| {
1542            let models = LanguageModelRegistry::read_global(cx);
1543            let model = models
1544                .available_models(cx)
1545                .find(|model| {
1546                    model.provider_id() == selected_model.provider
1547                        && model.id() == selected_model.model
1548                })
1549                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1550            model
1551        })
1552    }
1553
1554    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1555        let path = self
1556            .project
1557            .read_with(cx, |project, cx| {
1558                project.find_project_path(eval.edit_file_input.path, cx)
1559            })
1560            .unwrap();
1561        let buffer = self
1562            .project
1563            .update(cx, |project, cx| project.open_buffer(path, cx))
1564            .await
1565            .unwrap();
1566
1567        let tools = crate::built_in_tools().collect::<Vec<_>>();
1568
1569        let system_prompt = {
1570            let worktrees = vec![WorktreeContext {
1571                root_name: "root".to_string(),
1572                abs_path: Path::new("/path/to/root").into(),
1573                rules_file: None,
1574            }];
1575            let project_context = ProjectContext::new(worktrees, Vec::default());
1576            let tool_names = tools
1577                .iter()
1578                .map(|tool| tool.name.clone().into())
1579                .collect::<Vec<_>>();
1580            let template = crate::SystemPromptTemplate {
1581                project: &project_context,
1582                available_tools: tool_names,
1583                model_name: None,
1584            };
1585            let templates = Templates::new();
1586            template.render(&templates).unwrap()
1587        };
1588
1589        let has_system_prompt = eval
1590            .conversation
1591            .first()
1592            .is_some_and(|msg| msg.role == Role::System);
1593        let messages = if has_system_prompt {
1594            eval.conversation
1595        } else {
1596            [LanguageModelRequestMessage {
1597                role: Role::System,
1598                content: vec![MessageContent::Text(system_prompt)],
1599                cache: true,
1600            }]
1601            .into_iter()
1602            .chain(eval.conversation)
1603            .collect::<Vec<_>>()
1604        };
1605
1606        let conversation = LanguageModelRequest {
1607            messages,
1608            tools,
1609            thinking_allowed: true,
1610            ..Default::default()
1611        };
1612
1613        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1614            if let Some(input_content) = eval.input_content.as_deref() {
1615                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1616            }
1617            retry_on_rate_limit(async || {
1618                self.agent
1619                    .edit(
1620                        buffer.clone(),
1621                        eval.edit_file_input.display_description.clone(),
1622                        &conversation,
1623                        &mut cx.to_async(),
1624                    )
1625                    .0
1626                    .await
1627            })
1628            .await?
1629        } else {
1630            retry_on_rate_limit(async || {
1631                self.agent
1632                    .overwrite(
1633                        buffer.clone(),
1634                        eval.edit_file_input.display_description.clone(),
1635                        &conversation,
1636                        &mut cx.to_async(),
1637                    )
1638                    .0
1639                    .await
1640            })
1641            .await?
1642        };
1643
1644        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1645        let sample = EvalSample {
1646            edit_output,
1647            diff: language::unified_diff(
1648                eval.input_content.as_deref().unwrap_or_default(),
1649                &buffer_text,
1650            ),
1651            text_before: eval.input_content.unwrap_or_default(),
1652            text_after: buffer_text,
1653        };
1654        let assertion = eval
1655            .assertion
1656            .run(&sample, self.judge_model.clone(), cx)
1657            .await?;
1658
1659        Ok(EvalOutput { assertion, sample })
1660    }
1661}
1662
1663async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1664    const MAX_RETRIES: usize = 20;
1665    let mut attempt = 0;
1666
1667    loop {
1668        attempt += 1;
1669        let response = request().await;
1670
1671        if attempt >= MAX_RETRIES {
1672            return response;
1673        }
1674
1675        let retry_delay = match &response {
1676            Ok(_) => None,
1677            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1678                Some(err) => match &err {
1679                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1680                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1681                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1682                    }
1683                    LanguageModelCompletionError::UpstreamProviderError {
1684                        status,
1685                        retry_after,
1686                        ..
1687                    } => {
1688                        // Only retry for specific status codes
1689                        let should_retry = matches!(
1690                            *status,
1691                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1692                        ) || status.as_u16() == 529;
1693
1694                        if should_retry {
1695                            // Use server-provided retry_after if available, otherwise use default
1696                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1697                        } else {
1698                            None
1699                        }
1700                    }
1701                    LanguageModelCompletionError::ApiReadResponseError { .. }
1702                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1703                    | LanguageModelCompletionError::HttpSend { .. } => {
1704                        // Exponential backoff for transient I/O and internal server errors
1705                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1706                    }
1707                    _ => None,
1708                },
1709                _ => None,
1710            },
1711        };
1712
1713        if let Some(retry_after) = retry_delay {
1714            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1715            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1716            Timer::after(retry_after + jitter).await;
1717        } else {
1718            return response;
1719        }
1720    }
1721}
1722
1723#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1724struct EvalAssertionOutcome {
1725    score: usize,
1726    message: Option<String>,
1727}
1728
1729#[derive(Serialize)]
1730pub struct DiffJudgeTemplate {
1731    diff: String,
1732    assertions: &'static str,
1733}
1734
1735impl Template for DiffJudgeTemplate {
1736    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1737}
1738
1739fn strip_empty_lines(text: &str) -> String {
1740    text.lines()
1741        .filter(|line| !line.trim().is_empty())
1742        .collect::<Vec<_>>()
1743        .join("\n")
1744}