evals.rs

   1use super::*;
   2use crate::{
   3    EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
   4};
   5use Role::*;
   6use client::{Client, UserStore};
   7use collections::HashMap;
   8use fs::FakeFs;
   9use futures::{FutureExt, future::LocalBoxFuture};
  10use gpui::{AppContext, TestAppContext, Timer};
  11use http_client::StatusCode;
  12use indoc::{formatdoc, indoc};
  13use language_model::{
  14    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  15    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  16};
  17use project::Project;
  18use prompt_store::{ProjectContext, WorktreeContext};
  19use rand::prelude::*;
  20use reqwest_client::ReqwestClient;
  21use serde_json::json;
  22use std::{
  23    cmp::Reverse,
  24    fmt::{self, Display},
  25    io::Write as _,
  26    path::Path,
  27    str::FromStr,
  28    sync::mpsc,
  29    time::Duration,
  30};
  31use util::path;
  32
  33#[test]
  34#[cfg_attr(not(feature = "unit-eval"), ignore)]
  35fn eval_extract_handle_command_output() {
  36    // Test how well agent generates multiple edit hunks.
  37    //
  38    // Model                       | Pass rate
  39    // ----------------------------|----------
  40    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  41    // claude-sonnet-4             |  0.97 (2025-06-14)
  42    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  43    // gemini-2.5-flash            |  0.11 (2025-05-22)
  44    // gpt-4.1                     |  1.00 (2025-05-22)
  45
  46    let input_file_path = "root/blame.rs";
  47    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  48    let possible_diffs = vec![
  49        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  56    ];
  57    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  58    eval(
  59        100,
  60        0.95,
  61        0.05,
  62        EvalInput::from_conversation(
  63            vec![
  64                message(
  65                    User,
  66                    [text(formatdoc! {"
  67                        Read the `{input_file_path}` file and extract a method in
  68                        the final stanza of `run_git_blame` to deal with command failures,
  69                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  70                        Do not document the method and do not add any comments.
  71
  72                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  73                    "})],
  74                ),
  75                message(
  76                    Assistant,
  77                    [tool_use(
  78                        "tool_1",
  79                        "read_file",
  80                        ReadFileToolInput {
  81                            path: input_file_path.into(),
  82                            start_line: None,
  83                            end_line: None,
  84                        },
  85                    )],
  86                ),
  87                message(
  88                    User,
  89                    [tool_result("tool_1", "read_file", input_file_content)],
  90                ),
  91                message(
  92                    Assistant,
  93                    [tool_use(
  94                        "tool_2",
  95                        "edit_file",
  96                        EditFileToolInput {
  97                            display_description: edit_description.into(),
  98                            path: input_file_path.into(),
  99                            mode: EditFileMode::Edit,
 100                        },
 101                    )],
 102                ),
 103            ],
 104            Some(input_file_content.into()),
 105            EvalAssertion::assert_diff_any(possible_diffs),
 106        ),
 107    );
 108}
 109
 110#[test]
 111#[cfg_attr(not(feature = "unit-eval"), ignore)]
 112fn eval_delete_run_git_blame() {
 113    // Model                       | Pass rate
 114    // ----------------------------|----------
 115    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 116    // claude-sonnet-4             | 0.96 (2025-06-14)
 117    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 118    // gemini-2.5-flash            |
 119    // gpt-4.1                     |
 120
 121    let input_file_path = "root/blame.rs";
 122    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 123    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 124    let edit_description = "Delete the `run_git_blame` function.";
 125    eval(
 126        100,
 127        0.95,
 128        0.05,
 129        EvalInput::from_conversation(
 130            vec![
 131                message(
 132                    User,
 133                    [text(formatdoc! {"
 134                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 135                        one function, not its usages.
 136                    "})],
 137                ),
 138                message(
 139                    Assistant,
 140                    [tool_use(
 141                        "tool_1",
 142                        "read_file",
 143                        ReadFileToolInput {
 144                            path: input_file_path.into(),
 145                            start_line: None,
 146                            end_line: None,
 147                        },
 148                    )],
 149                ),
 150                message(
 151                    User,
 152                    [tool_result("tool_1", "read_file", input_file_content)],
 153                ),
 154                message(
 155                    Assistant,
 156                    [tool_use(
 157                        "tool_2",
 158                        "edit_file",
 159                        EditFileToolInput {
 160                            display_description: edit_description.into(),
 161                            path: input_file_path.into(),
 162                            mode: EditFileMode::Edit,
 163                        },
 164                    )],
 165                ),
 166            ],
 167            Some(input_file_content.into()),
 168            EvalAssertion::assert_eq(output_file_content),
 169        ),
 170    );
 171}
 172
 173#[test]
 174#[cfg_attr(not(feature = "unit-eval"), ignore)]
 175fn eval_translate_doc_comments() {
 176    //  Model                          | Pass rate
 177    // ============================================
 178    //
 179    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 180    //  claude-sonnet-4                |  1.0  (2025-06-14)
 181    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 182    //  gemini-2.5-flash-preview-04-17 |
 183    //  gpt-4.1                        |
 184
 185    let input_file_path = "root/canvas.rs";
 186    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 187    let edit_description = "Translate all doc comments to Italian";
 188    eval(
 189        200,
 190        1.,
 191        0.05,
 192        EvalInput::from_conversation(
 193            vec![
 194                message(
 195                    User,
 196                    [text(formatdoc! {"
 197                        Read the {input_file_path} file and edit it (without overwriting it),
 198                        translating all the doc comments to italian.
 199                    "})],
 200                ),
 201                message(
 202                    Assistant,
 203                    [tool_use(
 204                        "tool_1",
 205                        "read_file",
 206                        ReadFileToolInput {
 207                            path: input_file_path.into(),
 208                            start_line: None,
 209                            end_line: None,
 210                        },
 211                    )],
 212                ),
 213                message(
 214                    User,
 215                    [tool_result("tool_1", "read_file", input_file_content)],
 216                ),
 217                message(
 218                    Assistant,
 219                    [tool_use(
 220                        "tool_2",
 221                        "edit_file",
 222                        EditFileToolInput {
 223                            display_description: edit_description.into(),
 224                            path: input_file_path.into(),
 225                            mode: EditFileMode::Edit,
 226                        },
 227                    )],
 228                ),
 229            ],
 230            Some(input_file_content.into()),
 231            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 232        ),
 233    );
 234}
 235
 236#[test]
 237#[cfg_attr(not(feature = "unit-eval"), ignore)]
 238fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 239    //  Model                          | Pass rate
 240    // ============================================
 241    //
 242    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 243    //  claude-sonnet-4                |  0.11 (2025-06-14)
 244    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 245    //  gemini-2.5-flash-preview-04-17 |
 246    //  gpt-4.1                        |
 247
 248    let input_file_path = "root/lib.rs";
 249    let input_file_content =
 250        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 251    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 252    eval(
 253        100,
 254        0.95,
 255        0.05,
 256        EvalInput::from_conversation(
 257            vec![
 258                message(
 259                    User,
 260                    [text(formatdoc! {"
 261                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 262                        Use `ureq` to download the SDK for the current platform and architecture.
 263                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 264                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 265                        that's inside of the archive.
 266                        Don't re-download the SDK if that executable already exists.
 267
 268                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 269
 270                        Here are the available wasi-sdk assets:
 271                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 272                        - wasi-sdk-25.0-arm64-macos.tar.gz
 273                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 274                        - wasi-sdk-25.0-arm64-linux.tar.gz
 275                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 276                        - wasi-sdk-25.0-arm64-linux.tar.gz
 277                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 278                    "})],
 279                ),
 280                message(
 281                    Assistant,
 282                    [tool_use(
 283                        "tool_1",
 284                        "read_file",
 285                        ReadFileToolInput {
 286                            path: input_file_path.into(),
 287                            start_line: Some(971),
 288                            end_line: Some(1050),
 289                        },
 290                    )],
 291                ),
 292                message(
 293                    User,
 294                    [tool_result(
 295                        "tool_1",
 296                        "read_file",
 297                        lines(input_file_content, 971..1050),
 298                    )],
 299                ),
 300                message(
 301                    Assistant,
 302                    [tool_use(
 303                        "tool_2",
 304                        "read_file",
 305                        ReadFileToolInput {
 306                            path: input_file_path.into(),
 307                            start_line: Some(1050),
 308                            end_line: Some(1100),
 309                        },
 310                    )],
 311                ),
 312                message(
 313                    User,
 314                    [tool_result(
 315                        "tool_2",
 316                        "read_file",
 317                        lines(input_file_content, 1050..1100),
 318                    )],
 319                ),
 320                message(
 321                    Assistant,
 322                    [tool_use(
 323                        "tool_3",
 324                        "read_file",
 325                        ReadFileToolInput {
 326                            path: input_file_path.into(),
 327                            start_line: Some(1100),
 328                            end_line: Some(1150),
 329                        },
 330                    )],
 331                ),
 332                message(
 333                    User,
 334                    [tool_result(
 335                        "tool_3",
 336                        "read_file",
 337                        lines(input_file_content, 1100..1150),
 338                    )],
 339                ),
 340                message(
 341                    Assistant,
 342                    [tool_use(
 343                        "tool_4",
 344                        "edit_file",
 345                        EditFileToolInput {
 346                            display_description: edit_description.into(),
 347                            path: input_file_path.into(),
 348                            mode: EditFileMode::Edit,
 349                        },
 350                    )],
 351                ),
 352            ],
 353            Some(input_file_content.into()),
 354            EvalAssertion::judge_diff(indoc! {"
 355                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 356                - ureq is used to download the SDK for current platform and architecture
 357            "}),
 358        ),
 359    );
 360}
 361
 362#[test]
 363#[cfg_attr(not(feature = "unit-eval"), ignore)]
 364fn eval_disable_cursor_blinking() {
 365    //  Model                          | Pass rate
 366    // ============================================
 367    //
 368    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 369    //  claude-sonnet-4                |  0.81 (2025-07-14)
 370    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 371    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 372    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 373
 374    let input_file_path = "root/editor.rs";
 375    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 376    let edit_description = "Comment out the call to `BlinkManager::enable`";
 377    let possible_diffs = vec![
 378        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 379        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 380        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 381        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 382    ];
 383    eval(
 384        100,
 385        0.51,
 386        0.05,
 387        EvalInput::from_conversation(
 388            vec![
 389                message(User, [text("Let's research how to cursor blinking works.")]),
 390                message(
 391                    Assistant,
 392                    [tool_use(
 393                        "tool_1",
 394                        "grep",
 395                        GrepToolInput {
 396                            regex: "blink".into(),
 397                            include_pattern: None,
 398                            offset: 0,
 399                            case_sensitive: false,
 400                        },
 401                    )],
 402                ),
 403                message(
 404                    User,
 405                    [tool_result(
 406                        "tool_1",
 407                        "grep",
 408                        [
 409                            lines(input_file_content, 100..400),
 410                            lines(input_file_content, 800..1300),
 411                            lines(input_file_content, 1600..2000),
 412                            lines(input_file_content, 5000..5500),
 413                            lines(input_file_content, 8000..9000),
 414                            lines(input_file_content, 18455..18470),
 415                            lines(input_file_content, 20000..20500),
 416                            lines(input_file_content, 21000..21300),
 417                        ]
 418                        .join("Match found:\n\n"),
 419                    )],
 420                ),
 421                message(
 422                    User,
 423                    [text(indoc! {"
 424                        Comment out the lines that interact with the BlinkManager.
 425                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 426                        Don't add additional comments.
 427                    "})],
 428                ),
 429                message(
 430                    Assistant,
 431                    [tool_use(
 432                        "tool_4",
 433                        "edit_file",
 434                        EditFileToolInput {
 435                            display_description: edit_description.into(),
 436                            path: input_file_path.into(),
 437                            mode: EditFileMode::Edit,
 438                        },
 439                    )],
 440                ),
 441            ],
 442            Some(input_file_content.into()),
 443            EvalAssertion::assert_diff_any(possible_diffs),
 444        ),
 445    );
 446}
 447
 448#[test]
 449#[cfg_attr(not(feature = "unit-eval"), ignore)]
 450fn eval_from_pixels_constructor() {
 451    // Results for 2025-06-13
 452    //
 453    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 454    // value. Higher values improve the pass rate but may sometimes cause
 455    // edits to be misapplied. In the context of this eval, this means
 456    // the agent might add from_pixels tests in incorrect locations
 457    // (e.g., at the beginning of the file), yet the evaluation may still
 458    // rate it highly.
 459    //
 460    //  Model                          | Date        | Pass rate
 461    // =========================================================
 462    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 463    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 464    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 465    //  gpt-4.1                        |
 466
 467    let input_file_path = "root/canvas.rs";
 468    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 469    let edit_description = "Implement from_pixels constructor and add tests.";
 470    eval(
 471        100,
 472        0.95,
 473        // For whatever reason, this eval produces more mismatched tags.
 474        // Increasing for now, let's see if we can bring this down.
 475        0.25,
 476        EvalInput::from_conversation(
 477            vec![
 478                message(
 479                    User,
 480                    [text(indoc! {"
 481                        Introduce a new `from_pixels` constructor in Canvas and
 482                        also add tests for it in the same file.
 483                    "})],
 484                ),
 485                message(
 486                    Assistant,
 487                    [tool_use(
 488                        "tool_1",
 489                        "read_file",
 490                        ReadFileToolInput {
 491                            path: input_file_path.into(),
 492                            start_line: None,
 493                            end_line: None,
 494                        },
 495                    )],
 496                ),
 497                message(
 498                    User,
 499                    [tool_result("tool_1", "read_file", input_file_content)],
 500                ),
 501                message(
 502                    Assistant,
 503                    [tool_use(
 504                        "tool_2",
 505                        "grep",
 506                        GrepToolInput {
 507                            regex: "mod\\s+tests".into(),
 508                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 509                            offset: 0,
 510                            case_sensitive: false,
 511                        },
 512                    )],
 513                ),
 514                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 515                message(
 516                    Assistant,
 517                    [tool_use(
 518                        "tool_3",
 519                        "grep",
 520                        GrepToolInput {
 521                            regex: "mod\\s+tests".into(),
 522                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 523                            offset: 0,
 524                            case_sensitive: false,
 525                        },
 526                    )],
 527                ),
 528                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 529                message(
 530                    Assistant,
 531                    [tool_use(
 532                        "tool_4",
 533                        "grep",
 534                        GrepToolInput {
 535                            regex: "#\\[test\\]".into(),
 536                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 537                            offset: 0,
 538                            case_sensitive: false,
 539                        },
 540                    )],
 541                ),
 542                message(
 543                    User,
 544                    [tool_result(
 545                        "tool_4",
 546                        "grep",
 547                        indoc! {"
 548                            Found 6 matches:
 549
 550                            ## Matches in font-kit/src/loaders/core_text.rs
 551
 552                            ### mod test › L926-936
 553                            ```
 554                            mod test {
 555                                use super::Font;
 556                                use crate::properties::{Stretch, Weight};
 557
 558                                #[cfg(feature = \"source\")]
 559                                use crate::source::SystemSource;
 560
 561                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 562
 563                                #[cfg(feature = \"source\")]
 564                                #[test]
 565                            ```
 566
 567                            55 lines remaining in ancestor node. Read the file to see all.
 568
 569                            ### mod test › L947-951
 570                            ```
 571                                }
 572
 573                                #[test]
 574                                fn test_core_text_to_css_font_weight() {
 575                                    // Exact matches
 576                            ```
 577
 578                            ### mod test › L959-963
 579                            ```
 580                                }
 581
 582                                #[test]
 583                                fn test_core_text_to_css_font_stretch() {
 584                                    // Exact matches
 585                            ```
 586
 587                            ## Matches in font-kit/src/loaders/freetype.rs
 588
 589                            ### mod test › L1238-1248
 590                            ```
 591                            mod test {
 592                                use crate::loaders::freetype::Font;
 593
 594                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 595                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 596
 597                                #[test]
 598                                fn get_pcf_postscript_name() {
 599                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 600                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 601                                }
 602                            ```
 603
 604                            1 lines remaining in ancestor node. Read the file to see all.
 605
 606                            ## Matches in font-kit/src/sources/core_text.rs
 607
 608                            ### mod test › L265-275
 609                            ```
 610                            mod test {
 611                                use crate::properties::{Stretch, Weight};
 612
 613                                #[test]
 614                                fn test_css_to_core_text_font_weight() {
 615                                    // Exact matches
 616                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 617                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 618                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 619                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 620
 621                            ```
 622
 623                            27 lines remaining in ancestor node. Read the file to see all.
 624
 625                            ### mod test › L278-282
 626                            ```
 627                                }
 628
 629                                #[test]
 630                                fn test_css_to_core_text_font_stretch() {
 631                                    // Exact matches
 632                            ```
 633                        "},
 634                    )],
 635                ),
 636                message(
 637                    Assistant,
 638                    [tool_use(
 639                        "tool_5",
 640                        "edit_file",
 641                        EditFileToolInput {
 642                            display_description: edit_description.into(),
 643                            path: input_file_path.into(),
 644                            mode: EditFileMode::Edit,
 645                        },
 646                    )],
 647                ),
 648            ],
 649            Some(input_file_content.into()),
 650            EvalAssertion::judge_diff(indoc! {"
 651                    - The diff contains a new `from_pixels` constructor
 652                    - The diff contains new tests for the `from_pixels` constructor
 653                "}),
 654        ),
 655    );
 656}
 657
 658#[test]
 659#[cfg_attr(not(feature = "unit-eval"), ignore)]
 660fn eval_zode() {
 661    //  Model                          | Pass rate
 662    // ============================================
 663    //
 664    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 665    //  claude-sonnet-4                |  1.0 (2025-06-14)
 666    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 667    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 668    //  gpt-4.1                        |  1.0 (2025-05-22)
 669
 670    let input_file_path = "root/zode.py";
 671    let input_content = None;
 672    let edit_description = "Create the main Zode CLI script";
 673    eval(
 674        50,
 675        1.,
 676        0.05,
 677        EvalInput::from_conversation(
 678            vec![
 679                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 680                message(
 681                    Assistant,
 682                    [
 683                        tool_use(
 684                            "tool_1",
 685                            "read_file",
 686                            ReadFileToolInput {
 687                                path: "root/eval/react.py".into(),
 688                                start_line: None,
 689                                end_line: None,
 690                            },
 691                        ),
 692                        tool_use(
 693                            "tool_2",
 694                            "read_file",
 695                            ReadFileToolInput {
 696                                path: "root/eval/react_test.py".into(),
 697                                start_line: None,
 698                                end_line: None,
 699                            },
 700                        ),
 701                    ],
 702                ),
 703                message(
 704                    User,
 705                    [
 706                        tool_result(
 707                            "tool_1",
 708                            "read_file",
 709                            include_str!("evals/fixtures/zode/react.py"),
 710                        ),
 711                        tool_result(
 712                            "tool_2",
 713                            "read_file",
 714                            include_str!("evals/fixtures/zode/react_test.py"),
 715                        ),
 716                    ],
 717                ),
 718                message(
 719                    Assistant,
 720                    [
 721                        text(
 722                            "Now that I understand what we need to build, I'll create the main Python script:",
 723                        ),
 724                        tool_use(
 725                            "tool_3",
 726                            "edit_file",
 727                            EditFileToolInput {
 728                                display_description: edit_description.into(),
 729                                path: input_file_path.into(),
 730                                mode: EditFileMode::Create,
 731                            },
 732                        ),
 733                    ],
 734                ),
 735            ],
 736            input_content,
 737            EvalAssertion::new(async move |sample, _, _cx| {
 738                let invalid_starts = [' ', '`', '\n'];
 739                let mut message = String::new();
 740                for start in invalid_starts {
 741                    if sample.text_after.starts_with(start) {
 742                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 743                        break;
 744                    }
 745                }
 746                // Remove trailing newline.
 747                message.pop();
 748
 749                if message.is_empty() {
 750                    Ok(EvalAssertionOutcome {
 751                        score: 100,
 752                        message: None,
 753                    })
 754                } else {
 755                    Ok(EvalAssertionOutcome {
 756                        score: 0,
 757                        message: Some(message),
 758                    })
 759                }
 760            }),
 761        ),
 762    );
 763}
 764
 765#[test]
 766#[cfg_attr(not(feature = "unit-eval"), ignore)]
 767fn eval_add_overwrite_test() {
 768    //  Model                          | Pass rate
 769    // ============================================
 770    //
 771    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 772    //  claude-sonnet-4                |  0.07 (2025-06-14)
 773    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 774    //  gemini-2.5-flash-preview-04-17 |
 775    //  gpt-4.1                        |
 776
 777    let input_file_path = "root/action_log.rs";
 778    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 779    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 780    eval(
 781        200,
 782        0.5, // TODO: make this eval better
 783        0.05,
 784        EvalInput::from_conversation(
 785            vec![
 786                message(
 787                    User,
 788                    [text(indoc! {"
 789                        Introduce a new test in `action_log.rs` to test overwriting a file.
 790                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 791                        Take inspiration from all the other tests in the file.
 792                    "})],
 793                ),
 794                message(
 795                    Assistant,
 796                    [tool_use(
 797                        "tool_1",
 798                        "read_file",
 799                        ReadFileToolInput {
 800                            path: input_file_path.into(),
 801                            start_line: None,
 802                            end_line: None,
 803                        },
 804                    )],
 805                ),
 806                message(
 807                    User,
 808                    [tool_result(
 809                        "tool_1",
 810                        "read_file",
 811                        indoc! {"
 812                            pub struct ActionLog [L13-20]
 813                             tracked_buffers [L15]
 814                             edited_since_project_diagnostics_check [L17]
 815                             project [L19]
 816                            impl ActionLog [L22-498]
 817                             pub fn new [L24-30]
 818                             pub fn project [L32-34]
 819                             pub fn checked_project_diagnostics [L37-39]
 820                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 821                             fn track_buffer_internal [L46-101]
 822                             fn handle_buffer_event [L103-116]
 823                             fn handle_buffer_edited [L118-123]
 824                             fn handle_buffer_file_changed [L125-158]
 825                             async fn maintain_diff [L160-264]
 826                             pub fn buffer_read [L267-269]
 827                             pub fn buffer_created [L272-276]
 828                             pub fn buffer_edited [L279-287]
 829                             pub fn will_delete_buffer [L289-304]
 830                             pub fn keep_edits_in_range [L306-364]
 831                             pub fn reject_edits_in_ranges [L366-459]
 832                             pub fn keep_all_edits [L461-473]
 833                             pub fn changed_buffers [L476-482]
 834                             pub fn stale_buffers [L485-497]
 835                            fn apply_non_conflicting_edits [L500-561]
 836                            fn diff_snapshots [L563-585]
 837                            fn point_to_row_edit [L587-614]
 838                            enum ChangeAuthor [L617-620]
 839                             User [L618]
 840                             Agent [L619]
 841                            enum TrackedBufferStatus [L623-627]
 842                             Created [L624]
 843                             Modified [L625]
 844                             Deleted [L626]
 845                            struct TrackedBuffer [L629-641]
 846                             buffer [L630]
 847                             base_text [L631]
 848                             unreviewed_changes [L632]
 849                             status [L633]
 850                             version [L634]
 851                             diff [L635]
 852                             snapshot [L636]
 853                             diff_update [L637]
 854                             _open_lsp_handle [L638]
 855                             _maintain_diff [L639]
 856                             _subscription [L640]
 857                            impl TrackedBuffer [L643-657]
 858                             fn has_changes [L644-650]
 859                             fn schedule_diff_update [L652-656]
 860                            pub struct ChangedBuffer [L659-661]
 861                             pub diff [L660]
 862                            mod tests [L664-1574]
 863                             fn init_logger [L678-682]
 864                             fn init_test [L684-691]
 865                             async fn test_keep_edits [L694-769]
 866                             async fn test_deletions [L772-854]
 867                             async fn test_overlapping_user_edits [L857-951]
 868                             async fn test_creating_files [L954-1010]
 869                             async fn test_deleting_files [L1013-1120]
 870                             async fn test_reject_edits [L1123-1255]
 871                             async fn test_reject_multiple_edits [L1258-1331]
 872                             async fn test_reject_deleted_file [L1334-1388]
 873                             async fn test_reject_created_file [L1391-1443]
 874                             async fn test_random_diffs [L1446-1535]
 875                              fn quiesce [L1510-1534]
 876                             struct HunkStatus [L1538-1542]
 877                              range [L1539]
 878                              diff_status [L1540]
 879                              old_text [L1541]
 880                             fn unreviewed_hunks [L1544-1573]
 881
 882                            Showing symbols 1-69 (total symbols: 69)
 883
 884                            Using the line numbers in this outline, you can call this tool again while specifying
 885                            the start_line and end_line fields to see the implementations of symbols in the outline.
 886                        "},
 887                    )],
 888                ),
 889                message(
 890                    Assistant,
 891                    [
 892                        text(
 893                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 894                        ),
 895                        tool_use(
 896                            "tool_2",
 897                            "read_file",
 898                            ReadFileToolInput {
 899                                path: input_file_path.into(),
 900                                start_line: Some(953),
 901                                end_line: Some(1010),
 902                            },
 903                        ),
 904                    ],
 905                ),
 906                message(
 907                    User,
 908                    [tool_result(
 909                        "tool_2",
 910                        "read_file",
 911                        lines(input_file_content, 953..1010),
 912                    )],
 913                ),
 914                message(
 915                    Assistant,
 916                    [
 917                        text(
 918                            "Now let's look at another relevant test to understand how they're structured:",
 919                        ),
 920                        tool_use(
 921                            "tool_3",
 922                            "read_file",
 923                            ReadFileToolInput {
 924                                path: input_file_path.into(),
 925                                start_line: Some(1012),
 926                                end_line: Some(1120),
 927                            },
 928                        ),
 929                    ],
 930                ),
 931                message(
 932                    User,
 933                    [tool_result(
 934                        "tool_3",
 935                        "read_file",
 936                        lines(input_file_content, 1012..1120),
 937                    )],
 938                ),
 939                message(
 940                    Assistant,
 941                    [
 942                        text("Now let's look at how `buffer_created` is implemented:"),
 943                        tool_use(
 944                            "tool_4",
 945                            "read_file",
 946                            ReadFileToolInput {
 947                                path: input_file_path.into(),
 948                                start_line: Some(271),
 949                                end_line: Some(276),
 950                            },
 951                        ),
 952                    ],
 953                ),
 954                message(
 955                    User,
 956                    [tool_result(
 957                        "tool_4",
 958                        "read_file",
 959                        lines(input_file_content, 271..276),
 960                    )],
 961                ),
 962                message(
 963                    Assistant,
 964                    [
 965                        text(indoc! {"
 966                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 967
 968                            1. Create a file
 969                            2. Open a buffer for it
 970                            3. Track it in the action log
 971                            4. Modify the buffer
 972                            5. Call `buffer_created` again as if the file were new
 973                            6. Check that the changes are properly tracked
 974
 975                            Let's write the test:
 976                        "}),
 977                        tool_use(
 978                            "tool_5",
 979                            "edit_file",
 980                            EditFileToolInput {
 981                                display_description: edit_description.into(),
 982                                path: input_file_path.into(),
 983                                mode: EditFileMode::Edit,
 984                            },
 985                        ),
 986                    ],
 987                ),
 988            ],
 989            Some(input_file_content.into()),
 990            EvalAssertion::judge_diff(
 991                "A new test for overwritten files was created, without changing any previous test",
 992            ),
 993        ),
 994    );
 995}
 996
 997#[test]
 998#[cfg_attr(not(feature = "unit-eval"), ignore)]
 999fn eval_create_empty_file() {
1000    // Check that Edit Agent can create a file without writing its
1001    // thoughts into it. This issue is not specific to empty files, but
1002    // it's easier to reproduce with them.
1003    //
1004    //  Model                          | Pass rate
1005    // ============================================
1006    //
1007    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1008    //  claude-sonnet-4                |  1.00 (2025-06-14)
1009    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1010    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1011    //  gpt-4.1                        |  1.00 (2025-05-21)
1012    //
1013    //
1014    // TODO: gpt-4.1-mini errored 38 times:
1015    // "data did not match any variant of untagged enum ResponseStreamResult"
1016
1017    let input_file_content = None;
1018    let expected_output_content = String::new();
1019    eval(
1020        100,
1021        0.99,
1022        0.05,
1023        EvalInput::from_conversation(
1024            vec![
1025                message(User, [text("Create a second empty todo file ")]),
1026                message(
1027                    Assistant,
1028                    [
1029                        text(formatdoc! {"
1030                        I'll help you create a second empty todo file.
1031                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1032                        "}),
1033                        tool_use(
1034                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1035                            "list_directory",
1036                            ListDirectoryToolInput {
1037                                path: "root".to_string(),
1038                            },
1039                        ),
1040                    ],
1041                ),
1042                message(
1043                    User,
1044                    [tool_result(
1045                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1046                        "list_directory",
1047                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1048                    )],
1049                ),
1050                message(
1051                    Assistant,
1052                    [
1053                        text(formatdoc! {"
1054                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1055                    "}),
1056                        tool_use(
1057                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1058                            "edit_file",
1059                            EditFileToolInput {
1060                                display_description: "Create empty TODO3 file".to_string(),
1061                                mode: EditFileMode::Create,
1062                                path: "root/TODO3".into(),
1063                            },
1064                        ),
1065                    ],
1066                ),
1067            ],
1068            input_file_content,
1069            // Bad behavior is to write something like
1070            // "I'll create an empty TODO3 file as requested."
1071            EvalAssertion::assert_eq(expected_output_content),
1072        ),
1073    );
1074}
1075
1076fn message(
1077    role: Role,
1078    contents: impl IntoIterator<Item = MessageContent>,
1079) -> LanguageModelRequestMessage {
1080    LanguageModelRequestMessage {
1081        role,
1082        content: contents.into_iter().collect(),
1083        cache: false,
1084    }
1085}
1086
1087fn text(text: impl Into<String>) -> MessageContent {
1088    MessageContent::Text(text.into())
1089}
1090
1091fn lines(input: &str, range: Range<usize>) -> String {
1092    input
1093        .lines()
1094        .skip(range.start)
1095        .take(range.len())
1096        .collect::<Vec<_>>()
1097        .join("\n")
1098}
1099
1100fn tool_use(
1101    id: impl Into<Arc<str>>,
1102    name: impl Into<Arc<str>>,
1103    input: impl Serialize,
1104) -> MessageContent {
1105    MessageContent::ToolUse(LanguageModelToolUse {
1106        id: LanguageModelToolUseId::from(id.into()),
1107        name: name.into(),
1108        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1109        input: serde_json::to_value(input).unwrap(),
1110        is_input_complete: true,
1111        thought_signature: None,
1112    })
1113}
1114
1115fn tool_result(
1116    id: impl Into<Arc<str>>,
1117    name: impl Into<Arc<str>>,
1118    result: impl Into<Arc<str>>,
1119) -> MessageContent {
1120    MessageContent::ToolResult(LanguageModelToolResult {
1121        tool_use_id: LanguageModelToolUseId::from(id.into()),
1122        tool_name: name.into(),
1123        is_error: false,
1124        content: LanguageModelToolResultContent::Text(result.into()),
1125        output: None,
1126    })
1127}
1128
1129#[derive(Clone)]
1130struct EvalInput {
1131    conversation: Vec<LanguageModelRequestMessage>,
1132    edit_file_input: EditFileToolInput,
1133    input_content: Option<String>,
1134    assertion: EvalAssertion,
1135}
1136
1137impl EvalInput {
1138    fn from_conversation(
1139        conversation: Vec<LanguageModelRequestMessage>,
1140        input_content: Option<String>,
1141        assertion: EvalAssertion,
1142    ) -> Self {
1143        let msg = conversation.last().expect("Conversation must not be empty");
1144        if msg.role != Role::Assistant {
1145            panic!("Conversation must end with an assistant message");
1146        }
1147        let tool_use = msg
1148            .content
1149            .iter()
1150            .flat_map(|content| match content {
1151                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1152                    Some(tool_use)
1153                }
1154                _ => None,
1155            })
1156            .next()
1157            .expect("Conversation must end with an edit_file tool use")
1158            .clone();
1159
1160        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1161
1162        EvalInput {
1163            conversation,
1164            edit_file_input,
1165            input_content,
1166            assertion,
1167        }
1168    }
1169}
1170
1171#[derive(Clone)]
1172struct EvalSample {
1173    text_before: String,
1174    text_after: String,
1175    edit_output: EditAgentOutput,
1176    diff: String,
1177}
1178
1179trait AssertionFn: 'static + Send + Sync {
1180    fn assert<'a>(
1181        &'a self,
1182        sample: &'a EvalSample,
1183        judge_model: Arc<dyn LanguageModel>,
1184        cx: &'a mut TestAppContext,
1185    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1186}
1187
1188impl<F> AssertionFn for F
1189where
1190    F: 'static
1191        + Send
1192        + Sync
1193        + AsyncFn(
1194            &EvalSample,
1195            Arc<dyn LanguageModel>,
1196            &mut TestAppContext,
1197        ) -> Result<EvalAssertionOutcome>,
1198{
1199    fn assert<'a>(
1200        &'a self,
1201        sample: &'a EvalSample,
1202        judge_model: Arc<dyn LanguageModel>,
1203        cx: &'a mut TestAppContext,
1204    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1205        (self)(sample, judge_model, cx).boxed_local()
1206    }
1207}
1208
1209#[derive(Clone)]
1210struct EvalAssertion(Arc<dyn AssertionFn>);
1211
1212impl EvalAssertion {
1213    fn new<F>(f: F) -> Self
1214    where
1215        F: 'static
1216            + Send
1217            + Sync
1218            + AsyncFn(
1219                &EvalSample,
1220                Arc<dyn LanguageModel>,
1221                &mut TestAppContext,
1222            ) -> Result<EvalAssertionOutcome>,
1223    {
1224        EvalAssertion(Arc::new(f))
1225    }
1226
1227    fn assert_eq(expected: impl Into<String>) -> Self {
1228        let expected = expected.into();
1229        Self::new(async move |sample, _judge, _cx| {
1230            Ok(EvalAssertionOutcome {
1231                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1232                    100
1233                } else {
1234                    0
1235                },
1236                message: None,
1237            })
1238        })
1239    }
1240
1241    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1242        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1243        Self::new(async move |sample, _judge, _cx| {
1244            let matches = expected_diffs.iter().any(|possible_diff| {
1245                let expected =
1246                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1247                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1248            });
1249
1250            Ok(EvalAssertionOutcome {
1251                score: if matches { 100 } else { 0 },
1252                message: None,
1253            })
1254        })
1255    }
1256
1257    fn judge_diff(assertions: &'static str) -> Self {
1258        Self::new(async move |sample, judge, cx| {
1259            let prompt = DiffJudgeTemplate {
1260                diff: sample.diff.clone(),
1261                assertions,
1262            }
1263            .render(&Templates::new())
1264            .unwrap();
1265
1266            let request = LanguageModelRequest {
1267                messages: vec![LanguageModelRequestMessage {
1268                    role: Role::User,
1269                    content: vec![prompt.into()],
1270                    cache: false,
1271                }],
1272                thinking_allowed: true,
1273                ..Default::default()
1274            };
1275            let mut response = retry_on_rate_limit(async || {
1276                Ok(judge
1277                    .stream_completion_text(request.clone(), &cx.to_async())
1278                    .await?)
1279            })
1280            .await?;
1281            let mut output = String::new();
1282            while let Some(chunk) = response.stream.next().await {
1283                let chunk = chunk?;
1284                output.push_str(&chunk);
1285            }
1286
1287            // Parse the score from the response
1288            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1289            if let Some(captures) = re.captures(&output)
1290                && let Some(score_match) = captures.get(1)
1291            {
1292                let score = score_match.as_str().parse().unwrap_or(0);
1293                return Ok(EvalAssertionOutcome {
1294                    score,
1295                    message: Some(output),
1296                });
1297            }
1298
1299            anyhow::bail!("No score found in response. Raw output: {output}");
1300        })
1301    }
1302
1303    async fn run(
1304        &self,
1305        input: &EvalSample,
1306        judge_model: Arc<dyn LanguageModel>,
1307        cx: &mut TestAppContext,
1308    ) -> Result<EvalAssertionOutcome> {
1309        self.0.assert(input, judge_model, cx).await
1310    }
1311}
1312
1313fn eval(
1314    iterations: usize,
1315    expected_pass_ratio: f32,
1316    mismatched_tag_threshold: f32,
1317    mut eval: EvalInput,
1318) {
1319    let mut evaluated_count = 0;
1320    let mut failed_count = 0;
1321    report_progress(evaluated_count, failed_count, iterations);
1322
1323    let (tx, rx) = mpsc::channel();
1324
1325    // Cache the last message in the conversation, and run one instance of the eval so that
1326    // all the next ones are cached.
1327    eval.conversation.last_mut().unwrap().cache = true;
1328    run_eval(eval.clone(), tx.clone());
1329
1330    let executor = gpui::background_executor();
1331    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1332    for _ in 1..iterations {
1333        let eval = eval.clone();
1334        let tx = tx.clone();
1335        let semaphore = semaphore.clone();
1336        executor
1337            .spawn(async move {
1338                let _guard = semaphore.acquire().await;
1339                run_eval(eval, tx)
1340            })
1341            .detach();
1342    }
1343    drop(tx);
1344
1345    let mut failed_evals = HashMap::default();
1346    let mut errored_evals = HashMap::default();
1347    let mut eval_outputs = Vec::new();
1348    let mut cumulative_parser_metrics = EditParserMetrics::default();
1349    while let Ok(output) = rx.recv() {
1350        match output {
1351            Ok(output) => {
1352                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1353                eval_outputs.push(output.clone());
1354                if output.assertion.score < 80 {
1355                    failed_count += 1;
1356                    failed_evals
1357                        .entry(output.sample.text_after.clone())
1358                        .or_insert(Vec::new())
1359                        .push(output);
1360                }
1361            }
1362            Err(error) => {
1363                failed_count += 1;
1364                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1365            }
1366        }
1367
1368        evaluated_count += 1;
1369        report_progress(evaluated_count, failed_count, iterations);
1370    }
1371
1372    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1373    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1374    if actual_pass_ratio < expected_pass_ratio {
1375        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1376        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1377        for (error, count) in errored_evals {
1378            println!("Eval errored {} times. Error: {}", count, error);
1379        }
1380
1381        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1382        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1383        for (_buffer_output, failed_evals) in failed_evals {
1384            let eval_output = failed_evals.first().unwrap();
1385            println!("Eval failed {} times", failed_evals.len());
1386            println!("{}", eval_output);
1387        }
1388
1389        panic!(
1390            "Actual pass ratio: {}\nExpected pass ratio: {}",
1391            actual_pass_ratio, expected_pass_ratio
1392        );
1393    }
1394
1395    let mismatched_tag_ratio =
1396        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1397    if mismatched_tag_ratio > mismatched_tag_threshold {
1398        for eval_output in eval_outputs {
1399            println!("{}", eval_output);
1400        }
1401        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1402    }
1403}
1404
1405fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1406    let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1407    let mut cx = TestAppContext::build(dispatcher, None);
1408    let output = cx.executor().block_test(async {
1409        let test = EditAgentTest::new(&mut cx).await;
1410        test.eval(eval, &mut cx).await
1411    });
1412    tx.send(output).unwrap();
1413}
1414
1415#[derive(Clone)]
1416struct EvalOutput {
1417    sample: EvalSample,
1418    assertion: EvalAssertionOutcome,
1419}
1420
1421impl Display for EvalOutput {
1422    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1423        writeln!(f, "Score: {:?}", self.assertion.score)?;
1424        if let Some(message) = self.assertion.message.as_ref() {
1425            writeln!(f, "Message: {}", message)?;
1426        }
1427
1428        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1429
1430        writeln!(
1431            f,
1432            "Parser Metrics:\n{:#?}",
1433            self.sample.edit_output.parser_metrics
1434        )?;
1435        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1436        Ok(())
1437    }
1438}
1439
1440fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1441    let passed_count = evaluated_count - failed_count;
1442    let passed_ratio = if evaluated_count == 0 {
1443        0.0
1444    } else {
1445        passed_count as f64 / evaluated_count as f64
1446    };
1447    print!(
1448        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1449        evaluated_count,
1450        iterations,
1451        passed_ratio * 100.0
1452    );
1453    std::io::stdout().flush().unwrap();
1454}
1455
1456struct EditAgentTest {
1457    agent: EditAgent,
1458    project: Entity<Project>,
1459    judge_model: Arc<dyn LanguageModel>,
1460}
1461
1462impl EditAgentTest {
1463    async fn new(cx: &mut TestAppContext) -> Self {
1464        cx.executor().allow_parking();
1465
1466        let fs = FakeFs::new(cx.executor());
1467        cx.update(|cx| {
1468            settings::init(cx);
1469            gpui_tokio::init(cx);
1470            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1471            cx.set_http_client(http_client);
1472            let client = Client::production(cx);
1473            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1474            settings::init(cx);
1475            language_model::init(client.clone(), cx);
1476            language_models::init(user_store, client.clone(), cx);
1477        });
1478
1479        fs.insert_tree("/root", json!({})).await;
1480        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1481        let agent_model = SelectedModel::from_str(
1482            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1483        )
1484        .unwrap();
1485        let judge_model = SelectedModel::from_str(
1486            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1487        )
1488        .unwrap();
1489
1490        let authenticate_provider_tasks = cx.update(|cx| {
1491            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1492                registry
1493                    .providers()
1494                    .iter()
1495                    .map(|p| p.authenticate(cx))
1496                    .collect::<Vec<_>>()
1497            })
1498        });
1499        let (agent_model, judge_model) = cx
1500            .update(|cx| {
1501                cx.spawn(async move |cx| {
1502                    futures::future::join_all(authenticate_provider_tasks).await;
1503                    let agent_model = Self::load_model(&agent_model, cx).await;
1504                    let judge_model = Self::load_model(&judge_model, cx).await;
1505                    (agent_model.unwrap(), judge_model.unwrap())
1506                })
1507            })
1508            .await;
1509        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1510
1511        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1512
1513        Self {
1514            agent: EditAgent::new(
1515                agent_model,
1516                project.clone(),
1517                action_log,
1518                Templates::new(),
1519                edit_format,
1520            ),
1521            project,
1522            judge_model,
1523        }
1524    }
1525
1526    async fn load_model(
1527        selected_model: &SelectedModel,
1528        cx: &mut AsyncApp,
1529    ) -> Result<Arc<dyn LanguageModel>> {
1530        cx.update(|cx| {
1531            let registry = LanguageModelRegistry::read_global(cx);
1532            let provider = registry
1533                .provider(&selected_model.provider)
1534                .expect("Provider not found");
1535            provider.authenticate(cx)
1536        })?
1537        .await?;
1538        cx.update(|cx| {
1539            let models = LanguageModelRegistry::read_global(cx);
1540            let model = models
1541                .available_models(cx)
1542                .find(|model| {
1543                    model.provider_id() == selected_model.provider
1544                        && model.id() == selected_model.model
1545                })
1546                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1547            model
1548        })
1549    }
1550
1551    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1552        let path = self
1553            .project
1554            .read_with(cx, |project, cx| {
1555                project.find_project_path(eval.edit_file_input.path, cx)
1556            })
1557            .unwrap();
1558        let buffer = self
1559            .project
1560            .update(cx, |project, cx| project.open_buffer(path, cx))
1561            .await
1562            .unwrap();
1563
1564        let tools = crate::built_in_tools().collect::<Vec<_>>();
1565
1566        let system_prompt = {
1567            let worktrees = vec![WorktreeContext {
1568                root_name: "root".to_string(),
1569                abs_path: Path::new("/path/to/root").into(),
1570                rules_file: None,
1571            }];
1572            let project_context = ProjectContext::new(worktrees, Vec::default());
1573            let tool_names = tools
1574                .iter()
1575                .map(|tool| tool.name.clone().into())
1576                .collect::<Vec<_>>();
1577            let template = crate::SystemPromptTemplate {
1578                project: &project_context,
1579                available_tools: tool_names,
1580                model_name: None,
1581            };
1582            let templates = Templates::new();
1583            template.render(&templates).unwrap()
1584        };
1585
1586        let has_system_prompt = eval
1587            .conversation
1588            .first()
1589            .is_some_and(|msg| msg.role == Role::System);
1590        let messages = if has_system_prompt {
1591            eval.conversation
1592        } else {
1593            [LanguageModelRequestMessage {
1594                role: Role::System,
1595                content: vec![MessageContent::Text(system_prompt)],
1596                cache: true,
1597            }]
1598            .into_iter()
1599            .chain(eval.conversation)
1600            .collect::<Vec<_>>()
1601        };
1602
1603        let conversation = LanguageModelRequest {
1604            messages,
1605            tools,
1606            thinking_allowed: true,
1607            ..Default::default()
1608        };
1609
1610        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1611            if let Some(input_content) = eval.input_content.as_deref() {
1612                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1613            }
1614            retry_on_rate_limit(async || {
1615                self.agent
1616                    .edit(
1617                        buffer.clone(),
1618                        eval.edit_file_input.display_description.clone(),
1619                        &conversation,
1620                        &mut cx.to_async(),
1621                    )
1622                    .0
1623                    .await
1624            })
1625            .await?
1626        } else {
1627            retry_on_rate_limit(async || {
1628                self.agent
1629                    .overwrite(
1630                        buffer.clone(),
1631                        eval.edit_file_input.display_description.clone(),
1632                        &conversation,
1633                        &mut cx.to_async(),
1634                    )
1635                    .0
1636                    .await
1637            })
1638            .await?
1639        };
1640
1641        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1642        let sample = EvalSample {
1643            edit_output,
1644            diff: language::unified_diff(
1645                eval.input_content.as_deref().unwrap_or_default(),
1646                &buffer_text,
1647            ),
1648            text_before: eval.input_content.unwrap_or_default(),
1649            text_after: buffer_text,
1650        };
1651        let assertion = eval
1652            .assertion
1653            .run(&sample, self.judge_model.clone(), cx)
1654            .await?;
1655
1656        Ok(EvalOutput { assertion, sample })
1657    }
1658}
1659
1660async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1661    const MAX_RETRIES: usize = 20;
1662    let mut attempt = 0;
1663
1664    loop {
1665        attempt += 1;
1666        let response = request().await;
1667
1668        if attempt >= MAX_RETRIES {
1669            return response;
1670        }
1671
1672        let retry_delay = match &response {
1673            Ok(_) => None,
1674            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1675                Some(err) => match &err {
1676                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1677                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1678                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1679                    }
1680                    LanguageModelCompletionError::UpstreamProviderError {
1681                        status,
1682                        retry_after,
1683                        ..
1684                    } => {
1685                        // Only retry for specific status codes
1686                        let should_retry = matches!(
1687                            *status,
1688                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1689                        ) || status.as_u16() == 529;
1690
1691                        if should_retry {
1692                            // Use server-provided retry_after if available, otherwise use default
1693                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1694                        } else {
1695                            None
1696                        }
1697                    }
1698                    LanguageModelCompletionError::ApiReadResponseError { .. }
1699                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1700                    | LanguageModelCompletionError::HttpSend { .. } => {
1701                        // Exponential backoff for transient I/O and internal server errors
1702                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1703                    }
1704                    _ => None,
1705                },
1706                _ => None,
1707            },
1708        };
1709
1710        if let Some(retry_after) = retry_delay {
1711            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1712            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1713            Timer::after(retry_after + jitter).await;
1714        } else {
1715            return response;
1716        }
1717    }
1718}
1719
1720#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1721struct EvalAssertionOutcome {
1722    score: usize,
1723    message: Option<String>,
1724}
1725
1726#[derive(Serialize)]
1727pub struct DiffJudgeTemplate {
1728    diff: String,
1729    assertions: &'static str,
1730}
1731
1732impl Template for DiffJudgeTemplate {
1733    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1734}
1735
1736fn strip_empty_lines(text: &str) -> String {
1737    text.lines()
1738        .filter(|line| !line.trim().is_empty())
1739        .collect::<Vec<_>>()
1740        .join("\n")
1741}