evals.rs

   1use super::*;
   2use crate::{
   3    EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
   4};
   5use Role::*;
   6use client::{Client, UserStore};
   7use collections::HashMap;
   8use fs::FakeFs;
   9use futures::{FutureExt, future::LocalBoxFuture};
  10use gpui::{AppContext, TestAppContext, Timer};
  11use http_client::StatusCode;
  12use indoc::{formatdoc, indoc};
  13use language_model::{
  14    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  15    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  16};
  17use project::Project;
  18use prompt_store::{ProjectContext, WorktreeContext};
  19use rand::prelude::*;
  20use reqwest_client::ReqwestClient;
  21use serde_json::json;
  22use std::{
  23    cmp::Reverse,
  24    fmt::{self, Display},
  25    io::Write as _,
  26    path::Path,
  27    str::FromStr,
  28    sync::mpsc,
  29    time::Duration,
  30};
  31use util::path;
  32
  33#[test]
  34#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
  35fn eval_extract_handle_command_output() {
  36    // Test how well agent generates multiple edit hunks.
  37    //
  38    // Model                       | Pass rate
  39    // ----------------------------|----------
  40    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  41    // claude-sonnet-4             |  0.97 (2025-06-14)
  42    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  43    // gemini-2.5-flash            |  0.11 (2025-05-22)
  44    // gpt-4.1                     |  1.00 (2025-05-22)
  45
  46    let input_file_path = "root/blame.rs";
  47    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  48    let possible_diffs = vec![
  49        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  56    ];
  57    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  58    eval(
  59        100,
  60        0.95,
  61        0.05,
  62        EvalInput::from_conversation(
  63            vec![
  64                message(
  65                    User,
  66                    [text(formatdoc! {"
  67                        Read the `{input_file_path}` file and extract a method in
  68                        the final stanza of `run_git_blame` to deal with command failures,
  69                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  70                        Do not document the method and do not add any comments.
  71
  72                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  73                    "})],
  74                ),
  75                message(
  76                    Assistant,
  77                    [tool_use(
  78                        "tool_1",
  79                        "read_file",
  80                        ReadFileToolInput {
  81                            path: input_file_path.into(),
  82                            start_line: None,
  83                            end_line: None,
  84                        },
  85                    )],
  86                ),
  87                message(
  88                    User,
  89                    [tool_result("tool_1", "read_file", input_file_content)],
  90                ),
  91                message(
  92                    Assistant,
  93                    [tool_use(
  94                        "tool_2",
  95                        "edit_file",
  96                        EditFileToolInput {
  97                            display_description: edit_description.into(),
  98                            path: input_file_path.into(),
  99                            mode: EditFileMode::Edit,
 100                        },
 101                    )],
 102                ),
 103            ],
 104            Some(input_file_content.into()),
 105            EvalAssertion::assert_diff_any(possible_diffs),
 106        ),
 107    );
 108}
 109
 110#[test]
 111#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 112fn eval_delete_run_git_blame() {
 113    // Model                       | Pass rate
 114    // ----------------------------|----------
 115    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 116    // claude-sonnet-4             | 0.96 (2025-06-14)
 117    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 118    // gemini-2.5-flash            |
 119    // gpt-4.1                     |
 120
 121    let input_file_path = "root/blame.rs";
 122    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 123    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 124    let edit_description = "Delete the `run_git_blame` function.";
 125    eval(
 126        100,
 127        0.95,
 128        0.05,
 129        EvalInput::from_conversation(
 130            vec![
 131                message(
 132                    User,
 133                    [text(formatdoc! {"
 134                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 135                        one function, not its usages.
 136                    "})],
 137                ),
 138                message(
 139                    Assistant,
 140                    [tool_use(
 141                        "tool_1",
 142                        "read_file",
 143                        ReadFileToolInput {
 144                            path: input_file_path.into(),
 145                            start_line: None,
 146                            end_line: None,
 147                        },
 148                    )],
 149                ),
 150                message(
 151                    User,
 152                    [tool_result("tool_1", "read_file", input_file_content)],
 153                ),
 154                message(
 155                    Assistant,
 156                    [tool_use(
 157                        "tool_2",
 158                        "edit_file",
 159                        EditFileToolInput {
 160                            display_description: edit_description.into(),
 161                            path: input_file_path.into(),
 162                            mode: EditFileMode::Edit,
 163                        },
 164                    )],
 165                ),
 166            ],
 167            Some(input_file_content.into()),
 168            EvalAssertion::assert_eq(output_file_content),
 169        ),
 170    );
 171}
 172
 173#[test]
 174#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 175fn eval_translate_doc_comments() {
 176    //  Model                          | Pass rate
 177    // ============================================
 178    //
 179    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 180    //  claude-sonnet-4                |  1.0  (2025-06-14)
 181    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 182    //  gemini-2.5-flash-preview-04-17 |
 183    //  gpt-4.1                        |
 184
 185    let input_file_path = "root/canvas.rs";
 186    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 187    let edit_description = "Translate all doc comments to Italian";
 188    eval(
 189        200,
 190        1.,
 191        0.05,
 192        EvalInput::from_conversation(
 193            vec![
 194                message(
 195                    User,
 196                    [text(formatdoc! {"
 197                        Read the {input_file_path} file and edit it (without overwriting it),
 198                        translating all the doc comments to italian.
 199                    "})],
 200                ),
 201                message(
 202                    Assistant,
 203                    [tool_use(
 204                        "tool_1",
 205                        "read_file",
 206                        ReadFileToolInput {
 207                            path: input_file_path.into(),
 208                            start_line: None,
 209                            end_line: None,
 210                        },
 211                    )],
 212                ),
 213                message(
 214                    User,
 215                    [tool_result("tool_1", "read_file", input_file_content)],
 216                ),
 217                message(
 218                    Assistant,
 219                    [tool_use(
 220                        "tool_2",
 221                        "edit_file",
 222                        EditFileToolInput {
 223                            display_description: edit_description.into(),
 224                            path: input_file_path.into(),
 225                            mode: EditFileMode::Edit,
 226                        },
 227                    )],
 228                ),
 229            ],
 230            Some(input_file_content.into()),
 231            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 232        ),
 233    );
 234}
 235
 236#[test]
 237#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 238fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 239    //  Model                          | Pass rate
 240    // ============================================
 241    //
 242    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 243    //  claude-sonnet-4                |  0.11 (2025-06-14)
 244    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 245    //  gemini-2.5-flash-preview-04-17 |
 246    //  gpt-4.1                        |
 247
 248    let input_file_path = "root/lib.rs";
 249    let input_file_content =
 250        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 251    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 252    eval(
 253        100,
 254        0.95,
 255        0.05,
 256        EvalInput::from_conversation(
 257            vec![
 258                message(
 259                    User,
 260                    [text(formatdoc! {"
 261                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 262                        Use `ureq` to download the SDK for the current platform and architecture.
 263                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 264                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 265                        that's inside of the archive.
 266                        Don't re-download the SDK if that executable already exists.
 267
 268                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 269
 270                        Here are the available wasi-sdk assets:
 271                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 272                        - wasi-sdk-25.0-arm64-macos.tar.gz
 273                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 274                        - wasi-sdk-25.0-arm64-linux.tar.gz
 275                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 276                        - wasi-sdk-25.0-arm64-linux.tar.gz
 277                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 278                    "})],
 279                ),
 280                message(
 281                    Assistant,
 282                    [tool_use(
 283                        "tool_1",
 284                        "read_file",
 285                        ReadFileToolInput {
 286                            path: input_file_path.into(),
 287                            start_line: Some(971),
 288                            end_line: Some(1050),
 289                        },
 290                    )],
 291                ),
 292                message(
 293                    User,
 294                    [tool_result(
 295                        "tool_1",
 296                        "read_file",
 297                        lines(input_file_content, 971..1050),
 298                    )],
 299                ),
 300                message(
 301                    Assistant,
 302                    [tool_use(
 303                        "tool_2",
 304                        "read_file",
 305                        ReadFileToolInput {
 306                            path: input_file_path.into(),
 307                            start_line: Some(1050),
 308                            end_line: Some(1100),
 309                        },
 310                    )],
 311                ),
 312                message(
 313                    User,
 314                    [tool_result(
 315                        "tool_2",
 316                        "read_file",
 317                        lines(input_file_content, 1050..1100),
 318                    )],
 319                ),
 320                message(
 321                    Assistant,
 322                    [tool_use(
 323                        "tool_3",
 324                        "read_file",
 325                        ReadFileToolInput {
 326                            path: input_file_path.into(),
 327                            start_line: Some(1100),
 328                            end_line: Some(1150),
 329                        },
 330                    )],
 331                ),
 332                message(
 333                    User,
 334                    [tool_result(
 335                        "tool_3",
 336                        "read_file",
 337                        lines(input_file_content, 1100..1150),
 338                    )],
 339                ),
 340                message(
 341                    Assistant,
 342                    [tool_use(
 343                        "tool_4",
 344                        "edit_file",
 345                        EditFileToolInput {
 346                            display_description: edit_description.into(),
 347                            path: input_file_path.into(),
 348                            mode: EditFileMode::Edit,
 349                        },
 350                    )],
 351                ),
 352            ],
 353            Some(input_file_content.into()),
 354            EvalAssertion::judge_diff(indoc! {"
 355                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 356                - ureq is used to download the SDK for current platform and architecture
 357            "}),
 358        ),
 359    );
 360}
 361
 362#[test]
 363#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 364fn eval_disable_cursor_blinking() {
 365    //  Model                          | Pass rate
 366    // ============================================
 367    //
 368    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 369    //  claude-sonnet-4                |  0.81 (2025-07-14)
 370    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 371    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 372    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 373
 374    let input_file_path = "root/editor.rs";
 375    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 376    let edit_description = "Comment out the call to `BlinkManager::enable`";
 377    let possible_diffs = vec![
 378        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 379        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 380        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 381        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 382    ];
 383    eval(
 384        100,
 385        0.51,
 386        0.05,
 387        EvalInput::from_conversation(
 388            vec![
 389                message(User, [text("Let's research how to cursor blinking works.")]),
 390                message(
 391                    Assistant,
 392                    [tool_use(
 393                        "tool_1",
 394                        "grep",
 395                        GrepToolInput {
 396                            regex: "blink".into(),
 397                            include_pattern: None,
 398                            offset: 0,
 399                            case_sensitive: false,
 400                        },
 401                    )],
 402                ),
 403                message(
 404                    User,
 405                    [tool_result(
 406                        "tool_1",
 407                        "grep",
 408                        [
 409                            lines(input_file_content, 100..400),
 410                            lines(input_file_content, 800..1300),
 411                            lines(input_file_content, 1600..2000),
 412                            lines(input_file_content, 5000..5500),
 413                            lines(input_file_content, 8000..9000),
 414                            lines(input_file_content, 18455..18470),
 415                            lines(input_file_content, 20000..20500),
 416                            lines(input_file_content, 21000..21300),
 417                        ]
 418                        .join("Match found:\n\n"),
 419                    )],
 420                ),
 421                message(
 422                    User,
 423                    [text(indoc! {"
 424                        Comment out the lines that interact with the BlinkManager.
 425                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 426                        Don't add additional comments.
 427                    "})],
 428                ),
 429                message(
 430                    Assistant,
 431                    [tool_use(
 432                        "tool_4",
 433                        "edit_file",
 434                        EditFileToolInput {
 435                            display_description: edit_description.into(),
 436                            path: input_file_path.into(),
 437                            mode: EditFileMode::Edit,
 438                        },
 439                    )],
 440                ),
 441            ],
 442            Some(input_file_content.into()),
 443            EvalAssertion::assert_diff_any(possible_diffs),
 444        ),
 445    );
 446}
 447
 448#[test]
 449#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 450fn eval_from_pixels_constructor() {
 451    // Results for 2025-06-13
 452    //
 453    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 454    // value. Higher values improve the pass rate but may sometimes cause
 455    // edits to be misapplied. In the context of this eval, this means
 456    // the agent might add from_pixels tests in incorrect locations
 457    // (e.g., at the beginning of the file), yet the evaluation may still
 458    // rate it highly.
 459    //
 460    //  Model                          | Date        | Pass rate
 461    // =========================================================
 462    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 463    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 464    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 465    //  gpt-4.1                        |
 466
 467    let input_file_path = "root/canvas.rs";
 468    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 469    let edit_description = "Implement from_pixels constructor and add tests.";
 470    eval(
 471        100,
 472        0.95,
 473        // For whatever reason, this eval produces more mismatched tags.
 474        // Increasing for now, let's see if we can bring this down.
 475        0.25,
 476        EvalInput::from_conversation(
 477            vec![
 478                message(
 479                    User,
 480                    [text(indoc! {"
 481                        Introduce a new `from_pixels` constructor in Canvas and
 482                        also add tests for it in the same file.
 483                    "})],
 484                ),
 485                message(
 486                    Assistant,
 487                    [tool_use(
 488                        "tool_1",
 489                        "read_file",
 490                        ReadFileToolInput {
 491                            path: input_file_path.into(),
 492                            start_line: None,
 493                            end_line: None,
 494                        },
 495                    )],
 496                ),
 497                message(
 498                    User,
 499                    [tool_result("tool_1", "read_file", input_file_content)],
 500                ),
 501                message(
 502                    Assistant,
 503                    [tool_use(
 504                        "tool_2",
 505                        "grep",
 506                        GrepToolInput {
 507                            regex: "mod\\s+tests".into(),
 508                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 509                            offset: 0,
 510                            case_sensitive: false,
 511                        },
 512                    )],
 513                ),
 514                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 515                message(
 516                    Assistant,
 517                    [tool_use(
 518                        "tool_3",
 519                        "grep",
 520                        GrepToolInput {
 521                            regex: "mod\\s+tests".into(),
 522                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 523                            offset: 0,
 524                            case_sensitive: false,
 525                        },
 526                    )],
 527                ),
 528                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 529                message(
 530                    Assistant,
 531                    [tool_use(
 532                        "tool_4",
 533                        "grep",
 534                        GrepToolInput {
 535                            regex: "#\\[test\\]".into(),
 536                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 537                            offset: 0,
 538                            case_sensitive: false,
 539                        },
 540                    )],
 541                ),
 542                message(
 543                    User,
 544                    [tool_result(
 545                        "tool_4",
 546                        "grep",
 547                        indoc! {"
 548                            Found 6 matches:
 549
 550                            ## Matches in font-kit/src/loaders/core_text.rs
 551
 552                            ### mod test › L926-936
 553                            ```
 554                            mod test {
 555                                use super::Font;
 556                                use crate::properties::{Stretch, Weight};
 557
 558                                #[cfg(feature = \"source\")]
 559                                use crate::source::SystemSource;
 560
 561                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 562
 563                                #[cfg(feature = \"source\")]
 564                                #[test]
 565                            ```
 566
 567                            55 lines remaining in ancestor node. Read the file to see all.
 568
 569                            ### mod test › L947-951
 570                            ```
 571                                }
 572
 573                                #[test]
 574                                fn test_core_text_to_css_font_weight() {
 575                                    // Exact matches
 576                            ```
 577
 578                            ### mod test › L959-963
 579                            ```
 580                                }
 581
 582                                #[test]
 583                                fn test_core_text_to_css_font_stretch() {
 584                                    // Exact matches
 585                            ```
 586
 587                            ## Matches in font-kit/src/loaders/freetype.rs
 588
 589                            ### mod test › L1238-1248
 590                            ```
 591                            mod test {
 592                                use crate::loaders::freetype::Font;
 593
 594                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 595                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 596
 597                                #[test]
 598                                fn get_pcf_postscript_name() {
 599                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 600                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 601                                }
 602                            ```
 603
 604                            1 lines remaining in ancestor node. Read the file to see all.
 605
 606                            ## Matches in font-kit/src/sources/core_text.rs
 607
 608                            ### mod test › L265-275
 609                            ```
 610                            mod test {
 611                                use crate::properties::{Stretch, Weight};
 612
 613                                #[test]
 614                                fn test_css_to_core_text_font_weight() {
 615                                    // Exact matches
 616                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 617                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 618                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 619                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 620
 621                            ```
 622
 623                            27 lines remaining in ancestor node. Read the file to see all.
 624
 625                            ### mod test › L278-282
 626                            ```
 627                                }
 628
 629                                #[test]
 630                                fn test_css_to_core_text_font_stretch() {
 631                                    // Exact matches
 632                            ```
 633                        "},
 634                    )],
 635                ),
 636                message(
 637                    Assistant,
 638                    [tool_use(
 639                        "tool_5",
 640                        "edit_file",
 641                        EditFileToolInput {
 642                            display_description: edit_description.into(),
 643                            path: input_file_path.into(),
 644                            mode: EditFileMode::Edit,
 645                        },
 646                    )],
 647                ),
 648            ],
 649            Some(input_file_content.into()),
 650            EvalAssertion::judge_diff(indoc! {"
 651                    - The diff contains a new `from_pixels` constructor
 652                    - The diff contains new tests for the `from_pixels` constructor
 653                "}),
 654        ),
 655    );
 656}
 657
 658#[test]
 659#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 660fn eval_zode() {
 661    //  Model                          | Pass rate
 662    // ============================================
 663    //
 664    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 665    //  claude-sonnet-4                |  1.0 (2025-06-14)
 666    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 667    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 668    //  gpt-4.1                        |  1.0 (2025-05-22)
 669
 670    let input_file_path = "root/zode.py";
 671    let input_content = None;
 672    let edit_description = "Create the main Zode CLI script";
 673    eval(
 674        50,
 675        1.,
 676        0.05,
 677        EvalInput::from_conversation(
 678            vec![
 679                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 680                message(
 681                    Assistant,
 682                    [
 683                        tool_use(
 684                            "tool_1",
 685                            "read_file",
 686                            ReadFileToolInput {
 687                                path: "root/eval/react.py".into(),
 688                                start_line: None,
 689                                end_line: None,
 690                            },
 691                        ),
 692                        tool_use(
 693                            "tool_2",
 694                            "read_file",
 695                            ReadFileToolInput {
 696                                path: "root/eval/react_test.py".into(),
 697                                start_line: None,
 698                                end_line: None,
 699                            },
 700                        ),
 701                    ],
 702                ),
 703                message(
 704                    User,
 705                    [
 706                        tool_result(
 707                            "tool_1",
 708                            "read_file",
 709                            include_str!("evals/fixtures/zode/react.py"),
 710                        ),
 711                        tool_result(
 712                            "tool_2",
 713                            "read_file",
 714                            include_str!("evals/fixtures/zode/react_test.py"),
 715                        ),
 716                    ],
 717                ),
 718                message(
 719                    Assistant,
 720                    [
 721                        text(
 722                            "Now that I understand what we need to build, I'll create the main Python script:",
 723                        ),
 724                        tool_use(
 725                            "tool_3",
 726                            "edit_file",
 727                            EditFileToolInput {
 728                                display_description: edit_description.into(),
 729                                path: input_file_path.into(),
 730                                mode: EditFileMode::Create,
 731                            },
 732                        ),
 733                    ],
 734                ),
 735            ],
 736            input_content,
 737            EvalAssertion::new(async move |sample, _, _cx| {
 738                let invalid_starts = [' ', '`', '\n'];
 739                let mut message = String::new();
 740                for start in invalid_starts {
 741                    if sample.text_after.starts_with(start) {
 742                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 743                        break;
 744                    }
 745                }
 746                // Remove trailing newline.
 747                message.pop();
 748
 749                if message.is_empty() {
 750                    Ok(EvalAssertionOutcome {
 751                        score: 100,
 752                        message: None,
 753                    })
 754                } else {
 755                    Ok(EvalAssertionOutcome {
 756                        score: 0,
 757                        message: Some(message),
 758                    })
 759                }
 760            }),
 761        ),
 762    );
 763}
 764
 765#[test]
 766#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 767fn eval_add_overwrite_test() {
 768    //  Model                          | Pass rate
 769    // ============================================
 770    //
 771    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 772    //  claude-sonnet-4                |  0.07 (2025-06-14)
 773    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 774    //  gemini-2.5-flash-preview-04-17 |
 775    //  gpt-4.1                        |
 776
 777    let input_file_path = "root/action_log.rs";
 778    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 779    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 780    eval(
 781        200,
 782        0.5, // TODO: make this eval better
 783        0.05,
 784        EvalInput::from_conversation(
 785            vec![
 786                message(
 787                    User,
 788                    [text(indoc! {"
 789                        Introduce a new test in `action_log.rs` to test overwriting a file.
 790                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 791                        Take inspiration from all the other tests in the file.
 792                    "})],
 793                ),
 794                message(
 795                    Assistant,
 796                    [tool_use(
 797                        "tool_1",
 798                        "read_file",
 799                        ReadFileToolInput {
 800                            path: input_file_path.into(),
 801                            start_line: None,
 802                            end_line: None,
 803                        },
 804                    )],
 805                ),
 806                message(
 807                    User,
 808                    [tool_result(
 809                        "tool_1",
 810                        "read_file",
 811                        indoc! {"
 812                            pub struct ActionLog [L13-20]
 813                             tracked_buffers [L15]
 814                             edited_since_project_diagnostics_check [L17]
 815                             project [L19]
 816                            impl ActionLog [L22-498]
 817                             pub fn new [L24-30]
 818                             pub fn project [L32-34]
 819                             pub fn checked_project_diagnostics [L37-39]
 820                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 821                             fn track_buffer_internal [L46-101]
 822                             fn handle_buffer_event [L103-116]
 823                             fn handle_buffer_edited [L118-123]
 824                             fn handle_buffer_file_changed [L125-158]
 825                             async fn maintain_diff [L160-264]
 826                             pub fn buffer_read [L267-269]
 827                             pub fn buffer_created [L272-276]
 828                             pub fn buffer_edited [L279-287]
 829                             pub fn will_delete_buffer [L289-304]
 830                             pub fn keep_edits_in_range [L306-364]
 831                             pub fn reject_edits_in_ranges [L366-459]
 832                             pub fn keep_all_edits [L461-473]
 833                             pub fn changed_buffers [L476-482]
 834                             pub fn stale_buffers [L485-497]
 835                            fn apply_non_conflicting_edits [L500-561]
 836                            fn diff_snapshots [L563-585]
 837                            fn point_to_row_edit [L587-614]
 838                            enum ChangeAuthor [L617-620]
 839                             User [L618]
 840                             Agent [L619]
 841                            enum TrackedBufferStatus [L623-627]
 842                             Created [L624]
 843                             Modified [L625]
 844                             Deleted [L626]
 845                            struct TrackedBuffer [L629-641]
 846                             buffer [L630]
 847                             base_text [L631]
 848                             unreviewed_changes [L632]
 849                             status [L633]
 850                             version [L634]
 851                             diff [L635]
 852                             snapshot [L636]
 853                             diff_update [L637]
 854                             _open_lsp_handle [L638]
 855                             _maintain_diff [L639]
 856                             _subscription [L640]
 857                            impl TrackedBuffer [L643-657]
 858                             fn has_changes [L644-650]
 859                             fn schedule_diff_update [L652-656]
 860                            pub struct ChangedBuffer [L659-661]
 861                             pub diff [L660]
 862                            mod tests [L664-1574]
 863                             fn init_logger [L678-682]
 864                             fn init_test [L684-691]
 865                             async fn test_keep_edits [L694-769]
 866                             async fn test_deletions [L772-854]
 867                             async fn test_overlapping_user_edits [L857-951]
 868                             async fn test_creating_files [L954-1010]
 869                             async fn test_deleting_files [L1013-1120]
 870                             async fn test_reject_edits [L1123-1255]
 871                             async fn test_reject_multiple_edits [L1258-1331]
 872                             async fn test_reject_deleted_file [L1334-1388]
 873                             async fn test_reject_created_file [L1391-1443]
 874                             async fn test_random_diffs [L1446-1535]
 875                              fn quiesce [L1510-1534]
 876                             struct HunkStatus [L1538-1542]
 877                              range [L1539]
 878                              diff_status [L1540]
 879                              old_text [L1541]
 880                             fn unreviewed_hunks [L1544-1573]
 881
 882                            Showing symbols 1-69 (total symbols: 69)
 883
 884                            Using the line numbers in this outline, you can call this tool again while specifying
 885                            the start_line and end_line fields to see the implementations of symbols in the outline.
 886                        "},
 887                    )],
 888                ),
 889                message(
 890                    Assistant,
 891                    [
 892                        text(
 893                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 894                        ),
 895                        tool_use(
 896                            "tool_2",
 897                            "read_file",
 898                            ReadFileToolInput {
 899                                path: input_file_path.into(),
 900                                start_line: Some(953),
 901                                end_line: Some(1010),
 902                            },
 903                        ),
 904                    ],
 905                ),
 906                message(
 907                    User,
 908                    [tool_result(
 909                        "tool_2",
 910                        "read_file",
 911                        lines(input_file_content, 953..1010),
 912                    )],
 913                ),
 914                message(
 915                    Assistant,
 916                    [
 917                        text(
 918                            "Now let's look at another relevant test to understand how they're structured:",
 919                        ),
 920                        tool_use(
 921                            "tool_3",
 922                            "read_file",
 923                            ReadFileToolInput {
 924                                path: input_file_path.into(),
 925                                start_line: Some(1012),
 926                                end_line: Some(1120),
 927                            },
 928                        ),
 929                    ],
 930                ),
 931                message(
 932                    User,
 933                    [tool_result(
 934                        "tool_3",
 935                        "read_file",
 936                        lines(input_file_content, 1012..1120),
 937                    )],
 938                ),
 939                message(
 940                    Assistant,
 941                    [
 942                        text("Now let's look at how `buffer_created` is implemented:"),
 943                        tool_use(
 944                            "tool_4",
 945                            "read_file",
 946                            ReadFileToolInput {
 947                                path: input_file_path.into(),
 948                                start_line: Some(271),
 949                                end_line: Some(276),
 950                            },
 951                        ),
 952                    ],
 953                ),
 954                message(
 955                    User,
 956                    [tool_result(
 957                        "tool_4",
 958                        "read_file",
 959                        lines(input_file_content, 271..276),
 960                    )],
 961                ),
 962                message(
 963                    Assistant,
 964                    [
 965                        text(indoc! {"
 966                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 967
 968                            1. Create a file
 969                            2. Open a buffer for it
 970                            3. Track it in the action log
 971                            4. Modify the buffer
 972                            5. Call `buffer_created` again as if the file were new
 973                            6. Check that the changes are properly tracked
 974
 975                            Let's write the test:
 976                        "}),
 977                        tool_use(
 978                            "tool_5",
 979                            "edit_file",
 980                            EditFileToolInput {
 981                                display_description: edit_description.into(),
 982                                path: input_file_path.into(),
 983                                mode: EditFileMode::Edit,
 984                            },
 985                        ),
 986                    ],
 987                ),
 988            ],
 989            Some(input_file_content.into()),
 990            EvalAssertion::judge_diff(
 991                "A new test for overwritten files was created, without changing any previous test",
 992            ),
 993        ),
 994    );
 995}
 996
 997#[test]
 998#[cfg_attr(not(feature = "edit-agent-eval"), ignore)]
 999fn eval_create_empty_file() {
1000    // Check that Edit Agent can create a file without writing its
1001    // thoughts into it. This issue is not specific to empty files, but
1002    // it's easier to reproduce with them.
1003    //
1004    //  Model                          | Pass rate
1005    // ============================================
1006    //
1007    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1008    //  claude-sonnet-4                |  1.00 (2025-06-14)
1009    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1010    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1011    //  gpt-4.1                        |  1.00 (2025-05-21)
1012    //
1013    //
1014    // TODO: gpt-4.1-mini errored 38 times:
1015    // "data did not match any variant of untagged enum ResponseStreamResult"
1016
1017    let input_file_content = None;
1018    let expected_output_content = String::new();
1019    eval(
1020        100,
1021        0.99,
1022        0.05,
1023        EvalInput::from_conversation(
1024            vec![
1025                message(User, [text("Create a second empty todo file ")]),
1026                message(
1027                    Assistant,
1028                    [
1029                        text(formatdoc! {"
1030                        I'll help you create a second empty todo file.
1031                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1032                        "}),
1033                        tool_use(
1034                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1035                            "list_directory",
1036                            ListDirectoryToolInput {
1037                                path: "root".to_string(),
1038                            },
1039                        ),
1040                    ],
1041                ),
1042                message(
1043                    User,
1044                    [tool_result(
1045                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1046                        "list_directory",
1047                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1048                    )],
1049                ),
1050                message(
1051                    Assistant,
1052                    [
1053                        text(formatdoc! {"
1054                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1055                    "}),
1056                        tool_use(
1057                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1058                            "edit_file",
1059                            EditFileToolInput {
1060                                display_description: "Create empty TODO3 file".to_string(),
1061                                mode: EditFileMode::Create,
1062                                path: "root/TODO3".into(),
1063                            },
1064                        ),
1065                    ],
1066                ),
1067            ],
1068            input_file_content,
1069            // Bad behavior is to write something like
1070            // "I'll create an empty TODO3 file as requested."
1071            EvalAssertion::assert_eq(expected_output_content),
1072        ),
1073    );
1074}
1075
1076fn message(
1077    role: Role,
1078    contents: impl IntoIterator<Item = MessageContent>,
1079) -> LanguageModelRequestMessage {
1080    LanguageModelRequestMessage {
1081        role,
1082        content: contents.into_iter().collect(),
1083        cache: false,
1084    }
1085}
1086
1087fn text(text: impl Into<String>) -> MessageContent {
1088    MessageContent::Text(text.into())
1089}
1090
1091fn lines(input: &str, range: Range<usize>) -> String {
1092    input
1093        .lines()
1094        .skip(range.start)
1095        .take(range.len())
1096        .collect::<Vec<_>>()
1097        .join("\n")
1098}
1099
1100fn tool_use(
1101    id: impl Into<Arc<str>>,
1102    name: impl Into<Arc<str>>,
1103    input: impl Serialize,
1104) -> MessageContent {
1105    MessageContent::ToolUse(LanguageModelToolUse {
1106        id: LanguageModelToolUseId::from(id.into()),
1107        name: name.into(),
1108        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1109        input: serde_json::to_value(input).unwrap(),
1110        is_input_complete: true,
1111    })
1112}
1113
1114fn tool_result(
1115    id: impl Into<Arc<str>>,
1116    name: impl Into<Arc<str>>,
1117    result: impl Into<Arc<str>>,
1118) -> MessageContent {
1119    MessageContent::ToolResult(LanguageModelToolResult {
1120        tool_use_id: LanguageModelToolUseId::from(id.into()),
1121        tool_name: name.into(),
1122        is_error: false,
1123        content: LanguageModelToolResultContent::Text(result.into()),
1124        output: None,
1125    })
1126}
1127
1128#[derive(Clone)]
1129struct EvalInput {
1130    conversation: Vec<LanguageModelRequestMessage>,
1131    edit_file_input: EditFileToolInput,
1132    input_content: Option<String>,
1133    assertion: EvalAssertion,
1134}
1135
1136impl EvalInput {
1137    fn from_conversation(
1138        conversation: Vec<LanguageModelRequestMessage>,
1139        input_content: Option<String>,
1140        assertion: EvalAssertion,
1141    ) -> Self {
1142        let msg = conversation.last().expect("Conversation must not be empty");
1143        if msg.role != Role::Assistant {
1144            panic!("Conversation must end with an assistant message");
1145        }
1146        let tool_use = msg
1147            .content
1148            .iter()
1149            .flat_map(|content| match content {
1150                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1151                    Some(tool_use)
1152                }
1153                _ => None,
1154            })
1155            .next()
1156            .expect("Conversation must end with an edit_file tool use")
1157            .clone();
1158
1159        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1160
1161        EvalInput {
1162            conversation,
1163            edit_file_input,
1164            input_content,
1165            assertion,
1166        }
1167    }
1168}
1169
1170#[derive(Clone)]
1171struct EvalSample {
1172    text_before: String,
1173    text_after: String,
1174    edit_output: EditAgentOutput,
1175    diff: String,
1176}
1177
1178trait AssertionFn: 'static + Send + Sync {
1179    fn assert<'a>(
1180        &'a self,
1181        sample: &'a EvalSample,
1182        judge_model: Arc<dyn LanguageModel>,
1183        cx: &'a mut TestAppContext,
1184    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1185}
1186
1187impl<F> AssertionFn for F
1188where
1189    F: 'static
1190        + Send
1191        + Sync
1192        + AsyncFn(
1193            &EvalSample,
1194            Arc<dyn LanguageModel>,
1195            &mut TestAppContext,
1196        ) -> Result<EvalAssertionOutcome>,
1197{
1198    fn assert<'a>(
1199        &'a self,
1200        sample: &'a EvalSample,
1201        judge_model: Arc<dyn LanguageModel>,
1202        cx: &'a mut TestAppContext,
1203    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1204        (self)(sample, judge_model, cx).boxed_local()
1205    }
1206}
1207
1208#[derive(Clone)]
1209struct EvalAssertion(Arc<dyn AssertionFn>);
1210
1211impl EvalAssertion {
1212    fn new<F>(f: F) -> Self
1213    where
1214        F: 'static
1215            + Send
1216            + Sync
1217            + AsyncFn(
1218                &EvalSample,
1219                Arc<dyn LanguageModel>,
1220                &mut TestAppContext,
1221            ) -> Result<EvalAssertionOutcome>,
1222    {
1223        EvalAssertion(Arc::new(f))
1224    }
1225
1226    fn assert_eq(expected: impl Into<String>) -> Self {
1227        let expected = expected.into();
1228        Self::new(async move |sample, _judge, _cx| {
1229            Ok(EvalAssertionOutcome {
1230                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1231                    100
1232                } else {
1233                    0
1234                },
1235                message: None,
1236            })
1237        })
1238    }
1239
1240    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1241        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1242        Self::new(async move |sample, _judge, _cx| {
1243            let matches = expected_diffs.iter().any(|possible_diff| {
1244                let expected =
1245                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1246                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1247            });
1248
1249            Ok(EvalAssertionOutcome {
1250                score: if matches { 100 } else { 0 },
1251                message: None,
1252            })
1253        })
1254    }
1255
1256    fn judge_diff(assertions: &'static str) -> Self {
1257        Self::new(async move |sample, judge, cx| {
1258            let prompt = DiffJudgeTemplate {
1259                diff: sample.diff.clone(),
1260                assertions,
1261            }
1262            .render(&Templates::new())
1263            .unwrap();
1264
1265            let request = LanguageModelRequest {
1266                messages: vec![LanguageModelRequestMessage {
1267                    role: Role::User,
1268                    content: vec![prompt.into()],
1269                    cache: false,
1270                }],
1271                thinking_allowed: true,
1272                ..Default::default()
1273            };
1274            let mut response = retry_on_rate_limit(async || {
1275                Ok(judge
1276                    .stream_completion_text(request.clone(), &cx.to_async())
1277                    .await?)
1278            })
1279            .await?;
1280            let mut output = String::new();
1281            while let Some(chunk) = response.stream.next().await {
1282                let chunk = chunk?;
1283                output.push_str(&chunk);
1284            }
1285
1286            // Parse the score from the response
1287            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1288            if let Some(captures) = re.captures(&output)
1289                && let Some(score_match) = captures.get(1)
1290            {
1291                let score = score_match.as_str().parse().unwrap_or(0);
1292                return Ok(EvalAssertionOutcome {
1293                    score,
1294                    message: Some(output),
1295                });
1296            }
1297
1298            anyhow::bail!("No score found in response. Raw output: {output}");
1299        })
1300    }
1301
1302    async fn run(
1303        &self,
1304        input: &EvalSample,
1305        judge_model: Arc<dyn LanguageModel>,
1306        cx: &mut TestAppContext,
1307    ) -> Result<EvalAssertionOutcome> {
1308        self.0.assert(input, judge_model, cx).await
1309    }
1310}
1311
1312fn eval(
1313    iterations: usize,
1314    expected_pass_ratio: f32,
1315    mismatched_tag_threshold: f32,
1316    mut eval: EvalInput,
1317) {
1318    let mut evaluated_count = 0;
1319    let mut failed_count = 0;
1320    report_progress(evaluated_count, failed_count, iterations);
1321
1322    let (tx, rx) = mpsc::channel();
1323
1324    // Cache the last message in the conversation, and run one instance of the eval so that
1325    // all the next ones are cached.
1326    eval.conversation.last_mut().unwrap().cache = true;
1327    run_eval(eval.clone(), tx.clone());
1328
1329    let executor = gpui::background_executor();
1330    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1331    for _ in 1..iterations {
1332        let eval = eval.clone();
1333        let tx = tx.clone();
1334        let semaphore = semaphore.clone();
1335        executor
1336            .spawn(async move {
1337                let _guard = semaphore.acquire().await;
1338                run_eval(eval, tx)
1339            })
1340            .detach();
1341    }
1342    drop(tx);
1343
1344    let mut failed_evals = HashMap::default();
1345    let mut errored_evals = HashMap::default();
1346    let mut eval_outputs = Vec::new();
1347    let mut cumulative_parser_metrics = EditParserMetrics::default();
1348    while let Ok(output) = rx.recv() {
1349        match output {
1350            Ok(output) => {
1351                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1352                eval_outputs.push(output.clone());
1353                if output.assertion.score < 80 {
1354                    failed_count += 1;
1355                    failed_evals
1356                        .entry(output.sample.text_after.clone())
1357                        .or_insert(Vec::new())
1358                        .push(output);
1359                }
1360            }
1361            Err(error) => {
1362                failed_count += 1;
1363                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1364            }
1365        }
1366
1367        evaluated_count += 1;
1368        report_progress(evaluated_count, failed_count, iterations);
1369    }
1370
1371    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1372    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1373    if actual_pass_ratio < expected_pass_ratio {
1374        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1375        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1376        for (error, count) in errored_evals {
1377            println!("Eval errored {} times. Error: {}", count, error);
1378        }
1379
1380        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1381        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1382        for (_buffer_output, failed_evals) in failed_evals {
1383            let eval_output = failed_evals.first().unwrap();
1384            println!("Eval failed {} times", failed_evals.len());
1385            println!("{}", eval_output);
1386        }
1387
1388        panic!(
1389            "Actual pass ratio: {}\nExpected pass ratio: {}",
1390            actual_pass_ratio, expected_pass_ratio
1391        );
1392    }
1393
1394    let mismatched_tag_ratio =
1395        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1396    if mismatched_tag_ratio > mismatched_tag_threshold {
1397        for eval_output in eval_outputs {
1398            println!("{}", eval_output);
1399        }
1400        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1401    }
1402}
1403
1404fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1405    let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1406    let mut cx = TestAppContext::build(dispatcher, None);
1407    let output = cx.executor().block_test(async {
1408        let test = EditAgentTest::new(&mut cx).await;
1409        test.eval(eval, &mut cx).await
1410    });
1411    tx.send(output).unwrap();
1412}
1413
1414#[derive(Clone)]
1415struct EvalOutput {
1416    sample: EvalSample,
1417    assertion: EvalAssertionOutcome,
1418}
1419
1420impl Display for EvalOutput {
1421    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1422        writeln!(f, "Score: {:?}", self.assertion.score)?;
1423        if let Some(message) = self.assertion.message.as_ref() {
1424            writeln!(f, "Message: {}", message)?;
1425        }
1426
1427        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1428
1429        writeln!(
1430            f,
1431            "Parser Metrics:\n{:#?}",
1432            self.sample.edit_output.parser_metrics
1433        )?;
1434        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1435        Ok(())
1436    }
1437}
1438
1439fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1440    let passed_count = evaluated_count - failed_count;
1441    let passed_ratio = if evaluated_count == 0 {
1442        0.0
1443    } else {
1444        passed_count as f64 / evaluated_count as f64
1445    };
1446    print!(
1447        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1448        evaluated_count,
1449        iterations,
1450        passed_ratio * 100.0
1451    );
1452    std::io::stdout().flush().unwrap();
1453}
1454
1455struct EditAgentTest {
1456    agent: EditAgent,
1457    project: Entity<Project>,
1458    judge_model: Arc<dyn LanguageModel>,
1459}
1460
1461impl EditAgentTest {
1462    async fn new(cx: &mut TestAppContext) -> Self {
1463        cx.executor().allow_parking();
1464
1465        let fs = FakeFs::new(cx.executor());
1466        cx.update(|cx| {
1467            settings::init(cx);
1468            gpui_tokio::init(cx);
1469            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1470            cx.set_http_client(http_client);
1471
1472            client::init_settings(cx);
1473            let client = Client::production(cx);
1474            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1475
1476            settings::init(cx);
1477            Project::init_settings(cx);
1478            language::init(cx);
1479            language_model::init(client.clone(), cx);
1480            language_models::init(user_store, client.clone(), cx);
1481        });
1482
1483        fs.insert_tree("/root", json!({})).await;
1484        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1485        let agent_model = SelectedModel::from_str(
1486            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1487        )
1488        .unwrap();
1489        let judge_model = SelectedModel::from_str(
1490            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1491        )
1492        .unwrap();
1493
1494        let authenticate_provider_tasks = cx.update(|cx| {
1495            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1496                registry
1497                    .providers()
1498                    .iter()
1499                    .map(|p| p.authenticate(cx))
1500                    .collect::<Vec<_>>()
1501            })
1502        });
1503        let (agent_model, judge_model) = cx
1504            .update(|cx| {
1505                cx.spawn(async move |cx| {
1506                    futures::future::join_all(authenticate_provider_tasks).await;
1507                    let agent_model = Self::load_model(&agent_model, cx).await;
1508                    let judge_model = Self::load_model(&judge_model, cx).await;
1509                    (agent_model.unwrap(), judge_model.unwrap())
1510                })
1511            })
1512            .await;
1513        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1514
1515        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1516
1517        Self {
1518            agent: EditAgent::new(
1519                agent_model,
1520                project.clone(),
1521                action_log,
1522                Templates::new(),
1523                edit_format,
1524            ),
1525            project,
1526            judge_model,
1527        }
1528    }
1529
1530    async fn load_model(
1531        selected_model: &SelectedModel,
1532        cx: &mut AsyncApp,
1533    ) -> Result<Arc<dyn LanguageModel>> {
1534        cx.update(|cx| {
1535            let registry = LanguageModelRegistry::read_global(cx);
1536            let provider = registry
1537                .provider(&selected_model.provider)
1538                .expect("Provider not found");
1539            provider.authenticate(cx)
1540        })?
1541        .await?;
1542        cx.update(|cx| {
1543            let models = LanguageModelRegistry::read_global(cx);
1544            let model = models
1545                .available_models(cx)
1546                .find(|model| {
1547                    model.provider_id() == selected_model.provider
1548                        && model.id() == selected_model.model
1549                })
1550                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1551            model
1552        })
1553    }
1554
1555    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1556        let path = self
1557            .project
1558            .read_with(cx, |project, cx| {
1559                project.find_project_path(eval.edit_file_input.path, cx)
1560            })
1561            .unwrap();
1562        let buffer = self
1563            .project
1564            .update(cx, |project, cx| project.open_buffer(path, cx))
1565            .await
1566            .unwrap();
1567
1568        let tools = crate::built_in_tools().collect::<Vec<_>>();
1569
1570        let system_prompt = {
1571            let worktrees = vec![WorktreeContext {
1572                root_name: "root".to_string(),
1573                abs_path: Path::new("/path/to/root").into(),
1574                rules_file: None,
1575            }];
1576            let project_context = ProjectContext::new(worktrees, Vec::default());
1577            let tool_names = tools
1578                .iter()
1579                .map(|tool| tool.name.clone().into())
1580                .collect::<Vec<_>>();
1581            let template = crate::SystemPromptTemplate {
1582                project: &project_context,
1583                available_tools: tool_names,
1584            };
1585            let templates = Templates::new();
1586            template.render(&templates).unwrap()
1587        };
1588
1589        let has_system_prompt = eval
1590            .conversation
1591            .first()
1592            .is_some_and(|msg| msg.role == Role::System);
1593        let messages = if has_system_prompt {
1594            eval.conversation
1595        } else {
1596            [LanguageModelRequestMessage {
1597                role: Role::System,
1598                content: vec![MessageContent::Text(system_prompt)],
1599                cache: true,
1600            }]
1601            .into_iter()
1602            .chain(eval.conversation)
1603            .collect::<Vec<_>>()
1604        };
1605
1606        let conversation = LanguageModelRequest {
1607            messages,
1608            tools,
1609            thinking_allowed: true,
1610            ..Default::default()
1611        };
1612
1613        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1614            if let Some(input_content) = eval.input_content.as_deref() {
1615                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1616            }
1617            retry_on_rate_limit(async || {
1618                self.agent
1619                    .edit(
1620                        buffer.clone(),
1621                        eval.edit_file_input.display_description.clone(),
1622                        &conversation,
1623                        &mut cx.to_async(),
1624                    )
1625                    .0
1626                    .await
1627            })
1628            .await?
1629        } else {
1630            retry_on_rate_limit(async || {
1631                self.agent
1632                    .overwrite(
1633                        buffer.clone(),
1634                        eval.edit_file_input.display_description.clone(),
1635                        &conversation,
1636                        &mut cx.to_async(),
1637                    )
1638                    .0
1639                    .await
1640            })
1641            .await?
1642        };
1643
1644        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1645        let sample = EvalSample {
1646            edit_output,
1647            diff: language::unified_diff(
1648                eval.input_content.as_deref().unwrap_or_default(),
1649                &buffer_text,
1650            ),
1651            text_before: eval.input_content.unwrap_or_default(),
1652            text_after: buffer_text,
1653        };
1654        let assertion = eval
1655            .assertion
1656            .run(&sample, self.judge_model.clone(), cx)
1657            .await?;
1658
1659        Ok(EvalOutput { assertion, sample })
1660    }
1661}
1662
1663async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1664    const MAX_RETRIES: usize = 20;
1665    let mut attempt = 0;
1666
1667    loop {
1668        attempt += 1;
1669        let response = request().await;
1670
1671        if attempt >= MAX_RETRIES {
1672            return response;
1673        }
1674
1675        let retry_delay = match &response {
1676            Ok(_) => None,
1677            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1678                Some(err) => match &err {
1679                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1680                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1681                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1682                    }
1683                    LanguageModelCompletionError::UpstreamProviderError {
1684                        status,
1685                        retry_after,
1686                        ..
1687                    } => {
1688                        // Only retry for specific status codes
1689                        let should_retry = matches!(
1690                            *status,
1691                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1692                        ) || status.as_u16() == 529;
1693
1694                        if should_retry {
1695                            // Use server-provided retry_after if available, otherwise use default
1696                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1697                        } else {
1698                            None
1699                        }
1700                    }
1701                    LanguageModelCompletionError::ApiReadResponseError { .. }
1702                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1703                    | LanguageModelCompletionError::HttpSend { .. } => {
1704                        // Exponential backoff for transient I/O and internal server errors
1705                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1706                    }
1707                    _ => None,
1708                },
1709                _ => None,
1710            },
1711        };
1712
1713        if let Some(retry_after) = retry_delay {
1714            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1715            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1716            Timer::after(retry_after + jitter).await;
1717        } else {
1718            return response;
1719        }
1720    }
1721}
1722
1723#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1724struct EvalAssertionOutcome {
1725    score: usize,
1726    message: Option<String>,
1727}
1728
1729#[derive(Serialize)]
1730pub struct DiffJudgeTemplate {
1731    diff: String,
1732    assertions: &'static str,
1733}
1734
1735impl Template for DiffJudgeTemplate {
1736    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1737}
1738
1739fn strip_empty_lines(text: &str) -> String {
1740    text.lines()
1741        .filter(|line| !line.trim().is_empty())
1742        .collect::<Vec<_>>()
1743        .join("\n")
1744}