evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext, Timer};
  15use http_client::StatusCode;
  16use indoc::{formatdoc, indoc};
  17use language_model::{
  18    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  19    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  20};
  21use project::Project;
  22use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  23use rand::prelude::*;
  24use reqwest_client::ReqwestClient;
  25use serde_json::json;
  26use std::{
  27    cmp::Reverse,
  28    fmt::{self, Display},
  29    io::Write as _,
  30    path::Path,
  31    str::FromStr,
  32    sync::mpsc,
  33    time::Duration,
  34};
  35use util::path;
  36
  37#[test]
  38#[cfg_attr(not(feature = "eval"), ignore)]
  39fn eval_extract_handle_command_output() {
  40    // Test how well agent generates multiple edit hunks.
  41    //
  42    // Model                       | Pass rate
  43    // ----------------------------|----------
  44    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  45    // claude-sonnet-4             |  0.97 (2025-06-14)
  46    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  47    // gemini-2.5-flash            |  0.11 (2025-05-22)
  48    // gpt-4.1                     |  1.00 (2025-05-22)
  49
  50    let input_file_path = "root/blame.rs";
  51    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  52    let possible_diffs = vec![
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  56        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  57        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  58        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  59        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  60    ];
  61    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  62    eval(
  63        100,
  64        0.95,
  65        0.05,
  66        EvalInput::from_conversation(
  67            vec![
  68                message(
  69                    User,
  70                    [text(formatdoc! {"
  71                        Read the `{input_file_path}` file and extract a method in
  72                        the final stanza of `run_git_blame` to deal with command failures,
  73                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  74                        Do not document the method and do not add any comments.
  75
  76                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  77                    "})],
  78                ),
  79                message(
  80                    Assistant,
  81                    [tool_use(
  82                        "tool_1",
  83                        "read_file",
  84                        ReadFileToolInput {
  85                            path: input_file_path.into(),
  86                            start_line: None,
  87                            end_line: None,
  88                        },
  89                    )],
  90                ),
  91                message(
  92                    User,
  93                    [tool_result("tool_1", "read_file", input_file_content)],
  94                ),
  95                message(
  96                    Assistant,
  97                    [tool_use(
  98                        "tool_2",
  99                        "edit_file",
 100                        EditFileToolInput {
 101                            display_description: edit_description.into(),
 102                            path: input_file_path.into(),
 103                            mode: EditFileMode::Edit,
 104                        },
 105                    )],
 106                ),
 107            ],
 108            Some(input_file_content.into()),
 109            EvalAssertion::assert_diff_any(possible_diffs),
 110        ),
 111    );
 112}
 113
 114#[test]
 115#[cfg_attr(not(feature = "eval"), ignore)]
 116fn eval_delete_run_git_blame() {
 117    // Model                       | Pass rate
 118    // ----------------------------|----------
 119    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 120    // claude-sonnet-4             | 0.96 (2025-06-14)
 121    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 122    // gemini-2.5-flash            |
 123    // gpt-4.1                     |
 124    let input_file_path = "root/blame.rs";
 125    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 126    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 127    let edit_description = "Delete the `run_git_blame` function.";
 128    eval(
 129        100,
 130        0.95,
 131        0.05,
 132        EvalInput::from_conversation(
 133            vec![
 134                message(
 135                    User,
 136                    [text(formatdoc! {"
 137                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 138                        one function, not its usages.
 139                    "})],
 140                ),
 141                message(
 142                    Assistant,
 143                    [tool_use(
 144                        "tool_1",
 145                        "read_file",
 146                        ReadFileToolInput {
 147                            path: input_file_path.into(),
 148                            start_line: None,
 149                            end_line: None,
 150                        },
 151                    )],
 152                ),
 153                message(
 154                    User,
 155                    [tool_result("tool_1", "read_file", input_file_content)],
 156                ),
 157                message(
 158                    Assistant,
 159                    [tool_use(
 160                        "tool_2",
 161                        "edit_file",
 162                        EditFileToolInput {
 163                            display_description: edit_description.into(),
 164                            path: input_file_path.into(),
 165                            mode: EditFileMode::Edit,
 166                        },
 167                    )],
 168                ),
 169            ],
 170            Some(input_file_content.into()),
 171            EvalAssertion::assert_eq(output_file_content),
 172        ),
 173    );
 174}
 175
 176#[test]
 177#[cfg_attr(not(feature = "eval"), ignore)]
 178fn eval_translate_doc_comments() {
 179    //  Model                          | Pass rate
 180    // ============================================
 181    //
 182    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 183    //  claude-sonnet-4                |  1.0  (2025-06-14)
 184    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 185    //  gemini-2.5-flash-preview-04-17 |
 186    //  gpt-4.1                        |
 187    let input_file_path = "root/canvas.rs";
 188    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 189    let edit_description = "Translate all doc comments to Italian";
 190    eval(
 191        200,
 192        1.,
 193        0.05,
 194        EvalInput::from_conversation(
 195            vec![
 196                message(
 197                    User,
 198                    [text(formatdoc! {"
 199                        Read the {input_file_path} file and edit it (without overwriting it),
 200                        translating all the doc comments to italian.
 201                    "})],
 202                ),
 203                message(
 204                    Assistant,
 205                    [tool_use(
 206                        "tool_1",
 207                        "read_file",
 208                        ReadFileToolInput {
 209                            path: input_file_path.into(),
 210                            start_line: None,
 211                            end_line: None,
 212                        },
 213                    )],
 214                ),
 215                message(
 216                    User,
 217                    [tool_result("tool_1", "read_file", input_file_content)],
 218                ),
 219                message(
 220                    Assistant,
 221                    [tool_use(
 222                        "tool_2",
 223                        "edit_file",
 224                        EditFileToolInput {
 225                            display_description: edit_description.into(),
 226                            path: input_file_path.into(),
 227                            mode: EditFileMode::Edit,
 228                        },
 229                    )],
 230                ),
 231            ],
 232            Some(input_file_content.into()),
 233            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 234        ),
 235    );
 236}
 237
 238#[test]
 239#[cfg_attr(not(feature = "eval"), ignore)]
 240fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 241    //  Model                          | Pass rate
 242    // ============================================
 243    //
 244    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 245    //  claude-sonnet-4                |  0.11 (2025-06-14)
 246    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 247    //  gemini-2.5-flash-preview-04-17 |
 248    //  gpt-4.1                        |
 249    let input_file_path = "root/lib.rs";
 250    let input_file_content =
 251        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 252    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 253    eval(
 254        100,
 255        0.95,
 256        0.05,
 257        EvalInput::from_conversation(
 258            vec![
 259                message(
 260                    User,
 261                    [text(formatdoc! {"
 262                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 263                        Use `ureq` to download the SDK for the current platform and architecture.
 264                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 265                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 266                        that's inside of the archive.
 267                        Don't re-download the SDK if that executable already exists.
 268
 269                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 270
 271                        Here are the available wasi-sdk assets:
 272                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 273                        - wasi-sdk-25.0-arm64-macos.tar.gz
 274                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 275                        - wasi-sdk-25.0-arm64-linux.tar.gz
 276                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 277                        - wasi-sdk-25.0-arm64-linux.tar.gz
 278                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 279                    "})],
 280                ),
 281                message(
 282                    Assistant,
 283                    [tool_use(
 284                        "tool_1",
 285                        "read_file",
 286                        ReadFileToolInput {
 287                            path: input_file_path.into(),
 288                            start_line: Some(971),
 289                            end_line: Some(1050),
 290                        },
 291                    )],
 292                ),
 293                message(
 294                    User,
 295                    [tool_result(
 296                        "tool_1",
 297                        "read_file",
 298                        lines(input_file_content, 971..1050),
 299                    )],
 300                ),
 301                message(
 302                    Assistant,
 303                    [tool_use(
 304                        "tool_2",
 305                        "read_file",
 306                        ReadFileToolInput {
 307                            path: input_file_path.into(),
 308                            start_line: Some(1050),
 309                            end_line: Some(1100),
 310                        },
 311                    )],
 312                ),
 313                message(
 314                    User,
 315                    [tool_result(
 316                        "tool_2",
 317                        "read_file",
 318                        lines(input_file_content, 1050..1100),
 319                    )],
 320                ),
 321                message(
 322                    Assistant,
 323                    [tool_use(
 324                        "tool_3",
 325                        "read_file",
 326                        ReadFileToolInput {
 327                            path: input_file_path.into(),
 328                            start_line: Some(1100),
 329                            end_line: Some(1150),
 330                        },
 331                    )],
 332                ),
 333                message(
 334                    User,
 335                    [tool_result(
 336                        "tool_3",
 337                        "read_file",
 338                        lines(input_file_content, 1100..1150),
 339                    )],
 340                ),
 341                message(
 342                    Assistant,
 343                    [tool_use(
 344                        "tool_4",
 345                        "edit_file",
 346                        EditFileToolInput {
 347                            display_description: edit_description.into(),
 348                            path: input_file_path.into(),
 349                            mode: EditFileMode::Edit,
 350                        },
 351                    )],
 352                ),
 353            ],
 354            Some(input_file_content.into()),
 355            EvalAssertion::judge_diff(indoc! {"
 356                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 357                - ureq is used to download the SDK for current platform and architecture
 358            "}),
 359        ),
 360    );
 361}
 362
 363#[test]
 364#[cfg_attr(not(feature = "eval"), ignore)]
 365fn eval_disable_cursor_blinking() {
 366    //  Model                          | Pass rate
 367    // ============================================
 368    //
 369    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 370    //  claude-sonnet-4                |  0.81 (2025-07-14)
 371    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 372    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 373    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 374    let input_file_path = "root/editor.rs";
 375    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 376    let edit_description = "Comment out the call to `BlinkManager::enable`";
 377    let possible_diffs = vec![
 378        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 379        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 380        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 381        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 382    ];
 383    eval(
 384        100,
 385        0.51,
 386        0.05,
 387        EvalInput::from_conversation(
 388            vec![
 389                message(User, [text("Let's research how to cursor blinking works.")]),
 390                message(
 391                    Assistant,
 392                    [tool_use(
 393                        "tool_1",
 394                        "grep",
 395                        GrepToolInput {
 396                            regex: "blink".into(),
 397                            include_pattern: None,
 398                            offset: 0,
 399                            case_sensitive: false,
 400                        },
 401                    )],
 402                ),
 403                message(
 404                    User,
 405                    [tool_result(
 406                        "tool_1",
 407                        "grep",
 408                        [
 409                            lines(input_file_content, 100..400),
 410                            lines(input_file_content, 800..1300),
 411                            lines(input_file_content, 1600..2000),
 412                            lines(input_file_content, 5000..5500),
 413                            lines(input_file_content, 8000..9000),
 414                            lines(input_file_content, 18455..18470),
 415                            lines(input_file_content, 20000..20500),
 416                            lines(input_file_content, 21000..21300),
 417                        ]
 418                        .join("Match found:\n\n"),
 419                    )],
 420                ),
 421                message(
 422                    User,
 423                    [text(indoc! {"
 424                        Comment out the lines that interact with the BlinkManager.
 425                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 426                        Don't add additional comments.
 427                    "})],
 428                ),
 429                message(
 430                    Assistant,
 431                    [tool_use(
 432                        "tool_4",
 433                        "edit_file",
 434                        EditFileToolInput {
 435                            display_description: edit_description.into(),
 436                            path: input_file_path.into(),
 437                            mode: EditFileMode::Edit,
 438                        },
 439                    )],
 440                ),
 441            ],
 442            Some(input_file_content.into()),
 443            EvalAssertion::assert_diff_any(possible_diffs),
 444        ),
 445    );
 446}
 447
 448#[test]
 449#[cfg_attr(not(feature = "eval"), ignore)]
 450fn eval_from_pixels_constructor() {
 451    // Results for 2025-06-13
 452    //
 453    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 454    // value. Higher values improve the pass rate but may sometimes cause
 455    // edits to be misapplied. In the context of this eval, this means
 456    // the agent might add from_pixels tests in incorrect locations
 457    // (e.g., at the beginning of the file), yet the evaluation may still
 458    // rate it highly.
 459    //
 460    //  Model                          | Date        | Pass rate
 461    // =========================================================
 462    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 463    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 464    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 465    //  gpt-4.1                        |
 466    let input_file_path = "root/canvas.rs";
 467    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 468    let edit_description = "Implement from_pixels constructor and add tests.";
 469    eval(
 470        100,
 471        0.95,
 472        // For whatever reason, this eval produces more mismatched tags.
 473        // Increasing for now, let's see if we can bring this down.
 474        0.25,
 475        EvalInput::from_conversation(
 476            vec![
 477                message(
 478                    User,
 479                    [text(indoc! {"
 480                        Introduce a new `from_pixels` constructor in Canvas and
 481                        also add tests for it in the same file.
 482                    "})],
 483                ),
 484                message(
 485                    Assistant,
 486                    [tool_use(
 487                        "tool_1",
 488                        "read_file",
 489                        ReadFileToolInput {
 490                            path: input_file_path.into(),
 491                            start_line: None,
 492                            end_line: None,
 493                        },
 494                    )],
 495                ),
 496                message(
 497                    User,
 498                    [tool_result("tool_1", "read_file", input_file_content)],
 499                ),
 500                message(
 501                    Assistant,
 502                    [tool_use(
 503                        "tool_2",
 504                        "grep",
 505                        GrepToolInput {
 506                            regex: "mod\\s+tests".into(),
 507                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 508                            offset: 0,
 509                            case_sensitive: false,
 510                        },
 511                    )],
 512                ),
 513                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 514                message(
 515                    Assistant,
 516                    [tool_use(
 517                        "tool_3",
 518                        "grep",
 519                        GrepToolInput {
 520                            regex: "mod\\s+tests".into(),
 521                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 522                            offset: 0,
 523                            case_sensitive: false,
 524                        },
 525                    )],
 526                ),
 527                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 528                message(
 529                    Assistant,
 530                    [tool_use(
 531                        "tool_4",
 532                        "grep",
 533                        GrepToolInput {
 534                            regex: "#\\[test\\]".into(),
 535                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 536                            offset: 0,
 537                            case_sensitive: false,
 538                        },
 539                    )],
 540                ),
 541                message(
 542                    User,
 543                    [tool_result(
 544                        "tool_4",
 545                        "grep",
 546                        indoc! {"
 547                            Found 6 matches:
 548
 549                            ## Matches in font-kit/src/loaders/core_text.rs
 550
 551                            ### mod test › L926-936
 552                            ```
 553                            mod test {
 554                                use super::Font;
 555                                use crate::properties::{Stretch, Weight};
 556
 557                                #[cfg(feature = \"source\")]
 558                                use crate::source::SystemSource;
 559
 560                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 561
 562                                #[cfg(feature = \"source\")]
 563                                #[test]
 564                            ```
 565
 566                            55 lines remaining in ancestor node. Read the file to see all.
 567
 568                            ### mod test › L947-951
 569                            ```
 570                                }
 571
 572                                #[test]
 573                                fn test_core_text_to_css_font_weight() {
 574                                    // Exact matches
 575                            ```
 576
 577                            ### mod test › L959-963
 578                            ```
 579                                }
 580
 581                                #[test]
 582                                fn test_core_text_to_css_font_stretch() {
 583                                    // Exact matches
 584                            ```
 585
 586                            ## Matches in font-kit/src/loaders/freetype.rs
 587
 588                            ### mod test › L1238-1248
 589                            ```
 590                            mod test {
 591                                use crate::loaders::freetype::Font;
 592
 593                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 594                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 595
 596                                #[test]
 597                                fn get_pcf_postscript_name() {
 598                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 599                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 600                                }
 601                            ```
 602
 603                            1 lines remaining in ancestor node. Read the file to see all.
 604
 605                            ## Matches in font-kit/src/sources/core_text.rs
 606
 607                            ### mod test › L265-275
 608                            ```
 609                            mod test {
 610                                use crate::properties::{Stretch, Weight};
 611
 612                                #[test]
 613                                fn test_css_to_core_text_font_weight() {
 614                                    // Exact matches
 615                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 616                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 617                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 618                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 619
 620                            ```
 621
 622                            27 lines remaining in ancestor node. Read the file to see all.
 623
 624                            ### mod test › L278-282
 625                            ```
 626                                }
 627
 628                                #[test]
 629                                fn test_css_to_core_text_font_stretch() {
 630                                    // Exact matches
 631                            ```
 632                        "},
 633                    )],
 634                ),
 635                message(
 636                    Assistant,
 637                    [tool_use(
 638                        "tool_5",
 639                        "edit_file",
 640                        EditFileToolInput {
 641                            display_description: edit_description.into(),
 642                            path: input_file_path.into(),
 643                            mode: EditFileMode::Edit,
 644                        },
 645                    )],
 646                ),
 647            ],
 648            Some(input_file_content.into()),
 649            EvalAssertion::judge_diff(indoc! {"
 650                    - The diff contains a new `from_pixels` constructor
 651                    - The diff contains new tests for the `from_pixels` constructor
 652                "}),
 653        ),
 654    );
 655}
 656
 657#[test]
 658#[cfg_attr(not(feature = "eval"), ignore)]
 659fn eval_zode() {
 660    //  Model                          | Pass rate
 661    // ============================================
 662    //
 663    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 664    //  claude-sonnet-4                |  1.0 (2025-06-14)
 665    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 666    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 667    //  gpt-4.1                        |  1.0 (2025-05-22)
 668    let input_file_path = "root/zode.py";
 669    let input_content = None;
 670    let edit_description = "Create the main Zode CLI script";
 671    eval(
 672        50,
 673        1.,
 674        0.05,
 675        EvalInput::from_conversation(
 676            vec![
 677                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 678                message(
 679                    Assistant,
 680                    [
 681                        tool_use(
 682                            "tool_1",
 683                            "read_file",
 684                            ReadFileToolInput {
 685                                path: "root/eval/react.py".into(),
 686                                start_line: None,
 687                                end_line: None,
 688                            },
 689                        ),
 690                        tool_use(
 691                            "tool_2",
 692                            "read_file",
 693                            ReadFileToolInput {
 694                                path: "root/eval/react_test.py".into(),
 695                                start_line: None,
 696                                end_line: None,
 697                            },
 698                        ),
 699                    ],
 700                ),
 701                message(
 702                    User,
 703                    [
 704                        tool_result(
 705                            "tool_1",
 706                            "read_file",
 707                            include_str!("evals/fixtures/zode/react.py"),
 708                        ),
 709                        tool_result(
 710                            "tool_2",
 711                            "read_file",
 712                            include_str!("evals/fixtures/zode/react_test.py"),
 713                        ),
 714                    ],
 715                ),
 716                message(
 717                    Assistant,
 718                    [
 719                        text(
 720                            "Now that I understand what we need to build, I'll create the main Python script:",
 721                        ),
 722                        tool_use(
 723                            "tool_3",
 724                            "edit_file",
 725                            EditFileToolInput {
 726                                display_description: edit_description.into(),
 727                                path: input_file_path.into(),
 728                                mode: EditFileMode::Create,
 729                            },
 730                        ),
 731                    ],
 732                ),
 733            ],
 734            input_content,
 735            EvalAssertion::new(async move |sample, _, _cx| {
 736                let invalid_starts = [' ', '`', '\n'];
 737                let mut message = String::new();
 738                for start in invalid_starts {
 739                    if sample.text_after.starts_with(start) {
 740                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 741                        break;
 742                    }
 743                }
 744                // Remove trailing newline.
 745                message.pop();
 746
 747                if message.is_empty() {
 748                    Ok(EvalAssertionOutcome {
 749                        score: 100,
 750                        message: None,
 751                    })
 752                } else {
 753                    Ok(EvalAssertionOutcome {
 754                        score: 0,
 755                        message: Some(message),
 756                    })
 757                }
 758            }),
 759        ),
 760    );
 761}
 762
 763#[test]
 764#[cfg_attr(not(feature = "eval"), ignore)]
 765fn eval_add_overwrite_test() {
 766    //  Model                          | Pass rate
 767    // ============================================
 768    //
 769    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 770    //  claude-sonnet-4                |  0.07 (2025-06-14)
 771    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 772    //  gemini-2.5-flash-preview-04-17 |
 773    //  gpt-4.1                        |
 774    let input_file_path = "root/action_log.rs";
 775    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 776    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 777    eval(
 778        200,
 779        0.5, // TODO: make this eval better
 780        0.05,
 781        EvalInput::from_conversation(
 782            vec![
 783                message(
 784                    User,
 785                    [text(indoc! {"
 786                        Introduce a new test in `action_log.rs` to test overwriting a file.
 787                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 788                        Take inspiration from all the other tests in the file.
 789                    "})],
 790                ),
 791                message(
 792                    Assistant,
 793                    [tool_use(
 794                        "tool_1",
 795                        "read_file",
 796                        ReadFileToolInput {
 797                            path: input_file_path.into(),
 798                            start_line: None,
 799                            end_line: None,
 800                        },
 801                    )],
 802                ),
 803                message(
 804                    User,
 805                    [tool_result(
 806                        "tool_1",
 807                        "read_file",
 808                        indoc! {"
 809                            pub struct ActionLog [L13-20]
 810                             tracked_buffers [L15]
 811                             edited_since_project_diagnostics_check [L17]
 812                             project [L19]
 813                            impl ActionLog [L22-498]
 814                             pub fn new [L24-30]
 815                             pub fn project [L32-34]
 816                             pub fn checked_project_diagnostics [L37-39]
 817                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 818                             fn track_buffer_internal [L46-101]
 819                             fn handle_buffer_event [L103-116]
 820                             fn handle_buffer_edited [L118-123]
 821                             fn handle_buffer_file_changed [L125-158]
 822                             async fn maintain_diff [L160-264]
 823                             pub fn buffer_read [L267-269]
 824                             pub fn buffer_created [L272-276]
 825                             pub fn buffer_edited [L279-287]
 826                             pub fn will_delete_buffer [L289-304]
 827                             pub fn keep_edits_in_range [L306-364]
 828                             pub fn reject_edits_in_ranges [L366-459]
 829                             pub fn keep_all_edits [L461-473]
 830                             pub fn changed_buffers [L476-482]
 831                             pub fn stale_buffers [L485-497]
 832                            fn apply_non_conflicting_edits [L500-561]
 833                            fn diff_snapshots [L563-585]
 834                            fn point_to_row_edit [L587-614]
 835                            enum ChangeAuthor [L617-620]
 836                             User [L618]
 837                             Agent [L619]
 838                            enum TrackedBufferStatus [L623-627]
 839                             Created [L624]
 840                             Modified [L625]
 841                             Deleted [L626]
 842                            struct TrackedBuffer [L629-641]
 843                             buffer [L630]
 844                             base_text [L631]
 845                             unreviewed_changes [L632]
 846                             status [L633]
 847                             version [L634]
 848                             diff [L635]
 849                             snapshot [L636]
 850                             diff_update [L637]
 851                             _open_lsp_handle [L638]
 852                             _maintain_diff [L639]
 853                             _subscription [L640]
 854                            impl TrackedBuffer [L643-657]
 855                             fn has_changes [L644-650]
 856                             fn schedule_diff_update [L652-656]
 857                            pub struct ChangedBuffer [L659-661]
 858                             pub diff [L660]
 859                            mod tests [L664-1574]
 860                             fn init_logger [L678-682]
 861                             fn init_test [L684-691]
 862                             async fn test_keep_edits [L694-769]
 863                             async fn test_deletions [L772-854]
 864                             async fn test_overlapping_user_edits [L857-951]
 865                             async fn test_creating_files [L954-1010]
 866                             async fn test_deleting_files [L1013-1120]
 867                             async fn test_reject_edits [L1123-1255]
 868                             async fn test_reject_multiple_edits [L1258-1331]
 869                             async fn test_reject_deleted_file [L1334-1388]
 870                             async fn test_reject_created_file [L1391-1443]
 871                             async fn test_random_diffs [L1446-1535]
 872                              fn quiesce [L1510-1534]
 873                             struct HunkStatus [L1538-1542]
 874                              range [L1539]
 875                              diff_status [L1540]
 876                              old_text [L1541]
 877                             fn unreviewed_hunks [L1544-1573]
 878
 879                            Showing symbols 1-69 (total symbols: 69)
 880
 881                            Using the line numbers in this outline, you can call this tool again while specifying
 882                            the start_line and end_line fields to see the implementations of symbols in the outline.
 883                        "},
 884                    )],
 885                ),
 886                message(
 887                    Assistant,
 888                    [
 889                        text(
 890                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 891                        ),
 892                        tool_use(
 893                            "tool_2",
 894                            "read_file",
 895                            ReadFileToolInput {
 896                                path: input_file_path.into(),
 897                                start_line: Some(953),
 898                                end_line: Some(1010),
 899                            },
 900                        ),
 901                    ],
 902                ),
 903                message(
 904                    User,
 905                    [tool_result(
 906                        "tool_2",
 907                        "read_file",
 908                        lines(input_file_content, 953..1010),
 909                    )],
 910                ),
 911                message(
 912                    Assistant,
 913                    [
 914                        text(
 915                            "Now let's look at another relevant test to understand how they're structured:",
 916                        ),
 917                        tool_use(
 918                            "tool_3",
 919                            "read_file",
 920                            ReadFileToolInput {
 921                                path: input_file_path.into(),
 922                                start_line: Some(1012),
 923                                end_line: Some(1120),
 924                            },
 925                        ),
 926                    ],
 927                ),
 928                message(
 929                    User,
 930                    [tool_result(
 931                        "tool_3",
 932                        "read_file",
 933                        lines(input_file_content, 1012..1120),
 934                    )],
 935                ),
 936                message(
 937                    Assistant,
 938                    [
 939                        text("Now let's look at how `buffer_created` is implemented:"),
 940                        tool_use(
 941                            "tool_4",
 942                            "read_file",
 943                            ReadFileToolInput {
 944                                path: input_file_path.into(),
 945                                start_line: Some(271),
 946                                end_line: Some(276),
 947                            },
 948                        ),
 949                    ],
 950                ),
 951                message(
 952                    User,
 953                    [tool_result(
 954                        "tool_4",
 955                        "read_file",
 956                        lines(input_file_content, 271..276),
 957                    )],
 958                ),
 959                message(
 960                    Assistant,
 961                    [
 962                        text(indoc! {"
 963                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 964
 965                            1. Create a file
 966                            2. Open a buffer for it
 967                            3. Track it in the action log
 968                            4. Modify the buffer
 969                            5. Call `buffer_created` again as if the file were new
 970                            6. Check that the changes are properly tracked
 971
 972                            Let's write the test:
 973                        "}),
 974                        tool_use(
 975                            "tool_5",
 976                            "edit_file",
 977                            EditFileToolInput {
 978                                display_description: edit_description.into(),
 979                                path: input_file_path.into(),
 980                                mode: EditFileMode::Edit,
 981                            },
 982                        ),
 983                    ],
 984                ),
 985            ],
 986            Some(input_file_content.into()),
 987            EvalAssertion::judge_diff(
 988                "A new test for overwritten files was created, without changing any previous test",
 989            ),
 990        ),
 991    );
 992}
 993
 994#[test]
 995#[cfg_attr(not(feature = "eval"), ignore)]
 996fn eval_create_empty_file() {
 997    // Check that Edit Agent can create a file without writing its
 998    // thoughts into it. This issue is not specific to empty files, but
 999    // it's easier to reproduce with them.
1000    //
1001    //  Model                          | Pass rate
1002    // ============================================
1003    //
1004    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1005    //  claude-sonnet-4                |  1.00 (2025-06-14)
1006    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1007    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1008    //  gpt-4.1                        |  1.00 (2025-05-21)
1009    //
1010    //
1011    // TODO: gpt-4.1-mini errored 38 times:
1012    // "data did not match any variant of untagged enum ResponseStreamResult"
1013    //
1014    let input_file_content = None;
1015    let expected_output_content = String::new();
1016    eval(
1017        100,
1018        0.99,
1019        0.05,
1020        EvalInput::from_conversation(
1021            vec![
1022                message(User, [text("Create a second empty todo file ")]),
1023                message(
1024                    Assistant,
1025                    [
1026                        text(formatdoc! {"
1027                        I'll help you create a second empty todo file.
1028                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1029                        "}),
1030                        tool_use(
1031                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1032                            "list_directory",
1033                            ListDirectoryToolInput {
1034                                path: "root".to_string(),
1035                            },
1036                        ),
1037                    ],
1038                ),
1039                message(
1040                    User,
1041                    [tool_result(
1042                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1043                        "list_directory",
1044                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1045                    )],
1046                ),
1047                message(
1048                    Assistant,
1049                    [
1050                        text(formatdoc! {"
1051                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1052                    "}),
1053                        tool_use(
1054                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1055                            "edit_file",
1056                            EditFileToolInput {
1057                                display_description: "Create empty TODO3 file".to_string(),
1058                                mode: EditFileMode::Create,
1059                                path: "root/TODO3".into(),
1060                            },
1061                        ),
1062                    ],
1063                ),
1064            ],
1065            input_file_content,
1066            // Bad behavior is to write something like
1067            // "I'll create an empty TODO3 file as requested."
1068            EvalAssertion::assert_eq(expected_output_content),
1069        ),
1070    );
1071}
1072
1073fn message(
1074    role: Role,
1075    contents: impl IntoIterator<Item = MessageContent>,
1076) -> LanguageModelRequestMessage {
1077    LanguageModelRequestMessage {
1078        role,
1079        content: contents.into_iter().collect(),
1080        cache: false,
1081    }
1082}
1083
1084fn text(text: impl Into<String>) -> MessageContent {
1085    MessageContent::Text(text.into())
1086}
1087
1088fn lines(input: &str, range: Range<usize>) -> String {
1089    input
1090        .lines()
1091        .skip(range.start)
1092        .take(range.len())
1093        .collect::<Vec<_>>()
1094        .join("\n")
1095}
1096
1097fn tool_use(
1098    id: impl Into<Arc<str>>,
1099    name: impl Into<Arc<str>>,
1100    input: impl Serialize,
1101) -> MessageContent {
1102    MessageContent::ToolUse(LanguageModelToolUse {
1103        id: LanguageModelToolUseId::from(id.into()),
1104        name: name.into(),
1105        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1106        input: serde_json::to_value(input).unwrap(),
1107        is_input_complete: true,
1108    })
1109}
1110
1111fn tool_result(
1112    id: impl Into<Arc<str>>,
1113    name: impl Into<Arc<str>>,
1114    result: impl Into<Arc<str>>,
1115) -> MessageContent {
1116    MessageContent::ToolResult(LanguageModelToolResult {
1117        tool_use_id: LanguageModelToolUseId::from(id.into()),
1118        tool_name: name.into(),
1119        is_error: false,
1120        content: LanguageModelToolResultContent::Text(result.into()),
1121        output: None,
1122    })
1123}
1124
1125#[derive(Clone)]
1126struct EvalInput {
1127    conversation: Vec<LanguageModelRequestMessage>,
1128    edit_file_input: EditFileToolInput,
1129    input_content: Option<String>,
1130    assertion: EvalAssertion,
1131}
1132
1133impl EvalInput {
1134    fn from_conversation(
1135        conversation: Vec<LanguageModelRequestMessage>,
1136        input_content: Option<String>,
1137        assertion: EvalAssertion,
1138    ) -> Self {
1139        let msg = conversation.last().expect("Conversation must not be empty");
1140        if msg.role != Role::Assistant {
1141            panic!("Conversation must end with an assistant message");
1142        }
1143        let tool_use = msg
1144            .content
1145            .iter()
1146            .flat_map(|content| match content {
1147                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1148                    Some(tool_use)
1149                }
1150                _ => None,
1151            })
1152            .next()
1153            .expect("Conversation must end with an edit_file tool use")
1154            .clone();
1155
1156        let edit_file_input: EditFileToolInput =
1157            serde_json::from_value(tool_use.input.clone()).unwrap();
1158
1159        EvalInput {
1160            conversation,
1161            edit_file_input,
1162            input_content,
1163            assertion,
1164        }
1165    }
1166}
1167
1168#[derive(Clone)]
1169struct EvalSample {
1170    text_before: String,
1171    text_after: String,
1172    edit_output: EditAgentOutput,
1173    diff: String,
1174}
1175
1176trait AssertionFn: 'static + Send + Sync {
1177    fn assert<'a>(
1178        &'a self,
1179        sample: &'a EvalSample,
1180        judge_model: Arc<dyn LanguageModel>,
1181        cx: &'a mut TestAppContext,
1182    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1183}
1184
1185impl<F> AssertionFn for F
1186where
1187    F: 'static
1188        + Send
1189        + Sync
1190        + AsyncFn(
1191            &EvalSample,
1192            Arc<dyn LanguageModel>,
1193            &mut TestAppContext,
1194        ) -> Result<EvalAssertionOutcome>,
1195{
1196    fn assert<'a>(
1197        &'a self,
1198        sample: &'a EvalSample,
1199        judge_model: Arc<dyn LanguageModel>,
1200        cx: &'a mut TestAppContext,
1201    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1202        (self)(sample, judge_model, cx).boxed_local()
1203    }
1204}
1205
1206#[derive(Clone)]
1207struct EvalAssertion(Arc<dyn AssertionFn>);
1208
1209impl EvalAssertion {
1210    fn new<F>(f: F) -> Self
1211    where
1212        F: 'static
1213            + Send
1214            + Sync
1215            + AsyncFn(
1216                &EvalSample,
1217                Arc<dyn LanguageModel>,
1218                &mut TestAppContext,
1219            ) -> Result<EvalAssertionOutcome>,
1220    {
1221        EvalAssertion(Arc::new(f))
1222    }
1223
1224    fn assert_eq(expected: impl Into<String>) -> Self {
1225        let expected = expected.into();
1226        Self::new(async move |sample, _judge, _cx| {
1227            Ok(EvalAssertionOutcome {
1228                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1229                    100
1230                } else {
1231                    0
1232                },
1233                message: None,
1234            })
1235        })
1236    }
1237
1238    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1239        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1240        Self::new(async move |sample, _judge, _cx| {
1241            let matches = expected_diffs.iter().any(|possible_diff| {
1242                let expected =
1243                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1244                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1245            });
1246
1247            Ok(EvalAssertionOutcome {
1248                score: if matches { 100 } else { 0 },
1249                message: None,
1250            })
1251        })
1252    }
1253
1254    fn judge_diff(assertions: &'static str) -> Self {
1255        Self::new(async move |sample, judge, cx| {
1256            let prompt = DiffJudgeTemplate {
1257                diff: sample.diff.clone(),
1258                assertions,
1259            }
1260            .render(&Templates::new())
1261            .unwrap();
1262
1263            let request = LanguageModelRequest {
1264                messages: vec![LanguageModelRequestMessage {
1265                    role: Role::User,
1266                    content: vec![prompt.into()],
1267                    cache: false,
1268                }],
1269                thinking_allowed: true,
1270                ..Default::default()
1271            };
1272            let mut response = retry_on_rate_limit(async || {
1273                Ok(judge
1274                    .stream_completion_text(request.clone(), &cx.to_async())
1275                    .await?)
1276            })
1277            .await?;
1278            let mut output = String::new();
1279            while let Some(chunk) = response.stream.next().await {
1280                let chunk = chunk?;
1281                output.push_str(&chunk);
1282            }
1283
1284            // Parse the score from the response
1285            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1286            if let Some(captures) = re.captures(&output)
1287                && let Some(score_match) = captures.get(1)
1288            {
1289                let score = score_match.as_str().parse().unwrap_or(0);
1290                return Ok(EvalAssertionOutcome {
1291                    score,
1292                    message: Some(output),
1293                });
1294            }
1295
1296            anyhow::bail!("No score found in response. Raw output: {output}");
1297        })
1298    }
1299
1300    async fn run(
1301        &self,
1302        input: &EvalSample,
1303        judge_model: Arc<dyn LanguageModel>,
1304        cx: &mut TestAppContext,
1305    ) -> Result<EvalAssertionOutcome> {
1306        self.0.assert(input, judge_model, cx).await
1307    }
1308}
1309
1310fn eval(
1311    iterations: usize,
1312    expected_pass_ratio: f32,
1313    mismatched_tag_threshold: f32,
1314    mut eval: EvalInput,
1315) {
1316    let mut evaluated_count = 0;
1317    let mut failed_count = 0;
1318    report_progress(evaluated_count, failed_count, iterations);
1319
1320    let (tx, rx) = mpsc::channel();
1321
1322    // Cache the last message in the conversation, and run one instance of the eval so that
1323    // all the next ones are cached.
1324    eval.conversation.last_mut().unwrap().cache = true;
1325    run_eval(eval.clone(), tx.clone());
1326
1327    let executor = gpui::background_executor();
1328    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1329    for _ in 1..iterations {
1330        let eval = eval.clone();
1331        let tx = tx.clone();
1332        let semaphore = semaphore.clone();
1333        executor
1334            .spawn(async move {
1335                let _guard = semaphore.acquire().await;
1336                run_eval(eval, tx)
1337            })
1338            .detach();
1339    }
1340    drop(tx);
1341
1342    let mut failed_evals = HashMap::default();
1343    let mut errored_evals = HashMap::default();
1344    let mut eval_outputs = Vec::new();
1345    let mut cumulative_parser_metrics = EditParserMetrics::default();
1346    while let Ok(output) = rx.recv() {
1347        match output {
1348            Ok(output) => {
1349                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1350                eval_outputs.push(output.clone());
1351                if output.assertion.score < 80 {
1352                    failed_count += 1;
1353                    failed_evals
1354                        .entry(output.sample.text_after.clone())
1355                        .or_insert(Vec::new())
1356                        .push(output);
1357                }
1358            }
1359            Err(error) => {
1360                failed_count += 1;
1361                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1362            }
1363        }
1364
1365        evaluated_count += 1;
1366        report_progress(evaluated_count, failed_count, iterations);
1367    }
1368
1369    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1370    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1371    if actual_pass_ratio < expected_pass_ratio {
1372        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1373        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1374        for (error, count) in errored_evals {
1375            println!("Eval errored {} times. Error: {}", count, error);
1376        }
1377
1378        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1379        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1380        for (_buffer_output, failed_evals) in failed_evals {
1381            let eval_output = failed_evals.first().unwrap();
1382            println!("Eval failed {} times", failed_evals.len());
1383            println!("{}", eval_output);
1384        }
1385
1386        panic!(
1387            "Actual pass ratio: {}\nExpected pass ratio: {}",
1388            actual_pass_ratio, expected_pass_ratio
1389        );
1390    }
1391
1392    let mismatched_tag_ratio =
1393        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1394    if mismatched_tag_ratio > mismatched_tag_threshold {
1395        for eval_output in eval_outputs {
1396            println!("{}", eval_output);
1397        }
1398        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1399    }
1400}
1401
1402fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1403    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1404    let mut cx = TestAppContext::build(dispatcher, None);
1405    let output = cx.executor().block_test(async {
1406        let test = EditAgentTest::new(&mut cx).await;
1407        test.eval(eval, &mut cx).await
1408    });
1409    tx.send(output).unwrap();
1410}
1411
1412#[derive(Clone)]
1413struct EvalOutput {
1414    sample: EvalSample,
1415    assertion: EvalAssertionOutcome,
1416}
1417
1418impl Display for EvalOutput {
1419    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1420        writeln!(f, "Score: {:?}", self.assertion.score)?;
1421        if let Some(message) = self.assertion.message.as_ref() {
1422            writeln!(f, "Message: {}", message)?;
1423        }
1424
1425        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1426
1427        writeln!(
1428            f,
1429            "Parser Metrics:\n{:#?}",
1430            self.sample.edit_output.parser_metrics
1431        )?;
1432        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1433        Ok(())
1434    }
1435}
1436
1437fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1438    let passed_count = evaluated_count - failed_count;
1439    let passed_ratio = if evaluated_count == 0 {
1440        0.0
1441    } else {
1442        passed_count as f64 / evaluated_count as f64
1443    };
1444    print!(
1445        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1446        evaluated_count,
1447        iterations,
1448        passed_ratio * 100.0
1449    );
1450    std::io::stdout().flush().unwrap();
1451}
1452
1453struct EditAgentTest {
1454    agent: EditAgent,
1455    project: Entity<Project>,
1456    judge_model: Arc<dyn LanguageModel>,
1457}
1458
1459impl EditAgentTest {
1460    async fn new(cx: &mut TestAppContext) -> Self {
1461        cx.executor().allow_parking();
1462
1463        let fs = FakeFs::new(cx.executor().clone());
1464        cx.update(|cx| {
1465            settings::init(cx);
1466            gpui_tokio::init(cx);
1467            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1468            cx.set_http_client(http_client);
1469
1470            client::init_settings(cx);
1471            let client = Client::production(cx);
1472            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1473
1474            settings::init(cx);
1475            Project::init_settings(cx);
1476            language::init(cx);
1477            language_model::init(client.clone(), cx);
1478            language_models::init(user_store.clone(), client.clone(), cx);
1479            crate::init(client.http_client(), cx);
1480        });
1481
1482        fs.insert_tree("/root", json!({})).await;
1483        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1484        let agent_model = SelectedModel::from_str(
1485            &std::env::var("ZED_AGENT_MODEL")
1486                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1487        )
1488        .unwrap();
1489        let judge_model = SelectedModel::from_str(
1490            &std::env::var("ZED_JUDGE_MODEL")
1491                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1492        )
1493        .unwrap();
1494        let (agent_model, judge_model) = cx
1495            .update(|cx| {
1496                cx.spawn(async move |cx| {
1497                    let agent_model = Self::load_model(&agent_model, cx).await;
1498                    let judge_model = Self::load_model(&judge_model, cx).await;
1499                    (agent_model.unwrap(), judge_model.unwrap())
1500                })
1501            })
1502            .await;
1503        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1504
1505        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1506
1507        Self {
1508            agent: EditAgent::new(
1509                agent_model,
1510                project.clone(),
1511                action_log,
1512                Templates::new(),
1513                edit_format,
1514            ),
1515            project,
1516            judge_model,
1517        }
1518    }
1519
1520    async fn load_model(
1521        selected_model: &SelectedModel,
1522        cx: &mut AsyncApp,
1523    ) -> Result<Arc<dyn LanguageModel>> {
1524        let (provider, model) = cx.update(|cx| {
1525            let models = LanguageModelRegistry::read_global(cx);
1526            let model = models
1527                .available_models(cx)
1528                .find(|model| {
1529                    model.provider_id() == selected_model.provider
1530                        && model.id() == selected_model.model
1531                })
1532                .expect("Model not found");
1533            let provider = models.provider(&model.provider_id()).unwrap();
1534            (provider, model)
1535        })?;
1536        cx.update(|cx| provider.authenticate(cx))?.await?;
1537        Ok(model)
1538    }
1539
1540    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1541        let path = self
1542            .project
1543            .read_with(cx, |project, cx| {
1544                project.find_project_path(eval.edit_file_input.path, cx)
1545            })
1546            .unwrap();
1547        let buffer = self
1548            .project
1549            .update(cx, |project, cx| project.open_buffer(path, cx))
1550            .await
1551            .unwrap();
1552        let tools = cx.update(|cx| {
1553            ToolRegistry::default_global(cx)
1554                .tools()
1555                .into_iter()
1556                .filter_map(|tool| {
1557                    let input_schema = tool
1558                        .input_schema(self.agent.model.tool_input_format())
1559                        .ok()?;
1560                    Some(LanguageModelRequestTool {
1561                        name: tool.name(),
1562                        description: tool.description(),
1563                        input_schema,
1564                    })
1565                })
1566                .collect::<Vec<_>>()
1567        });
1568        let tool_names = tools
1569            .iter()
1570            .map(|tool| tool.name.clone())
1571            .collect::<Vec<_>>();
1572        let worktrees = vec![WorktreeContext {
1573            root_name: "root".to_string(),
1574            abs_path: Path::new("/path/to/root").into(),
1575            rules_file: None,
1576        }];
1577        let prompt_builder = PromptBuilder::new(None)?;
1578        let project_context = ProjectContext::new(worktrees, Vec::default());
1579        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1580            &project_context,
1581            &ModelContext {
1582                available_tools: tool_names,
1583            },
1584        )?;
1585
1586        let has_system_prompt = eval
1587            .conversation
1588            .first()
1589            .is_some_and(|msg| msg.role == Role::System);
1590        let messages = if has_system_prompt {
1591            eval.conversation
1592        } else {
1593            [LanguageModelRequestMessage {
1594                role: Role::System,
1595                content: vec![MessageContent::Text(system_prompt)],
1596                cache: true,
1597            }]
1598            .into_iter()
1599            .chain(eval.conversation)
1600            .collect::<Vec<_>>()
1601        };
1602
1603        let conversation = LanguageModelRequest {
1604            messages,
1605            tools,
1606            thinking_allowed: true,
1607            ..Default::default()
1608        };
1609
1610        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1611            if let Some(input_content) = eval.input_content.as_deref() {
1612                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1613            }
1614            retry_on_rate_limit(async || {
1615                self.agent
1616                    .edit(
1617                        buffer.clone(),
1618                        eval.edit_file_input.display_description.clone(),
1619                        &conversation,
1620                        &mut cx.to_async(),
1621                    )
1622                    .0
1623                    .await
1624            })
1625            .await?
1626        } else {
1627            retry_on_rate_limit(async || {
1628                self.agent
1629                    .overwrite(
1630                        buffer.clone(),
1631                        eval.edit_file_input.display_description.clone(),
1632                        &conversation,
1633                        &mut cx.to_async(),
1634                    )
1635                    .0
1636                    .await
1637            })
1638            .await?
1639        };
1640
1641        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1642        let sample = EvalSample {
1643            edit_output,
1644            diff: language::unified_diff(
1645                eval.input_content.as_deref().unwrap_or_default(),
1646                &buffer_text,
1647            ),
1648            text_before: eval.input_content.unwrap_or_default(),
1649            text_after: buffer_text,
1650        };
1651        let assertion = eval
1652            .assertion
1653            .run(&sample, self.judge_model.clone(), cx)
1654            .await?;
1655
1656        Ok(EvalOutput { assertion, sample })
1657    }
1658}
1659
1660async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1661    const MAX_RETRIES: usize = 20;
1662    let mut attempt = 0;
1663
1664    loop {
1665        attempt += 1;
1666        let response = request().await;
1667
1668        if attempt >= MAX_RETRIES {
1669            return response;
1670        }
1671
1672        let retry_delay = match &response {
1673            Ok(_) => None,
1674            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1675                Some(err) => match &err {
1676                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1677                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1678                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1679                    }
1680                    LanguageModelCompletionError::UpstreamProviderError {
1681                        status,
1682                        retry_after,
1683                        ..
1684                    } => {
1685                        // Only retry for specific status codes
1686                        let should_retry = matches!(
1687                            *status,
1688                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1689                        ) || status.as_u16() == 529;
1690
1691                        if should_retry {
1692                            // Use server-provided retry_after if available, otherwise use default
1693                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1694                        } else {
1695                            None
1696                        }
1697                    }
1698                    LanguageModelCompletionError::ApiReadResponseError { .. }
1699                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1700                    | LanguageModelCompletionError::HttpSend { .. } => {
1701                        // Exponential backoff for transient I/O and internal server errors
1702                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1703                    }
1704                    _ => None,
1705                },
1706                _ => None,
1707            },
1708        };
1709
1710        if let Some(retry_after) = retry_delay {
1711            let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1712            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1713            Timer::after(retry_after + jitter).await;
1714        } else {
1715            return response;
1716        }
1717    }
1718}
1719
1720#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1721struct EvalAssertionOutcome {
1722    score: usize,
1723    message: Option<String>,
1724}
1725
1726#[derive(Serialize)]
1727pub struct DiffJudgeTemplate {
1728    diff: String,
1729    assertions: &'static str,
1730}
1731
1732impl Template for DiffJudgeTemplate {
1733    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1734}
1735
1736fn strip_empty_lines(text: &str) -> String {
1737    text.lines()
1738        .filter(|line| !line.trim().is_empty())
1739        .collect::<Vec<_>>()
1740        .join("\n")
1741}