evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext, Timer};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    path::Path,
  30    str::FromStr,
  31    sync::mpsc,
  32    time::Duration,
  33};
  34use util::path;
  35
  36#[test]
  37#[cfg_attr(not(feature = "eval"), ignore)]
  38fn eval_extract_handle_command_output() {
  39    // Test how well agent generates multiple edit hunks.
  40    //
  41    // Model                       | Pass rate
  42    // ----------------------------|----------
  43    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  44    // claude-sonnet-4             |  0.97 (2025-06-14)
  45    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  46    // gemini-2.5-flash            |  0.11 (2025-05-22)
  47    // gpt-4.1                     |  1.00 (2025-05-22)
  48
  49    let input_file_path = "root/blame.rs";
  50    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  51    let possible_diffs = vec![
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  56        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  57        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  58        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  59    ];
  60    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  61    eval(
  62        100,
  63        0.95,
  64        0.05,
  65        EvalInput::from_conversation(
  66            vec![
  67                message(
  68                    User,
  69                    [text(formatdoc! {"
  70                        Read the `{input_file_path}` file and extract a method in
  71                        the final stanza of `run_git_blame` to deal with command failures,
  72                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  73                        Do not document the method and do not add any comments.
  74
  75                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  76                    "})],
  77                ),
  78                message(
  79                    Assistant,
  80                    [tool_use(
  81                        "tool_1",
  82                        "read_file",
  83                        ReadFileToolInput {
  84                            path: input_file_path.into(),
  85                            start_line: None,
  86                            end_line: None,
  87                        },
  88                    )],
  89                ),
  90                message(
  91                    User,
  92                    [tool_result("tool_1", "read_file", input_file_content)],
  93                ),
  94                message(
  95                    Assistant,
  96                    [tool_use(
  97                        "tool_2",
  98                        "edit_file",
  99                        EditFileToolInput {
 100                            display_description: edit_description.into(),
 101                            path: input_file_path.into(),
 102                            mode: EditFileMode::Edit,
 103                        },
 104                    )],
 105                ),
 106            ],
 107            Some(input_file_content.into()),
 108            EvalAssertion::assert_diff_any(possible_diffs),
 109        ),
 110    );
 111}
 112
 113#[test]
 114#[cfg_attr(not(feature = "eval"), ignore)]
 115fn eval_delete_run_git_blame() {
 116    // Model                       | Pass rate
 117    // ----------------------------|----------
 118    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 119    // claude-sonnet-4             | 0.96 (2025-06-14)
 120    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 121    // gemini-2.5-flash            |
 122    // gpt-4.1                     |
 123    let input_file_path = "root/blame.rs";
 124    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 125    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 126    let edit_description = "Delete the `run_git_blame` function.";
 127    eval(
 128        100,
 129        0.95,
 130        0.05,
 131        EvalInput::from_conversation(
 132            vec![
 133                message(
 134                    User,
 135                    [text(formatdoc! {"
 136                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 137                        one function, not its usages.
 138                    "})],
 139                ),
 140                message(
 141                    Assistant,
 142                    [tool_use(
 143                        "tool_1",
 144                        "read_file",
 145                        ReadFileToolInput {
 146                            path: input_file_path.into(),
 147                            start_line: None,
 148                            end_line: None,
 149                        },
 150                    )],
 151                ),
 152                message(
 153                    User,
 154                    [tool_result("tool_1", "read_file", input_file_content)],
 155                ),
 156                message(
 157                    Assistant,
 158                    [tool_use(
 159                        "tool_2",
 160                        "edit_file",
 161                        EditFileToolInput {
 162                            display_description: edit_description.into(),
 163                            path: input_file_path.into(),
 164                            mode: EditFileMode::Edit,
 165                        },
 166                    )],
 167                ),
 168            ],
 169            Some(input_file_content.into()),
 170            EvalAssertion::assert_eq(output_file_content),
 171        ),
 172    );
 173}
 174
 175#[test]
 176#[cfg_attr(not(feature = "eval"), ignore)]
 177fn eval_translate_doc_comments() {
 178    //  Model                          | Pass rate
 179    // ============================================
 180    //
 181    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 182    //  claude-sonnet-4                |  1.0  (2025-06-14)
 183    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 184    //  gemini-2.5-flash-preview-04-17 |
 185    //  gpt-4.1                        |
 186    let input_file_path = "root/canvas.rs";
 187    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 188    let edit_description = "Translate all doc comments to Italian";
 189    eval(
 190        200,
 191        1.,
 192        0.05,
 193        EvalInput::from_conversation(
 194            vec![
 195                message(
 196                    User,
 197                    [text(formatdoc! {"
 198                        Read the {input_file_path} file and edit it (without overwriting it),
 199                        translating all the doc comments to italian.
 200                    "})],
 201                ),
 202                message(
 203                    Assistant,
 204                    [tool_use(
 205                        "tool_1",
 206                        "read_file",
 207                        ReadFileToolInput {
 208                            path: input_file_path.into(),
 209                            start_line: None,
 210                            end_line: None,
 211                        },
 212                    )],
 213                ),
 214                message(
 215                    User,
 216                    [tool_result("tool_1", "read_file", input_file_content)],
 217                ),
 218                message(
 219                    Assistant,
 220                    [tool_use(
 221                        "tool_2",
 222                        "edit_file",
 223                        EditFileToolInput {
 224                            display_description: edit_description.into(),
 225                            path: input_file_path.into(),
 226                            mode: EditFileMode::Edit,
 227                        },
 228                    )],
 229                ),
 230            ],
 231            Some(input_file_content.into()),
 232            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 233        ),
 234    );
 235}
 236
 237#[test]
 238#[cfg_attr(not(feature = "eval"), ignore)]
 239fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 240    //  Model                          | Pass rate
 241    // ============================================
 242    //
 243    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 244    //  claude-sonnet-4                |  0.11 (2025-06-14)
 245    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 246    //  gemini-2.5-flash-preview-04-17 |
 247    //  gpt-4.1                        |
 248    let input_file_path = "root/lib.rs";
 249    let input_file_content =
 250        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 251    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 252    eval(
 253        100,
 254        0.95,
 255        0.05,
 256        EvalInput::from_conversation(
 257            vec![
 258                message(
 259                    User,
 260                    [text(formatdoc! {"
 261                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 262                        Use `ureq` to download the SDK for the current platform and architecture.
 263                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 264                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 265                        that's inside of the archive.
 266                        Don't re-download the SDK if that executable already exists.
 267
 268                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 269
 270                        Here are the available wasi-sdk assets:
 271                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 272                        - wasi-sdk-25.0-arm64-macos.tar.gz
 273                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 274                        - wasi-sdk-25.0-arm64-linux.tar.gz
 275                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 276                        - wasi-sdk-25.0-arm64-linux.tar.gz
 277                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 278                    "})],
 279                ),
 280                message(
 281                    Assistant,
 282                    [tool_use(
 283                        "tool_1",
 284                        "read_file",
 285                        ReadFileToolInput {
 286                            path: input_file_path.into(),
 287                            start_line: Some(971),
 288                            end_line: Some(1050),
 289                        },
 290                    )],
 291                ),
 292                message(
 293                    User,
 294                    [tool_result(
 295                        "tool_1",
 296                        "read_file",
 297                        lines(input_file_content, 971..1050),
 298                    )],
 299                ),
 300                message(
 301                    Assistant,
 302                    [tool_use(
 303                        "tool_2",
 304                        "read_file",
 305                        ReadFileToolInput {
 306                            path: input_file_path.into(),
 307                            start_line: Some(1050),
 308                            end_line: Some(1100),
 309                        },
 310                    )],
 311                ),
 312                message(
 313                    User,
 314                    [tool_result(
 315                        "tool_2",
 316                        "read_file",
 317                        lines(input_file_content, 1050..1100),
 318                    )],
 319                ),
 320                message(
 321                    Assistant,
 322                    [tool_use(
 323                        "tool_3",
 324                        "read_file",
 325                        ReadFileToolInput {
 326                            path: input_file_path.into(),
 327                            start_line: Some(1100),
 328                            end_line: Some(1150),
 329                        },
 330                    )],
 331                ),
 332                message(
 333                    User,
 334                    [tool_result(
 335                        "tool_3",
 336                        "read_file",
 337                        lines(input_file_content, 1100..1150),
 338                    )],
 339                ),
 340                message(
 341                    Assistant,
 342                    [tool_use(
 343                        "tool_4",
 344                        "edit_file",
 345                        EditFileToolInput {
 346                            display_description: edit_description.into(),
 347                            path: input_file_path.into(),
 348                            mode: EditFileMode::Edit,
 349                        },
 350                    )],
 351                ),
 352            ],
 353            Some(input_file_content.into()),
 354            EvalAssertion::judge_diff(indoc! {"
 355                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 356                - ureq is used to download the SDK for current platform and architecture
 357            "}),
 358        ),
 359    );
 360}
 361
 362#[test]
 363#[cfg_attr(not(feature = "eval"), ignore)]
 364fn eval_disable_cursor_blinking() {
 365    //  Model                          | Pass rate
 366    // ============================================
 367    //
 368    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 369    //  claude-sonnet-4                |  0.81 (2025-07-14)
 370    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 371    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 372    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 373    let input_file_path = "root/editor.rs";
 374    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 375    let edit_description = "Comment out the call to `BlinkManager::enable`";
 376    let possible_diffs = vec![
 377        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 378        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 379        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 380        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 381    ];
 382    eval(
 383        100,
 384        0.51,
 385        0.05,
 386        EvalInput::from_conversation(
 387            vec![
 388                message(User, [text("Let's research how to cursor blinking works.")]),
 389                message(
 390                    Assistant,
 391                    [tool_use(
 392                        "tool_1",
 393                        "grep",
 394                        GrepToolInput {
 395                            regex: "blink".into(),
 396                            include_pattern: None,
 397                            offset: 0,
 398                            case_sensitive: false,
 399                        },
 400                    )],
 401                ),
 402                message(
 403                    User,
 404                    [tool_result(
 405                        "tool_1",
 406                        "grep",
 407                        [
 408                            lines(input_file_content, 100..400),
 409                            lines(input_file_content, 800..1300),
 410                            lines(input_file_content, 1600..2000),
 411                            lines(input_file_content, 5000..5500),
 412                            lines(input_file_content, 8000..9000),
 413                            lines(input_file_content, 18455..18470),
 414                            lines(input_file_content, 20000..20500),
 415                            lines(input_file_content, 21000..21300),
 416                        ]
 417                        .join("Match found:\n\n"),
 418                    )],
 419                ),
 420                message(
 421                    User,
 422                    [text(indoc! {"
 423                        Comment out the lines that interact with the BlinkManager.
 424                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 425                        Don't add additional comments.
 426                    "})],
 427                ),
 428                message(
 429                    Assistant,
 430                    [tool_use(
 431                        "tool_4",
 432                        "edit_file",
 433                        EditFileToolInput {
 434                            display_description: edit_description.into(),
 435                            path: input_file_path.into(),
 436                            mode: EditFileMode::Edit,
 437                        },
 438                    )],
 439                ),
 440            ],
 441            Some(input_file_content.into()),
 442            EvalAssertion::assert_diff_any(possible_diffs),
 443        ),
 444    );
 445}
 446
 447#[test]
 448#[cfg_attr(not(feature = "eval"), ignore)]
 449fn eval_from_pixels_constructor() {
 450    // Results for 2025-06-13
 451    //
 452    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 453    // value. Higher values improve the pass rate but may sometimes cause
 454    // edits to be misapplied. In the context of this eval, this means
 455    // the agent might add from_pixels tests in incorrect locations
 456    // (e.g., at the beginning of the file), yet the evaluation may still
 457    // rate it highly.
 458    //
 459    //  Model                          | Date        | Pass rate
 460    // =========================================================
 461    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 462    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 463    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 464    //  gpt-4.1                        |
 465    let input_file_path = "root/canvas.rs";
 466    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 467    let edit_description = "Implement from_pixels constructor and add tests.";
 468    eval(
 469        100,
 470        0.95,
 471        // For whatever reason, this eval produces more mismatched tags.
 472        // Increasing for now, let's see if we can bring this down.
 473        0.25,
 474        EvalInput::from_conversation(
 475            vec![
 476                message(
 477                    User,
 478                    [text(indoc! {"
 479                        Introduce a new `from_pixels` constructor in Canvas and
 480                        also add tests for it in the same file.
 481                    "})],
 482                ),
 483                message(
 484                    Assistant,
 485                    [tool_use(
 486                        "tool_1",
 487                        "read_file",
 488                        ReadFileToolInput {
 489                            path: input_file_path.into(),
 490                            start_line: None,
 491                            end_line: None,
 492                        },
 493                    )],
 494                ),
 495                message(
 496                    User,
 497                    [tool_result("tool_1", "read_file", input_file_content)],
 498                ),
 499                message(
 500                    Assistant,
 501                    [tool_use(
 502                        "tool_2",
 503                        "grep",
 504                        GrepToolInput {
 505                            regex: "mod\\s+tests".into(),
 506                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 507                            offset: 0,
 508                            case_sensitive: false,
 509                        },
 510                    )],
 511                ),
 512                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 513                message(
 514                    Assistant,
 515                    [tool_use(
 516                        "tool_3",
 517                        "grep",
 518                        GrepToolInput {
 519                            regex: "mod\\s+tests".into(),
 520                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 521                            offset: 0,
 522                            case_sensitive: false,
 523                        },
 524                    )],
 525                ),
 526                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 527                message(
 528                    Assistant,
 529                    [tool_use(
 530                        "tool_4",
 531                        "grep",
 532                        GrepToolInput {
 533                            regex: "#\\[test\\]".into(),
 534                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 535                            offset: 0,
 536                            case_sensitive: false,
 537                        },
 538                    )],
 539                ),
 540                message(
 541                    User,
 542                    [tool_result(
 543                        "tool_4",
 544                        "grep",
 545                        indoc! {"
 546                            Found 6 matches:
 547
 548                            ## Matches in font-kit/src/loaders/core_text.rs
 549
 550                            ### mod test › L926-936
 551                            ```
 552                            mod test {
 553                                use super::Font;
 554                                use crate::properties::{Stretch, Weight};
 555
 556                                #[cfg(feature = \"source\")]
 557                                use crate::source::SystemSource;
 558
 559                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 560
 561                                #[cfg(feature = \"source\")]
 562                                #[test]
 563                            ```
 564
 565                            55 lines remaining in ancestor node. Read the file to see all.
 566
 567                            ### mod test › L947-951
 568                            ```
 569                                }
 570
 571                                #[test]
 572                                fn test_core_text_to_css_font_weight() {
 573                                    // Exact matches
 574                            ```
 575
 576                            ### mod test › L959-963
 577                            ```
 578                                }
 579
 580                                #[test]
 581                                fn test_core_text_to_css_font_stretch() {
 582                                    // Exact matches
 583                            ```
 584
 585                            ## Matches in font-kit/src/loaders/freetype.rs
 586
 587                            ### mod test › L1238-1248
 588                            ```
 589                            mod test {
 590                                use crate::loaders::freetype::Font;
 591
 592                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 593                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 594
 595                                #[test]
 596                                fn get_pcf_postscript_name() {
 597                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 598                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 599                                }
 600                            ```
 601
 602                            1 lines remaining in ancestor node. Read the file to see all.
 603
 604                            ## Matches in font-kit/src/sources/core_text.rs
 605
 606                            ### mod test › L265-275
 607                            ```
 608                            mod test {
 609                                use crate::properties::{Stretch, Weight};
 610
 611                                #[test]
 612                                fn test_css_to_core_text_font_weight() {
 613                                    // Exact matches
 614                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 615                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 616                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 617                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 618
 619                            ```
 620
 621                            27 lines remaining in ancestor node. Read the file to see all.
 622
 623                            ### mod test › L278-282
 624                            ```
 625                                }
 626
 627                                #[test]
 628                                fn test_css_to_core_text_font_stretch() {
 629                                    // Exact matches
 630                            ```
 631                        "},
 632                    )],
 633                ),
 634                message(
 635                    Assistant,
 636                    [tool_use(
 637                        "tool_5",
 638                        "edit_file",
 639                        EditFileToolInput {
 640                            display_description: edit_description.into(),
 641                            path: input_file_path.into(),
 642                            mode: EditFileMode::Edit,
 643                        },
 644                    )],
 645                ),
 646            ],
 647            Some(input_file_content.into()),
 648            EvalAssertion::judge_diff(indoc! {"
 649                    - The diff contains a new `from_pixels` constructor
 650                    - The diff contains new tests for the `from_pixels` constructor
 651                "}),
 652        ),
 653    );
 654}
 655
 656#[test]
 657#[cfg_attr(not(feature = "eval"), ignore)]
 658fn eval_zode() {
 659    //  Model                          | Pass rate
 660    // ============================================
 661    //
 662    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 663    //  claude-sonnet-4                |  1.0 (2025-06-14)
 664    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 665    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 666    //  gpt-4.1                        |  1.0 (2025-05-22)
 667    let input_file_path = "root/zode.py";
 668    let input_content = None;
 669    let edit_description = "Create the main Zode CLI script";
 670    eval(
 671        50,
 672        1.,
 673        0.05,
 674        EvalInput::from_conversation(
 675            vec![
 676                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 677                message(
 678                    Assistant,
 679                    [
 680                        tool_use(
 681                            "tool_1",
 682                            "read_file",
 683                            ReadFileToolInput {
 684                                path: "root/eval/react.py".into(),
 685                                start_line: None,
 686                                end_line: None,
 687                            },
 688                        ),
 689                        tool_use(
 690                            "tool_2",
 691                            "read_file",
 692                            ReadFileToolInput {
 693                                path: "root/eval/react_test.py".into(),
 694                                start_line: None,
 695                                end_line: None,
 696                            },
 697                        ),
 698                    ],
 699                ),
 700                message(
 701                    User,
 702                    [
 703                        tool_result(
 704                            "tool_1",
 705                            "read_file",
 706                            include_str!("evals/fixtures/zode/react.py"),
 707                        ),
 708                        tool_result(
 709                            "tool_2",
 710                            "read_file",
 711                            include_str!("evals/fixtures/zode/react_test.py"),
 712                        ),
 713                    ],
 714                ),
 715                message(
 716                    Assistant,
 717                    [
 718                        text(
 719                            "Now that I understand what we need to build, I'll create the main Python script:",
 720                        ),
 721                        tool_use(
 722                            "tool_3",
 723                            "edit_file",
 724                            EditFileToolInput {
 725                                display_description: edit_description.into(),
 726                                path: input_file_path.into(),
 727                                mode: EditFileMode::Create,
 728                            },
 729                        ),
 730                    ],
 731                ),
 732            ],
 733            input_content,
 734            EvalAssertion::new(async move |sample, _, _cx| {
 735                let invalid_starts = [' ', '`', '\n'];
 736                let mut message = String::new();
 737                for start in invalid_starts {
 738                    if sample.text_after.starts_with(start) {
 739                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 740                        break;
 741                    }
 742                }
 743                // Remove trailing newline.
 744                message.pop();
 745
 746                if message.is_empty() {
 747                    Ok(EvalAssertionOutcome {
 748                        score: 100,
 749                        message: None,
 750                    })
 751                } else {
 752                    Ok(EvalAssertionOutcome {
 753                        score: 0,
 754                        message: Some(message),
 755                    })
 756                }
 757            }),
 758        ),
 759    );
 760}
 761
 762#[test]
 763#[cfg_attr(not(feature = "eval"), ignore)]
 764fn eval_add_overwrite_test() {
 765    //  Model                          | Pass rate
 766    // ============================================
 767    //
 768    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 769    //  claude-sonnet-4                |  0.07 (2025-06-14)
 770    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 771    //  gemini-2.5-flash-preview-04-17 |
 772    //  gpt-4.1                        |
 773    let input_file_path = "root/action_log.rs";
 774    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 775    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 776    eval(
 777        200,
 778        0.5, // TODO: make this eval better
 779        0.05,
 780        EvalInput::from_conversation(
 781            vec![
 782                message(
 783                    User,
 784                    [text(indoc! {"
 785                        Introduce a new test in `action_log.rs` to test overwriting a file.
 786                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 787                        Take inspiration from all the other tests in the file.
 788                    "})],
 789                ),
 790                message(
 791                    Assistant,
 792                    [tool_use(
 793                        "tool_1",
 794                        "read_file",
 795                        ReadFileToolInput {
 796                            path: input_file_path.into(),
 797                            start_line: None,
 798                            end_line: None,
 799                        },
 800                    )],
 801                ),
 802                message(
 803                    User,
 804                    [tool_result(
 805                        "tool_1",
 806                        "read_file",
 807                        indoc! {"
 808                            pub struct ActionLog [L13-20]
 809                             tracked_buffers [L15]
 810                             edited_since_project_diagnostics_check [L17]
 811                             project [L19]
 812                            impl ActionLog [L22-498]
 813                             pub fn new [L24-30]
 814                             pub fn project [L32-34]
 815                             pub fn checked_project_diagnostics [L37-39]
 816                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 817                             fn track_buffer_internal [L46-101]
 818                             fn handle_buffer_event [L103-116]
 819                             fn handle_buffer_edited [L118-123]
 820                             fn handle_buffer_file_changed [L125-158]
 821                             async fn maintain_diff [L160-264]
 822                             pub fn buffer_read [L267-269]
 823                             pub fn buffer_created [L272-276]
 824                             pub fn buffer_edited [L279-287]
 825                             pub fn will_delete_buffer [L289-304]
 826                             pub fn keep_edits_in_range [L306-364]
 827                             pub fn reject_edits_in_ranges [L366-459]
 828                             pub fn keep_all_edits [L461-473]
 829                             pub fn changed_buffers [L476-482]
 830                             pub fn stale_buffers [L485-497]
 831                            fn apply_non_conflicting_edits [L500-561]
 832                            fn diff_snapshots [L563-585]
 833                            fn point_to_row_edit [L587-614]
 834                            enum ChangeAuthor [L617-620]
 835                             User [L618]
 836                             Agent [L619]
 837                            enum TrackedBufferStatus [L623-627]
 838                             Created [L624]
 839                             Modified [L625]
 840                             Deleted [L626]
 841                            struct TrackedBuffer [L629-641]
 842                             buffer [L630]
 843                             base_text [L631]
 844                             unreviewed_changes [L632]
 845                             status [L633]
 846                             version [L634]
 847                             diff [L635]
 848                             snapshot [L636]
 849                             diff_update [L637]
 850                             _open_lsp_handle [L638]
 851                             _maintain_diff [L639]
 852                             _subscription [L640]
 853                            impl TrackedBuffer [L643-657]
 854                             fn has_changes [L644-650]
 855                             fn schedule_diff_update [L652-656]
 856                            pub struct ChangedBuffer [L659-661]
 857                             pub diff [L660]
 858                            mod tests [L664-1574]
 859                             fn init_logger [L678-682]
 860                             fn init_test [L684-691]
 861                             async fn test_keep_edits [L694-769]
 862                             async fn test_deletions [L772-854]
 863                             async fn test_overlapping_user_edits [L857-951]
 864                             async fn test_creating_files [L954-1010]
 865                             async fn test_deleting_files [L1013-1120]
 866                             async fn test_reject_edits [L1123-1255]
 867                             async fn test_reject_multiple_edits [L1258-1331]
 868                             async fn test_reject_deleted_file [L1334-1388]
 869                             async fn test_reject_created_file [L1391-1443]
 870                             async fn test_random_diffs [L1446-1535]
 871                              fn quiesce [L1510-1534]
 872                             struct HunkStatus [L1538-1542]
 873                              range [L1539]
 874                              diff_status [L1540]
 875                              old_text [L1541]
 876                             fn unreviewed_hunks [L1544-1573]
 877
 878                            Showing symbols 1-69 (total symbols: 69)
 879
 880                            Using the line numbers in this outline, you can call this tool again while specifying
 881                            the start_line and end_line fields to see the implementations of symbols in the outline.
 882                        "},
 883                    )],
 884                ),
 885                message(
 886                    Assistant,
 887                    [
 888                        text(
 889                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 890                        ),
 891                        tool_use(
 892                            "tool_2",
 893                            "read_file",
 894                            ReadFileToolInput {
 895                                path: input_file_path.into(),
 896                                start_line: Some(953),
 897                                end_line: Some(1010),
 898                            },
 899                        ),
 900                    ],
 901                ),
 902                message(
 903                    User,
 904                    [tool_result(
 905                        "tool_2",
 906                        "read_file",
 907                        lines(input_file_content, 953..1010),
 908                    )],
 909                ),
 910                message(
 911                    Assistant,
 912                    [
 913                        text(
 914                            "Now let's look at another relevant test to understand how they're structured:",
 915                        ),
 916                        tool_use(
 917                            "tool_3",
 918                            "read_file",
 919                            ReadFileToolInput {
 920                                path: input_file_path.into(),
 921                                start_line: Some(1012),
 922                                end_line: Some(1120),
 923                            },
 924                        ),
 925                    ],
 926                ),
 927                message(
 928                    User,
 929                    [tool_result(
 930                        "tool_3",
 931                        "read_file",
 932                        lines(input_file_content, 1012..1120),
 933                    )],
 934                ),
 935                message(
 936                    Assistant,
 937                    [
 938                        text("Now let's look at how `buffer_created` is implemented:"),
 939                        tool_use(
 940                            "tool_4",
 941                            "read_file",
 942                            ReadFileToolInput {
 943                                path: input_file_path.into(),
 944                                start_line: Some(271),
 945                                end_line: Some(276),
 946                            },
 947                        ),
 948                    ],
 949                ),
 950                message(
 951                    User,
 952                    [tool_result(
 953                        "tool_4",
 954                        "read_file",
 955                        lines(input_file_content, 271..276),
 956                    )],
 957                ),
 958                message(
 959                    Assistant,
 960                    [
 961                        text(indoc! {"
 962                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 963
 964                            1. Create a file
 965                            2. Open a buffer for it
 966                            3. Track it in the action log
 967                            4. Modify the buffer
 968                            5. Call `buffer_created` again as if the file were new
 969                            6. Check that the changes are properly tracked
 970
 971                            Let's write the test:
 972                        "}),
 973                        tool_use(
 974                            "tool_5",
 975                            "edit_file",
 976                            EditFileToolInput {
 977                                display_description: edit_description.into(),
 978                                path: input_file_path.into(),
 979                                mode: EditFileMode::Edit,
 980                            },
 981                        ),
 982                    ],
 983                ),
 984            ],
 985            Some(input_file_content.into()),
 986            EvalAssertion::judge_diff(
 987                "A new test for overwritten files was created, without changing any previous test",
 988            ),
 989        ),
 990    );
 991}
 992
 993#[test]
 994#[cfg_attr(not(feature = "eval"), ignore)]
 995fn eval_create_empty_file() {
 996    // Check that Edit Agent can create a file without writing its
 997    // thoughts into it. This issue is not specific to empty files, but
 998    // it's easier to reproduce with them.
 999    //
1000    //  Model                          | Pass rate
1001    // ============================================
1002    //
1003    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1004    //  claude-sonnet-4                |  1.00 (2025-06-14)
1005    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1006    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1007    //  gpt-4.1                        |  1.00 (2025-05-21)
1008    //
1009    //
1010    // TODO: gpt-4.1-mini errored 38 times:
1011    // "data did not match any variant of untagged enum ResponseStreamResult"
1012    //
1013    let input_file_content = None;
1014    let expected_output_content = String::new();
1015    eval(
1016        100,
1017        0.99,
1018        0.05,
1019        EvalInput::from_conversation(
1020            vec![
1021                message(User, [text("Create a second empty todo file ")]),
1022                message(
1023                    Assistant,
1024                    [
1025                        text(formatdoc! {"
1026                        I'll help you create a second empty todo file.
1027                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1028                        "}),
1029                        tool_use(
1030                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1031                            "list_directory",
1032                            ListDirectoryToolInput {
1033                                path: "root".to_string(),
1034                            },
1035                        ),
1036                    ],
1037                ),
1038                message(
1039                    User,
1040                    [tool_result(
1041                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1042                        "list_directory",
1043                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1044                    )],
1045                ),
1046                message(
1047                    Assistant,
1048                    [
1049                        text(formatdoc! {"
1050                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1051                    "}),
1052                        tool_use(
1053                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1054                            "edit_file",
1055                            EditFileToolInput {
1056                                display_description: "Create empty TODO3 file".to_string(),
1057                                mode: EditFileMode::Create,
1058                                path: "root/TODO3".into(),
1059                            },
1060                        ),
1061                    ],
1062                ),
1063            ],
1064            input_file_content,
1065            // Bad behavior is to write something like
1066            // "I'll create an empty TODO3 file as requested."
1067            EvalAssertion::assert_eq(expected_output_content),
1068        ),
1069    );
1070}
1071
1072fn message(
1073    role: Role,
1074    contents: impl IntoIterator<Item = MessageContent>,
1075) -> LanguageModelRequestMessage {
1076    LanguageModelRequestMessage {
1077        role,
1078        content: contents.into_iter().collect(),
1079        cache: false,
1080    }
1081}
1082
1083fn text(text: impl Into<String>) -> MessageContent {
1084    MessageContent::Text(text.into())
1085}
1086
1087fn lines(input: &str, range: Range<usize>) -> String {
1088    input
1089        .lines()
1090        .skip(range.start)
1091        .take(range.len())
1092        .collect::<Vec<_>>()
1093        .join("\n")
1094}
1095
1096fn tool_use(
1097    id: impl Into<Arc<str>>,
1098    name: impl Into<Arc<str>>,
1099    input: impl Serialize,
1100) -> MessageContent {
1101    MessageContent::ToolUse(LanguageModelToolUse {
1102        id: LanguageModelToolUseId::from(id.into()),
1103        name: name.into(),
1104        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1105        input: serde_json::to_value(input).unwrap(),
1106        is_input_complete: true,
1107    })
1108}
1109
1110fn tool_result(
1111    id: impl Into<Arc<str>>,
1112    name: impl Into<Arc<str>>,
1113    result: impl Into<Arc<str>>,
1114) -> MessageContent {
1115    MessageContent::ToolResult(LanguageModelToolResult {
1116        tool_use_id: LanguageModelToolUseId::from(id.into()),
1117        tool_name: name.into(),
1118        is_error: false,
1119        content: LanguageModelToolResultContent::Text(result.into()),
1120        output: None,
1121    })
1122}
1123
1124#[derive(Clone)]
1125struct EvalInput {
1126    conversation: Vec<LanguageModelRequestMessage>,
1127    edit_file_input: EditFileToolInput,
1128    input_content: Option<String>,
1129    assertion: EvalAssertion,
1130}
1131
1132impl EvalInput {
1133    fn from_conversation(
1134        conversation: Vec<LanguageModelRequestMessage>,
1135        input_content: Option<String>,
1136        assertion: EvalAssertion,
1137    ) -> Self {
1138        let msg = conversation.last().expect("Conversation must not be empty");
1139        if msg.role != Role::Assistant {
1140            panic!("Conversation must end with an assistant message");
1141        }
1142        let tool_use = msg
1143            .content
1144            .iter()
1145            .flat_map(|content| match content {
1146                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1147                    Some(tool_use)
1148                }
1149                _ => None,
1150            })
1151            .next()
1152            .expect("Conversation must end with an edit_file tool use")
1153            .clone();
1154
1155        let edit_file_input: EditFileToolInput =
1156            serde_json::from_value(tool_use.input.clone()).unwrap();
1157
1158        EvalInput {
1159            conversation,
1160            edit_file_input,
1161            input_content,
1162            assertion,
1163        }
1164    }
1165}
1166
1167#[derive(Clone)]
1168struct EvalSample {
1169    text_before: String,
1170    text_after: String,
1171    edit_output: EditAgentOutput,
1172    diff: String,
1173}
1174
1175trait AssertionFn: 'static + Send + Sync {
1176    fn assert<'a>(
1177        &'a self,
1178        sample: &'a EvalSample,
1179        judge_model: Arc<dyn LanguageModel>,
1180        cx: &'a mut TestAppContext,
1181    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1182}
1183
1184impl<F> AssertionFn for F
1185where
1186    F: 'static
1187        + Send
1188        + Sync
1189        + AsyncFn(
1190            &EvalSample,
1191            Arc<dyn LanguageModel>,
1192            &mut TestAppContext,
1193        ) -> Result<EvalAssertionOutcome>,
1194{
1195    fn assert<'a>(
1196        &'a self,
1197        sample: &'a EvalSample,
1198        judge_model: Arc<dyn LanguageModel>,
1199        cx: &'a mut TestAppContext,
1200    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1201        (self)(sample, judge_model, cx).boxed_local()
1202    }
1203}
1204
1205#[derive(Clone)]
1206struct EvalAssertion(Arc<dyn AssertionFn>);
1207
1208impl EvalAssertion {
1209    fn new<F>(f: F) -> Self
1210    where
1211        F: 'static
1212            + Send
1213            + Sync
1214            + AsyncFn(
1215                &EvalSample,
1216                Arc<dyn LanguageModel>,
1217                &mut TestAppContext,
1218            ) -> Result<EvalAssertionOutcome>,
1219    {
1220        EvalAssertion(Arc::new(f))
1221    }
1222
1223    fn assert_eq(expected: impl Into<String>) -> Self {
1224        let expected = expected.into();
1225        Self::new(async move |sample, _judge, _cx| {
1226            Ok(EvalAssertionOutcome {
1227                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1228                    100
1229                } else {
1230                    0
1231                },
1232                message: None,
1233            })
1234        })
1235    }
1236
1237    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1238        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1239        Self::new(async move |sample, _judge, _cx| {
1240            let matches = expected_diffs.iter().any(|possible_diff| {
1241                let expected =
1242                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1243                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1244            });
1245
1246            Ok(EvalAssertionOutcome {
1247                score: if matches { 100 } else { 0 },
1248                message: None,
1249            })
1250        })
1251    }
1252
1253    fn judge_diff(assertions: &'static str) -> Self {
1254        Self::new(async move |sample, judge, cx| {
1255            let prompt = DiffJudgeTemplate {
1256                diff: sample.diff.clone(),
1257                assertions,
1258            }
1259            .render(&Templates::new())
1260            .unwrap();
1261
1262            let request = LanguageModelRequest {
1263                messages: vec![LanguageModelRequestMessage {
1264                    role: Role::User,
1265                    content: vec![prompt.into()],
1266                    cache: false,
1267                }],
1268                thinking_allowed: true,
1269                ..Default::default()
1270            };
1271            let mut response = retry_on_rate_limit(async || {
1272                Ok(judge
1273                    .stream_completion_text(request.clone(), &cx.to_async())
1274                    .await?)
1275            })
1276            .await?;
1277            let mut output = String::new();
1278            while let Some(chunk) = response.stream.next().await {
1279                let chunk = chunk?;
1280                output.push_str(&chunk);
1281            }
1282
1283            // Parse the score from the response
1284            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1285            if let Some(captures) = re.captures(&output) {
1286                if let Some(score_match) = captures.get(1) {
1287                    let score = score_match.as_str().parse().unwrap_or(0);
1288                    return Ok(EvalAssertionOutcome {
1289                        score,
1290                        message: Some(output),
1291                    });
1292                }
1293            }
1294
1295            anyhow::bail!("No score found in response. Raw output: {output}");
1296        })
1297    }
1298
1299    async fn run(
1300        &self,
1301        input: &EvalSample,
1302        judge_model: Arc<dyn LanguageModel>,
1303        cx: &mut TestAppContext,
1304    ) -> Result<EvalAssertionOutcome> {
1305        self.0.assert(input, judge_model, cx).await
1306    }
1307}
1308
1309fn eval(
1310    iterations: usize,
1311    expected_pass_ratio: f32,
1312    mismatched_tag_threshold: f32,
1313    mut eval: EvalInput,
1314) {
1315    let mut evaluated_count = 0;
1316    let mut failed_count = 0;
1317    report_progress(evaluated_count, failed_count, iterations);
1318
1319    let (tx, rx) = mpsc::channel();
1320
1321    // Cache the last message in the conversation, and run one instance of the eval so that
1322    // all the next ones are cached.
1323    eval.conversation.last_mut().unwrap().cache = true;
1324    run_eval(eval.clone(), tx.clone());
1325
1326    let executor = gpui::background_executor();
1327    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1328    for _ in 1..iterations {
1329        let eval = eval.clone();
1330        let tx = tx.clone();
1331        let semaphore = semaphore.clone();
1332        executor
1333            .spawn(async move {
1334                let _guard = semaphore.acquire().await;
1335                run_eval(eval, tx)
1336            })
1337            .detach();
1338    }
1339    drop(tx);
1340
1341    let mut failed_evals = HashMap::default();
1342    let mut errored_evals = HashMap::default();
1343    let mut eval_outputs = Vec::new();
1344    let mut cumulative_parser_metrics = EditParserMetrics::default();
1345    while let Ok(output) = rx.recv() {
1346        match output {
1347            Ok(output) => {
1348                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1349                eval_outputs.push(output.clone());
1350                if output.assertion.score < 80 {
1351                    failed_count += 1;
1352                    failed_evals
1353                        .entry(output.sample.text_after.clone())
1354                        .or_insert(Vec::new())
1355                        .push(output);
1356                }
1357            }
1358            Err(error) => {
1359                failed_count += 1;
1360                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1361            }
1362        }
1363
1364        evaluated_count += 1;
1365        report_progress(evaluated_count, failed_count, iterations);
1366    }
1367
1368    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1369    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1370    if actual_pass_ratio < expected_pass_ratio {
1371        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1372        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1373        for (error, count) in errored_evals {
1374            println!("Eval errored {} times. Error: {}", count, error);
1375        }
1376
1377        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1378        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1379        for (_buffer_output, failed_evals) in failed_evals {
1380            let eval_output = failed_evals.first().unwrap();
1381            println!("Eval failed {} times", failed_evals.len());
1382            println!("{}", eval_output);
1383        }
1384
1385        panic!(
1386            "Actual pass ratio: {}\nExpected pass ratio: {}",
1387            actual_pass_ratio, expected_pass_ratio
1388        );
1389    }
1390
1391    let mismatched_tag_ratio =
1392        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1393    if mismatched_tag_ratio > mismatched_tag_threshold {
1394        for eval_output in eval_outputs {
1395            println!("{}", eval_output);
1396        }
1397        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1398    }
1399}
1400
1401fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1402    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1403    let mut cx = TestAppContext::build(dispatcher, None);
1404    let output = cx.executor().block_test(async {
1405        let test = EditAgentTest::new(&mut cx).await;
1406        test.eval(eval, &mut cx).await
1407    });
1408    tx.send(output).unwrap();
1409}
1410
1411#[derive(Clone)]
1412struct EvalOutput {
1413    sample: EvalSample,
1414    assertion: EvalAssertionOutcome,
1415}
1416
1417impl Display for EvalOutput {
1418    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1419        writeln!(f, "Score: {:?}", self.assertion.score)?;
1420        if let Some(message) = self.assertion.message.as_ref() {
1421            writeln!(f, "Message: {}", message)?;
1422        }
1423
1424        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1425
1426        writeln!(
1427            f,
1428            "Parser Metrics:\n{:#?}",
1429            self.sample.edit_output.parser_metrics
1430        )?;
1431        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1432        Ok(())
1433    }
1434}
1435
1436fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1437    let passed_count = evaluated_count - failed_count;
1438    let passed_ratio = if evaluated_count == 0 {
1439        0.0
1440    } else {
1441        passed_count as f64 / evaluated_count as f64
1442    };
1443    print!(
1444        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1445        evaluated_count,
1446        iterations,
1447        passed_ratio * 100.0
1448    );
1449    std::io::stdout().flush().unwrap();
1450}
1451
1452struct EditAgentTest {
1453    agent: EditAgent,
1454    project: Entity<Project>,
1455    judge_model: Arc<dyn LanguageModel>,
1456}
1457
1458impl EditAgentTest {
1459    async fn new(cx: &mut TestAppContext) -> Self {
1460        cx.executor().allow_parking();
1461
1462        let fs = FakeFs::new(cx.executor().clone());
1463        cx.update(|cx| {
1464            settings::init(cx);
1465            gpui_tokio::init(cx);
1466            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1467            cx.set_http_client(http_client);
1468
1469            client::init_settings(cx);
1470            let client = Client::production(cx);
1471            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1472
1473            settings::init(cx);
1474            Project::init_settings(cx);
1475            language::init(cx);
1476            language_model::init(client.clone(), cx);
1477            language_models::init(user_store.clone(), client.clone(), cx);
1478            crate::init(client.http_client(), cx);
1479        });
1480
1481        fs.insert_tree("/root", json!({})).await;
1482        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1483        let agent_model = SelectedModel::from_str(
1484            &std::env::var("ZED_AGENT_MODEL")
1485                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1486        )
1487        .unwrap();
1488        let judge_model = SelectedModel::from_str(
1489            &std::env::var("ZED_JUDGE_MODEL")
1490                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1491        )
1492        .unwrap();
1493        let (agent_model, judge_model) = cx
1494            .update(|cx| {
1495                cx.spawn(async move |cx| {
1496                    let agent_model = Self::load_model(&agent_model, cx).await;
1497                    let judge_model = Self::load_model(&judge_model, cx).await;
1498                    (agent_model.unwrap(), judge_model.unwrap())
1499                })
1500            })
1501            .await;
1502        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1503
1504        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1505
1506        Self {
1507            agent: EditAgent::new(
1508                agent_model,
1509                project.clone(),
1510                action_log,
1511                Templates::new(),
1512                edit_format,
1513            ),
1514            project,
1515            judge_model,
1516        }
1517    }
1518
1519    async fn load_model(
1520        selected_model: &SelectedModel,
1521        cx: &mut AsyncApp,
1522    ) -> Result<Arc<dyn LanguageModel>> {
1523        let (provider, model) = cx.update(|cx| {
1524            let models = LanguageModelRegistry::read_global(cx);
1525            let model = models
1526                .available_models(cx)
1527                .find(|model| {
1528                    model.provider_id() == selected_model.provider
1529                        && model.id() == selected_model.model
1530                })
1531                .expect("Model not found");
1532            let provider = models.provider(&model.provider_id()).unwrap();
1533            (provider, model)
1534        })?;
1535        cx.update(|cx| provider.authenticate(cx))?.await?;
1536        Ok(model)
1537    }
1538
1539    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1540        let path = self
1541            .project
1542            .read_with(cx, |project, cx| {
1543                project.find_project_path(eval.edit_file_input.path, cx)
1544            })
1545            .unwrap();
1546        let buffer = self
1547            .project
1548            .update(cx, |project, cx| project.open_buffer(path, cx))
1549            .await
1550            .unwrap();
1551        let tools = cx.update(|cx| {
1552            ToolRegistry::default_global(cx)
1553                .tools()
1554                .into_iter()
1555                .filter_map(|tool| {
1556                    let input_schema = tool
1557                        .input_schema(self.agent.model.tool_input_format())
1558                        .ok()?;
1559                    Some(LanguageModelRequestTool {
1560                        name: tool.name(),
1561                        description: tool.description(),
1562                        input_schema,
1563                    })
1564                })
1565                .collect::<Vec<_>>()
1566        });
1567        let tool_names = tools
1568            .iter()
1569            .map(|tool| tool.name.clone())
1570            .collect::<Vec<_>>();
1571        let worktrees = vec![WorktreeContext {
1572            root_name: "root".to_string(),
1573            abs_path: Path::new("/path/to/root").into(),
1574            rules_file: None,
1575        }];
1576        let prompt_builder = PromptBuilder::new(None)?;
1577        let project_context = ProjectContext::new(worktrees, Vec::default());
1578        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1579            &project_context,
1580            &ModelContext {
1581                available_tools: tool_names,
1582            },
1583        )?;
1584
1585        let has_system_prompt = eval
1586            .conversation
1587            .first()
1588            .map_or(false, |msg| msg.role == Role::System);
1589        let messages = if has_system_prompt {
1590            eval.conversation
1591        } else {
1592            [LanguageModelRequestMessage {
1593                role: Role::System,
1594                content: vec![MessageContent::Text(system_prompt)],
1595                cache: true,
1596            }]
1597            .into_iter()
1598            .chain(eval.conversation)
1599            .collect::<Vec<_>>()
1600        };
1601
1602        let conversation = LanguageModelRequest {
1603            messages,
1604            tools,
1605            thinking_allowed: true,
1606            ..Default::default()
1607        };
1608
1609        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1610            if let Some(input_content) = eval.input_content.as_deref() {
1611                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1612            }
1613            retry_on_rate_limit(async || {
1614                self.agent
1615                    .edit(
1616                        buffer.clone(),
1617                        eval.edit_file_input.display_description.clone(),
1618                        &conversation,
1619                        &mut cx.to_async(),
1620                    )
1621                    .0
1622                    .await
1623            })
1624            .await?
1625        } else {
1626            retry_on_rate_limit(async || {
1627                self.agent
1628                    .overwrite(
1629                        buffer.clone(),
1630                        eval.edit_file_input.display_description.clone(),
1631                        &conversation,
1632                        &mut cx.to_async(),
1633                    )
1634                    .0
1635                    .await
1636            })
1637            .await?
1638        };
1639
1640        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1641        let sample = EvalSample {
1642            edit_output,
1643            diff: language::unified_diff(
1644                eval.input_content.as_deref().unwrap_or_default(),
1645                &buffer_text,
1646            ),
1647            text_before: eval.input_content.unwrap_or_default(),
1648            text_after: buffer_text,
1649        };
1650        let assertion = eval
1651            .assertion
1652            .run(&sample, self.judge_model.clone(), cx)
1653            .await?;
1654
1655        Ok(EvalOutput { assertion, sample })
1656    }
1657}
1658
1659async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1660    let mut attempt = 0;
1661    loop {
1662        attempt += 1;
1663        match request().await {
1664            Ok(result) => return Ok(result),
1665            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1666                Ok(err) => match &err {
1667                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1668                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1669                        let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
1670                        // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1671                        let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1672                        eprintln!(
1673                            "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
1674                        );
1675                        Timer::after(retry_after + jitter).await;
1676                        continue;
1677                    }
1678                    _ => return Err(err.into()),
1679                },
1680                Err(err) => return Err(err),
1681            },
1682        }
1683    }
1684}
1685
1686#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1687struct EvalAssertionOutcome {
1688    score: usize,
1689    message: Option<String>,
1690}
1691
1692#[derive(Serialize)]
1693pub struct DiffJudgeTemplate {
1694    diff: String,
1695    assertions: &'static str,
1696}
1697
1698impl Template for DiffJudgeTemplate {
1699    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1700}
1701
1702fn strip_empty_lines(text: &str) -> String {
1703    text.lines()
1704        .filter(|line| !line.trim().is_empty())
1705        .collect::<Vec<_>>()
1706        .join("\n")
1707}