evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext, Timer};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    path::Path,
  30    str::FromStr,
  31    sync::mpsc,
  32};
  33use util::path;
  34
  35#[test]
  36#[cfg_attr(not(feature = "eval"), ignore)]
  37fn eval_extract_handle_command_output() {
  38    // Test how well agent generates multiple edit hunks.
  39    //
  40    // Model                       | Pass rate
  41    // ----------------------------|----------
  42    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  43    // claude-sonnet-4             |  0.97 (2025-06-14)
  44    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  45    // gemini-2.5-flash            |  0.11 (2025-05-22)
  46    // gpt-4.1                     |  1.00 (2025-05-22)
  47
  48    let input_file_path = "root/blame.rs";
  49    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  50    let possible_diffs = vec![
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  56        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  57        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  58    ];
  59    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  60    eval(
  61        100,
  62        0.95,
  63        0.05,
  64        EvalInput::from_conversation(
  65            vec![
  66                message(
  67                    User,
  68                    [text(formatdoc! {"
  69                        Read the `{input_file_path}` file and extract a method in
  70                        the final stanza of `run_git_blame` to deal with command failures,
  71                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  72                        Do not document the method and do not add any comments.
  73
  74                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  75                    "})],
  76                ),
  77                message(
  78                    Assistant,
  79                    [tool_use(
  80                        "tool_1",
  81                        "read_file",
  82                        ReadFileToolInput {
  83                            path: input_file_path.into(),
  84                            start_line: None,
  85                            end_line: None,
  86                        },
  87                    )],
  88                ),
  89                message(
  90                    User,
  91                    [tool_result("tool_1", "read_file", input_file_content)],
  92                ),
  93                message(
  94                    Assistant,
  95                    [tool_use(
  96                        "tool_2",
  97                        "edit_file",
  98                        EditFileToolInput {
  99                            display_description: edit_description.into(),
 100                            path: input_file_path.into(),
 101                            mode: EditFileMode::Edit,
 102                        },
 103                    )],
 104                ),
 105            ],
 106            Some(input_file_content.into()),
 107            EvalAssertion::assert_diff_any(possible_diffs),
 108        ),
 109    );
 110}
 111
 112#[test]
 113#[cfg_attr(not(feature = "eval"), ignore)]
 114fn eval_delete_run_git_blame() {
 115    // Model                       | Pass rate
 116    // ----------------------------|----------
 117    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 118    // claude-sonnet-4             | 0.96 (2025-06-14)
 119    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 120    // gemini-2.5-flash            |
 121    // gpt-4.1                     |
 122    let input_file_path = "root/blame.rs";
 123    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 124    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 125    let edit_description = "Delete the `run_git_blame` function.";
 126    eval(
 127        100,
 128        0.95,
 129        0.05,
 130        EvalInput::from_conversation(
 131            vec![
 132                message(
 133                    User,
 134                    [text(formatdoc! {"
 135                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 136                        one function, not its usages.
 137                    "})],
 138                ),
 139                message(
 140                    Assistant,
 141                    [tool_use(
 142                        "tool_1",
 143                        "read_file",
 144                        ReadFileToolInput {
 145                            path: input_file_path.into(),
 146                            start_line: None,
 147                            end_line: None,
 148                        },
 149                    )],
 150                ),
 151                message(
 152                    User,
 153                    [tool_result("tool_1", "read_file", input_file_content)],
 154                ),
 155                message(
 156                    Assistant,
 157                    [tool_use(
 158                        "tool_2",
 159                        "edit_file",
 160                        EditFileToolInput {
 161                            display_description: edit_description.into(),
 162                            path: input_file_path.into(),
 163                            mode: EditFileMode::Edit,
 164                        },
 165                    )],
 166                ),
 167            ],
 168            Some(input_file_content.into()),
 169            EvalAssertion::assert_eq(output_file_content),
 170        ),
 171    );
 172}
 173
 174#[test]
 175#[cfg_attr(not(feature = "eval"), ignore)]
 176fn eval_translate_doc_comments() {
 177    //  Model                          | Pass rate
 178    // ============================================
 179    //
 180    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 181    //  claude-sonnet-4                |  1.0  (2025-06-14)
 182    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 183    //  gemini-2.5-flash-preview-04-17 |
 184    //  gpt-4.1                        |
 185    let input_file_path = "root/canvas.rs";
 186    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 187    let edit_description = "Translate all doc comments to Italian";
 188    eval(
 189        200,
 190        1.,
 191        0.05,
 192        EvalInput::from_conversation(
 193            vec![
 194                message(
 195                    User,
 196                    [text(formatdoc! {"
 197                        Read the {input_file_path} file and edit it (without overwriting it),
 198                        translating all the doc comments to italian.
 199                    "})],
 200                ),
 201                message(
 202                    Assistant,
 203                    [tool_use(
 204                        "tool_1",
 205                        "read_file",
 206                        ReadFileToolInput {
 207                            path: input_file_path.into(),
 208                            start_line: None,
 209                            end_line: None,
 210                        },
 211                    )],
 212                ),
 213                message(
 214                    User,
 215                    [tool_result("tool_1", "read_file", input_file_content)],
 216                ),
 217                message(
 218                    Assistant,
 219                    [tool_use(
 220                        "tool_2",
 221                        "edit_file",
 222                        EditFileToolInput {
 223                            display_description: edit_description.into(),
 224                            path: input_file_path.into(),
 225                            mode: EditFileMode::Edit,
 226                        },
 227                    )],
 228                ),
 229            ],
 230            Some(input_file_content.into()),
 231            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 232        ),
 233    );
 234}
 235
 236#[test]
 237#[cfg_attr(not(feature = "eval"), ignore)]
 238fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 239    //  Model                          | Pass rate
 240    // ============================================
 241    //
 242    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 243    //  claude-sonnet-4                |  0.11 (2025-06-14)
 244    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 245    //  gemini-2.5-flash-preview-04-17 |
 246    //  gpt-4.1                        |
 247    let input_file_path = "root/lib.rs";
 248    let input_file_content =
 249        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 250    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 251    eval(
 252        100,
 253        0.95,
 254        0.05,
 255        EvalInput::from_conversation(
 256            vec![
 257                message(
 258                    User,
 259                    [text(formatdoc! {"
 260                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 261                        Use `ureq` to download the SDK for the current platform and architecture.
 262                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 263                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 264                        that's inside of the archive.
 265                        Don't re-download the SDK if that executable already exists.
 266
 267                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 268
 269                        Here are the available wasi-sdk assets:
 270                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 271                        - wasi-sdk-25.0-arm64-macos.tar.gz
 272                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 273                        - wasi-sdk-25.0-arm64-linux.tar.gz
 274                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 275                        - wasi-sdk-25.0-arm64-linux.tar.gz
 276                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 277                    "})],
 278                ),
 279                message(
 280                    Assistant,
 281                    [tool_use(
 282                        "tool_1",
 283                        "read_file",
 284                        ReadFileToolInput {
 285                            path: input_file_path.into(),
 286                            start_line: Some(971),
 287                            end_line: Some(1050),
 288                        },
 289                    )],
 290                ),
 291                message(
 292                    User,
 293                    [tool_result(
 294                        "tool_1",
 295                        "read_file",
 296                        lines(input_file_content, 971..1050),
 297                    )],
 298                ),
 299                message(
 300                    Assistant,
 301                    [tool_use(
 302                        "tool_2",
 303                        "read_file",
 304                        ReadFileToolInput {
 305                            path: input_file_path.into(),
 306                            start_line: Some(1050),
 307                            end_line: Some(1100),
 308                        },
 309                    )],
 310                ),
 311                message(
 312                    User,
 313                    [tool_result(
 314                        "tool_2",
 315                        "read_file",
 316                        lines(input_file_content, 1050..1100),
 317                    )],
 318                ),
 319                message(
 320                    Assistant,
 321                    [tool_use(
 322                        "tool_3",
 323                        "read_file",
 324                        ReadFileToolInput {
 325                            path: input_file_path.into(),
 326                            start_line: Some(1100),
 327                            end_line: Some(1150),
 328                        },
 329                    )],
 330                ),
 331                message(
 332                    User,
 333                    [tool_result(
 334                        "tool_3",
 335                        "read_file",
 336                        lines(input_file_content, 1100..1150),
 337                    )],
 338                ),
 339                message(
 340                    Assistant,
 341                    [tool_use(
 342                        "tool_4",
 343                        "edit_file",
 344                        EditFileToolInput {
 345                            display_description: edit_description.into(),
 346                            path: input_file_path.into(),
 347                            mode: EditFileMode::Edit,
 348                        },
 349                    )],
 350                ),
 351            ],
 352            Some(input_file_content.into()),
 353            EvalAssertion::judge_diff(indoc! {"
 354                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 355                - ureq is used to download the SDK for current platform and architecture
 356            "}),
 357        ),
 358    );
 359}
 360
 361#[test]
 362#[cfg_attr(not(feature = "eval"), ignore)]
 363fn eval_disable_cursor_blinking() {
 364    //  Model                          | Pass rate
 365    // ============================================
 366    //
 367    //  claude-3.7-sonnet              |  0.99 (2025-06-14)
 368    //  claude-sonnet-4                |  0.85 (2025-06-14)
 369    //  gemini-2.5-pro-preview-latest  |  0.97 (2025-06-16)
 370    //  gemini-2.5-flash-preview-04-17 |
 371    //  gpt-4.1                        |
 372    let input_file_path = "root/editor.rs";
 373    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 374    let edit_description = "Comment out the call to `BlinkManager::enable`";
 375    eval(
 376        100,
 377        0.95,
 378        0.05,
 379        EvalInput::from_conversation(
 380            vec![
 381                message(User, [text("Let's research how to cursor blinking works.")]),
 382                message(
 383                    Assistant,
 384                    [tool_use(
 385                        "tool_1",
 386                        "grep",
 387                        GrepToolInput {
 388                            regex: "blink".into(),
 389                            include_pattern: None,
 390                            offset: 0,
 391                            case_sensitive: false,
 392                        },
 393                    )],
 394                ),
 395                message(
 396                    User,
 397                    [tool_result(
 398                        "tool_1",
 399                        "grep",
 400                        [
 401                            lines(input_file_content, 100..400),
 402                            lines(input_file_content, 800..1300),
 403                            lines(input_file_content, 1600..2000),
 404                            lines(input_file_content, 5000..5500),
 405                            lines(input_file_content, 8000..9000),
 406                            lines(input_file_content, 18455..18470),
 407                            lines(input_file_content, 20000..20500),
 408                            lines(input_file_content, 21000..21300),
 409                        ]
 410                        .join("Match found:\n\n"),
 411                    )],
 412                ),
 413                message(
 414                    User,
 415                    [text(indoc! {"
 416                        Comment out the lines that interact with the BlinkManager.
 417                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 418                        Don't add additional comments.
 419                    "})],
 420                ),
 421                message(
 422                    Assistant,
 423                    [tool_use(
 424                        "tool_4",
 425                        "edit_file",
 426                        EditFileToolInput {
 427                            display_description: edit_description.into(),
 428                            path: input_file_path.into(),
 429                            mode: EditFileMode::Edit,
 430                        },
 431                    )],
 432                ),
 433            ],
 434            Some(input_file_content.into()),
 435            EvalAssertion::judge_diff(indoc! {"
 436                - Calls to BlinkManager in `observe_window_activation` were commented out
 437                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 438                - All the edits have valid indentation
 439            "}),
 440        ),
 441    );
 442}
 443
 444#[test]
 445#[cfg_attr(not(feature = "eval"), ignore)]
 446fn eval_from_pixels_constructor() {
 447    // Results for 2025-06-13
 448    //
 449    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 450    // value. Higher values improve the pass rate but may sometimes cause
 451    // edits to be misapplied. In the context of this eval, this means
 452    // the agent might add from_pixels tests in incorrect locations
 453    // (e.g., at the beginning of the file), yet the evaluation may still
 454    // rate it highly.
 455    //
 456    //  Model                          | Date        | Pass rate
 457    // =========================================================
 458    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 459    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 460    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 461    //  gpt-4.1                        |
 462    let input_file_path = "root/canvas.rs";
 463    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 464    let edit_description = "Implement from_pixels constructor and add tests.";
 465    eval(
 466        100,
 467        0.95,
 468        // For whatever reason, this eval produces more mismatched tags.
 469        // Increasing for now, let's see if we can bring this down.
 470        0.25,
 471        EvalInput::from_conversation(
 472            vec![
 473                message(
 474                    User,
 475                    [text(indoc! {"
 476                        Introduce a new `from_pixels` constructor in Canvas and
 477                        also add tests for it in the same file.
 478                    "})],
 479                ),
 480                message(
 481                    Assistant,
 482                    [tool_use(
 483                        "tool_1",
 484                        "read_file",
 485                        ReadFileToolInput {
 486                            path: input_file_path.into(),
 487                            start_line: None,
 488                            end_line: None,
 489                        },
 490                    )],
 491                ),
 492                message(
 493                    User,
 494                    [tool_result("tool_1", "read_file", input_file_content)],
 495                ),
 496                message(
 497                    Assistant,
 498                    [tool_use(
 499                        "tool_2",
 500                        "grep",
 501                        GrepToolInput {
 502                            regex: "mod\\s+tests".into(),
 503                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 504                            offset: 0,
 505                            case_sensitive: false,
 506                        },
 507                    )],
 508                ),
 509                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 510                message(
 511                    Assistant,
 512                    [tool_use(
 513                        "tool_3",
 514                        "grep",
 515                        GrepToolInput {
 516                            regex: "mod\\s+tests".into(),
 517                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 518                            offset: 0,
 519                            case_sensitive: false,
 520                        },
 521                    )],
 522                ),
 523                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 524                message(
 525                    Assistant,
 526                    [tool_use(
 527                        "tool_4",
 528                        "grep",
 529                        GrepToolInput {
 530                            regex: "#\\[test\\]".into(),
 531                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 532                            offset: 0,
 533                            case_sensitive: false,
 534                        },
 535                    )],
 536                ),
 537                message(
 538                    User,
 539                    [tool_result(
 540                        "tool_4",
 541                        "grep",
 542                        indoc! {"
 543                            Found 6 matches:
 544
 545                            ## Matches in font-kit/src/loaders/core_text.rs
 546
 547                            ### mod test › L926-936
 548                            ```
 549                            mod test {
 550                                use super::Font;
 551                                use crate::properties::{Stretch, Weight};
 552
 553                                #[cfg(feature = \"source\")]
 554                                use crate::source::SystemSource;
 555
 556                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 557
 558                                #[cfg(feature = \"source\")]
 559                                #[test]
 560                            ```
 561
 562                            55 lines remaining in ancestor node. Read the file to see all.
 563
 564                            ### mod test › L947-951
 565                            ```
 566                                }
 567
 568                                #[test]
 569                                fn test_core_text_to_css_font_weight() {
 570                                    // Exact matches
 571                            ```
 572
 573                            ### mod test › L959-963
 574                            ```
 575                                }
 576
 577                                #[test]
 578                                fn test_core_text_to_css_font_stretch() {
 579                                    // Exact matches
 580                            ```
 581
 582                            ## Matches in font-kit/src/loaders/freetype.rs
 583
 584                            ### mod test › L1238-1248
 585                            ```
 586                            mod test {
 587                                use crate::loaders::freetype::Font;
 588
 589                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 590                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 591
 592                                #[test]
 593                                fn get_pcf_postscript_name() {
 594                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 595                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 596                                }
 597                            ```
 598
 599                            1 lines remaining in ancestor node. Read the file to see all.
 600
 601                            ## Matches in font-kit/src/sources/core_text.rs
 602
 603                            ### mod test › L265-275
 604                            ```
 605                            mod test {
 606                                use crate::properties::{Stretch, Weight};
 607
 608                                #[test]
 609                                fn test_css_to_core_text_font_weight() {
 610                                    // Exact matches
 611                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 612                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 613                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 614                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 615
 616                            ```
 617
 618                            27 lines remaining in ancestor node. Read the file to see all.
 619
 620                            ### mod test › L278-282
 621                            ```
 622                                }
 623
 624                                #[test]
 625                                fn test_css_to_core_text_font_stretch() {
 626                                    // Exact matches
 627                            ```
 628                        "},
 629                    )],
 630                ),
 631                message(
 632                    Assistant,
 633                    [tool_use(
 634                        "tool_5",
 635                        "edit_file",
 636                        EditFileToolInput {
 637                            display_description: edit_description.into(),
 638                            path: input_file_path.into(),
 639                            mode: EditFileMode::Edit,
 640                        },
 641                    )],
 642                ),
 643            ],
 644            Some(input_file_content.into()),
 645            EvalAssertion::judge_diff(indoc! {"
 646                    - The diff contains a new `from_pixels` constructor
 647                    - The diff contains new tests for the `from_pixels` constructor
 648                "}),
 649        ),
 650    );
 651}
 652
 653#[test]
 654#[cfg_attr(not(feature = "eval"), ignore)]
 655fn eval_zode() {
 656    //  Model                          | Pass rate
 657    // ============================================
 658    //
 659    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 660    //  claude-sonnet-4                |  1.0 (2025-06-14)
 661    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 662    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 663    //  gpt-4.1                        |  1.0 (2025-05-22)
 664    let input_file_path = "root/zode.py";
 665    let input_content = None;
 666    let edit_description = "Create the main Zode CLI script";
 667    eval(
 668        50,
 669        1.,
 670        0.05,
 671        EvalInput::from_conversation(
 672            vec![
 673                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 674                message(
 675                    Assistant,
 676                    [
 677                        tool_use(
 678                            "tool_1",
 679                            "read_file",
 680                            ReadFileToolInput {
 681                                path: "root/eval/react.py".into(),
 682                                start_line: None,
 683                                end_line: None,
 684                            },
 685                        ),
 686                        tool_use(
 687                            "tool_2",
 688                            "read_file",
 689                            ReadFileToolInput {
 690                                path: "root/eval/react_test.py".into(),
 691                                start_line: None,
 692                                end_line: None,
 693                            },
 694                        ),
 695                    ],
 696                ),
 697                message(
 698                    User,
 699                    [
 700                        tool_result(
 701                            "tool_1",
 702                            "read_file",
 703                            include_str!("evals/fixtures/zode/react.py"),
 704                        ),
 705                        tool_result(
 706                            "tool_2",
 707                            "read_file",
 708                            include_str!("evals/fixtures/zode/react_test.py"),
 709                        ),
 710                    ],
 711                ),
 712                message(
 713                    Assistant,
 714                    [
 715                        text(
 716                            "Now that I understand what we need to build, I'll create the main Python script:",
 717                        ),
 718                        tool_use(
 719                            "tool_3",
 720                            "edit_file",
 721                            EditFileToolInput {
 722                                display_description: edit_description.into(),
 723                                path: input_file_path.into(),
 724                                mode: EditFileMode::Create,
 725                            },
 726                        ),
 727                    ],
 728                ),
 729            ],
 730            input_content,
 731            EvalAssertion::new(async move |sample, _, _cx| {
 732                let invalid_starts = [' ', '`', '\n'];
 733                let mut message = String::new();
 734                for start in invalid_starts {
 735                    if sample.text_after.starts_with(start) {
 736                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 737                        break;
 738                    }
 739                }
 740                // Remove trailing newline.
 741                message.pop();
 742
 743                if message.is_empty() {
 744                    Ok(EvalAssertionOutcome {
 745                        score: 100,
 746                        message: None,
 747                    })
 748                } else {
 749                    Ok(EvalAssertionOutcome {
 750                        score: 0,
 751                        message: Some(message),
 752                    })
 753                }
 754            }),
 755        ),
 756    );
 757}
 758
 759#[test]
 760#[cfg_attr(not(feature = "eval"), ignore)]
 761fn eval_add_overwrite_test() {
 762    //  Model                          | Pass rate
 763    // ============================================
 764    //
 765    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 766    //  claude-sonnet-4                |  0.07 (2025-06-14)
 767    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 768    //  gemini-2.5-flash-preview-04-17 |
 769    //  gpt-4.1                        |
 770    let input_file_path = "root/action_log.rs";
 771    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 772    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 773    eval(
 774        200,
 775        0.5, // TODO: make this eval better
 776        0.05,
 777        EvalInput::from_conversation(
 778            vec![
 779                message(
 780                    User,
 781                    [text(indoc! {"
 782                        Introduce a new test in `action_log.rs` to test overwriting a file.
 783                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 784                        Take inspiration from all the other tests in the file.
 785                    "})],
 786                ),
 787                message(
 788                    Assistant,
 789                    [tool_use(
 790                        "tool_1",
 791                        "read_file",
 792                        ReadFileToolInput {
 793                            path: input_file_path.into(),
 794                            start_line: None,
 795                            end_line: None,
 796                        },
 797                    )],
 798                ),
 799                message(
 800                    User,
 801                    [tool_result(
 802                        "tool_1",
 803                        "read_file",
 804                        indoc! {"
 805                            pub struct ActionLog [L13-20]
 806                             tracked_buffers [L15]
 807                             edited_since_project_diagnostics_check [L17]
 808                             project [L19]
 809                            impl ActionLog [L22-498]
 810                             pub fn new [L24-30]
 811                             pub fn project [L32-34]
 812                             pub fn checked_project_diagnostics [L37-39]
 813                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 814                             fn track_buffer_internal [L46-101]
 815                             fn handle_buffer_event [L103-116]
 816                             fn handle_buffer_edited [L118-123]
 817                             fn handle_buffer_file_changed [L125-158]
 818                             async fn maintain_diff [L160-264]
 819                             pub fn buffer_read [L267-269]
 820                             pub fn buffer_created [L272-276]
 821                             pub fn buffer_edited [L279-287]
 822                             pub fn will_delete_buffer [L289-304]
 823                             pub fn keep_edits_in_range [L306-364]
 824                             pub fn reject_edits_in_ranges [L366-459]
 825                             pub fn keep_all_edits [L461-473]
 826                             pub fn changed_buffers [L476-482]
 827                             pub fn stale_buffers [L485-497]
 828                            fn apply_non_conflicting_edits [L500-561]
 829                            fn diff_snapshots [L563-585]
 830                            fn point_to_row_edit [L587-614]
 831                            enum ChangeAuthor [L617-620]
 832                             User [L618]
 833                             Agent [L619]
 834                            enum TrackedBufferStatus [L623-627]
 835                             Created [L624]
 836                             Modified [L625]
 837                             Deleted [L626]
 838                            struct TrackedBuffer [L629-641]
 839                             buffer [L630]
 840                             base_text [L631]
 841                             unreviewed_changes [L632]
 842                             status [L633]
 843                             version [L634]
 844                             diff [L635]
 845                             snapshot [L636]
 846                             diff_update [L637]
 847                             _open_lsp_handle [L638]
 848                             _maintain_diff [L639]
 849                             _subscription [L640]
 850                            impl TrackedBuffer [L643-657]
 851                             fn has_changes [L644-650]
 852                             fn schedule_diff_update [L652-656]
 853                            pub struct ChangedBuffer [L659-661]
 854                             pub diff [L660]
 855                            mod tests [L664-1574]
 856                             fn init_logger [L678-682]
 857                             fn init_test [L684-691]
 858                             async fn test_keep_edits [L694-769]
 859                             async fn test_deletions [L772-854]
 860                             async fn test_overlapping_user_edits [L857-951]
 861                             async fn test_creating_files [L954-1010]
 862                             async fn test_deleting_files [L1013-1120]
 863                             async fn test_reject_edits [L1123-1255]
 864                             async fn test_reject_multiple_edits [L1258-1331]
 865                             async fn test_reject_deleted_file [L1334-1388]
 866                             async fn test_reject_created_file [L1391-1443]
 867                             async fn test_random_diffs [L1446-1535]
 868                              fn quiesce [L1510-1534]
 869                             struct HunkStatus [L1538-1542]
 870                              range [L1539]
 871                              diff_status [L1540]
 872                              old_text [L1541]
 873                             fn unreviewed_hunks [L1544-1573]
 874
 875                            Showing symbols 1-69 (total symbols: 69)
 876
 877                            Using the line numbers in this outline, you can call this tool again while specifying
 878                            the start_line and end_line fields to see the implementations of symbols in the outline.
 879                        "},
 880                    )],
 881                ),
 882                message(
 883                    Assistant,
 884                    [
 885                        text(
 886                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 887                        ),
 888                        tool_use(
 889                            "tool_2",
 890                            "read_file",
 891                            ReadFileToolInput {
 892                                path: input_file_path.into(),
 893                                start_line: Some(953),
 894                                end_line: Some(1010),
 895                            },
 896                        ),
 897                    ],
 898                ),
 899                message(
 900                    User,
 901                    [tool_result(
 902                        "tool_2",
 903                        "read_file",
 904                        lines(input_file_content, 953..1010),
 905                    )],
 906                ),
 907                message(
 908                    Assistant,
 909                    [
 910                        text(
 911                            "Now let's look at another relevant test to understand how they're structured:",
 912                        ),
 913                        tool_use(
 914                            "tool_3",
 915                            "read_file",
 916                            ReadFileToolInput {
 917                                path: input_file_path.into(),
 918                                start_line: Some(1012),
 919                                end_line: Some(1120),
 920                            },
 921                        ),
 922                    ],
 923                ),
 924                message(
 925                    User,
 926                    [tool_result(
 927                        "tool_3",
 928                        "read_file",
 929                        lines(input_file_content, 1012..1120),
 930                    )],
 931                ),
 932                message(
 933                    Assistant,
 934                    [
 935                        text("Now let's look at how `buffer_created` is implemented:"),
 936                        tool_use(
 937                            "tool_4",
 938                            "read_file",
 939                            ReadFileToolInput {
 940                                path: input_file_path.into(),
 941                                start_line: Some(271),
 942                                end_line: Some(276),
 943                            },
 944                        ),
 945                    ],
 946                ),
 947                message(
 948                    User,
 949                    [tool_result(
 950                        "tool_4",
 951                        "read_file",
 952                        lines(input_file_content, 271..276),
 953                    )],
 954                ),
 955                message(
 956                    Assistant,
 957                    [
 958                        text(indoc! {"
 959                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 960
 961                            1. Create a file
 962                            2. Open a buffer for it
 963                            3. Track it in the action log
 964                            4. Modify the buffer
 965                            5. Call `buffer_created` again as if the file were new
 966                            6. Check that the changes are properly tracked
 967
 968                            Let's write the test:
 969                        "}),
 970                        tool_use(
 971                            "tool_5",
 972                            "edit_file",
 973                            EditFileToolInput {
 974                                display_description: edit_description.into(),
 975                                path: input_file_path.into(),
 976                                mode: EditFileMode::Edit,
 977                            },
 978                        ),
 979                    ],
 980                ),
 981            ],
 982            Some(input_file_content.into()),
 983            EvalAssertion::judge_diff(
 984                "A new test for overwritten files was created, without changing any previous test",
 985            ),
 986        ),
 987    );
 988}
 989
 990#[test]
 991#[cfg_attr(not(feature = "eval"), ignore)]
 992fn eval_create_empty_file() {
 993    // Check that Edit Agent can create a file without writing its
 994    // thoughts into it. This issue is not specific to empty files, but
 995    // it's easier to reproduce with them.
 996    //
 997    //  Model                          | Pass rate
 998    // ============================================
 999    //
1000    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1001    //  claude-sonnet-4                |  1.00 (2025-06-14)
1002    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1003    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1004    //  gpt-4.1                        |  1.00 (2025-05-21)
1005    //
1006    //
1007    // TODO: gpt-4.1-mini errored 38 times:
1008    // "data did not match any variant of untagged enum ResponseStreamResult"
1009    //
1010    let input_file_content = None;
1011    let expected_output_content = String::new();
1012    eval(
1013        100,
1014        0.99,
1015        0.05,
1016        EvalInput::from_conversation(
1017            vec![
1018                message(User, [text("Create a second empty todo file ")]),
1019                message(
1020                    Assistant,
1021                    [
1022                        text(formatdoc! {"
1023                        I'll help you create a second empty todo file.
1024                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1025                        "}),
1026                        tool_use(
1027                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1028                            "list_directory",
1029                            ListDirectoryToolInput {
1030                                path: "root".to_string(),
1031                            },
1032                        ),
1033                    ],
1034                ),
1035                message(
1036                    User,
1037                    [tool_result(
1038                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1039                        "list_directory",
1040                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1041                    )],
1042                ),
1043                message(
1044                    Assistant,
1045                    [
1046                        text(formatdoc! {"
1047                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1048                    "}),
1049                        tool_use(
1050                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1051                            "edit_file",
1052                            EditFileToolInput {
1053                                display_description: "Create empty TODO3 file".to_string(),
1054                                mode: EditFileMode::Create,
1055                                path: "root/TODO3".into(),
1056                            },
1057                        ),
1058                    ],
1059                ),
1060            ],
1061            input_file_content,
1062            // Bad behavior is to write something like
1063            // "I'll create an empty TODO3 file as requested."
1064            EvalAssertion::assert_eq(expected_output_content),
1065        ),
1066    );
1067}
1068
1069fn message(
1070    role: Role,
1071    contents: impl IntoIterator<Item = MessageContent>,
1072) -> LanguageModelRequestMessage {
1073    LanguageModelRequestMessage {
1074        role,
1075        content: contents.into_iter().collect(),
1076        cache: false,
1077    }
1078}
1079
1080fn text(text: impl Into<String>) -> MessageContent {
1081    MessageContent::Text(text.into())
1082}
1083
1084fn lines(input: &str, range: Range<usize>) -> String {
1085    input
1086        .lines()
1087        .skip(range.start)
1088        .take(range.len())
1089        .collect::<Vec<_>>()
1090        .join("\n")
1091}
1092
1093fn tool_use(
1094    id: impl Into<Arc<str>>,
1095    name: impl Into<Arc<str>>,
1096    input: impl Serialize,
1097) -> MessageContent {
1098    MessageContent::ToolUse(LanguageModelToolUse {
1099        id: LanguageModelToolUseId::from(id.into()),
1100        name: name.into(),
1101        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1102        input: serde_json::to_value(input).unwrap(),
1103        is_input_complete: true,
1104    })
1105}
1106
1107fn tool_result(
1108    id: impl Into<Arc<str>>,
1109    name: impl Into<Arc<str>>,
1110    result: impl Into<Arc<str>>,
1111) -> MessageContent {
1112    MessageContent::ToolResult(LanguageModelToolResult {
1113        tool_use_id: LanguageModelToolUseId::from(id.into()),
1114        tool_name: name.into(),
1115        is_error: false,
1116        content: LanguageModelToolResultContent::Text(result.into()),
1117        output: None,
1118    })
1119}
1120
1121#[derive(Clone)]
1122struct EvalInput {
1123    conversation: Vec<LanguageModelRequestMessage>,
1124    edit_file_input: EditFileToolInput,
1125    input_content: Option<String>,
1126    assertion: EvalAssertion,
1127}
1128
1129impl EvalInput {
1130    fn from_conversation(
1131        conversation: Vec<LanguageModelRequestMessage>,
1132        input_content: Option<String>,
1133        assertion: EvalAssertion,
1134    ) -> Self {
1135        let msg = conversation.last().expect("Conversation must not be empty");
1136        if msg.role != Role::Assistant {
1137            panic!("Conversation must end with an assistant message");
1138        }
1139        let tool_use = msg
1140            .content
1141            .iter()
1142            .flat_map(|content| match content {
1143                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1144                    Some(tool_use)
1145                }
1146                _ => None,
1147            })
1148            .next()
1149            .expect("Conversation must end with an edit_file tool use")
1150            .clone();
1151
1152        let edit_file_input: EditFileToolInput =
1153            serde_json::from_value(tool_use.input.clone()).unwrap();
1154
1155        EvalInput {
1156            conversation,
1157            edit_file_input,
1158            input_content,
1159            assertion,
1160        }
1161    }
1162}
1163
1164#[derive(Clone)]
1165struct EvalSample {
1166    text_before: String,
1167    text_after: String,
1168    edit_output: EditAgentOutput,
1169    diff: String,
1170}
1171
1172trait AssertionFn: 'static + Send + Sync {
1173    fn assert<'a>(
1174        &'a self,
1175        sample: &'a EvalSample,
1176        judge_model: Arc<dyn LanguageModel>,
1177        cx: &'a mut TestAppContext,
1178    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1179}
1180
1181impl<F> AssertionFn for F
1182where
1183    F: 'static
1184        + Send
1185        + Sync
1186        + AsyncFn(
1187            &EvalSample,
1188            Arc<dyn LanguageModel>,
1189            &mut TestAppContext,
1190        ) -> Result<EvalAssertionOutcome>,
1191{
1192    fn assert<'a>(
1193        &'a self,
1194        sample: &'a EvalSample,
1195        judge_model: Arc<dyn LanguageModel>,
1196        cx: &'a mut TestAppContext,
1197    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1198        (self)(sample, judge_model, cx).boxed_local()
1199    }
1200}
1201
1202#[derive(Clone)]
1203struct EvalAssertion(Arc<dyn AssertionFn>);
1204
1205impl EvalAssertion {
1206    fn new<F>(f: F) -> Self
1207    where
1208        F: 'static
1209            + Send
1210            + Sync
1211            + AsyncFn(
1212                &EvalSample,
1213                Arc<dyn LanguageModel>,
1214                &mut TestAppContext,
1215            ) -> Result<EvalAssertionOutcome>,
1216    {
1217        EvalAssertion(Arc::new(f))
1218    }
1219
1220    fn assert_eq(expected: impl Into<String>) -> Self {
1221        let expected = expected.into();
1222        Self::new(async move |sample, _judge, _cx| {
1223            Ok(EvalAssertionOutcome {
1224                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1225                    100
1226                } else {
1227                    0
1228                },
1229                message: None,
1230            })
1231        })
1232    }
1233
1234    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1235        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1236        Self::new(async move |sample, _judge, _cx| {
1237            let matches = expected_diffs.iter().any(|possible_diff| {
1238                let expected =
1239                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1240                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1241            });
1242
1243            Ok(EvalAssertionOutcome {
1244                score: if matches { 100 } else { 0 },
1245                message: None,
1246            })
1247        })
1248    }
1249
1250    fn judge_diff(assertions: &'static str) -> Self {
1251        Self::new(async move |sample, judge, cx| {
1252            let prompt = DiffJudgeTemplate {
1253                diff: sample.diff.clone(),
1254                assertions,
1255            }
1256            .render(&Templates::new())
1257            .unwrap();
1258
1259            let request = LanguageModelRequest {
1260                messages: vec![LanguageModelRequestMessage {
1261                    role: Role::User,
1262                    content: vec![prompt.into()],
1263                    cache: false,
1264                }],
1265                ..Default::default()
1266            };
1267            let mut response = retry_on_rate_limit(async || {
1268                Ok(judge
1269                    .stream_completion_text(request.clone(), &cx.to_async())
1270                    .await?)
1271            })
1272            .await?;
1273            let mut output = String::new();
1274            while let Some(chunk) = response.stream.next().await {
1275                let chunk = chunk?;
1276                output.push_str(&chunk);
1277            }
1278
1279            // Parse the score from the response
1280            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1281            if let Some(captures) = re.captures(&output) {
1282                if let Some(score_match) = captures.get(1) {
1283                    let score = score_match.as_str().parse().unwrap_or(0);
1284                    return Ok(EvalAssertionOutcome {
1285                        score,
1286                        message: Some(output),
1287                    });
1288                }
1289            }
1290
1291            anyhow::bail!("No score found in response. Raw output: {output}");
1292        })
1293    }
1294
1295    async fn run(
1296        &self,
1297        input: &EvalSample,
1298        judge_model: Arc<dyn LanguageModel>,
1299        cx: &mut TestAppContext,
1300    ) -> Result<EvalAssertionOutcome> {
1301        self.0.assert(input, judge_model, cx).await
1302    }
1303}
1304
1305fn eval(
1306    iterations: usize,
1307    expected_pass_ratio: f32,
1308    mismatched_tag_threshold: f32,
1309    mut eval: EvalInput,
1310) {
1311    let mut evaluated_count = 0;
1312    let mut failed_count = 0;
1313    report_progress(evaluated_count, failed_count, iterations);
1314
1315    let (tx, rx) = mpsc::channel();
1316
1317    // Cache the last message in the conversation, and run one instance of the eval so that
1318    // all the next ones are cached.
1319    eval.conversation.last_mut().unwrap().cache = true;
1320    run_eval(eval.clone(), tx.clone());
1321
1322    let executor = gpui::background_executor();
1323    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1324    for _ in 1..iterations {
1325        let eval = eval.clone();
1326        let tx = tx.clone();
1327        let semaphore = semaphore.clone();
1328        executor
1329            .spawn(async move {
1330                let _guard = semaphore.acquire().await;
1331                run_eval(eval, tx)
1332            })
1333            .detach();
1334    }
1335    drop(tx);
1336
1337    let mut failed_evals = HashMap::default();
1338    let mut errored_evals = HashMap::default();
1339    let mut eval_outputs = Vec::new();
1340    let mut cumulative_parser_metrics = EditParserMetrics::default();
1341    while let Ok(output) = rx.recv() {
1342        match output {
1343            Ok(output) => {
1344                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1345                eval_outputs.push(output.clone());
1346                if output.assertion.score < 80 {
1347                    failed_count += 1;
1348                    failed_evals
1349                        .entry(output.sample.text_after.clone())
1350                        .or_insert(Vec::new())
1351                        .push(output);
1352                }
1353            }
1354            Err(error) => {
1355                failed_count += 1;
1356                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1357            }
1358        }
1359
1360        evaluated_count += 1;
1361        report_progress(evaluated_count, failed_count, iterations);
1362    }
1363
1364    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1365    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1366    if actual_pass_ratio < expected_pass_ratio {
1367        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1368        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1369        for (error, count) in errored_evals {
1370            println!("Eval errored {} times. Error: {}", count, error);
1371        }
1372
1373        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1374        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1375        for (_buffer_output, failed_evals) in failed_evals {
1376            let eval_output = failed_evals.first().unwrap();
1377            println!("Eval failed {} times", failed_evals.len());
1378            println!("{}", eval_output);
1379        }
1380
1381        panic!(
1382            "Actual pass ratio: {}\nExpected pass ratio: {}",
1383            actual_pass_ratio, expected_pass_ratio
1384        );
1385    }
1386
1387    let mismatched_tag_ratio =
1388        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1389    if mismatched_tag_ratio > mismatched_tag_threshold {
1390        for eval_output in eval_outputs {
1391            println!("{}", eval_output);
1392        }
1393        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1394    }
1395}
1396
1397fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1398    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1399    let mut cx = TestAppContext::build(dispatcher, None);
1400    let output = cx.executor().block_test(async {
1401        let test = EditAgentTest::new(&mut cx).await;
1402        test.eval(eval, &mut cx).await
1403    });
1404    tx.send(output).unwrap();
1405}
1406
1407#[derive(Clone)]
1408struct EvalOutput {
1409    sample: EvalSample,
1410    assertion: EvalAssertionOutcome,
1411}
1412
1413impl Display for EvalOutput {
1414    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1415        writeln!(f, "Score: {:?}", self.assertion.score)?;
1416        if let Some(message) = self.assertion.message.as_ref() {
1417            writeln!(f, "Message: {}", message)?;
1418        }
1419
1420        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1421
1422        writeln!(
1423            f,
1424            "Parser Metrics:\n{:#?}",
1425            self.sample.edit_output.parser_metrics
1426        )?;
1427        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1428        Ok(())
1429    }
1430}
1431
1432fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1433    let passed_count = evaluated_count - failed_count;
1434    let passed_ratio = if evaluated_count == 0 {
1435        0.0
1436    } else {
1437        passed_count as f64 / evaluated_count as f64
1438    };
1439    print!(
1440        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1441        evaluated_count,
1442        iterations,
1443        passed_ratio * 100.0
1444    );
1445    std::io::stdout().flush().unwrap();
1446}
1447
1448struct EditAgentTest {
1449    agent: EditAgent,
1450    project: Entity<Project>,
1451    judge_model: Arc<dyn LanguageModel>,
1452}
1453
1454impl EditAgentTest {
1455    async fn new(cx: &mut TestAppContext) -> Self {
1456        cx.executor().allow_parking();
1457
1458        let fs = FakeFs::new(cx.executor().clone());
1459        cx.update(|cx| {
1460            settings::init(cx);
1461            gpui_tokio::init(cx);
1462            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1463            cx.set_http_client(http_client);
1464
1465            client::init_settings(cx);
1466            let client = Client::production(cx);
1467            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1468
1469            settings::init(cx);
1470            Project::init_settings(cx);
1471            language::init(cx);
1472            language_model::init(client.clone(), cx);
1473            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1474            crate::init(client.http_client(), cx);
1475        });
1476
1477        fs.insert_tree("/root", json!({})).await;
1478        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1479        let agent_model = SelectedModel::from_str(
1480            &std::env::var("ZED_AGENT_MODEL")
1481                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1482        )
1483        .unwrap();
1484        let judge_model = SelectedModel::from_str(
1485            &std::env::var("ZED_JUDGE_MODEL")
1486                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1487        )
1488        .unwrap();
1489        let (agent_model, judge_model) = cx
1490            .update(|cx| {
1491                cx.spawn(async move |cx| {
1492                    let agent_model = Self::load_model(&agent_model, cx).await;
1493                    let judge_model = Self::load_model(&judge_model, cx).await;
1494                    (agent_model.unwrap(), judge_model.unwrap())
1495                })
1496            })
1497            .await;
1498        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1499
1500        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1501
1502        Self {
1503            agent: EditAgent::new(
1504                agent_model,
1505                project.clone(),
1506                action_log,
1507                Templates::new(),
1508                edit_format,
1509            ),
1510            project,
1511            judge_model,
1512        }
1513    }
1514
1515    async fn load_model(
1516        selected_model: &SelectedModel,
1517        cx: &mut AsyncApp,
1518    ) -> Result<Arc<dyn LanguageModel>> {
1519        let (provider, model) = cx.update(|cx| {
1520            let models = LanguageModelRegistry::read_global(cx);
1521            let model = models
1522                .available_models(cx)
1523                .find(|model| {
1524                    model.provider_id() == selected_model.provider
1525                        && model.id() == selected_model.model
1526                })
1527                .expect("Model not found");
1528            let provider = models.provider(&model.provider_id()).unwrap();
1529            (provider, model)
1530        })?;
1531        cx.update(|cx| provider.authenticate(cx))?.await?;
1532        Ok(model)
1533    }
1534
1535    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1536        let path = self
1537            .project
1538            .read_with(cx, |project, cx| {
1539                project.find_project_path(eval.edit_file_input.path, cx)
1540            })
1541            .unwrap();
1542        let buffer = self
1543            .project
1544            .update(cx, |project, cx| project.open_buffer(path, cx))
1545            .await
1546            .unwrap();
1547        let tools = cx.update(|cx| {
1548            ToolRegistry::default_global(cx)
1549                .tools()
1550                .into_iter()
1551                .filter_map(|tool| {
1552                    let input_schema = tool
1553                        .input_schema(self.agent.model.tool_input_format())
1554                        .ok()?;
1555                    Some(LanguageModelRequestTool {
1556                        name: tool.name(),
1557                        description: tool.description(),
1558                        input_schema,
1559                    })
1560                })
1561                .collect::<Vec<_>>()
1562        });
1563        let tool_names = tools
1564            .iter()
1565            .map(|tool| tool.name.clone())
1566            .collect::<Vec<_>>();
1567        let worktrees = vec![WorktreeContext {
1568            root_name: "root".to_string(),
1569            abs_path: Path::new("/path/to/root").into(),
1570            rules_file: None,
1571        }];
1572        let prompt_builder = PromptBuilder::new(None)?;
1573        let project_context = ProjectContext::new(worktrees, Vec::default());
1574        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1575            &project_context,
1576            &ModelContext {
1577                available_tools: tool_names,
1578            },
1579        )?;
1580
1581        let has_system_prompt = eval
1582            .conversation
1583            .first()
1584            .map_or(false, |msg| msg.role == Role::System);
1585        let messages = if has_system_prompt {
1586            eval.conversation
1587        } else {
1588            [LanguageModelRequestMessage {
1589                role: Role::System,
1590                content: vec![MessageContent::Text(system_prompt)],
1591                cache: true,
1592            }]
1593            .into_iter()
1594            .chain(eval.conversation)
1595            .collect::<Vec<_>>()
1596        };
1597
1598        let conversation = LanguageModelRequest {
1599            messages,
1600            tools,
1601            ..Default::default()
1602        };
1603
1604        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1605            if let Some(input_content) = eval.input_content.as_deref() {
1606                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1607            }
1608            retry_on_rate_limit(async || {
1609                self.agent
1610                    .edit(
1611                        buffer.clone(),
1612                        eval.edit_file_input.display_description.clone(),
1613                        &conversation,
1614                        &mut cx.to_async(),
1615                    )
1616                    .0
1617                    .await
1618            })
1619            .await?
1620        } else {
1621            retry_on_rate_limit(async || {
1622                self.agent
1623                    .overwrite(
1624                        buffer.clone(),
1625                        eval.edit_file_input.display_description.clone(),
1626                        &conversation,
1627                        &mut cx.to_async(),
1628                    )
1629                    .0
1630                    .await
1631            })
1632            .await?
1633        };
1634
1635        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1636        let sample = EvalSample {
1637            edit_output,
1638            diff: language::unified_diff(
1639                eval.input_content.as_deref().unwrap_or_default(),
1640                &buffer_text,
1641            ),
1642            text_before: eval.input_content.unwrap_or_default(),
1643            text_after: buffer_text,
1644        };
1645        let assertion = eval
1646            .assertion
1647            .run(&sample, self.judge_model.clone(), cx)
1648            .await?;
1649
1650        Ok(EvalOutput { assertion, sample })
1651    }
1652}
1653
1654async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1655    let mut attempt = 0;
1656    loop {
1657        attempt += 1;
1658        match request().await {
1659            Ok(result) => return Ok(result),
1660            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1661                Ok(err) => match err {
1662                    LanguageModelCompletionError::RateLimitExceeded { retry_after } => {
1663                        // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1664                        let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1665                        eprintln!(
1666                            "Attempt #{attempt}: Rate limit exceeded. Retry after {retry_after:?} + jitter of {jitter:?}"
1667                        );
1668                        Timer::after(retry_after + jitter).await;
1669                        continue;
1670                    }
1671                    _ => return Err(err.into()),
1672                },
1673                Err(err) => return Err(err),
1674            },
1675        }
1676    }
1677}
1678
1679#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1680struct EvalAssertionOutcome {
1681    score: usize,
1682    message: Option<String>,
1683}
1684
1685#[derive(Serialize)]
1686pub struct DiffJudgeTemplate {
1687    diff: String,
1688    assertions: &'static str,
1689}
1690
1691impl Template for DiffJudgeTemplate {
1692    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1693}
1694
1695fn strip_empty_lines(text: &str) -> String {
1696    text.lines()
1697        .filter(|line| !line.trim().is_empty())
1698        .collect::<Vec<_>>()
1699        .join("\n")
1700}