evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext, Timer};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    path::Path,
  30    str::FromStr,
  31    sync::mpsc,
  32};
  33use util::path;
  34
  35#[test]
  36#[cfg_attr(not(feature = "eval"), ignore)]
  37fn eval_extract_handle_command_output() {
  38    // Test how well agent generates multiple edit hunks.
  39    //
  40    // Model                       | Pass rate
  41    // ----------------------------|----------
  42    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  43    // claude-sonnet-4             |  0.97 (2025-06-14)
  44    // gemini-2.5-pro-06-05        |  0.77 (2025-05-22)
  45    // gemini-2.5-flash            |  0.11 (2025-05-22)
  46    // gpt-4.1                     |  1.00 (2025-05-22)
  47
  48    let input_file_path = "root/blame.rs";
  49    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  50    let possible_diffs = vec![
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  56        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  57        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  58    ];
  59    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  60    eval(
  61        100,
  62        0.7, // Taking the lower bar for Gemini
  63        0.05,
  64        EvalInput::from_conversation(
  65            vec![
  66                message(
  67                    User,
  68                    [text(formatdoc! {"
  69                        Read the `{input_file_path}` file and extract a method in
  70                        the final stanza of `run_git_blame` to deal with command failures,
  71                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  72                        Do not document the method and do not add any comments.
  73
  74                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  75                    "})],
  76                ),
  77                message(
  78                    Assistant,
  79                    [tool_use(
  80                        "tool_1",
  81                        "read_file",
  82                        ReadFileToolInput {
  83                            path: input_file_path.into(),
  84                            start_line: None,
  85                            end_line: None,
  86                        },
  87                    )],
  88                ),
  89                message(
  90                    User,
  91                    [tool_result("tool_1", "read_file", input_file_content)],
  92                ),
  93                message(
  94                    Assistant,
  95                    [tool_use(
  96                        "tool_2",
  97                        "edit_file",
  98                        EditFileToolInput {
  99                            display_description: edit_description.into(),
 100                            path: input_file_path.into(),
 101                            mode: EditFileMode::Edit,
 102                        },
 103                    )],
 104                ),
 105            ],
 106            Some(input_file_content.into()),
 107            EvalAssertion::assert_diff_any(possible_diffs),
 108        ),
 109    );
 110}
 111
 112#[test]
 113#[cfg_attr(not(feature = "eval"), ignore)]
 114fn eval_delete_run_git_blame() {
 115    // Model                       | Pass rate
 116    // ----------------------------|----------
 117    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 118    // claude-sonnet-4             | 0.96 (2025-06-14)
 119    // gemini-2.5-pro-06-05        |
 120    // gemini-2.5-flash            |
 121    // gpt-4.1                     |
 122    let input_file_path = "root/blame.rs";
 123    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 124    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 125    let edit_description = "Delete the `run_git_blame` function.";
 126    eval(
 127        100,
 128        0.95,
 129        0.05,
 130        EvalInput::from_conversation(
 131            vec![
 132                message(
 133                    User,
 134                    [text(formatdoc! {"
 135                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 136                        one function, not its usages.
 137                    "})],
 138                ),
 139                message(
 140                    Assistant,
 141                    [tool_use(
 142                        "tool_1",
 143                        "read_file",
 144                        ReadFileToolInput {
 145                            path: input_file_path.into(),
 146                            start_line: None,
 147                            end_line: None,
 148                        },
 149                    )],
 150                ),
 151                message(
 152                    User,
 153                    [tool_result("tool_1", "read_file", input_file_content)],
 154                ),
 155                message(
 156                    Assistant,
 157                    [tool_use(
 158                        "tool_2",
 159                        "edit_file",
 160                        EditFileToolInput {
 161                            display_description: edit_description.into(),
 162                            path: input_file_path.into(),
 163                            mode: EditFileMode::Edit,
 164                        },
 165                    )],
 166                ),
 167            ],
 168            Some(input_file_content.into()),
 169            EvalAssertion::assert_eq(output_file_content),
 170        ),
 171    );
 172}
 173
 174#[test]
 175#[cfg_attr(not(feature = "eval"), ignore)]
 176fn eval_translate_doc_comments() {
 177    //  Model                          | Pass rate
 178    // ============================================
 179    //
 180    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 181    //  claude-sonnet-4                |  1.0  (2025-06-14)
 182    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 183    //  gemini-2.5-flash-preview-04-17 |
 184    //  gpt-4.1                        |
 185    let input_file_path = "root/canvas.rs";
 186    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 187    let edit_description = "Translate all doc comments to Italian";
 188    eval(
 189        200,
 190        1.,
 191        0.05,
 192        EvalInput::from_conversation(
 193            vec![
 194                message(
 195                    User,
 196                    [text(formatdoc! {"
 197                        Read the {input_file_path} file and edit it (without overwriting it),
 198                        translating all the doc comments to italian.
 199                    "})],
 200                ),
 201                message(
 202                    Assistant,
 203                    [tool_use(
 204                        "tool_1",
 205                        "read_file",
 206                        ReadFileToolInput {
 207                            path: input_file_path.into(),
 208                            start_line: None,
 209                            end_line: None,
 210                        },
 211                    )],
 212                ),
 213                message(
 214                    User,
 215                    [tool_result("tool_1", "read_file", input_file_content)],
 216                ),
 217                message(
 218                    Assistant,
 219                    [tool_use(
 220                        "tool_2",
 221                        "edit_file",
 222                        EditFileToolInput {
 223                            display_description: edit_description.into(),
 224                            path: input_file_path.into(),
 225                            mode: EditFileMode::Edit,
 226                        },
 227                    )],
 228                ),
 229            ],
 230            Some(input_file_content.into()),
 231            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 232        ),
 233    );
 234}
 235
 236#[test]
 237#[cfg_attr(not(feature = "eval"), ignore)]
 238fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 239    //  Model                          | Pass rate
 240    // ============================================
 241    //
 242    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 243    //  claude-sonnet-4                |  0.11 (2025-06-14)
 244    //  gemini-2.5-pro-preview-03-25   |  0.99 (2025-05-22)
 245    //  gemini-2.5-flash-preview-04-17 |
 246    //  gpt-4.1                        |
 247    let input_file_path = "root/lib.rs";
 248    let input_file_content =
 249        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 250    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 251    eval(
 252        100,
 253        0.95,
 254        0.05,
 255        EvalInput::from_conversation(
 256            vec![
 257                message(
 258                    User,
 259                    [text(formatdoc! {"
 260                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 261                        Use `ureq` to download the SDK for the current platform and architecture.
 262                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 263                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 264                        that's inside of the archive.
 265                        Don't re-download the SDK if that executable already exists.
 266
 267                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 268
 269                        Here are the available wasi-sdk assets:
 270                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 271                        - wasi-sdk-25.0-arm64-macos.tar.gz
 272                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 273                        - wasi-sdk-25.0-arm64-linux.tar.gz
 274                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 275                        - wasi-sdk-25.0-arm64-linux.tar.gz
 276                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 277                    "})],
 278                ),
 279                message(
 280                    Assistant,
 281                    [tool_use(
 282                        "tool_1",
 283                        "read_file",
 284                        ReadFileToolInput {
 285                            path: input_file_path.into(),
 286                            start_line: Some(971),
 287                            end_line: Some(1050),
 288                        },
 289                    )],
 290                ),
 291                message(
 292                    User,
 293                    [tool_result(
 294                        "tool_1",
 295                        "read_file",
 296                        lines(input_file_content, 971..1050),
 297                    )],
 298                ),
 299                message(
 300                    Assistant,
 301                    [tool_use(
 302                        "tool_2",
 303                        "read_file",
 304                        ReadFileToolInput {
 305                            path: input_file_path.into(),
 306                            start_line: Some(1050),
 307                            end_line: Some(1100),
 308                        },
 309                    )],
 310                ),
 311                message(
 312                    User,
 313                    [tool_result(
 314                        "tool_2",
 315                        "read_file",
 316                        lines(input_file_content, 1050..1100),
 317                    )],
 318                ),
 319                message(
 320                    Assistant,
 321                    [tool_use(
 322                        "tool_3",
 323                        "read_file",
 324                        ReadFileToolInput {
 325                            path: input_file_path.into(),
 326                            start_line: Some(1100),
 327                            end_line: Some(1150),
 328                        },
 329                    )],
 330                ),
 331                message(
 332                    User,
 333                    [tool_result(
 334                        "tool_3",
 335                        "read_file",
 336                        lines(input_file_content, 1100..1150),
 337                    )],
 338                ),
 339                message(
 340                    Assistant,
 341                    [tool_use(
 342                        "tool_4",
 343                        "edit_file",
 344                        EditFileToolInput {
 345                            display_description: edit_description.into(),
 346                            path: input_file_path.into(),
 347                            mode: EditFileMode::Edit,
 348                        },
 349                    )],
 350                ),
 351            ],
 352            Some(input_file_content.into()),
 353            EvalAssertion::judge_diff(indoc! {"
 354                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 355                - ureq is used to download the SDK for current platform and architecture
 356            "}),
 357        ),
 358    );
 359}
 360
 361#[test]
 362#[cfg_attr(not(feature = "eval"), ignore)]
 363fn eval_disable_cursor_blinking() {
 364    //  Model                          | Pass rate
 365    // ============================================
 366    //
 367    //  claude-3.7-sonnet              |  0.99 (2025-06-14)
 368    //  claude-sonnet-4                |  0.85 (2025-06-14)
 369    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 370    //  gemini-2.5-flash-preview-04-17 |
 371    //  gpt-4.1                        |
 372    let input_file_path = "root/editor.rs";
 373    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 374    let edit_description = "Comment out the call to `BlinkManager::enable`";
 375    eval(
 376        100,
 377        0.95,
 378        0.05,
 379        EvalInput::from_conversation(
 380            vec![
 381                message(User, [text("Let's research how to cursor blinking works.")]),
 382                message(
 383                    Assistant,
 384                    [tool_use(
 385                        "tool_1",
 386                        "grep",
 387                        GrepToolInput {
 388                            regex: "blink".into(),
 389                            include_pattern: None,
 390                            offset: 0,
 391                            case_sensitive: false,
 392                        },
 393                    )],
 394                ),
 395                message(
 396                    User,
 397                    [tool_result(
 398                        "tool_1",
 399                        "grep",
 400                        [
 401                            lines(input_file_content, 100..400),
 402                            lines(input_file_content, 800..1300),
 403                            lines(input_file_content, 1600..2000),
 404                            lines(input_file_content, 5000..5500),
 405                            lines(input_file_content, 8000..9000),
 406                            lines(input_file_content, 18455..18470),
 407                            lines(input_file_content, 20000..20500),
 408                            lines(input_file_content, 21000..21300),
 409                        ]
 410                        .join("Match found:\n\n"),
 411                    )],
 412                ),
 413                message(
 414                    User,
 415                    [text(indoc! {"
 416                        Comment out the lines that interact with the BlinkManager.
 417                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 418                        Don't add additional comments.
 419                    "})],
 420                ),
 421                message(
 422                    Assistant,
 423                    [tool_use(
 424                        "tool_4",
 425                        "edit_file",
 426                        EditFileToolInput {
 427                            display_description: edit_description.into(),
 428                            path: input_file_path.into(),
 429                            mode: EditFileMode::Edit,
 430                        },
 431                    )],
 432                ),
 433            ],
 434            Some(input_file_content.into()),
 435            EvalAssertion::judge_diff(indoc! {"
 436                - Calls to BlinkManager in `observe_window_activation` were commented out
 437                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 438                - All the edits have valid indentation
 439            "}),
 440        ),
 441    );
 442}
 443
 444#[test]
 445#[cfg_attr(not(feature = "eval"), ignore)]
 446fn eval_from_pixels_constructor() {
 447    // Results for 2025-06-13
 448    //
 449    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 450    // value. Higher values improve the pass rate but may sometimes cause
 451    // edits to be misapplied. In the context of this eval, this means
 452    // the agent might add from_pixels tests in incorrect locations
 453    // (e.g., at the beginning of the file), yet the evaluation may still
 454    // rate it highly.
 455    //
 456    //  Model                          | Pass rate
 457    // ============================================
 458    //
 459    //  claude-4.0-sonnet              |  0.99
 460    //  claude-3.7-sonnet              |  0.88
 461    //  gemini-2.5-pro-preview-03-25   |  0.96
 462    //  gpt-4.1                        |
 463    let input_file_path = "root/canvas.rs";
 464    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 465    let edit_description = "Implement from_pixels constructor and add tests.";
 466    eval(
 467        100,
 468        0.95,
 469        // For whatever reason, this eval produces more mismatched tags.
 470        // Increasing for now, let's see if we can bring this down.
 471        0.25,
 472        EvalInput::from_conversation(
 473            vec![
 474                message(
 475                    User,
 476                    [text(indoc! {"
 477                        Introduce a new `from_pixels` constructor in Canvas and
 478                        also add tests for it in the same file.
 479                    "})],
 480                ),
 481                message(
 482                    Assistant,
 483                    [tool_use(
 484                        "tool_1",
 485                        "read_file",
 486                        ReadFileToolInput {
 487                            path: input_file_path.into(),
 488                            start_line: None,
 489                            end_line: None,
 490                        },
 491                    )],
 492                ),
 493                message(
 494                    User,
 495                    [tool_result("tool_1", "read_file", input_file_content)],
 496                ),
 497                message(
 498                    Assistant,
 499                    [tool_use(
 500                        "tool_2",
 501                        "grep",
 502                        GrepToolInput {
 503                            regex: "mod\\s+tests".into(),
 504                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 505                            offset: 0,
 506                            case_sensitive: false,
 507                        },
 508                    )],
 509                ),
 510                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 511                message(
 512                    Assistant,
 513                    [tool_use(
 514                        "tool_3",
 515                        "grep",
 516                        GrepToolInput {
 517                            regex: "mod\\s+tests".into(),
 518                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 519                            offset: 0,
 520                            case_sensitive: false,
 521                        },
 522                    )],
 523                ),
 524                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 525                message(
 526                    Assistant,
 527                    [tool_use(
 528                        "tool_4",
 529                        "grep",
 530                        GrepToolInput {
 531                            regex: "#\\[test\\]".into(),
 532                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 533                            offset: 0,
 534                            case_sensitive: false,
 535                        },
 536                    )],
 537                ),
 538                message(
 539                    User,
 540                    [tool_result(
 541                        "tool_4",
 542                        "grep",
 543                        indoc! {"
 544                            Found 6 matches:
 545
 546                            ## Matches in font-kit/src/loaders/core_text.rs
 547
 548                            ### mod test › L926-936
 549                            ```
 550                            mod test {
 551                                use super::Font;
 552                                use crate::properties::{Stretch, Weight};
 553
 554                                #[cfg(feature = \"source\")]
 555                                use crate::source::SystemSource;
 556
 557                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 558
 559                                #[cfg(feature = \"source\")]
 560                                #[test]
 561                            ```
 562
 563                            55 lines remaining in ancestor node. Read the file to see all.
 564
 565                            ### mod test › L947-951
 566                            ```
 567                                }
 568
 569                                #[test]
 570                                fn test_core_text_to_css_font_weight() {
 571                                    // Exact matches
 572                            ```
 573
 574                            ### mod test › L959-963
 575                            ```
 576                                }
 577
 578                                #[test]
 579                                fn test_core_text_to_css_font_stretch() {
 580                                    // Exact matches
 581                            ```
 582
 583                            ## Matches in font-kit/src/loaders/freetype.rs
 584
 585                            ### mod test › L1238-1248
 586                            ```
 587                            mod test {
 588                                use crate::loaders::freetype::Font;
 589
 590                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 591                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 592
 593                                #[test]
 594                                fn get_pcf_postscript_name() {
 595                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 596                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 597                                }
 598                            ```
 599
 600                            1 lines remaining in ancestor node. Read the file to see all.
 601
 602                            ## Matches in font-kit/src/sources/core_text.rs
 603
 604                            ### mod test › L265-275
 605                            ```
 606                            mod test {
 607                                use crate::properties::{Stretch, Weight};
 608
 609                                #[test]
 610                                fn test_css_to_core_text_font_weight() {
 611                                    // Exact matches
 612                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 613                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 614                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 615                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 616
 617                            ```
 618
 619                            27 lines remaining in ancestor node. Read the file to see all.
 620
 621                            ### mod test › L278-282
 622                            ```
 623                                }
 624
 625                                #[test]
 626                                fn test_css_to_core_text_font_stretch() {
 627                                    // Exact matches
 628                            ```
 629                        "},
 630                    )],
 631                ),
 632                message(
 633                    Assistant,
 634                    [tool_use(
 635                        "tool_5",
 636                        "edit_file",
 637                        EditFileToolInput {
 638                            display_description: edit_description.into(),
 639                            path: input_file_path.into(),
 640                            mode: EditFileMode::Edit,
 641                        },
 642                    )],
 643                ),
 644            ],
 645            Some(input_file_content.into()),
 646            EvalAssertion::judge_diff(indoc! {"
 647                    - The diff contains a new `from_pixels` constructor
 648                    - The diff contains new tests for the `from_pixels` constructor
 649                "}),
 650        ),
 651    );
 652}
 653
 654#[test]
 655#[cfg_attr(not(feature = "eval"), ignore)]
 656fn eval_zode() {
 657    //  Model                          | Pass rate
 658    // ============================================
 659    //
 660    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 661    //  claude-sonnet-4                |  1.0 (2025-06-14)
 662    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 663    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 664    //  gpt-4.1                        |  1.0 (2025-05-22)
 665    let input_file_path = "root/zode.py";
 666    let input_content = None;
 667    let edit_description = "Create the main Zode CLI script";
 668    eval(
 669        50,
 670        1.,
 671        0.05,
 672        EvalInput::from_conversation(
 673            vec![
 674                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 675                message(
 676                    Assistant,
 677                    [
 678                        tool_use(
 679                            "tool_1",
 680                            "read_file",
 681                            ReadFileToolInput {
 682                                path: "root/eval/react.py".into(),
 683                                start_line: None,
 684                                end_line: None,
 685                            },
 686                        ),
 687                        tool_use(
 688                            "tool_2",
 689                            "read_file",
 690                            ReadFileToolInput {
 691                                path: "root/eval/react_test.py".into(),
 692                                start_line: None,
 693                                end_line: None,
 694                            },
 695                        ),
 696                    ],
 697                ),
 698                message(
 699                    User,
 700                    [
 701                        tool_result(
 702                            "tool_1",
 703                            "read_file",
 704                            include_str!("evals/fixtures/zode/react.py"),
 705                        ),
 706                        tool_result(
 707                            "tool_2",
 708                            "read_file",
 709                            include_str!("evals/fixtures/zode/react_test.py"),
 710                        ),
 711                    ],
 712                ),
 713                message(
 714                    Assistant,
 715                    [
 716                        text(
 717                            "Now that I understand what we need to build, I'll create the main Python script:",
 718                        ),
 719                        tool_use(
 720                            "tool_3",
 721                            "edit_file",
 722                            EditFileToolInput {
 723                                display_description: edit_description.into(),
 724                                path: input_file_path.into(),
 725                                mode: EditFileMode::Create,
 726                            },
 727                        ),
 728                    ],
 729                ),
 730            ],
 731            input_content,
 732            EvalAssertion::new(async move |sample, _, _cx| {
 733                let invalid_starts = [' ', '`', '\n'];
 734                let mut message = String::new();
 735                for start in invalid_starts {
 736                    if sample.text_after.starts_with(start) {
 737                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 738                        break;
 739                    }
 740                }
 741                // Remove trailing newline.
 742                message.pop();
 743
 744                if message.is_empty() {
 745                    Ok(EvalAssertionOutcome {
 746                        score: 100,
 747                        message: None,
 748                    })
 749                } else {
 750                    Ok(EvalAssertionOutcome {
 751                        score: 0,
 752                        message: Some(message),
 753                    })
 754                }
 755            }),
 756        ),
 757    );
 758}
 759
 760#[test]
 761#[cfg_attr(not(feature = "eval"), ignore)]
 762fn eval_add_overwrite_test() {
 763    //  Model                          | Pass rate
 764    // ============================================
 765    //
 766    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 767    //  claude-sonnet-4                |  0.07 (2025-06-14)
 768    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 769    //  gemini-2.5-flash-preview-04-17 |
 770    //  gpt-4.1                        |
 771    let input_file_path = "root/action_log.rs";
 772    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 773    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 774    eval(
 775        200,
 776        0.5, // TODO: make this eval better
 777        0.05,
 778        EvalInput::from_conversation(
 779            vec![
 780                message(
 781                    User,
 782                    [text(indoc! {"
 783                        Introduce a new test in `action_log.rs` to test overwriting a file.
 784                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 785                        Take inspiration from all the other tests in the file.
 786                    "})],
 787                ),
 788                message(
 789                    Assistant,
 790                    [tool_use(
 791                        "tool_1",
 792                        "read_file",
 793                        ReadFileToolInput {
 794                            path: input_file_path.into(),
 795                            start_line: None,
 796                            end_line: None,
 797                        },
 798                    )],
 799                ),
 800                message(
 801                    User,
 802                    [tool_result(
 803                        "tool_1",
 804                        "read_file",
 805                        indoc! {"
 806                            pub struct ActionLog [L13-20]
 807                             tracked_buffers [L15]
 808                             edited_since_project_diagnostics_check [L17]
 809                             project [L19]
 810                            impl ActionLog [L22-498]
 811                             pub fn new [L24-30]
 812                             pub fn project [L32-34]
 813                             pub fn checked_project_diagnostics [L37-39]
 814                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 815                             fn track_buffer_internal [L46-101]
 816                             fn handle_buffer_event [L103-116]
 817                             fn handle_buffer_edited [L118-123]
 818                             fn handle_buffer_file_changed [L125-158]
 819                             async fn maintain_diff [L160-264]
 820                             pub fn buffer_read [L267-269]
 821                             pub fn buffer_created [L272-276]
 822                             pub fn buffer_edited [L279-287]
 823                             pub fn will_delete_buffer [L289-304]
 824                             pub fn keep_edits_in_range [L306-364]
 825                             pub fn reject_edits_in_ranges [L366-459]
 826                             pub fn keep_all_edits [L461-473]
 827                             pub fn changed_buffers [L476-482]
 828                             pub fn stale_buffers [L485-497]
 829                            fn apply_non_conflicting_edits [L500-561]
 830                            fn diff_snapshots [L563-585]
 831                            fn point_to_row_edit [L587-614]
 832                            enum ChangeAuthor [L617-620]
 833                             User [L618]
 834                             Agent [L619]
 835                            enum TrackedBufferStatus [L623-627]
 836                             Created [L624]
 837                             Modified [L625]
 838                             Deleted [L626]
 839                            struct TrackedBuffer [L629-641]
 840                             buffer [L630]
 841                             base_text [L631]
 842                             unreviewed_changes [L632]
 843                             status [L633]
 844                             version [L634]
 845                             diff [L635]
 846                             snapshot [L636]
 847                             diff_update [L637]
 848                             _open_lsp_handle [L638]
 849                             _maintain_diff [L639]
 850                             _subscription [L640]
 851                            impl TrackedBuffer [L643-657]
 852                             fn has_changes [L644-650]
 853                             fn schedule_diff_update [L652-656]
 854                            pub struct ChangedBuffer [L659-661]
 855                             pub diff [L660]
 856                            mod tests [L664-1574]
 857                             fn init_logger [L678-682]
 858                             fn init_test [L684-691]
 859                             async fn test_keep_edits [L694-769]
 860                             async fn test_deletions [L772-854]
 861                             async fn test_overlapping_user_edits [L857-951]
 862                             async fn test_creating_files [L954-1010]
 863                             async fn test_deleting_files [L1013-1120]
 864                             async fn test_reject_edits [L1123-1255]
 865                             async fn test_reject_multiple_edits [L1258-1331]
 866                             async fn test_reject_deleted_file [L1334-1388]
 867                             async fn test_reject_created_file [L1391-1443]
 868                             async fn test_random_diffs [L1446-1535]
 869                              fn quiesce [L1510-1534]
 870                             struct HunkStatus [L1538-1542]
 871                              range [L1539]
 872                              diff_status [L1540]
 873                              old_text [L1541]
 874                             fn unreviewed_hunks [L1544-1573]
 875
 876                            Showing symbols 1-69 (total symbols: 69)
 877
 878                            Using the line numbers in this outline, you can call this tool again while specifying
 879                            the start_line and end_line fields to see the implementations of symbols in the outline.
 880                        "},
 881                    )],
 882                ),
 883                message(
 884                    Assistant,
 885                    [
 886                        text(
 887                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 888                        ),
 889                        tool_use(
 890                            "tool_2",
 891                            "read_file",
 892                            ReadFileToolInput {
 893                                path: input_file_path.into(),
 894                                start_line: Some(953),
 895                                end_line: Some(1010),
 896                            },
 897                        ),
 898                    ],
 899                ),
 900                message(
 901                    User,
 902                    [tool_result(
 903                        "tool_2",
 904                        "read_file",
 905                        lines(input_file_content, 953..1010),
 906                    )],
 907                ),
 908                message(
 909                    Assistant,
 910                    [
 911                        text(
 912                            "Now let's look at another relevant test to understand how they're structured:",
 913                        ),
 914                        tool_use(
 915                            "tool_3",
 916                            "read_file",
 917                            ReadFileToolInput {
 918                                path: input_file_path.into(),
 919                                start_line: Some(1012),
 920                                end_line: Some(1120),
 921                            },
 922                        ),
 923                    ],
 924                ),
 925                message(
 926                    User,
 927                    [tool_result(
 928                        "tool_3",
 929                        "read_file",
 930                        lines(input_file_content, 1012..1120),
 931                    )],
 932                ),
 933                message(
 934                    Assistant,
 935                    [
 936                        text("Now let's look at how `buffer_created` is implemented:"),
 937                        tool_use(
 938                            "tool_4",
 939                            "read_file",
 940                            ReadFileToolInput {
 941                                path: input_file_path.into(),
 942                                start_line: Some(271),
 943                                end_line: Some(276),
 944                            },
 945                        ),
 946                    ],
 947                ),
 948                message(
 949                    User,
 950                    [tool_result(
 951                        "tool_4",
 952                        "read_file",
 953                        lines(input_file_content, 271..276),
 954                    )],
 955                ),
 956                message(
 957                    Assistant,
 958                    [
 959                        text(indoc! {"
 960                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 961
 962                            1. Create a file
 963                            2. Open a buffer for it
 964                            3. Track it in the action log
 965                            4. Modify the buffer
 966                            5. Call `buffer_created` again as if the file were new
 967                            6. Check that the changes are properly tracked
 968
 969                            Let's write the test:
 970                        "}),
 971                        tool_use(
 972                            "tool_5",
 973                            "edit_file",
 974                            EditFileToolInput {
 975                                display_description: edit_description.into(),
 976                                path: input_file_path.into(),
 977                                mode: EditFileMode::Edit,
 978                            },
 979                        ),
 980                    ],
 981                ),
 982            ],
 983            Some(input_file_content.into()),
 984            EvalAssertion::judge_diff(
 985                "A new test for overwritten files was created, without changing any previous test",
 986            ),
 987        ),
 988    );
 989}
 990
 991#[test]
 992#[cfg_attr(not(feature = "eval"), ignore)]
 993fn eval_create_empty_file() {
 994    // Check that Edit Agent can create a file without writing its
 995    // thoughts into it. This issue is not specific to empty files, but
 996    // it's easier to reproduce with them.
 997    //
 998    //  Model                          | Pass rate
 999    // ============================================
1000    //
1001    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1002    //  claude-sonnet-4                |  1.00 (2025-06-14)
1003    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1004    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1005    //  gpt-4.1                        |  1.00 (2025-05-21)
1006    //
1007    //
1008    // TODO: gpt-4.1-mini errored 38 times:
1009    // "data did not match any variant of untagged enum ResponseStreamResult"
1010    //
1011    let input_file_content = None;
1012    let expected_output_content = String::new();
1013    eval(
1014        100,
1015        0.99,
1016        0.05,
1017        EvalInput::from_conversation(
1018            vec![
1019                message(User, [text("Create a second empty todo file ")]),
1020                message(
1021                    Assistant,
1022                    [
1023                        text(formatdoc! {"
1024                        I'll help you create a second empty todo file.
1025                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1026                        "}),
1027                        tool_use(
1028                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1029                            "list_directory",
1030                            ListDirectoryToolInput {
1031                                path: "root".to_string(),
1032                            },
1033                        ),
1034                    ],
1035                ),
1036                message(
1037                    User,
1038                    [tool_result(
1039                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1040                        "list_directory",
1041                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1042                    )],
1043                ),
1044                message(
1045                    Assistant,
1046                    [
1047                        text(formatdoc! {"
1048                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1049                    "}),
1050                        tool_use(
1051                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1052                            "edit_file",
1053                            EditFileToolInput {
1054                                display_description: "Create empty TODO3 file".to_string(),
1055                                mode: EditFileMode::Create,
1056                                path: "root/TODO3".into(),
1057                            },
1058                        ),
1059                    ],
1060                ),
1061            ],
1062            input_file_content,
1063            // Bad behavior is to write something like
1064            // "I'll create an empty TODO3 file as requested."
1065            EvalAssertion::assert_eq(expected_output_content),
1066        ),
1067    );
1068}
1069
1070fn message(
1071    role: Role,
1072    contents: impl IntoIterator<Item = MessageContent>,
1073) -> LanguageModelRequestMessage {
1074    LanguageModelRequestMessage {
1075        role,
1076        content: contents.into_iter().collect(),
1077        cache: false,
1078    }
1079}
1080
1081fn text(text: impl Into<String>) -> MessageContent {
1082    MessageContent::Text(text.into())
1083}
1084
1085fn lines(input: &str, range: Range<usize>) -> String {
1086    input
1087        .lines()
1088        .skip(range.start)
1089        .take(range.len())
1090        .collect::<Vec<_>>()
1091        .join("\n")
1092}
1093
1094fn tool_use(
1095    id: impl Into<Arc<str>>,
1096    name: impl Into<Arc<str>>,
1097    input: impl Serialize,
1098) -> MessageContent {
1099    MessageContent::ToolUse(LanguageModelToolUse {
1100        id: LanguageModelToolUseId::from(id.into()),
1101        name: name.into(),
1102        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1103        input: serde_json::to_value(input).unwrap(),
1104        is_input_complete: true,
1105    })
1106}
1107
1108fn tool_result(
1109    id: impl Into<Arc<str>>,
1110    name: impl Into<Arc<str>>,
1111    result: impl Into<Arc<str>>,
1112) -> MessageContent {
1113    MessageContent::ToolResult(LanguageModelToolResult {
1114        tool_use_id: LanguageModelToolUseId::from(id.into()),
1115        tool_name: name.into(),
1116        is_error: false,
1117        content: LanguageModelToolResultContent::Text(result.into()),
1118        output: None,
1119    })
1120}
1121
1122#[derive(Clone)]
1123struct EvalInput {
1124    conversation: Vec<LanguageModelRequestMessage>,
1125    edit_file_input: EditFileToolInput,
1126    input_content: Option<String>,
1127    assertion: EvalAssertion,
1128}
1129
1130impl EvalInput {
1131    fn from_conversation(
1132        conversation: Vec<LanguageModelRequestMessage>,
1133        input_content: Option<String>,
1134        assertion: EvalAssertion,
1135    ) -> Self {
1136        let msg = conversation.last().expect("Conversation must not be empty");
1137        if msg.role != Role::Assistant {
1138            panic!("Conversation must end with an assistant message");
1139        }
1140        let tool_use = msg
1141            .content
1142            .iter()
1143            .flat_map(|content| match content {
1144                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1145                    Some(tool_use)
1146                }
1147                _ => None,
1148            })
1149            .next()
1150            .expect("Conversation must end with an edit_file tool use")
1151            .clone();
1152
1153        let edit_file_input: EditFileToolInput =
1154            serde_json::from_value(tool_use.input.clone()).unwrap();
1155
1156        EvalInput {
1157            conversation,
1158            edit_file_input,
1159            input_content,
1160            assertion,
1161        }
1162    }
1163}
1164
1165#[derive(Clone)]
1166struct EvalSample {
1167    text_before: String,
1168    text_after: String,
1169    edit_output: EditAgentOutput,
1170    diff: String,
1171}
1172
1173trait AssertionFn: 'static + Send + Sync {
1174    fn assert<'a>(
1175        &'a self,
1176        sample: &'a EvalSample,
1177        judge_model: Arc<dyn LanguageModel>,
1178        cx: &'a mut TestAppContext,
1179    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1180}
1181
1182impl<F> AssertionFn for F
1183where
1184    F: 'static
1185        + Send
1186        + Sync
1187        + AsyncFn(
1188            &EvalSample,
1189            Arc<dyn LanguageModel>,
1190            &mut TestAppContext,
1191        ) -> Result<EvalAssertionOutcome>,
1192{
1193    fn assert<'a>(
1194        &'a self,
1195        sample: &'a EvalSample,
1196        judge_model: Arc<dyn LanguageModel>,
1197        cx: &'a mut TestAppContext,
1198    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1199        (self)(sample, judge_model, cx).boxed_local()
1200    }
1201}
1202
1203#[derive(Clone)]
1204struct EvalAssertion(Arc<dyn AssertionFn>);
1205
1206impl EvalAssertion {
1207    fn new<F>(f: F) -> Self
1208    where
1209        F: 'static
1210            + Send
1211            + Sync
1212            + AsyncFn(
1213                &EvalSample,
1214                Arc<dyn LanguageModel>,
1215                &mut TestAppContext,
1216            ) -> Result<EvalAssertionOutcome>,
1217    {
1218        EvalAssertion(Arc::new(f))
1219    }
1220
1221    fn assert_eq(expected: impl Into<String>) -> Self {
1222        let expected = expected.into();
1223        Self::new(async move |sample, _judge, _cx| {
1224            Ok(EvalAssertionOutcome {
1225                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1226                    100
1227                } else {
1228                    0
1229                },
1230                message: None,
1231            })
1232        })
1233    }
1234
1235    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1236        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1237        Self::new(async move |sample, _judge, _cx| {
1238            let matches = expected_diffs.iter().any(|possible_diff| {
1239                let expected =
1240                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1241                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1242            });
1243
1244            Ok(EvalAssertionOutcome {
1245                score: if matches { 100 } else { 0 },
1246                message: None,
1247            })
1248        })
1249    }
1250
1251    fn judge_diff(assertions: &'static str) -> Self {
1252        Self::new(async move |sample, judge, cx| {
1253            let prompt = DiffJudgeTemplate {
1254                diff: sample.diff.clone(),
1255                assertions,
1256            }
1257            .render(&Templates::new())
1258            .unwrap();
1259
1260            let request = LanguageModelRequest {
1261                messages: vec![LanguageModelRequestMessage {
1262                    role: Role::User,
1263                    content: vec![prompt.into()],
1264                    cache: false,
1265                }],
1266                ..Default::default()
1267            };
1268            let mut response = retry_on_rate_limit(async || {
1269                Ok(judge
1270                    .stream_completion_text(request.clone(), &cx.to_async())
1271                    .await?)
1272            })
1273            .await?;
1274            let mut output = String::new();
1275            while let Some(chunk) = response.stream.next().await {
1276                let chunk = chunk?;
1277                output.push_str(&chunk);
1278            }
1279
1280            // Parse the score from the response
1281            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1282            if let Some(captures) = re.captures(&output) {
1283                if let Some(score_match) = captures.get(1) {
1284                    let score = score_match.as_str().parse().unwrap_or(0);
1285                    return Ok(EvalAssertionOutcome {
1286                        score,
1287                        message: Some(output),
1288                    });
1289                }
1290            }
1291
1292            anyhow::bail!("No score found in response. Raw output: {output}");
1293        })
1294    }
1295
1296    async fn run(
1297        &self,
1298        input: &EvalSample,
1299        judge_model: Arc<dyn LanguageModel>,
1300        cx: &mut TestAppContext,
1301    ) -> Result<EvalAssertionOutcome> {
1302        self.0.assert(input, judge_model, cx).await
1303    }
1304}
1305
1306fn eval(
1307    iterations: usize,
1308    expected_pass_ratio: f32,
1309    mismatched_tag_threshold: f32,
1310    mut eval: EvalInput,
1311) {
1312    let mut evaluated_count = 0;
1313    let mut failed_count = 0;
1314    report_progress(evaluated_count, failed_count, iterations);
1315
1316    let (tx, rx) = mpsc::channel();
1317
1318    // Cache the last message in the conversation, and run one instance of the eval so that
1319    // all the next ones are cached.
1320    eval.conversation.last_mut().unwrap().cache = true;
1321    run_eval(eval.clone(), tx.clone());
1322
1323    let executor = gpui::background_executor();
1324    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1325    for _ in 1..iterations {
1326        let eval = eval.clone();
1327        let tx = tx.clone();
1328        let semaphore = semaphore.clone();
1329        executor
1330            .spawn(async move {
1331                let _guard = semaphore.acquire().await;
1332                run_eval(eval, tx)
1333            })
1334            .detach();
1335    }
1336    drop(tx);
1337
1338    let mut failed_evals = HashMap::default();
1339    let mut errored_evals = HashMap::default();
1340    let mut eval_outputs = Vec::new();
1341    let mut cumulative_parser_metrics = EditParserMetrics::default();
1342    while let Ok(output) = rx.recv() {
1343        match output {
1344            Ok(output) => {
1345                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1346                eval_outputs.push(output.clone());
1347                if output.assertion.score < 80 {
1348                    failed_count += 1;
1349                    failed_evals
1350                        .entry(output.sample.text_after.clone())
1351                        .or_insert(Vec::new())
1352                        .push(output);
1353                }
1354            }
1355            Err(error) => {
1356                failed_count += 1;
1357                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1358            }
1359        }
1360
1361        evaluated_count += 1;
1362        report_progress(evaluated_count, failed_count, iterations);
1363    }
1364
1365    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1366    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1367    if actual_pass_ratio < expected_pass_ratio {
1368        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1369        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1370        for (error, count) in errored_evals {
1371            println!("Eval errored {} times. Error: {}", count, error);
1372        }
1373
1374        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1375        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1376        for (_buffer_output, failed_evals) in failed_evals {
1377            let eval_output = failed_evals.first().unwrap();
1378            println!("Eval failed {} times", failed_evals.len());
1379            println!("{}", eval_output);
1380        }
1381
1382        panic!(
1383            "Actual pass ratio: {}\nExpected pass ratio: {}",
1384            actual_pass_ratio, expected_pass_ratio
1385        );
1386    }
1387
1388    let mismatched_tag_ratio =
1389        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1390    if mismatched_tag_ratio > mismatched_tag_threshold {
1391        for eval_output in eval_outputs {
1392            println!("{}", eval_output);
1393        }
1394        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1395    }
1396}
1397
1398fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1399    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1400    let mut cx = TestAppContext::build(dispatcher, None);
1401    let output = cx.executor().block_test(async {
1402        let test = EditAgentTest::new(&mut cx).await;
1403        test.eval(eval, &mut cx).await
1404    });
1405    tx.send(output).unwrap();
1406}
1407
1408#[derive(Clone)]
1409struct EvalOutput {
1410    sample: EvalSample,
1411    assertion: EvalAssertionOutcome,
1412}
1413
1414impl Display for EvalOutput {
1415    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1416        writeln!(f, "Score: {:?}", self.assertion.score)?;
1417        if let Some(message) = self.assertion.message.as_ref() {
1418            writeln!(f, "Message: {}", message)?;
1419        }
1420
1421        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1422
1423        writeln!(
1424            f,
1425            "Parser Metrics:\n{:#?}",
1426            self.sample.edit_output.parser_metrics
1427        )?;
1428        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1429        Ok(())
1430    }
1431}
1432
1433fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1434    let passed_count = evaluated_count - failed_count;
1435    let passed_ratio = if evaluated_count == 0 {
1436        0.0
1437    } else {
1438        passed_count as f64 / evaluated_count as f64
1439    };
1440    print!(
1441        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1442        evaluated_count,
1443        iterations,
1444        passed_ratio * 100.0
1445    );
1446    std::io::stdout().flush().unwrap();
1447}
1448
1449struct EditAgentTest {
1450    agent: EditAgent,
1451    project: Entity<Project>,
1452    judge_model: Arc<dyn LanguageModel>,
1453}
1454
1455impl EditAgentTest {
1456    async fn new(cx: &mut TestAppContext) -> Self {
1457        cx.executor().allow_parking();
1458
1459        let fs = FakeFs::new(cx.executor().clone());
1460        cx.update(|cx| {
1461            settings::init(cx);
1462            gpui_tokio::init(cx);
1463            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1464            cx.set_http_client(http_client);
1465
1466            client::init_settings(cx);
1467            let client = Client::production(cx);
1468            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1469
1470            settings::init(cx);
1471            Project::init_settings(cx);
1472            language::init(cx);
1473            language_model::init(client.clone(), cx);
1474            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1475            crate::init(client.http_client(), cx);
1476        });
1477
1478        fs.insert_tree("/root", json!({})).await;
1479        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1480        let agent_model = SelectedModel::from_str(
1481            &std::env::var("ZED_AGENT_MODEL")
1482                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1483        )
1484        .unwrap();
1485        let judge_model = SelectedModel::from_str(
1486            &std::env::var("ZED_JUDGE_MODEL")
1487                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1488        )
1489        .unwrap();
1490        let (agent_model, judge_model) = cx
1491            .update(|cx| {
1492                cx.spawn(async move |cx| {
1493                    let agent_model = Self::load_model(&agent_model, cx).await;
1494                    let judge_model = Self::load_model(&judge_model, cx).await;
1495                    (agent_model.unwrap(), judge_model.unwrap())
1496                })
1497            })
1498            .await;
1499        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1500
1501        Self {
1502            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1503            project,
1504            judge_model,
1505        }
1506    }
1507
1508    async fn load_model(
1509        selected_model: &SelectedModel,
1510        cx: &mut AsyncApp,
1511    ) -> Result<Arc<dyn LanguageModel>> {
1512        let (provider, model) = cx.update(|cx| {
1513            let models = LanguageModelRegistry::read_global(cx);
1514            let model = models
1515                .available_models(cx)
1516                .find(|model| {
1517                    model.provider_id() == selected_model.provider
1518                        && model.id() == selected_model.model
1519                })
1520                .expect("Model not found");
1521            let provider = models.provider(&model.provider_id()).unwrap();
1522            (provider, model)
1523        })?;
1524        cx.update(|cx| provider.authenticate(cx))?.await?;
1525        Ok(model)
1526    }
1527
1528    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1529        let path = self
1530            .project
1531            .read_with(cx, |project, cx| {
1532                project.find_project_path(eval.edit_file_input.path, cx)
1533            })
1534            .unwrap();
1535        let buffer = self
1536            .project
1537            .update(cx, |project, cx| project.open_buffer(path, cx))
1538            .await
1539            .unwrap();
1540        let tools = cx.update(|cx| {
1541            ToolRegistry::default_global(cx)
1542                .tools()
1543                .into_iter()
1544                .filter_map(|tool| {
1545                    let input_schema = tool
1546                        .input_schema(self.agent.model.tool_input_format())
1547                        .ok()?;
1548                    Some(LanguageModelRequestTool {
1549                        name: tool.name(),
1550                        description: tool.description(),
1551                        input_schema,
1552                    })
1553                })
1554                .collect::<Vec<_>>()
1555        });
1556        let tool_names = tools
1557            .iter()
1558            .map(|tool| tool.name.clone())
1559            .collect::<Vec<_>>();
1560        let worktrees = vec![WorktreeContext {
1561            root_name: "root".to_string(),
1562            abs_path: Path::new("/path/to/root").into(),
1563            rules_file: None,
1564        }];
1565        let prompt_builder = PromptBuilder::new(None)?;
1566        let project_context = ProjectContext::new(worktrees, Vec::default());
1567        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1568            &project_context,
1569            &ModelContext {
1570                available_tools: tool_names,
1571            },
1572        )?;
1573
1574        let has_system_prompt = eval
1575            .conversation
1576            .first()
1577            .map_or(false, |msg| msg.role == Role::System);
1578        let messages = if has_system_prompt {
1579            eval.conversation
1580        } else {
1581            [LanguageModelRequestMessage {
1582                role: Role::System,
1583                content: vec![MessageContent::Text(system_prompt)],
1584                cache: true,
1585            }]
1586            .into_iter()
1587            .chain(eval.conversation)
1588            .collect::<Vec<_>>()
1589        };
1590
1591        let conversation = LanguageModelRequest {
1592            messages,
1593            tools,
1594            ..Default::default()
1595        };
1596
1597        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1598            if let Some(input_content) = eval.input_content.as_deref() {
1599                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1600            }
1601            retry_on_rate_limit(async || {
1602                self.agent
1603                    .edit(
1604                        buffer.clone(),
1605                        eval.edit_file_input.display_description.clone(),
1606                        &conversation,
1607                        &mut cx.to_async(),
1608                    )
1609                    .0
1610                    .await
1611            })
1612            .await?
1613        } else {
1614            retry_on_rate_limit(async || {
1615                self.agent
1616                    .overwrite(
1617                        buffer.clone(),
1618                        eval.edit_file_input.display_description.clone(),
1619                        &conversation,
1620                        &mut cx.to_async(),
1621                    )
1622                    .0
1623                    .await
1624            })
1625            .await?
1626        };
1627
1628        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1629        let sample = EvalSample {
1630            edit_output,
1631            diff: language::unified_diff(
1632                eval.input_content.as_deref().unwrap_or_default(),
1633                &buffer_text,
1634            ),
1635            text_before: eval.input_content.unwrap_or_default(),
1636            text_after: buffer_text,
1637        };
1638        let assertion = eval
1639            .assertion
1640            .run(&sample, self.judge_model.clone(), cx)
1641            .await?;
1642
1643        Ok(EvalOutput { assertion, sample })
1644    }
1645}
1646
1647async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1648    let mut attempt = 0;
1649    loop {
1650        attempt += 1;
1651        match request().await {
1652            Ok(result) => return Ok(result),
1653            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1654                Ok(err) => match err {
1655                    LanguageModelCompletionError::RateLimit(duration) => {
1656                        // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1657                        let jitter = duration.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1658                        eprintln!(
1659                            "Attempt #{attempt}: Rate limit exceeded. Retry after {duration:?} + jitter of {jitter:?}"
1660                        );
1661                        Timer::after(duration + jitter).await;
1662                        continue;
1663                    }
1664                    _ => return Err(err.into()),
1665                },
1666                Err(err) => return Err(err),
1667            },
1668        }
1669    }
1670}
1671
1672#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1673struct EvalAssertionOutcome {
1674    score: usize,
1675    message: Option<String>,
1676}
1677
1678#[derive(Serialize)]
1679pub struct DiffJudgeTemplate {
1680    diff: String,
1681    assertions: &'static str,
1682}
1683
1684impl Template for DiffJudgeTemplate {
1685    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1686}
1687
1688fn strip_empty_lines(text: &str) -> String {
1689    text.lines()
1690        .filter(|line| !line.trim().is_empty())
1691        .collect::<Vec<_>>()
1692        .join("\n")
1693}