evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext, Timer};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    str::FromStr,
  30    sync::mpsc,
  31};
  32use util::path;
  33
  34#[test]
  35#[cfg_attr(not(feature = "eval"), ignore)]
  36fn eval_extract_handle_command_output() {
  37    // Test how well agent generates multiple edit hunks.
  38    //
  39    // Model                       | Pass rate
  40    // ----------------------------|----------
  41    // claude-3.7-sonnet           |  0.98
  42    // gemini-2.5-pro-06-05        |  0.77
  43    // gemini-2.5-flash            |  0.11
  44    // gpt-4.1                     |  1.00
  45
  46    let input_file_path = "root/blame.rs";
  47    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  48    let possible_diffs = vec![
  49        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  56    ];
  57    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  58    eval(
  59        100,
  60        0.7, // Taking the lower bar for Gemini
  61        0.05,
  62        EvalInput::from_conversation(
  63            vec![
  64                message(
  65                    User,
  66                    [text(formatdoc! {"
  67                        Read the `{input_file_path}` file and extract a method in
  68                        the final stanza of `run_git_blame` to deal with command failures,
  69                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  70                        Do not document the method and do not add any comments.
  71
  72                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  73                    "})],
  74                ),
  75                message(
  76                    Assistant,
  77                    [tool_use(
  78                        "tool_1",
  79                        "read_file",
  80                        ReadFileToolInput {
  81                            path: input_file_path.into(),
  82                            start_line: None,
  83                            end_line: None,
  84                        },
  85                    )],
  86                ),
  87                message(
  88                    User,
  89                    [tool_result("tool_1", "read_file", input_file_content)],
  90                ),
  91                message(
  92                    Assistant,
  93                    [tool_use(
  94                        "tool_2",
  95                        "edit_file",
  96                        EditFileToolInput {
  97                            display_description: edit_description.into(),
  98                            path: input_file_path.into(),
  99                            mode: EditFileMode::Edit,
 100                        },
 101                    )],
 102                ),
 103            ],
 104            Some(input_file_content.into()),
 105            EvalAssertion::assert_diff_any(possible_diffs),
 106        ),
 107    );
 108}
 109
 110#[test]
 111#[cfg_attr(not(feature = "eval"), ignore)]
 112fn eval_delete_run_git_blame() {
 113    let input_file_path = "root/blame.rs";
 114    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 115    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 116    let edit_description = "Delete the `run_git_blame` function.";
 117    eval(
 118        100,
 119        0.95,
 120        0.05,
 121        EvalInput::from_conversation(
 122            vec![
 123                message(
 124                    User,
 125                    [text(formatdoc! {"
 126                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 127                        one function, not its usages.
 128                    "})],
 129                ),
 130                message(
 131                    Assistant,
 132                    [tool_use(
 133                        "tool_1",
 134                        "read_file",
 135                        ReadFileToolInput {
 136                            path: input_file_path.into(),
 137                            start_line: None,
 138                            end_line: None,
 139                        },
 140                    )],
 141                ),
 142                message(
 143                    User,
 144                    [tool_result("tool_1", "read_file", input_file_content)],
 145                ),
 146                message(
 147                    Assistant,
 148                    [tool_use(
 149                        "tool_2",
 150                        "edit_file",
 151                        EditFileToolInput {
 152                            display_description: edit_description.into(),
 153                            path: input_file_path.into(),
 154                            mode: EditFileMode::Edit,
 155                        },
 156                    )],
 157                ),
 158            ],
 159            Some(input_file_content.into()),
 160            EvalAssertion::assert_eq(output_file_content),
 161        ),
 162    );
 163}
 164
 165#[test]
 166#[cfg_attr(not(feature = "eval"), ignore)]
 167fn eval_translate_doc_comments() {
 168    // Results for 2025-05-22
 169    //
 170    //  Model                          | Pass rate
 171    // ============================================
 172    //
 173    //  claude-3.7-sonnet              |
 174    //  gemini-2.5-pro-preview-03-25   |  1.0
 175    //  gemini-2.5-flash-preview-04-17 |
 176    //  gpt-4.1                        |
 177    let input_file_path = "root/canvas.rs";
 178    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 179    let edit_description = "Translate all doc comments to Italian";
 180    eval(
 181        200,
 182        1.,
 183        0.05,
 184        EvalInput::from_conversation(
 185            vec![
 186                message(
 187                    User,
 188                    [text(formatdoc! {"
 189                        Read the {input_file_path} file and edit it (without overwriting it),
 190                        translating all the doc comments to italian.
 191                    "})],
 192                ),
 193                message(
 194                    Assistant,
 195                    [tool_use(
 196                        "tool_1",
 197                        "read_file",
 198                        ReadFileToolInput {
 199                            path: input_file_path.into(),
 200                            start_line: None,
 201                            end_line: None,
 202                        },
 203                    )],
 204                ),
 205                message(
 206                    User,
 207                    [tool_result("tool_1", "read_file", input_file_content)],
 208                ),
 209                message(
 210                    Assistant,
 211                    [tool_use(
 212                        "tool_2",
 213                        "edit_file",
 214                        EditFileToolInput {
 215                            display_description: edit_description.into(),
 216                            path: input_file_path.into(),
 217                            mode: EditFileMode::Edit,
 218                        },
 219                    )],
 220                ),
 221            ],
 222            Some(input_file_content.into()),
 223            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 224        ),
 225    );
 226}
 227
 228#[test]
 229#[cfg_attr(not(feature = "eval"), ignore)]
 230fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 231    // Results for 2025-05-22
 232    //
 233    //  Model                          | Pass rate
 234    // ============================================
 235    //
 236    //  claude-3.7-sonnet              |  0.98
 237    //  gemini-2.5-pro-preview-03-25   |  0.99
 238    //  gemini-2.5-flash-preview-04-17 |
 239    //  gpt-4.1                        |
 240    let input_file_path = "root/lib.rs";
 241    let input_file_content =
 242        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 243    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 244    eval(
 245        100,
 246        0.95,
 247        0.05,
 248        EvalInput::from_conversation(
 249            vec![
 250                message(
 251                    User,
 252                    [text(formatdoc! {"
 253                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 254                        Use `ureq` to download the SDK for the current platform and architecture.
 255                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 256                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 257                        that's inside of the archive.
 258                        Don't re-download the SDK if that executable already exists.
 259
 260                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 261
 262                        Here are the available wasi-sdk assets:
 263                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 264                        - wasi-sdk-25.0-arm64-macos.tar.gz
 265                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 266                        - wasi-sdk-25.0-arm64-linux.tar.gz
 267                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 268                        - wasi-sdk-25.0-arm64-linux.tar.gz
 269                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 270                    "})],
 271                ),
 272                message(
 273                    Assistant,
 274                    [tool_use(
 275                        "tool_1",
 276                        "read_file",
 277                        ReadFileToolInput {
 278                            path: input_file_path.into(),
 279                            start_line: Some(971),
 280                            end_line: Some(1050),
 281                        },
 282                    )],
 283                ),
 284                message(
 285                    User,
 286                    [tool_result(
 287                        "tool_1",
 288                        "read_file",
 289                        lines(input_file_content, 971..1050),
 290                    )],
 291                ),
 292                message(
 293                    Assistant,
 294                    [tool_use(
 295                        "tool_2",
 296                        "read_file",
 297                        ReadFileToolInput {
 298                            path: input_file_path.into(),
 299                            start_line: Some(1050),
 300                            end_line: Some(1100),
 301                        },
 302                    )],
 303                ),
 304                message(
 305                    User,
 306                    [tool_result(
 307                        "tool_2",
 308                        "read_file",
 309                        lines(input_file_content, 1050..1100),
 310                    )],
 311                ),
 312                message(
 313                    Assistant,
 314                    [tool_use(
 315                        "tool_3",
 316                        "read_file",
 317                        ReadFileToolInput {
 318                            path: input_file_path.into(),
 319                            start_line: Some(1100),
 320                            end_line: Some(1150),
 321                        },
 322                    )],
 323                ),
 324                message(
 325                    User,
 326                    [tool_result(
 327                        "tool_3",
 328                        "read_file",
 329                        lines(input_file_content, 1100..1150),
 330                    )],
 331                ),
 332                message(
 333                    Assistant,
 334                    [tool_use(
 335                        "tool_4",
 336                        "edit_file",
 337                        EditFileToolInput {
 338                            display_description: edit_description.into(),
 339                            path: input_file_path.into(),
 340                            mode: EditFileMode::Edit,
 341                        },
 342                    )],
 343                ),
 344            ],
 345            Some(input_file_content.into()),
 346            EvalAssertion::judge_diff(indoc! {"
 347                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 348                - ureq is used to download the SDK for current platform and architecture
 349            "}),
 350        ),
 351    );
 352}
 353
 354#[test]
 355#[cfg_attr(not(feature = "eval"), ignore)]
 356fn eval_disable_cursor_blinking() {
 357    // Results for 2025-05-22
 358    //
 359    //  Model                          | Pass rate
 360    // ============================================
 361    //
 362    //  claude-3.7-sonnet              |
 363    //  gemini-2.5-pro-preview-03-25   |  1.0
 364    //  gemini-2.5-flash-preview-04-17 |
 365    //  gpt-4.1                        |
 366    let input_file_path = "root/editor.rs";
 367    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 368    let edit_description = "Comment out the call to `BlinkManager::enable`";
 369    eval(
 370        100,
 371        0.95,
 372        0.05,
 373        EvalInput::from_conversation(
 374            vec![
 375                message(User, [text("Let's research how to cursor blinking works.")]),
 376                message(
 377                    Assistant,
 378                    [tool_use(
 379                        "tool_1",
 380                        "grep",
 381                        GrepToolInput {
 382                            regex: "blink".into(),
 383                            include_pattern: None,
 384                            offset: 0,
 385                            case_sensitive: false,
 386                        },
 387                    )],
 388                ),
 389                message(
 390                    User,
 391                    [tool_result(
 392                        "tool_1",
 393                        "grep",
 394                        [
 395                            lines(input_file_content, 100..400),
 396                            lines(input_file_content, 800..1300),
 397                            lines(input_file_content, 1600..2000),
 398                            lines(input_file_content, 5000..5500),
 399                            lines(input_file_content, 8000..9000),
 400                            lines(input_file_content, 18455..18470),
 401                            lines(input_file_content, 20000..20500),
 402                            lines(input_file_content, 21000..21300),
 403                        ]
 404                        .join("Match found:\n\n"),
 405                    )],
 406                ),
 407                message(
 408                    User,
 409                    [text(indoc! {"
 410                        Comment out the lines that interact with the BlinkManager.
 411                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 412                        Don't add additional comments.
 413                    "})],
 414                ),
 415                message(
 416                    Assistant,
 417                    [tool_use(
 418                        "tool_4",
 419                        "edit_file",
 420                        EditFileToolInput {
 421                            display_description: edit_description.into(),
 422                            path: input_file_path.into(),
 423                            mode: EditFileMode::Edit,
 424                        },
 425                    )],
 426                ),
 427            ],
 428            Some(input_file_content.into()),
 429            EvalAssertion::judge_diff(indoc! {"
 430                - Calls to BlinkManager in `observe_window_activation` were commented out
 431                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 432                - All the edits have valid indentation
 433            "}),
 434        ),
 435    );
 436}
 437
 438#[test]
 439#[cfg_attr(not(feature = "eval"), ignore)]
 440fn eval_from_pixels_constructor() {
 441    // Results for 2025-06-13
 442    //
 443    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 444    // value. Higher values improve the pass rate but may sometimes cause
 445    // edits to be misapplied. In the context of this eval, this means
 446    // the agent might add from_pixels tests in incorrect locations
 447    // (e.g., at the beginning of the file), yet the evaluation may still
 448    // rate it highly.
 449    //
 450    //  Model                          | Pass rate
 451    // ============================================
 452    //
 453    //  claude-4.0-sonnet              |  0.99
 454    //  claude-3.7-sonnet              |  0.88
 455    //  gemini-2.5-pro-preview-03-25   |  0.96
 456    //  gpt-4.1                        |
 457    let input_file_path = "root/canvas.rs";
 458    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 459    let edit_description = "Implement from_pixels constructor and add tests.";
 460    eval(
 461        100,
 462        0.95,
 463        // For whatever reason, this eval produces more mismatched tags.
 464        // Increasing for now, let's see if we can bring this down.
 465        0.2,
 466        EvalInput::from_conversation(
 467            vec![
 468                message(
 469                    User,
 470                    [text(indoc! {"
 471                        Introduce a new `from_pixels` constructor in Canvas and
 472                        also add tests for it in the same file.
 473                    "})],
 474                ),
 475                message(
 476                    Assistant,
 477                    [tool_use(
 478                        "tool_1",
 479                        "read_file",
 480                        ReadFileToolInput {
 481                            path: input_file_path.into(),
 482                            start_line: None,
 483                            end_line: None,
 484                        },
 485                    )],
 486                ),
 487                message(
 488                    User,
 489                    [tool_result("tool_1", "read_file", input_file_content)],
 490                ),
 491                message(
 492                    Assistant,
 493                    [tool_use(
 494                        "tool_2",
 495                        "grep",
 496                        GrepToolInput {
 497                            regex: "mod\\s+tests".into(),
 498                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 499                            offset: 0,
 500                            case_sensitive: false,
 501                        },
 502                    )],
 503                ),
 504                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 505                message(
 506                    Assistant,
 507                    [tool_use(
 508                        "tool_3",
 509                        "grep",
 510                        GrepToolInput {
 511                            regex: "mod\\s+tests".into(),
 512                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 513                            offset: 0,
 514                            case_sensitive: false,
 515                        },
 516                    )],
 517                ),
 518                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 519                message(
 520                    Assistant,
 521                    [tool_use(
 522                        "tool_4",
 523                        "grep",
 524                        GrepToolInput {
 525                            regex: "#\\[test\\]".into(),
 526                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 527                            offset: 0,
 528                            case_sensitive: false,
 529                        },
 530                    )],
 531                ),
 532                message(
 533                    User,
 534                    [tool_result(
 535                        "tool_4",
 536                        "grep",
 537                        indoc! {"
 538                            Found 6 matches:
 539
 540                            ## Matches in font-kit/src/loaders/core_text.rs
 541
 542                            ### mod test › L926-936
 543                            ```
 544                            mod test {
 545                                use super::Font;
 546                                use crate::properties::{Stretch, Weight};
 547
 548                                #[cfg(feature = \"source\")]
 549                                use crate::source::SystemSource;
 550
 551                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 552
 553                                #[cfg(feature = \"source\")]
 554                                #[test]
 555                            ```
 556
 557                            55 lines remaining in ancestor node. Read the file to see all.
 558
 559                            ### mod test › L947-951
 560                            ```
 561                                }
 562
 563                                #[test]
 564                                fn test_core_text_to_css_font_weight() {
 565                                    // Exact matches
 566                            ```
 567
 568                            ### mod test › L959-963
 569                            ```
 570                                }
 571
 572                                #[test]
 573                                fn test_core_text_to_css_font_stretch() {
 574                                    // Exact matches
 575                            ```
 576
 577                            ## Matches in font-kit/src/loaders/freetype.rs
 578
 579                            ### mod test › L1238-1248
 580                            ```
 581                            mod test {
 582                                use crate::loaders::freetype::Font;
 583
 584                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 585                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 586
 587                                #[test]
 588                                fn get_pcf_postscript_name() {
 589                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 590                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 591                                }
 592                            ```
 593
 594                            1 lines remaining in ancestor node. Read the file to see all.
 595
 596                            ## Matches in font-kit/src/sources/core_text.rs
 597
 598                            ### mod test › L265-275
 599                            ```
 600                            mod test {
 601                                use crate::properties::{Stretch, Weight};
 602
 603                                #[test]
 604                                fn test_css_to_core_text_font_weight() {
 605                                    // Exact matches
 606                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 607                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 608                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 609                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 610
 611                            ```
 612
 613                            27 lines remaining in ancestor node. Read the file to see all.
 614
 615                            ### mod test › L278-282
 616                            ```
 617                                }
 618
 619                                #[test]
 620                                fn test_css_to_core_text_font_stretch() {
 621                                    // Exact matches
 622                            ```
 623                        "},
 624                    )],
 625                ),
 626                message(
 627                    Assistant,
 628                    [tool_use(
 629                        "tool_5",
 630                        "edit_file",
 631                        EditFileToolInput {
 632                            display_description: edit_description.into(),
 633                            path: input_file_path.into(),
 634                            mode: EditFileMode::Edit,
 635                        },
 636                    )],
 637                ),
 638            ],
 639            Some(input_file_content.into()),
 640            EvalAssertion::judge_diff(indoc! {"
 641                    - The diff contains a new `from_pixels` constructor
 642                    - The diff contains new tests for the `from_pixels` constructor
 643                "}),
 644        ),
 645    );
 646}
 647
 648#[test]
 649#[cfg_attr(not(feature = "eval"), ignore)]
 650fn eval_zode() {
 651    // Results for 2025-05-22
 652    //
 653    //  Model                          | Pass rate
 654    // ============================================
 655    //
 656    //  claude-3.7-sonnet              |  1.0
 657    //  gemini-2.5-pro-preview-03-25   |  1.0
 658    //  gemini-2.5-flash-preview-04-17 |  1.0
 659    //  gpt-4.1                        |  1.0
 660    let input_file_path = "root/zode.py";
 661    let input_content = None;
 662    let edit_description = "Create the main Zode CLI script";
 663    eval(
 664        50,
 665        1.,
 666        0.05,
 667        EvalInput::from_conversation(
 668            vec![
 669                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 670                message(
 671                    Assistant,
 672                    [
 673                        tool_use(
 674                            "tool_1",
 675                            "read_file",
 676                            ReadFileToolInput {
 677                                path: "root/eval/react.py".into(),
 678                                start_line: None,
 679                                end_line: None,
 680                            },
 681                        ),
 682                        tool_use(
 683                            "tool_2",
 684                            "read_file",
 685                            ReadFileToolInput {
 686                                path: "root/eval/react_test.py".into(),
 687                                start_line: None,
 688                                end_line: None,
 689                            },
 690                        ),
 691                    ],
 692                ),
 693                message(
 694                    User,
 695                    [
 696                        tool_result(
 697                            "tool_1",
 698                            "read_file",
 699                            include_str!("evals/fixtures/zode/react.py"),
 700                        ),
 701                        tool_result(
 702                            "tool_2",
 703                            "read_file",
 704                            include_str!("evals/fixtures/zode/react_test.py"),
 705                        ),
 706                    ],
 707                ),
 708                message(
 709                    Assistant,
 710                    [
 711                        text(
 712                            "Now that I understand what we need to build, I'll create the main Python script:",
 713                        ),
 714                        tool_use(
 715                            "tool_3",
 716                            "edit_file",
 717                            EditFileToolInput {
 718                                display_description: edit_description.into(),
 719                                path: input_file_path.into(),
 720                                mode: EditFileMode::Create,
 721                            },
 722                        ),
 723                    ],
 724                ),
 725            ],
 726            input_content,
 727            EvalAssertion::new(async move |sample, _, _cx| {
 728                let invalid_starts = [' ', '`', '\n'];
 729                let mut message = String::new();
 730                for start in invalid_starts {
 731                    if sample.text_after.starts_with(start) {
 732                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 733                        break;
 734                    }
 735                }
 736                // Remove trailing newline.
 737                message.pop();
 738
 739                if message.is_empty() {
 740                    Ok(EvalAssertionOutcome {
 741                        score: 100,
 742                        message: None,
 743                    })
 744                } else {
 745                    Ok(EvalAssertionOutcome {
 746                        score: 0,
 747                        message: Some(message),
 748                    })
 749                }
 750            }),
 751        ),
 752    );
 753}
 754
 755#[test]
 756#[cfg_attr(not(feature = "eval"), ignore)]
 757fn eval_add_overwrite_test() {
 758    // Results for 2025-05-22
 759    //
 760    //  Model                          | Pass rate
 761    // ============================================
 762    //
 763    //  claude-3.7-sonnet              |  0.16
 764    //  gemini-2.5-pro-preview-03-25   |  0.35
 765    //  gemini-2.5-flash-preview-04-17 |
 766    //  gpt-4.1                        |
 767    let input_file_path = "root/action_log.rs";
 768    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 769    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 770    eval(
 771        200,
 772        0.5, // TODO: make this eval better
 773        0.05,
 774        EvalInput::from_conversation(
 775            vec![
 776                message(
 777                    User,
 778                    [text(indoc! {"
 779                        Introduce a new test in `action_log.rs` to test overwriting a file.
 780                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 781                        Take inspiration from all the other tests in the file.
 782                    "})],
 783                ),
 784                message(
 785                    Assistant,
 786                    [tool_use(
 787                        "tool_1",
 788                        "read_file",
 789                        ReadFileToolInput {
 790                            path: input_file_path.into(),
 791                            start_line: None,
 792                            end_line: None,
 793                        },
 794                    )],
 795                ),
 796                message(
 797                    User,
 798                    [tool_result(
 799                        "tool_1",
 800                        "read_file",
 801                        indoc! {"
 802                            pub struct ActionLog [L13-20]
 803                             tracked_buffers [L15]
 804                             edited_since_project_diagnostics_check [L17]
 805                             project [L19]
 806                            impl ActionLog [L22-498]
 807                             pub fn new [L24-30]
 808                             pub fn project [L32-34]
 809                             pub fn checked_project_diagnostics [L37-39]
 810                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 811                             fn track_buffer_internal [L46-101]
 812                             fn handle_buffer_event [L103-116]
 813                             fn handle_buffer_edited [L118-123]
 814                             fn handle_buffer_file_changed [L125-158]
 815                             async fn maintain_diff [L160-264]
 816                             pub fn buffer_read [L267-269]
 817                             pub fn buffer_created [L272-276]
 818                             pub fn buffer_edited [L279-287]
 819                             pub fn will_delete_buffer [L289-304]
 820                             pub fn keep_edits_in_range [L306-364]
 821                             pub fn reject_edits_in_ranges [L366-459]
 822                             pub fn keep_all_edits [L461-473]
 823                             pub fn changed_buffers [L476-482]
 824                             pub fn stale_buffers [L485-497]
 825                            fn apply_non_conflicting_edits [L500-561]
 826                            fn diff_snapshots [L563-585]
 827                            fn point_to_row_edit [L587-614]
 828                            enum ChangeAuthor [L617-620]
 829                             User [L618]
 830                             Agent [L619]
 831                            enum TrackedBufferStatus [L623-627]
 832                             Created [L624]
 833                             Modified [L625]
 834                             Deleted [L626]
 835                            struct TrackedBuffer [L629-641]
 836                             buffer [L630]
 837                             base_text [L631]
 838                             unreviewed_changes [L632]
 839                             status [L633]
 840                             version [L634]
 841                             diff [L635]
 842                             snapshot [L636]
 843                             diff_update [L637]
 844                             _open_lsp_handle [L638]
 845                             _maintain_diff [L639]
 846                             _subscription [L640]
 847                            impl TrackedBuffer [L643-657]
 848                             fn has_changes [L644-650]
 849                             fn schedule_diff_update [L652-656]
 850                            pub struct ChangedBuffer [L659-661]
 851                             pub diff [L660]
 852                            mod tests [L664-1574]
 853                             fn init_logger [L678-682]
 854                             fn init_test [L684-691]
 855                             async fn test_keep_edits [L694-769]
 856                             async fn test_deletions [L772-854]
 857                             async fn test_overlapping_user_edits [L857-951]
 858                             async fn test_creating_files [L954-1010]
 859                             async fn test_deleting_files [L1013-1120]
 860                             async fn test_reject_edits [L1123-1255]
 861                             async fn test_reject_multiple_edits [L1258-1331]
 862                             async fn test_reject_deleted_file [L1334-1388]
 863                             async fn test_reject_created_file [L1391-1443]
 864                             async fn test_random_diffs [L1446-1535]
 865                              fn quiesce [L1510-1534]
 866                             struct HunkStatus [L1538-1542]
 867                              range [L1539]
 868                              diff_status [L1540]
 869                              old_text [L1541]
 870                             fn unreviewed_hunks [L1544-1573]
 871
 872                            Showing symbols 1-69 (total symbols: 69)
 873
 874                            Using the line numbers in this outline, you can call this tool again while specifying
 875                            the start_line and end_line fields to see the implementations of symbols in the outline.
 876                        "},
 877                    )],
 878                ),
 879                message(
 880                    Assistant,
 881                    [
 882                        text(
 883                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 884                        ),
 885                        tool_use(
 886                            "tool_2",
 887                            "read_file",
 888                            ReadFileToolInput {
 889                                path: input_file_path.into(),
 890                                start_line: Some(953),
 891                                end_line: Some(1010),
 892                            },
 893                        ),
 894                    ],
 895                ),
 896                message(
 897                    User,
 898                    [tool_result(
 899                        "tool_2",
 900                        "read_file",
 901                        lines(input_file_content, 953..1010),
 902                    )],
 903                ),
 904                message(
 905                    Assistant,
 906                    [
 907                        text(
 908                            "Now let's look at another relevant test to understand how they're structured:",
 909                        ),
 910                        tool_use(
 911                            "tool_3",
 912                            "read_file",
 913                            ReadFileToolInput {
 914                                path: input_file_path.into(),
 915                                start_line: Some(1012),
 916                                end_line: Some(1120),
 917                            },
 918                        ),
 919                    ],
 920                ),
 921                message(
 922                    User,
 923                    [tool_result(
 924                        "tool_3",
 925                        "read_file",
 926                        lines(input_file_content, 1012..1120),
 927                    )],
 928                ),
 929                message(
 930                    Assistant,
 931                    [
 932                        text("Now let's look at how `buffer_created` is implemented:"),
 933                        tool_use(
 934                            "tool_4",
 935                            "read_file",
 936                            ReadFileToolInput {
 937                                path: input_file_path.into(),
 938                                start_line: Some(271),
 939                                end_line: Some(276),
 940                            },
 941                        ),
 942                    ],
 943                ),
 944                message(
 945                    User,
 946                    [tool_result(
 947                        "tool_4",
 948                        "read_file",
 949                        lines(input_file_content, 271..276),
 950                    )],
 951                ),
 952                message(
 953                    Assistant,
 954                    [
 955                        text(indoc! {"
 956                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 957
 958                            1. Create a file
 959                            2. Open a buffer for it
 960                            3. Track it in the action log
 961                            4. Modify the buffer
 962                            5. Call `buffer_created` again as if the file were new
 963                            6. Check that the changes are properly tracked
 964
 965                            Let's write the test:
 966                        "}),
 967                        tool_use(
 968                            "tool_5",
 969                            "edit_file",
 970                            EditFileToolInput {
 971                                display_description: edit_description.into(),
 972                                path: input_file_path.into(),
 973                                mode: EditFileMode::Edit,
 974                            },
 975                        ),
 976                    ],
 977                ),
 978            ],
 979            Some(input_file_content.into()),
 980            EvalAssertion::judge_diff(
 981                "A new test for overwritten files was created, without changing any previous test",
 982            ),
 983        ),
 984    );
 985}
 986
 987#[test]
 988#[cfg_attr(not(feature = "eval"), ignore)]
 989fn eval_create_empty_file() {
 990    // Check that Edit Agent can create a file without writing its
 991    // thoughts into it. This issue is not specific to empty files, but
 992    // it's easier to reproduce with them.
 993    //
 994    // Results for 2025-05-21:
 995    //
 996    //  Model                          | Pass rate
 997    // ============================================
 998    //
 999    //  claude-3.7-sonnet              |  1.00
1000    //  gemini-2.5-pro-preview-03-25   |  1.00
1001    //  gemini-2.5-flash-preview-04-17 |  1.00
1002    //  gpt-4.1                        |  1.00
1003    //
1004    //
1005    // TODO: gpt-4.1-mini errored 38 times:
1006    // "data did not match any variant of untagged enum ResponseStreamResult"
1007    //
1008    let input_file_content = None;
1009    let expected_output_content = String::new();
1010    eval(
1011        100,
1012        0.99,
1013        0.05,
1014        EvalInput::from_conversation(
1015            vec![
1016                message(User, [text("Create a second empty todo file ")]),
1017                message(
1018                    Assistant,
1019                    [
1020                        text(formatdoc! {"
1021                        I'll help you create a second empty todo file.
1022                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1023                        "}),
1024                        tool_use(
1025                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1026                            "list_directory",
1027                            ListDirectoryToolInput {
1028                                path: "root".to_string(),
1029                            },
1030                        ),
1031                    ],
1032                ),
1033                message(
1034                    User,
1035                    [tool_result(
1036                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1037                        "list_directory",
1038                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1039                    )],
1040                ),
1041                message(
1042                    Assistant,
1043                    [
1044                        text(formatdoc! {"
1045                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1046                    "}),
1047                        tool_use(
1048                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1049                            "edit_file",
1050                            EditFileToolInput {
1051                                display_description: "Create empty TODO3 file".to_string(),
1052                                mode: EditFileMode::Create,
1053                                path: "root/TODO3".into(),
1054                            },
1055                        ),
1056                    ],
1057                ),
1058            ],
1059            input_file_content,
1060            // Bad behavior is to write something like
1061            // "I'll create an empty TODO3 file as requested."
1062            EvalAssertion::assert_eq(expected_output_content),
1063        ),
1064    );
1065}
1066
1067fn message(
1068    role: Role,
1069    contents: impl IntoIterator<Item = MessageContent>,
1070) -> LanguageModelRequestMessage {
1071    LanguageModelRequestMessage {
1072        role,
1073        content: contents.into_iter().collect(),
1074        cache: false,
1075    }
1076}
1077
1078fn text(text: impl Into<String>) -> MessageContent {
1079    MessageContent::Text(text.into())
1080}
1081
1082fn lines(input: &str, range: Range<usize>) -> String {
1083    input
1084        .lines()
1085        .skip(range.start)
1086        .take(range.len())
1087        .collect::<Vec<_>>()
1088        .join("\n")
1089}
1090
1091fn tool_use(
1092    id: impl Into<Arc<str>>,
1093    name: impl Into<Arc<str>>,
1094    input: impl Serialize,
1095) -> MessageContent {
1096    MessageContent::ToolUse(LanguageModelToolUse {
1097        id: LanguageModelToolUseId::from(id.into()),
1098        name: name.into(),
1099        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1100        input: serde_json::to_value(input).unwrap(),
1101        is_input_complete: true,
1102    })
1103}
1104
1105fn tool_result(
1106    id: impl Into<Arc<str>>,
1107    name: impl Into<Arc<str>>,
1108    result: impl Into<Arc<str>>,
1109) -> MessageContent {
1110    MessageContent::ToolResult(LanguageModelToolResult {
1111        tool_use_id: LanguageModelToolUseId::from(id.into()),
1112        tool_name: name.into(),
1113        is_error: false,
1114        content: LanguageModelToolResultContent::Text(result.into()),
1115        output: None,
1116    })
1117}
1118
1119#[derive(Clone)]
1120struct EvalInput {
1121    conversation: Vec<LanguageModelRequestMessage>,
1122    edit_file_input: EditFileToolInput,
1123    input_content: Option<String>,
1124    assertion: EvalAssertion,
1125}
1126
1127impl EvalInput {
1128    fn from_conversation(
1129        conversation: Vec<LanguageModelRequestMessage>,
1130        input_content: Option<String>,
1131        assertion: EvalAssertion,
1132    ) -> Self {
1133        let msg = conversation.last().expect("Conversation must not be empty");
1134        if msg.role != Role::Assistant {
1135            panic!("Conversation must end with an assistant message");
1136        }
1137        let tool_use = msg
1138            .content
1139            .iter()
1140            .flat_map(|content| match content {
1141                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1142                    Some(tool_use)
1143                }
1144                _ => None,
1145            })
1146            .next()
1147            .expect("Conversation must end with an edit_file tool use")
1148            .clone();
1149
1150        let edit_file_input: EditFileToolInput =
1151            serde_json::from_value(tool_use.input.clone()).unwrap();
1152
1153        EvalInput {
1154            conversation,
1155            edit_file_input,
1156            input_content,
1157            assertion,
1158        }
1159    }
1160}
1161
1162#[derive(Clone)]
1163struct EvalSample {
1164    text_before: String,
1165    text_after: String,
1166    edit_output: EditAgentOutput,
1167    diff: String,
1168}
1169
1170trait AssertionFn: 'static + Send + Sync {
1171    fn assert<'a>(
1172        &'a self,
1173        sample: &'a EvalSample,
1174        judge_model: Arc<dyn LanguageModel>,
1175        cx: &'a mut TestAppContext,
1176    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1177}
1178
1179impl<F> AssertionFn for F
1180where
1181    F: 'static
1182        + Send
1183        + Sync
1184        + AsyncFn(
1185            &EvalSample,
1186            Arc<dyn LanguageModel>,
1187            &mut TestAppContext,
1188        ) -> Result<EvalAssertionOutcome>,
1189{
1190    fn assert<'a>(
1191        &'a self,
1192        sample: &'a EvalSample,
1193        judge_model: Arc<dyn LanguageModel>,
1194        cx: &'a mut TestAppContext,
1195    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1196        (self)(sample, judge_model, cx).boxed_local()
1197    }
1198}
1199
1200#[derive(Clone)]
1201struct EvalAssertion(Arc<dyn AssertionFn>);
1202
1203impl EvalAssertion {
1204    fn new<F>(f: F) -> Self
1205    where
1206        F: 'static
1207            + Send
1208            + Sync
1209            + AsyncFn(
1210                &EvalSample,
1211                Arc<dyn LanguageModel>,
1212                &mut TestAppContext,
1213            ) -> Result<EvalAssertionOutcome>,
1214    {
1215        EvalAssertion(Arc::new(f))
1216    }
1217
1218    fn assert_eq(expected: impl Into<String>) -> Self {
1219        let expected = expected.into();
1220        Self::new(async move |sample, _judge, _cx| {
1221            Ok(EvalAssertionOutcome {
1222                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1223                    100
1224                } else {
1225                    0
1226                },
1227                message: None,
1228            })
1229        })
1230    }
1231
1232    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1233        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1234        Self::new(async move |sample, _judge, _cx| {
1235            let matches = expected_diffs.iter().any(|possible_diff| {
1236                let expected =
1237                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1238                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1239            });
1240
1241            Ok(EvalAssertionOutcome {
1242                score: if matches { 100 } else { 0 },
1243                message: None,
1244            })
1245        })
1246    }
1247
1248    fn judge_diff(assertions: &'static str) -> Self {
1249        Self::new(async move |sample, judge, cx| {
1250            let prompt = DiffJudgeTemplate {
1251                diff: sample.diff.clone(),
1252                assertions,
1253            }
1254            .render(&Templates::new())
1255            .unwrap();
1256
1257            let request = LanguageModelRequest {
1258                messages: vec![LanguageModelRequestMessage {
1259                    role: Role::User,
1260                    content: vec![prompt.into()],
1261                    cache: false,
1262                }],
1263                ..Default::default()
1264            };
1265            let mut response = retry_on_rate_limit(async || {
1266                Ok(judge
1267                    .stream_completion_text(request.clone(), &cx.to_async())
1268                    .await?)
1269            })
1270            .await?;
1271            let mut output = String::new();
1272            while let Some(chunk) = response.stream.next().await {
1273                let chunk = chunk?;
1274                output.push_str(&chunk);
1275            }
1276
1277            // Parse the score from the response
1278            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1279            if let Some(captures) = re.captures(&output) {
1280                if let Some(score_match) = captures.get(1) {
1281                    let score = score_match.as_str().parse().unwrap_or(0);
1282                    return Ok(EvalAssertionOutcome {
1283                        score,
1284                        message: Some(output),
1285                    });
1286                }
1287            }
1288
1289            anyhow::bail!("No score found in response. Raw output: {output}");
1290        })
1291    }
1292
1293    async fn run(
1294        &self,
1295        input: &EvalSample,
1296        judge_model: Arc<dyn LanguageModel>,
1297        cx: &mut TestAppContext,
1298    ) -> Result<EvalAssertionOutcome> {
1299        self.0.assert(input, judge_model, cx).await
1300    }
1301}
1302
1303fn eval(
1304    iterations: usize,
1305    expected_pass_ratio: f32,
1306    mismatched_tag_threshold: f32,
1307    mut eval: EvalInput,
1308) {
1309    let mut evaluated_count = 0;
1310    let mut failed_count = 0;
1311    report_progress(evaluated_count, failed_count, iterations);
1312
1313    let (tx, rx) = mpsc::channel();
1314
1315    // Cache the last message in the conversation, and run one instance of the eval so that
1316    // all the next ones are cached.
1317    eval.conversation.last_mut().unwrap().cache = true;
1318    run_eval(eval.clone(), tx.clone());
1319
1320    let executor = gpui::background_executor();
1321    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1322    for _ in 1..iterations {
1323        let eval = eval.clone();
1324        let tx = tx.clone();
1325        let semaphore = semaphore.clone();
1326        executor
1327            .spawn(async move {
1328                let _guard = semaphore.acquire().await;
1329                run_eval(eval, tx)
1330            })
1331            .detach();
1332    }
1333    drop(tx);
1334
1335    let mut failed_evals = HashMap::default();
1336    let mut errored_evals = HashMap::default();
1337    let mut eval_outputs = Vec::new();
1338    let mut cumulative_parser_metrics = EditParserMetrics::default();
1339    while let Ok(output) = rx.recv() {
1340        match output {
1341            Ok(output) => {
1342                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1343                eval_outputs.push(output.clone());
1344                if output.assertion.score < 80 {
1345                    failed_count += 1;
1346                    failed_evals
1347                        .entry(output.sample.text_after.clone())
1348                        .or_insert(Vec::new())
1349                        .push(output);
1350                }
1351            }
1352            Err(error) => {
1353                failed_count += 1;
1354                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1355            }
1356        }
1357
1358        evaluated_count += 1;
1359        report_progress(evaluated_count, failed_count, iterations);
1360    }
1361
1362    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1363    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1364    if actual_pass_ratio < expected_pass_ratio {
1365        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1366        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1367        for (error, count) in errored_evals {
1368            println!("Eval errored {} times. Error: {}", count, error);
1369        }
1370
1371        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1372        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1373        for (_buffer_output, failed_evals) in failed_evals {
1374            let eval_output = failed_evals.first().unwrap();
1375            println!("Eval failed {} times", failed_evals.len());
1376            println!("{}", eval_output);
1377        }
1378
1379        panic!(
1380            "Actual pass ratio: {}\nExpected pass ratio: {}",
1381            actual_pass_ratio, expected_pass_ratio
1382        );
1383    }
1384
1385    let mismatched_tag_ratio =
1386        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1387    if mismatched_tag_ratio > mismatched_tag_threshold {
1388        for eval_output in eval_outputs {
1389            println!("{}", eval_output);
1390        }
1391        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1392    }
1393}
1394
1395fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1396    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1397    let mut cx = TestAppContext::build(dispatcher, None);
1398    let output = cx.executor().block_test(async {
1399        let test = EditAgentTest::new(&mut cx).await;
1400        test.eval(eval, &mut cx).await
1401    });
1402    tx.send(output).unwrap();
1403}
1404
1405#[derive(Clone)]
1406struct EvalOutput {
1407    sample: EvalSample,
1408    assertion: EvalAssertionOutcome,
1409}
1410
1411impl Display for EvalOutput {
1412    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1413        writeln!(f, "Score: {:?}", self.assertion.score)?;
1414        if let Some(message) = self.assertion.message.as_ref() {
1415            writeln!(f, "Message: {}", message)?;
1416        }
1417
1418        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1419
1420        writeln!(
1421            f,
1422            "Parser Metrics:\n{:#?}",
1423            self.sample.edit_output.parser_metrics
1424        )?;
1425        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1426        Ok(())
1427    }
1428}
1429
1430fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1431    let passed_count = evaluated_count - failed_count;
1432    let passed_ratio = if evaluated_count == 0 {
1433        0.0
1434    } else {
1435        passed_count as f64 / evaluated_count as f64
1436    };
1437    print!(
1438        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1439        evaluated_count,
1440        iterations,
1441        passed_ratio * 100.0
1442    );
1443    std::io::stdout().flush().unwrap();
1444}
1445
1446struct EditAgentTest {
1447    agent: EditAgent,
1448    project: Entity<Project>,
1449    judge_model: Arc<dyn LanguageModel>,
1450}
1451
1452impl EditAgentTest {
1453    async fn new(cx: &mut TestAppContext) -> Self {
1454        cx.executor().allow_parking();
1455
1456        let fs = FakeFs::new(cx.executor().clone());
1457        cx.update(|cx| {
1458            settings::init(cx);
1459            gpui_tokio::init(cx);
1460            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1461            cx.set_http_client(http_client);
1462
1463            client::init_settings(cx);
1464            let client = Client::production(cx);
1465            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1466
1467            settings::init(cx);
1468            Project::init_settings(cx);
1469            language::init(cx);
1470            language_model::init(client.clone(), cx);
1471            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1472            crate::init(client.http_client(), cx);
1473        });
1474
1475        fs.insert_tree("/root", json!({})).await;
1476        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1477        let agent_model = SelectedModel::from_str(
1478            &std::env::var("ZED_AGENT_MODEL")
1479                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1480        )
1481        .unwrap();
1482        let judge_model = SelectedModel::from_str(
1483            &std::env::var("ZED_JUDGE_MODEL")
1484                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1485        )
1486        .unwrap();
1487        let (agent_model, judge_model) = cx
1488            .update(|cx| {
1489                cx.spawn(async move |cx| {
1490                    let agent_model = Self::load_model(&agent_model, cx).await;
1491                    let judge_model = Self::load_model(&judge_model, cx).await;
1492                    (agent_model.unwrap(), judge_model.unwrap())
1493                })
1494            })
1495            .await;
1496        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1497
1498        Self {
1499            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1500            project,
1501            judge_model,
1502        }
1503    }
1504
1505    async fn load_model(
1506        selected_model: &SelectedModel,
1507        cx: &mut AsyncApp,
1508    ) -> Result<Arc<dyn LanguageModel>> {
1509        let (provider, model) = cx.update(|cx| {
1510            let models = LanguageModelRegistry::read_global(cx);
1511            let model = models
1512                .available_models(cx)
1513                .find(|model| {
1514                    model.provider_id() == selected_model.provider
1515                        && model.id() == selected_model.model
1516                })
1517                .expect("Model not found");
1518            let provider = models.provider(&model.provider_id()).unwrap();
1519            (provider, model)
1520        })?;
1521        cx.update(|cx| provider.authenticate(cx))?.await?;
1522        Ok(model)
1523    }
1524
1525    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1526        let path = self
1527            .project
1528            .read_with(cx, |project, cx| {
1529                project.find_project_path(eval.edit_file_input.path, cx)
1530            })
1531            .unwrap();
1532        let buffer = self
1533            .project
1534            .update(cx, |project, cx| project.open_buffer(path, cx))
1535            .await
1536            .unwrap();
1537        let tools = cx.update(|cx| {
1538            ToolRegistry::default_global(cx)
1539                .tools()
1540                .into_iter()
1541                .filter_map(|tool| {
1542                    let input_schema = tool
1543                        .input_schema(self.agent.model.tool_input_format())
1544                        .ok()?;
1545                    Some(LanguageModelRequestTool {
1546                        name: tool.name(),
1547                        description: tool.description(),
1548                        input_schema,
1549                    })
1550                })
1551                .collect::<Vec<_>>()
1552        });
1553        let tool_names = tools
1554            .iter()
1555            .map(|tool| tool.name.clone())
1556            .collect::<Vec<_>>();
1557        let worktrees = vec![WorktreeContext {
1558            root_name: "root".to_string(),
1559            rules_file: None,
1560        }];
1561        let prompt_builder = PromptBuilder::new(None)?;
1562        let project_context = ProjectContext::new(worktrees, Vec::default());
1563        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1564            &project_context,
1565            &ModelContext {
1566                available_tools: tool_names,
1567            },
1568        )?;
1569
1570        let has_system_prompt = eval
1571            .conversation
1572            .first()
1573            .map_or(false, |msg| msg.role == Role::System);
1574        let messages = if has_system_prompt {
1575            eval.conversation
1576        } else {
1577            [LanguageModelRequestMessage {
1578                role: Role::System,
1579                content: vec![MessageContent::Text(system_prompt)],
1580                cache: true,
1581            }]
1582            .into_iter()
1583            .chain(eval.conversation)
1584            .collect::<Vec<_>>()
1585        };
1586
1587        let conversation = LanguageModelRequest {
1588            messages,
1589            tools,
1590            ..Default::default()
1591        };
1592
1593        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1594            if let Some(input_content) = eval.input_content.as_deref() {
1595                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1596            }
1597            retry_on_rate_limit(async || {
1598                self.agent
1599                    .edit(
1600                        buffer.clone(),
1601                        eval.edit_file_input.display_description.clone(),
1602                        &conversation,
1603                        &mut cx.to_async(),
1604                    )
1605                    .0
1606                    .await
1607            })
1608            .await?
1609        } else {
1610            retry_on_rate_limit(async || {
1611                self.agent
1612                    .overwrite(
1613                        buffer.clone(),
1614                        eval.edit_file_input.display_description.clone(),
1615                        &conversation,
1616                        &mut cx.to_async(),
1617                    )
1618                    .0
1619                    .await
1620            })
1621            .await?
1622        };
1623
1624        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1625        let sample = EvalSample {
1626            edit_output,
1627            diff: language::unified_diff(
1628                eval.input_content.as_deref().unwrap_or_default(),
1629                &buffer_text,
1630            ),
1631            text_before: eval.input_content.unwrap_or_default(),
1632            text_after: buffer_text,
1633        };
1634        let assertion = eval
1635            .assertion
1636            .run(&sample, self.judge_model.clone(), cx)
1637            .await?;
1638
1639        Ok(EvalOutput { assertion, sample })
1640    }
1641}
1642
1643async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1644    let mut attempt = 0;
1645    loop {
1646        attempt += 1;
1647        match request().await {
1648            Ok(result) => return Ok(result),
1649            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1650                Ok(err) => match err {
1651                    LanguageModelCompletionError::RateLimit(duration) => {
1652                        // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1653                        let jitter = duration.mul_f64(rand::thread_rng().gen_range(0.0..0.5));
1654                        eprintln!(
1655                            "Attempt #{attempt}: Rate limit exceeded. Retry after {duration:?} + jitter of {jitter:?}"
1656                        );
1657                        Timer::after(duration + jitter).await;
1658                        continue;
1659                    }
1660                    _ => return Err(err.into()),
1661                },
1662                Err(err) => return Err(err),
1663            },
1664        }
1665    }
1666}
1667
1668#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1669struct EvalAssertionOutcome {
1670    score: usize,
1671    message: Option<String>,
1672}
1673
1674#[derive(Serialize)]
1675pub struct DiffJudgeTemplate {
1676    diff: String,
1677    assertions: &'static str,
1678}
1679
1680impl Template for DiffJudgeTemplate {
1681    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1682}
1683
1684fn strip_empty_lines(text: &str) -> String {
1685    text.lines()
1686        .filter(|line| !line.trim().is_empty())
1687        .collect::<Vec<_>>()
1688        .join("\n")
1689}