evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext, Timer};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    str::FromStr,
  30    sync::mpsc,
  31};
  32use util::path;
  33
  34#[test]
  35#[cfg_attr(not(feature = "eval"), ignore)]
  36fn eval_extract_handle_command_output() {
  37    // Test how well agent generates multiple edit hunks.
  38    //
  39    // Model                       | Pass rate
  40    // ----------------------------|----------
  41    // claude-3.7-sonnet           |  0.98
  42    // gemini-2.5-pro-06-05        |  0.77
  43    // gemini-2.5-flash            |  0.11
  44    // gpt-4.1                     |  1.00
  45
  46    let input_file_path = "root/blame.rs";
  47    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  48    let possible_diffs = vec![
  49        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  56    ];
  57    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  58    eval(
  59        100,
  60        0.7, // Taking the lower bar for Gemini
  61        0.05,
  62        EvalInput::from_conversation(
  63            vec![
  64                message(
  65                    User,
  66                    [text(formatdoc! {"
  67                        Read the `{input_file_path}` file and extract a method in
  68                        the final stanza of `run_git_blame` to deal with command failures,
  69                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  70                        Do not document the method and do not add any comments.
  71
  72                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  73                    "})],
  74                ),
  75                message(
  76                    Assistant,
  77                    [tool_use(
  78                        "tool_1",
  79                        "read_file",
  80                        ReadFileToolInput {
  81                            path: input_file_path.into(),
  82                            start_line: None,
  83                            end_line: None,
  84                        },
  85                    )],
  86                ),
  87                message(
  88                    User,
  89                    [tool_result("tool_1", "read_file", input_file_content)],
  90                ),
  91                message(
  92                    Assistant,
  93                    [tool_use(
  94                        "tool_2",
  95                        "edit_file",
  96                        EditFileToolInput {
  97                            display_description: edit_description.into(),
  98                            path: input_file_path.into(),
  99                            mode: EditFileMode::Edit,
 100                        },
 101                    )],
 102                ),
 103            ],
 104            Some(input_file_content.into()),
 105            EvalAssertion::assert_diff_any(possible_diffs),
 106        ),
 107    );
 108}
 109
 110#[test]
 111#[cfg_attr(not(feature = "eval"), ignore)]
 112fn eval_delete_run_git_blame() {
 113    let input_file_path = "root/blame.rs";
 114    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 115    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 116    let edit_description = "Delete the `run_git_blame` function.";
 117    eval(
 118        100,
 119        0.95,
 120        0.05,
 121        EvalInput::from_conversation(
 122            vec![
 123                message(
 124                    User,
 125                    [text(formatdoc! {"
 126                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 127                        one function, not its usages.
 128                    "})],
 129                ),
 130                message(
 131                    Assistant,
 132                    [tool_use(
 133                        "tool_1",
 134                        "read_file",
 135                        ReadFileToolInput {
 136                            path: input_file_path.into(),
 137                            start_line: None,
 138                            end_line: None,
 139                        },
 140                    )],
 141                ),
 142                message(
 143                    User,
 144                    [tool_result("tool_1", "read_file", input_file_content)],
 145                ),
 146                message(
 147                    Assistant,
 148                    [tool_use(
 149                        "tool_2",
 150                        "edit_file",
 151                        EditFileToolInput {
 152                            display_description: edit_description.into(),
 153                            path: input_file_path.into(),
 154                            mode: EditFileMode::Edit,
 155                        },
 156                    )],
 157                ),
 158            ],
 159            Some(input_file_content.into()),
 160            EvalAssertion::assert_eq(output_file_content),
 161        ),
 162    );
 163}
 164
 165#[test]
 166#[cfg_attr(not(feature = "eval"), ignore)]
 167fn eval_translate_doc_comments() {
 168    // Results for 2025-05-22
 169    //
 170    //  Model                          | Pass rate
 171    // ============================================
 172    //
 173    //  claude-3.7-sonnet              |
 174    //  gemini-2.5-pro-preview-03-25   |  1.0
 175    //  gemini-2.5-flash-preview-04-17 |
 176    //  gpt-4.1                        |
 177    let input_file_path = "root/canvas.rs";
 178    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 179    let edit_description = "Translate all doc comments to Italian";
 180    eval(
 181        200,
 182        1.,
 183        0.05,
 184        EvalInput::from_conversation(
 185            vec![
 186                message(
 187                    User,
 188                    [text(formatdoc! {"
 189                        Read the {input_file_path} file and edit it (without overwriting it),
 190                        translating all the doc comments to italian.
 191                    "})],
 192                ),
 193                message(
 194                    Assistant,
 195                    [tool_use(
 196                        "tool_1",
 197                        "read_file",
 198                        ReadFileToolInput {
 199                            path: input_file_path.into(),
 200                            start_line: None,
 201                            end_line: None,
 202                        },
 203                    )],
 204                ),
 205                message(
 206                    User,
 207                    [tool_result("tool_1", "read_file", input_file_content)],
 208                ),
 209                message(
 210                    Assistant,
 211                    [tool_use(
 212                        "tool_2",
 213                        "edit_file",
 214                        EditFileToolInput {
 215                            display_description: edit_description.into(),
 216                            path: input_file_path.into(),
 217                            mode: EditFileMode::Edit,
 218                        },
 219                    )],
 220                ),
 221            ],
 222            Some(input_file_content.into()),
 223            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 224        ),
 225    );
 226}
 227
 228#[test]
 229#[cfg_attr(not(feature = "eval"), ignore)]
 230fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 231    // Results for 2025-05-22
 232    //
 233    //  Model                          | Pass rate
 234    // ============================================
 235    //
 236    //  claude-3.7-sonnet              |  0.98
 237    //  gemini-2.5-pro-preview-03-25   |  0.99
 238    //  gemini-2.5-flash-preview-04-17 |
 239    //  gpt-4.1                        |
 240    let input_file_path = "root/lib.rs";
 241    let input_file_content =
 242        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 243    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 244    eval(
 245        100,
 246        0.95,
 247        0.05,
 248        EvalInput::from_conversation(
 249            vec![
 250                message(
 251                    User,
 252                    [text(formatdoc! {"
 253                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 254                        Use `ureq` to download the SDK for the current platform and architecture.
 255                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 256                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 257                        that's inside of the archive.
 258                        Don't re-download the SDK if that executable already exists.
 259
 260                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 261
 262                        Here are the available wasi-sdk assets:
 263                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 264                        - wasi-sdk-25.0-arm64-macos.tar.gz
 265                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 266                        - wasi-sdk-25.0-arm64-linux.tar.gz
 267                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 268                        - wasi-sdk-25.0-arm64-linux.tar.gz
 269                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 270                    "})],
 271                ),
 272                message(
 273                    Assistant,
 274                    [tool_use(
 275                        "tool_1",
 276                        "read_file",
 277                        ReadFileToolInput {
 278                            path: input_file_path.into(),
 279                            start_line: Some(971),
 280                            end_line: Some(1050),
 281                        },
 282                    )],
 283                ),
 284                message(
 285                    User,
 286                    [tool_result(
 287                        "tool_1",
 288                        "read_file",
 289                        lines(input_file_content, 971..1050),
 290                    )],
 291                ),
 292                message(
 293                    Assistant,
 294                    [tool_use(
 295                        "tool_2",
 296                        "read_file",
 297                        ReadFileToolInput {
 298                            path: input_file_path.into(),
 299                            start_line: Some(1050),
 300                            end_line: Some(1100),
 301                        },
 302                    )],
 303                ),
 304                message(
 305                    User,
 306                    [tool_result(
 307                        "tool_2",
 308                        "read_file",
 309                        lines(input_file_content, 1050..1100),
 310                    )],
 311                ),
 312                message(
 313                    Assistant,
 314                    [tool_use(
 315                        "tool_3",
 316                        "read_file",
 317                        ReadFileToolInput {
 318                            path: input_file_path.into(),
 319                            start_line: Some(1100),
 320                            end_line: Some(1150),
 321                        },
 322                    )],
 323                ),
 324                message(
 325                    User,
 326                    [tool_result(
 327                        "tool_3",
 328                        "read_file",
 329                        lines(input_file_content, 1100..1150),
 330                    )],
 331                ),
 332                message(
 333                    Assistant,
 334                    [tool_use(
 335                        "tool_4",
 336                        "edit_file",
 337                        EditFileToolInput {
 338                            display_description: edit_description.into(),
 339                            path: input_file_path.into(),
 340                            mode: EditFileMode::Edit,
 341                        },
 342                    )],
 343                ),
 344            ],
 345            Some(input_file_content.into()),
 346            EvalAssertion::judge_diff(indoc! {"
 347                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 348                - ureq is used to download the SDK for current platform and architecture
 349            "}),
 350        ),
 351    );
 352}
 353
 354#[test]
 355#[cfg_attr(not(feature = "eval"), ignore)]
 356fn eval_disable_cursor_blinking() {
 357    // Results for 2025-05-22
 358    //
 359    //  Model                          | Pass rate
 360    // ============================================
 361    //
 362    //  claude-3.7-sonnet              |
 363    //  gemini-2.5-pro-preview-03-25   |  1.0
 364    //  gemini-2.5-flash-preview-04-17 |
 365    //  gpt-4.1                        |
 366    let input_file_path = "root/editor.rs";
 367    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 368    let edit_description = "Comment out the call to `BlinkManager::enable`";
 369    eval(
 370        100,
 371        0.95,
 372        0.05,
 373        EvalInput::from_conversation(
 374            vec![
 375                message(User, [text("Let's research how to cursor blinking works.")]),
 376                message(
 377                    Assistant,
 378                    [tool_use(
 379                        "tool_1",
 380                        "grep",
 381                        GrepToolInput {
 382                            regex: "blink".into(),
 383                            include_pattern: None,
 384                            offset: 0,
 385                            case_sensitive: false,
 386                        },
 387                    )],
 388                ),
 389                message(
 390                    User,
 391                    [tool_result(
 392                        "tool_1",
 393                        "grep",
 394                        [
 395                            lines(input_file_content, 100..400),
 396                            lines(input_file_content, 800..1300),
 397                            lines(input_file_content, 1600..2000),
 398                            lines(input_file_content, 5000..5500),
 399                            lines(input_file_content, 8000..9000),
 400                            lines(input_file_content, 18455..18470),
 401                            lines(input_file_content, 20000..20500),
 402                            lines(input_file_content, 21000..21300),
 403                        ]
 404                        .join("Match found:\n\n"),
 405                    )],
 406                ),
 407                message(
 408                    User,
 409                    [text(indoc! {"
 410                        Comment out the lines that interact with the BlinkManager.
 411                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 412                        Don't add additional comments.
 413                    "})],
 414                ),
 415                message(
 416                    Assistant,
 417                    [tool_use(
 418                        "tool_4",
 419                        "edit_file",
 420                        EditFileToolInput {
 421                            display_description: edit_description.into(),
 422                            path: input_file_path.into(),
 423                            mode: EditFileMode::Edit,
 424                        },
 425                    )],
 426                ),
 427            ],
 428            Some(input_file_content.into()),
 429            EvalAssertion::judge_diff(indoc! {"
 430                - Calls to BlinkManager in `observe_window_activation` were commented out
 431                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 432                - All the edits have valid indentation
 433            "}),
 434        ),
 435    );
 436}
 437
 438#[test]
 439#[cfg_attr(not(feature = "eval"), ignore)]
 440fn eval_from_pixels_constructor() {
 441    // Results for 2025-05-22
 442    //
 443    //  Model                          | Pass rate
 444    // ============================================
 445    //
 446    //  claude-3.7-sonnet              |
 447    //  gemini-2.5-pro-preview-03-25   |  0.94
 448    //  gemini-2.5-flash-preview-04-17 |
 449    //  gpt-4.1                        |
 450    let input_file_path = "root/canvas.rs";
 451    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 452    let edit_description = "Implement from_pixels constructor and add tests.";
 453    eval(
 454        100,
 455        0.95,
 456        // For whatever reason, this eval produces more mismatched tags.
 457        // Increasing for now, let's see if we can bring this down.
 458        0.2,
 459        EvalInput::from_conversation(
 460            vec![
 461                message(
 462                    User,
 463                    [text(indoc! {"
 464                        Introduce a new `from_pixels` constructor in Canvas and
 465                        also add tests for it in the same file.
 466                    "})],
 467                ),
 468                message(
 469                    Assistant,
 470                    [tool_use(
 471                        "tool_1",
 472                        "read_file",
 473                        ReadFileToolInput {
 474                            path: input_file_path.into(),
 475                            start_line: None,
 476                            end_line: None,
 477                        },
 478                    )],
 479                ),
 480                message(
 481                    User,
 482                    [tool_result("tool_1", "read_file", input_file_content)],
 483                ),
 484                message(
 485                    Assistant,
 486                    [tool_use(
 487                        "tool_2",
 488                        "grep",
 489                        GrepToolInput {
 490                            regex: "mod\\s+tests".into(),
 491                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 492                            offset: 0,
 493                            case_sensitive: false,
 494                        },
 495                    )],
 496                ),
 497                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 498                message(
 499                    Assistant,
 500                    [tool_use(
 501                        "tool_3",
 502                        "grep",
 503                        GrepToolInput {
 504                            regex: "mod\\s+tests".into(),
 505                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 506                            offset: 0,
 507                            case_sensitive: false,
 508                        },
 509                    )],
 510                ),
 511                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 512                message(
 513                    Assistant,
 514                    [tool_use(
 515                        "tool_4",
 516                        "grep",
 517                        GrepToolInput {
 518                            regex: "#\\[test\\]".into(),
 519                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 520                            offset: 0,
 521                            case_sensitive: false,
 522                        },
 523                    )],
 524                ),
 525                message(
 526                    User,
 527                    [tool_result(
 528                        "tool_4",
 529                        "grep",
 530                        indoc! {"
 531                            Found 6 matches:
 532
 533                            ## Matches in font-kit/src/loaders/core_text.rs
 534
 535                            ### mod test › L926-936
 536                            ```
 537                            mod test {
 538                                use super::Font;
 539                                use crate::properties::{Stretch, Weight};
 540
 541                                #[cfg(feature = \"source\")]
 542                                use crate::source::SystemSource;
 543
 544                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 545
 546                                #[cfg(feature = \"source\")]
 547                                #[test]
 548                            ```
 549
 550                            55 lines remaining in ancestor node. Read the file to see all.
 551
 552                            ### mod test › L947-951
 553                            ```
 554                                }
 555
 556                                #[test]
 557                                fn test_core_text_to_css_font_weight() {
 558                                    // Exact matches
 559                            ```
 560
 561                            ### mod test › L959-963
 562                            ```
 563                                }
 564
 565                                #[test]
 566                                fn test_core_text_to_css_font_stretch() {
 567                                    // Exact matches
 568                            ```
 569
 570                            ## Matches in font-kit/src/loaders/freetype.rs
 571
 572                            ### mod test › L1238-1248
 573                            ```
 574                            mod test {
 575                                use crate::loaders::freetype::Font;
 576
 577                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 578                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 579
 580                                #[test]
 581                                fn get_pcf_postscript_name() {
 582                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 583                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 584                                }
 585                            ```
 586
 587                            1 lines remaining in ancestor node. Read the file to see all.
 588
 589                            ## Matches in font-kit/src/sources/core_text.rs
 590
 591                            ### mod test › L265-275
 592                            ```
 593                            mod test {
 594                                use crate::properties::{Stretch, Weight};
 595
 596                                #[test]
 597                                fn test_css_to_core_text_font_weight() {
 598                                    // Exact matches
 599                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 600                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 601                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 602                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 603
 604                            ```
 605
 606                            27 lines remaining in ancestor node. Read the file to see all.
 607
 608                            ### mod test › L278-282
 609                            ```
 610                                }
 611
 612                                #[test]
 613                                fn test_css_to_core_text_font_stretch() {
 614                                    // Exact matches
 615                            ```
 616                        "},
 617                    )],
 618                ),
 619                message(
 620                    Assistant,
 621                    [tool_use(
 622                        "tool_5",
 623                        "edit_file",
 624                        EditFileToolInput {
 625                            display_description: edit_description.into(),
 626                            path: input_file_path.into(),
 627                            mode: EditFileMode::Edit,
 628                        },
 629                    )],
 630                ),
 631            ],
 632            Some(input_file_content.into()),
 633            EvalAssertion::judge_diff(indoc! {"
 634                    - The diff contains a new `from_pixels` constructor
 635                    - The diff contains new tests for the `from_pixels` constructor
 636                "}),
 637        ),
 638    );
 639}
 640
 641#[test]
 642#[cfg_attr(not(feature = "eval"), ignore)]
 643fn eval_zode() {
 644    // Results for 2025-05-22
 645    //
 646    //  Model                          | Pass rate
 647    // ============================================
 648    //
 649    //  claude-3.7-sonnet              |  1.0
 650    //  gemini-2.5-pro-preview-03-25   |  1.0
 651    //  gemini-2.5-flash-preview-04-17 |  1.0
 652    //  gpt-4.1                        |  1.0
 653    let input_file_path = "root/zode.py";
 654    let input_content = None;
 655    let edit_description = "Create the main Zode CLI script";
 656    eval(
 657        50,
 658        1.,
 659        0.05,
 660        EvalInput::from_conversation(
 661            vec![
 662                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 663                message(
 664                    Assistant,
 665                    [
 666                        tool_use(
 667                            "tool_1",
 668                            "read_file",
 669                            ReadFileToolInput {
 670                                path: "root/eval/react.py".into(),
 671                                start_line: None,
 672                                end_line: None,
 673                            },
 674                        ),
 675                        tool_use(
 676                            "tool_2",
 677                            "read_file",
 678                            ReadFileToolInput {
 679                                path: "root/eval/react_test.py".into(),
 680                                start_line: None,
 681                                end_line: None,
 682                            },
 683                        ),
 684                    ],
 685                ),
 686                message(
 687                    User,
 688                    [
 689                        tool_result(
 690                            "tool_1",
 691                            "read_file",
 692                            include_str!("evals/fixtures/zode/react.py"),
 693                        ),
 694                        tool_result(
 695                            "tool_2",
 696                            "read_file",
 697                            include_str!("evals/fixtures/zode/react_test.py"),
 698                        ),
 699                    ],
 700                ),
 701                message(
 702                    Assistant,
 703                    [
 704                        text(
 705                            "Now that I understand what we need to build, I'll create the main Python script:",
 706                        ),
 707                        tool_use(
 708                            "tool_3",
 709                            "edit_file",
 710                            EditFileToolInput {
 711                                display_description: edit_description.into(),
 712                                path: input_file_path.into(),
 713                                mode: EditFileMode::Create,
 714                            },
 715                        ),
 716                    ],
 717                ),
 718            ],
 719            input_content,
 720            EvalAssertion::new(async move |sample, _, _cx| {
 721                let invalid_starts = [' ', '`', '\n'];
 722                let mut message = String::new();
 723                for start in invalid_starts {
 724                    if sample.text_after.starts_with(start) {
 725                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 726                        break;
 727                    }
 728                }
 729                // Remove trailing newline.
 730                message.pop();
 731
 732                if message.is_empty() {
 733                    Ok(EvalAssertionOutcome {
 734                        score: 100,
 735                        message: None,
 736                    })
 737                } else {
 738                    Ok(EvalAssertionOutcome {
 739                        score: 0,
 740                        message: Some(message),
 741                    })
 742                }
 743            }),
 744        ),
 745    );
 746}
 747
 748#[test]
 749#[cfg_attr(not(feature = "eval"), ignore)]
 750fn eval_add_overwrite_test() {
 751    // Results for 2025-05-22
 752    //
 753    //  Model                          | Pass rate
 754    // ============================================
 755    //
 756    //  claude-3.7-sonnet              |  0.16
 757    //  gemini-2.5-pro-preview-03-25   |  0.35
 758    //  gemini-2.5-flash-preview-04-17 |
 759    //  gpt-4.1                        |
 760    let input_file_path = "root/action_log.rs";
 761    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 762    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 763    eval(
 764        200,
 765        0.5, // TODO: make this eval better
 766        0.05,
 767        EvalInput::from_conversation(
 768            vec![
 769                message(
 770                    User,
 771                    [text(indoc! {"
 772                        Introduce a new test in `action_log.rs` to test overwriting a file.
 773                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 774                        Take inspiration from all the other tests in the file.
 775                    "})],
 776                ),
 777                message(
 778                    Assistant,
 779                    [tool_use(
 780                        "tool_1",
 781                        "read_file",
 782                        ReadFileToolInput {
 783                            path: input_file_path.into(),
 784                            start_line: None,
 785                            end_line: None,
 786                        },
 787                    )],
 788                ),
 789                message(
 790                    User,
 791                    [tool_result(
 792                        "tool_1",
 793                        "read_file",
 794                        indoc! {"
 795                            pub struct ActionLog [L13-20]
 796                             tracked_buffers [L15]
 797                             edited_since_project_diagnostics_check [L17]
 798                             project [L19]
 799                            impl ActionLog [L22-498]
 800                             pub fn new [L24-30]
 801                             pub fn project [L32-34]
 802                             pub fn checked_project_diagnostics [L37-39]
 803                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 804                             fn track_buffer_internal [L46-101]
 805                             fn handle_buffer_event [L103-116]
 806                             fn handle_buffer_edited [L118-123]
 807                             fn handle_buffer_file_changed [L125-158]
 808                             async fn maintain_diff [L160-264]
 809                             pub fn buffer_read [L267-269]
 810                             pub fn buffer_created [L272-276]
 811                             pub fn buffer_edited [L279-287]
 812                             pub fn will_delete_buffer [L289-304]
 813                             pub fn keep_edits_in_range [L306-364]
 814                             pub fn reject_edits_in_ranges [L366-459]
 815                             pub fn keep_all_edits [L461-473]
 816                             pub fn changed_buffers [L476-482]
 817                             pub fn stale_buffers [L485-497]
 818                            fn apply_non_conflicting_edits [L500-561]
 819                            fn diff_snapshots [L563-585]
 820                            fn point_to_row_edit [L587-614]
 821                            enum ChangeAuthor [L617-620]
 822                             User [L618]
 823                             Agent [L619]
 824                            enum TrackedBufferStatus [L623-627]
 825                             Created [L624]
 826                             Modified [L625]
 827                             Deleted [L626]
 828                            struct TrackedBuffer [L629-641]
 829                             buffer [L630]
 830                             base_text [L631]
 831                             unreviewed_changes [L632]
 832                             status [L633]
 833                             version [L634]
 834                             diff [L635]
 835                             snapshot [L636]
 836                             diff_update [L637]
 837                             _open_lsp_handle [L638]
 838                             _maintain_diff [L639]
 839                             _subscription [L640]
 840                            impl TrackedBuffer [L643-657]
 841                             fn has_changes [L644-650]
 842                             fn schedule_diff_update [L652-656]
 843                            pub struct ChangedBuffer [L659-661]
 844                             pub diff [L660]
 845                            mod tests [L664-1574]
 846                             fn init_logger [L678-682]
 847                             fn init_test [L684-691]
 848                             async fn test_keep_edits [L694-769]
 849                             async fn test_deletions [L772-854]
 850                             async fn test_overlapping_user_edits [L857-951]
 851                             async fn test_creating_files [L954-1010]
 852                             async fn test_deleting_files [L1013-1120]
 853                             async fn test_reject_edits [L1123-1255]
 854                             async fn test_reject_multiple_edits [L1258-1331]
 855                             async fn test_reject_deleted_file [L1334-1388]
 856                             async fn test_reject_created_file [L1391-1443]
 857                             async fn test_random_diffs [L1446-1535]
 858                              fn quiesce [L1510-1534]
 859                             struct HunkStatus [L1538-1542]
 860                              range [L1539]
 861                              diff_status [L1540]
 862                              old_text [L1541]
 863                             fn unreviewed_hunks [L1544-1573]
 864
 865                            Showing symbols 1-69 (total symbols: 69)
 866
 867                            Using the line numbers in this outline, you can call this tool again while specifying
 868                            the start_line and end_line fields to see the implementations of symbols in the outline.
 869                        "},
 870                    )],
 871                ),
 872                message(
 873                    Assistant,
 874                    [
 875                        text(
 876                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 877                        ),
 878                        tool_use(
 879                            "tool_2",
 880                            "read_file",
 881                            ReadFileToolInput {
 882                                path: input_file_path.into(),
 883                                start_line: Some(953),
 884                                end_line: Some(1010),
 885                            },
 886                        ),
 887                    ],
 888                ),
 889                message(
 890                    User,
 891                    [tool_result(
 892                        "tool_2",
 893                        "read_file",
 894                        lines(input_file_content, 953..1010),
 895                    )],
 896                ),
 897                message(
 898                    Assistant,
 899                    [
 900                        text(
 901                            "Now let's look at another relevant test to understand how they're structured:",
 902                        ),
 903                        tool_use(
 904                            "tool_3",
 905                            "read_file",
 906                            ReadFileToolInput {
 907                                path: input_file_path.into(),
 908                                start_line: Some(1012),
 909                                end_line: Some(1120),
 910                            },
 911                        ),
 912                    ],
 913                ),
 914                message(
 915                    User,
 916                    [tool_result(
 917                        "tool_3",
 918                        "read_file",
 919                        lines(input_file_content, 1012..1120),
 920                    )],
 921                ),
 922                message(
 923                    Assistant,
 924                    [
 925                        text("Now let's look at how `buffer_created` is implemented:"),
 926                        tool_use(
 927                            "tool_4",
 928                            "read_file",
 929                            ReadFileToolInput {
 930                                path: input_file_path.into(),
 931                                start_line: Some(271),
 932                                end_line: Some(276),
 933                            },
 934                        ),
 935                    ],
 936                ),
 937                message(
 938                    User,
 939                    [tool_result(
 940                        "tool_4",
 941                        "read_file",
 942                        lines(input_file_content, 271..276),
 943                    )],
 944                ),
 945                message(
 946                    Assistant,
 947                    [
 948                        text(indoc! {"
 949                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 950
 951                            1. Create a file
 952                            2. Open a buffer for it
 953                            3. Track it in the action log
 954                            4. Modify the buffer
 955                            5. Call `buffer_created` again as if the file were new
 956                            6. Check that the changes are properly tracked
 957
 958                            Let's write the test:
 959                        "}),
 960                        tool_use(
 961                            "tool_5",
 962                            "edit_file",
 963                            EditFileToolInput {
 964                                display_description: edit_description.into(),
 965                                path: input_file_path.into(),
 966                                mode: EditFileMode::Edit,
 967                            },
 968                        ),
 969                    ],
 970                ),
 971            ],
 972            Some(input_file_content.into()),
 973            EvalAssertion::judge_diff(
 974                "A new test for overwritten files was created, without changing any previous test",
 975            ),
 976        ),
 977    );
 978}
 979
 980#[test]
 981#[cfg_attr(not(feature = "eval"), ignore)]
 982fn eval_create_empty_file() {
 983    // Check that Edit Agent can create a file without writing its
 984    // thoughts into it. This issue is not specific to empty files, but
 985    // it's easier to reproduce with them.
 986    //
 987    // Results for 2025-05-21:
 988    //
 989    //  Model                          | Pass rate
 990    // ============================================
 991    //
 992    //  claude-3.7-sonnet              |  1.00
 993    //  gemini-2.5-pro-preview-03-25   |  1.00
 994    //  gemini-2.5-flash-preview-04-17 |  1.00
 995    //  gpt-4.1                        |  1.00
 996    //
 997    //
 998    // TODO: gpt-4.1-mini errored 38 times:
 999    // "data did not match any variant of untagged enum ResponseStreamResult"
1000    //
1001    let input_file_content = None;
1002    let expected_output_content = String::new();
1003    eval(
1004        100,
1005        0.99,
1006        0.05,
1007        EvalInput::from_conversation(
1008            vec![
1009                message(User, [text("Create a second empty todo file ")]),
1010                message(
1011                    Assistant,
1012                    [
1013                        text(formatdoc! {"
1014                        I'll help you create a second empty todo file.
1015                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1016                        "}),
1017                        tool_use(
1018                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1019                            "list_directory",
1020                            ListDirectoryToolInput {
1021                                path: "root".to_string(),
1022                            },
1023                        ),
1024                    ],
1025                ),
1026                message(
1027                    User,
1028                    [tool_result(
1029                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1030                        "list_directory",
1031                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1032                    )],
1033                ),
1034                message(
1035                    Assistant,
1036                    [
1037                        text(formatdoc! {"
1038                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1039                    "}),
1040                        tool_use(
1041                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1042                            "edit_file",
1043                            EditFileToolInput {
1044                                display_description: "Create empty TODO3 file".to_string(),
1045                                mode: EditFileMode::Create,
1046                                path: "root/TODO3".into(),
1047                            },
1048                        ),
1049                    ],
1050                ),
1051            ],
1052            input_file_content,
1053            // Bad behavior is to write something like
1054            // "I'll create an empty TODO3 file as requested."
1055            EvalAssertion::assert_eq(expected_output_content),
1056        ),
1057    );
1058}
1059
1060fn message(
1061    role: Role,
1062    contents: impl IntoIterator<Item = MessageContent>,
1063) -> LanguageModelRequestMessage {
1064    LanguageModelRequestMessage {
1065        role,
1066        content: contents.into_iter().collect(),
1067        cache: false,
1068    }
1069}
1070
1071fn text(text: impl Into<String>) -> MessageContent {
1072    MessageContent::Text(text.into())
1073}
1074
1075fn lines(input: &str, range: Range<usize>) -> String {
1076    input
1077        .lines()
1078        .skip(range.start)
1079        .take(range.len())
1080        .collect::<Vec<_>>()
1081        .join("\n")
1082}
1083
1084fn tool_use(
1085    id: impl Into<Arc<str>>,
1086    name: impl Into<Arc<str>>,
1087    input: impl Serialize,
1088) -> MessageContent {
1089    MessageContent::ToolUse(LanguageModelToolUse {
1090        id: LanguageModelToolUseId::from(id.into()),
1091        name: name.into(),
1092        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1093        input: serde_json::to_value(input).unwrap(),
1094        is_input_complete: true,
1095    })
1096}
1097
1098fn tool_result(
1099    id: impl Into<Arc<str>>,
1100    name: impl Into<Arc<str>>,
1101    result: impl Into<Arc<str>>,
1102) -> MessageContent {
1103    MessageContent::ToolResult(LanguageModelToolResult {
1104        tool_use_id: LanguageModelToolUseId::from(id.into()),
1105        tool_name: name.into(),
1106        is_error: false,
1107        content: LanguageModelToolResultContent::Text(result.into()),
1108        output: None,
1109    })
1110}
1111
1112#[derive(Clone)]
1113struct EvalInput {
1114    conversation: Vec<LanguageModelRequestMessage>,
1115    edit_file_input: EditFileToolInput,
1116    input_content: Option<String>,
1117    assertion: EvalAssertion,
1118}
1119
1120impl EvalInput {
1121    fn from_conversation(
1122        conversation: Vec<LanguageModelRequestMessage>,
1123        input_content: Option<String>,
1124        assertion: EvalAssertion,
1125    ) -> Self {
1126        let msg = conversation.last().expect("Conversation must not be empty");
1127        if msg.role != Role::Assistant {
1128            panic!("Conversation must end with an assistant message");
1129        }
1130        let tool_use = msg
1131            .content
1132            .iter()
1133            .flat_map(|content| match content {
1134                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1135                    Some(tool_use)
1136                }
1137                _ => None,
1138            })
1139            .next()
1140            .expect("Conversation must end with an edit_file tool use")
1141            .clone();
1142
1143        let edit_file_input: EditFileToolInput =
1144            serde_json::from_value(tool_use.input.clone()).unwrap();
1145
1146        EvalInput {
1147            conversation,
1148            edit_file_input,
1149            input_content,
1150            assertion,
1151        }
1152    }
1153}
1154
1155#[derive(Clone)]
1156struct EvalSample {
1157    text_before: String,
1158    text_after: String,
1159    edit_output: EditAgentOutput,
1160    diff: String,
1161}
1162
1163trait AssertionFn: 'static + Send + Sync {
1164    fn assert<'a>(
1165        &'a self,
1166        sample: &'a EvalSample,
1167        judge_model: Arc<dyn LanguageModel>,
1168        cx: &'a mut TestAppContext,
1169    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1170}
1171
1172impl<F> AssertionFn for F
1173where
1174    F: 'static
1175        + Send
1176        + Sync
1177        + AsyncFn(
1178            &EvalSample,
1179            Arc<dyn LanguageModel>,
1180            &mut TestAppContext,
1181        ) -> Result<EvalAssertionOutcome>,
1182{
1183    fn assert<'a>(
1184        &'a self,
1185        sample: &'a EvalSample,
1186        judge_model: Arc<dyn LanguageModel>,
1187        cx: &'a mut TestAppContext,
1188    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1189        (self)(sample, judge_model, cx).boxed_local()
1190    }
1191}
1192
1193#[derive(Clone)]
1194struct EvalAssertion(Arc<dyn AssertionFn>);
1195
1196impl EvalAssertion {
1197    fn new<F>(f: F) -> Self
1198    where
1199        F: 'static
1200            + Send
1201            + Sync
1202            + AsyncFn(
1203                &EvalSample,
1204                Arc<dyn LanguageModel>,
1205                &mut TestAppContext,
1206            ) -> Result<EvalAssertionOutcome>,
1207    {
1208        EvalAssertion(Arc::new(f))
1209    }
1210
1211    fn assert_eq(expected: impl Into<String>) -> Self {
1212        let expected = expected.into();
1213        Self::new(async move |sample, _judge, _cx| {
1214            Ok(EvalAssertionOutcome {
1215                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1216                    100
1217                } else {
1218                    0
1219                },
1220                message: None,
1221            })
1222        })
1223    }
1224
1225    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1226        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1227        Self::new(async move |sample, _judge, _cx| {
1228            let matches = expected_diffs.iter().any(|possible_diff| {
1229                let expected =
1230                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1231                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1232            });
1233
1234            Ok(EvalAssertionOutcome {
1235                score: if matches { 100 } else { 0 },
1236                message: None,
1237            })
1238        })
1239    }
1240
1241    fn judge_diff(assertions: &'static str) -> Self {
1242        Self::new(async move |sample, judge, cx| {
1243            let prompt = DiffJudgeTemplate {
1244                diff: sample.diff.clone(),
1245                assertions,
1246            }
1247            .render(&Templates::new())
1248            .unwrap();
1249
1250            let request = LanguageModelRequest {
1251                messages: vec![LanguageModelRequestMessage {
1252                    role: Role::User,
1253                    content: vec![prompt.into()],
1254                    cache: false,
1255                }],
1256                ..Default::default()
1257            };
1258            let mut response = retry_on_rate_limit(async || {
1259                Ok(judge
1260                    .stream_completion_text(request.clone(), &cx.to_async())
1261                    .await?)
1262            })
1263            .await?;
1264            let mut output = String::new();
1265            while let Some(chunk) = response.stream.next().await {
1266                let chunk = chunk?;
1267                output.push_str(&chunk);
1268            }
1269
1270            // Parse the score from the response
1271            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1272            if let Some(captures) = re.captures(&output) {
1273                if let Some(score_match) = captures.get(1) {
1274                    let score = score_match.as_str().parse().unwrap_or(0);
1275                    return Ok(EvalAssertionOutcome {
1276                        score,
1277                        message: Some(output),
1278                    });
1279                }
1280            }
1281
1282            anyhow::bail!("No score found in response. Raw output: {output}");
1283        })
1284    }
1285
1286    async fn run(
1287        &self,
1288        input: &EvalSample,
1289        judge_model: Arc<dyn LanguageModel>,
1290        cx: &mut TestAppContext,
1291    ) -> Result<EvalAssertionOutcome> {
1292        self.0.assert(input, judge_model, cx).await
1293    }
1294}
1295
1296fn eval(
1297    iterations: usize,
1298    expected_pass_ratio: f32,
1299    mismatched_tag_threshold: f32,
1300    mut eval: EvalInput,
1301) {
1302    let mut evaluated_count = 0;
1303    let mut failed_count = 0;
1304    report_progress(evaluated_count, failed_count, iterations);
1305
1306    let (tx, rx) = mpsc::channel();
1307
1308    // Cache the last message in the conversation, and run one instance of the eval so that
1309    // all the next ones are cached.
1310    eval.conversation.last_mut().unwrap().cache = true;
1311    run_eval(eval.clone(), tx.clone());
1312
1313    let executor = gpui::background_executor();
1314    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1315    for _ in 1..iterations {
1316        let eval = eval.clone();
1317        let tx = tx.clone();
1318        let semaphore = semaphore.clone();
1319        executor
1320            .spawn(async move {
1321                let _guard = semaphore.acquire().await;
1322                run_eval(eval, tx)
1323            })
1324            .detach();
1325    }
1326    drop(tx);
1327
1328    let mut failed_evals = HashMap::default();
1329    let mut errored_evals = HashMap::default();
1330    let mut eval_outputs = Vec::new();
1331    let mut cumulative_parser_metrics = EditParserMetrics::default();
1332    while let Ok(output) = rx.recv() {
1333        match output {
1334            Ok(output) => {
1335                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1336                eval_outputs.push(output.clone());
1337                if output.assertion.score < 80 {
1338                    failed_count += 1;
1339                    failed_evals
1340                        .entry(output.sample.text_after.clone())
1341                        .or_insert(Vec::new())
1342                        .push(output);
1343                }
1344            }
1345            Err(error) => {
1346                failed_count += 1;
1347                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1348            }
1349        }
1350
1351        evaluated_count += 1;
1352        report_progress(evaluated_count, failed_count, iterations);
1353    }
1354
1355    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1356    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1357    if actual_pass_ratio < expected_pass_ratio {
1358        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1359        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1360        for (error, count) in errored_evals {
1361            println!("Eval errored {} times. Error: {}", count, error);
1362        }
1363
1364        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1365        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1366        for (_buffer_output, failed_evals) in failed_evals {
1367            let eval_output = failed_evals.first().unwrap();
1368            println!("Eval failed {} times", failed_evals.len());
1369            println!("{}", eval_output);
1370        }
1371
1372        panic!(
1373            "Actual pass ratio: {}\nExpected pass ratio: {}",
1374            actual_pass_ratio, expected_pass_ratio
1375        );
1376    }
1377
1378    let mismatched_tag_ratio =
1379        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1380    if mismatched_tag_ratio > mismatched_tag_threshold {
1381        for eval_output in eval_outputs {
1382            println!("{}", eval_output);
1383        }
1384        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1385    }
1386}
1387
1388fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1389    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1390    let mut cx = TestAppContext::build(dispatcher, None);
1391    let output = cx.executor().block_test(async {
1392        let test = EditAgentTest::new(&mut cx).await;
1393        test.eval(eval, &mut cx).await
1394    });
1395    tx.send(output).unwrap();
1396}
1397
1398#[derive(Clone)]
1399struct EvalOutput {
1400    sample: EvalSample,
1401    assertion: EvalAssertionOutcome,
1402}
1403
1404impl Display for EvalOutput {
1405    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1406        writeln!(f, "Score: {:?}", self.assertion.score)?;
1407        if let Some(message) = self.assertion.message.as_ref() {
1408            writeln!(f, "Message: {}", message)?;
1409        }
1410
1411        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1412
1413        writeln!(
1414            f,
1415            "Parser Metrics:\n{:#?}",
1416            self.sample.edit_output.parser_metrics
1417        )?;
1418        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1419        Ok(())
1420    }
1421}
1422
1423fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1424    let passed_count = evaluated_count - failed_count;
1425    let passed_ratio = if evaluated_count == 0 {
1426        0.0
1427    } else {
1428        passed_count as f64 / evaluated_count as f64
1429    };
1430    print!(
1431        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1432        evaluated_count,
1433        iterations,
1434        passed_ratio * 100.0
1435    );
1436    std::io::stdout().flush().unwrap();
1437}
1438
1439struct EditAgentTest {
1440    agent: EditAgent,
1441    project: Entity<Project>,
1442    judge_model: Arc<dyn LanguageModel>,
1443}
1444
1445impl EditAgentTest {
1446    async fn new(cx: &mut TestAppContext) -> Self {
1447        cx.executor().allow_parking();
1448
1449        let fs = FakeFs::new(cx.executor().clone());
1450        cx.update(|cx| {
1451            settings::init(cx);
1452            gpui_tokio::init(cx);
1453            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1454            cx.set_http_client(http_client);
1455
1456            client::init_settings(cx);
1457            let client = Client::production(cx);
1458            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1459
1460            settings::init(cx);
1461            Project::init_settings(cx);
1462            language::init(cx);
1463            language_model::init(client.clone(), cx);
1464            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1465            crate::init(client.http_client(), cx);
1466        });
1467
1468        fs.insert_tree("/root", json!({})).await;
1469        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1470        let agent_model = SelectedModel::from_str(
1471            &std::env::var("ZED_AGENT_MODEL")
1472                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1473        )
1474        .unwrap();
1475        let judge_model = SelectedModel::from_str(
1476            &std::env::var("ZED_JUDGE_MODEL")
1477                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1478        )
1479        .unwrap();
1480        let (agent_model, judge_model) = cx
1481            .update(|cx| {
1482                cx.spawn(async move |cx| {
1483                    let agent_model = Self::load_model(&agent_model, cx).await;
1484                    let judge_model = Self::load_model(&judge_model, cx).await;
1485                    (agent_model.unwrap(), judge_model.unwrap())
1486                })
1487            })
1488            .await;
1489        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1490
1491        Self {
1492            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1493            project,
1494            judge_model,
1495        }
1496    }
1497
1498    async fn load_model(
1499        selected_model: &SelectedModel,
1500        cx: &mut AsyncApp,
1501    ) -> Result<Arc<dyn LanguageModel>> {
1502        let (provider, model) = cx.update(|cx| {
1503            let models = LanguageModelRegistry::read_global(cx);
1504            let model = models
1505                .available_models(cx)
1506                .find(|model| {
1507                    model.provider_id() == selected_model.provider
1508                        && model.id() == selected_model.model
1509                })
1510                .expect("Model not found");
1511            let provider = models.provider(&model.provider_id()).unwrap();
1512            (provider, model)
1513        })?;
1514        cx.update(|cx| provider.authenticate(cx))?.await?;
1515        Ok(model)
1516    }
1517
1518    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1519        let path = self
1520            .project
1521            .read_with(cx, |project, cx| {
1522                project.find_project_path(eval.edit_file_input.path, cx)
1523            })
1524            .unwrap();
1525        let buffer = self
1526            .project
1527            .update(cx, |project, cx| project.open_buffer(path, cx))
1528            .await
1529            .unwrap();
1530        let tools = cx.update(|cx| {
1531            ToolRegistry::default_global(cx)
1532                .tools()
1533                .into_iter()
1534                .filter_map(|tool| {
1535                    let input_schema = tool
1536                        .input_schema(self.agent.model.tool_input_format())
1537                        .ok()?;
1538                    Some(LanguageModelRequestTool {
1539                        name: tool.name(),
1540                        description: tool.description(),
1541                        input_schema,
1542                    })
1543                })
1544                .collect::<Vec<_>>()
1545        });
1546        let tool_names = tools
1547            .iter()
1548            .map(|tool| tool.name.clone())
1549            .collect::<Vec<_>>();
1550        let worktrees = vec![WorktreeContext {
1551            root_name: "root".to_string(),
1552            rules_file: None,
1553        }];
1554        let prompt_builder = PromptBuilder::new(None)?;
1555        let project_context = ProjectContext::new(worktrees, Vec::default());
1556        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1557            &project_context,
1558            &ModelContext {
1559                available_tools: tool_names,
1560            },
1561        )?;
1562
1563        let has_system_prompt = eval
1564            .conversation
1565            .first()
1566            .map_or(false, |msg| msg.role == Role::System);
1567        let messages = if has_system_prompt {
1568            eval.conversation
1569        } else {
1570            [LanguageModelRequestMessage {
1571                role: Role::System,
1572                content: vec![MessageContent::Text(system_prompt)],
1573                cache: true,
1574            }]
1575            .into_iter()
1576            .chain(eval.conversation)
1577            .collect::<Vec<_>>()
1578        };
1579
1580        let conversation = LanguageModelRequest {
1581            messages,
1582            tools,
1583            ..Default::default()
1584        };
1585
1586        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1587            if let Some(input_content) = eval.input_content.as_deref() {
1588                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1589            }
1590            retry_on_rate_limit(async || {
1591                self.agent
1592                    .edit(
1593                        buffer.clone(),
1594                        eval.edit_file_input.display_description.clone(),
1595                        &conversation,
1596                        &mut cx.to_async(),
1597                    )
1598                    .0
1599                    .await
1600            })
1601            .await?
1602        } else {
1603            retry_on_rate_limit(async || {
1604                self.agent
1605                    .overwrite(
1606                        buffer.clone(),
1607                        eval.edit_file_input.display_description.clone(),
1608                        &conversation,
1609                        &mut cx.to_async(),
1610                    )
1611                    .0
1612                    .await
1613            })
1614            .await?
1615        };
1616
1617        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1618        let sample = EvalSample {
1619            edit_output,
1620            diff: language::unified_diff(
1621                eval.input_content.as_deref().unwrap_or_default(),
1622                &buffer_text,
1623            ),
1624            text_before: eval.input_content.unwrap_or_default(),
1625            text_after: buffer_text,
1626        };
1627        let assertion = eval
1628            .assertion
1629            .run(&sample, self.judge_model.clone(), cx)
1630            .await?;
1631
1632        Ok(EvalOutput { assertion, sample })
1633    }
1634}
1635
1636async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1637    let mut attempt = 0;
1638    loop {
1639        attempt += 1;
1640        match request().await {
1641            Ok(result) => return Ok(result),
1642            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1643                Ok(err) => match err {
1644                    LanguageModelCompletionError::RateLimit(duration) => {
1645                        // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1646                        let jitter = duration.mul_f64(rand::thread_rng().gen_range(0.0..0.5));
1647                        eprintln!(
1648                            "Attempt #{attempt}: Rate limit exceeded. Retry after {duration:?} + jitter of {jitter:?}"
1649                        );
1650                        Timer::after(duration + jitter).await;
1651                        continue;
1652                    }
1653                    _ => return Err(err.into()),
1654                },
1655                Err(err) => return Err(err),
1656            },
1657        }
1658    }
1659}
1660
1661#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1662struct EvalAssertionOutcome {
1663    score: usize,
1664    message: Option<String>,
1665}
1666
1667#[derive(Serialize)]
1668pub struct DiffJudgeTemplate {
1669    diff: String,
1670    assertions: &'static str,
1671}
1672
1673impl Template for DiffJudgeTemplate {
1674    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1675}
1676
1677fn strip_empty_lines(text: &str) -> String {
1678    text.lines()
1679        .filter(|line| !line.trim().is_empty())
1680        .collect::<Vec<_>>()
1681        .join("\n")
1682}