evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    str::FromStr,
  30    sync::mpsc,
  31};
  32use util::path;
  33
  34#[test]
  35#[cfg_attr(not(feature = "eval"), ignore)]
  36fn eval_extract_handle_command_output() {
  37    // Test how well agent generates multiple edit hunks.
  38    //
  39    // Model                       | Pass rate
  40    // ----------------------------|----------
  41    // claude-3.7-sonnet           |  0.98
  42    // gemini-2.5-pro              |  0.86
  43    // gemini-2.5-flash            |  0.11
  44    // gpt-4.1                     |  1.00
  45
  46    let input_file_path = "root/blame.rs";
  47    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  48    let possible_diffs = vec![
  49        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  56    ];
  57    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  58    eval(
  59        100,
  60        0.7, // Taking the lower bar for Gemini
  61        EvalInput::from_conversation(
  62            vec![
  63                message(
  64                    User,
  65                    [text(formatdoc! {"
  66                        Read the `{input_file_path}` file and extract a method in
  67                        the final stanza of `run_git_blame` to deal with command failures,
  68                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  69                        Do not document the method and do not add any comments.
  70
  71                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  72                    "})],
  73                ),
  74                message(
  75                    Assistant,
  76                    [tool_use(
  77                        "tool_1",
  78                        "read_file",
  79                        ReadFileToolInput {
  80                            path: input_file_path.into(),
  81                            start_line: None,
  82                            end_line: None,
  83                        },
  84                    )],
  85                ),
  86                message(
  87                    User,
  88                    [tool_result("tool_1", "read_file", input_file_content)],
  89                ),
  90                message(
  91                    Assistant,
  92                    [tool_use(
  93                        "tool_2",
  94                        "edit_file",
  95                        EditFileToolInput {
  96                            display_description: edit_description.into(),
  97                            path: input_file_path.into(),
  98                            mode: EditFileMode::Edit,
  99                        },
 100                    )],
 101                ),
 102            ],
 103            Some(input_file_content.into()),
 104            EvalAssertion::assert_diff_any(possible_diffs),
 105        ),
 106    );
 107}
 108
 109#[test]
 110#[cfg_attr(not(feature = "eval"), ignore)]
 111fn eval_delete_run_git_blame() {
 112    let input_file_path = "root/blame.rs";
 113    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 114    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 115    let edit_description = "Delete the `run_git_blame` function.";
 116    eval(
 117        100,
 118        0.95,
 119        EvalInput::from_conversation(
 120            vec![
 121                message(
 122                    User,
 123                    [text(formatdoc! {"
 124                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 125                        one function, not its usages.
 126                    "})],
 127                ),
 128                message(
 129                    Assistant,
 130                    [tool_use(
 131                        "tool_1",
 132                        "read_file",
 133                        ReadFileToolInput {
 134                            path: input_file_path.into(),
 135                            start_line: None,
 136                            end_line: None,
 137                        },
 138                    )],
 139                ),
 140                message(
 141                    User,
 142                    [tool_result("tool_1", "read_file", input_file_content)],
 143                ),
 144                message(
 145                    Assistant,
 146                    [tool_use(
 147                        "tool_2",
 148                        "edit_file",
 149                        EditFileToolInput {
 150                            display_description: edit_description.into(),
 151                            path: input_file_path.into(),
 152                            mode: EditFileMode::Edit,
 153                        },
 154                    )],
 155                ),
 156            ],
 157            Some(input_file_content.into()),
 158            EvalAssertion::assert_eq(output_file_content),
 159        ),
 160    );
 161}
 162
 163#[test]
 164#[cfg_attr(not(feature = "eval"), ignore)]
 165fn eval_translate_doc_comments() {
 166    // Results for 2025-05-22
 167    //
 168    //  Model                          | Pass rate
 169    // ============================================
 170    //
 171    //  claude-3.7-sonnet              |
 172    //  gemini-2.5-pro-preview-03-25   |  1.0
 173    //  gemini-2.5-flash-preview-04-17 |
 174    //  gpt-4.1                        |
 175    let input_file_path = "root/canvas.rs";
 176    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 177    let edit_description = "Translate all doc comments to Italian";
 178    eval(
 179        200,
 180        1.,
 181        EvalInput::from_conversation(
 182            vec![
 183                message(
 184                    User,
 185                    [text(formatdoc! {"
 186                        Read the {input_file_path} file and edit it (without overwriting it),
 187                        translating all the doc comments to italian.
 188                    "})],
 189                ),
 190                message(
 191                    Assistant,
 192                    [tool_use(
 193                        "tool_1",
 194                        "read_file",
 195                        ReadFileToolInput {
 196                            path: input_file_path.into(),
 197                            start_line: None,
 198                            end_line: None,
 199                        },
 200                    )],
 201                ),
 202                message(
 203                    User,
 204                    [tool_result("tool_1", "read_file", input_file_content)],
 205                ),
 206                message(
 207                    Assistant,
 208                    [tool_use(
 209                        "tool_2",
 210                        "edit_file",
 211                        EditFileToolInput {
 212                            display_description: edit_description.into(),
 213                            path: input_file_path.into(),
 214                            mode: EditFileMode::Edit,
 215                        },
 216                    )],
 217                ),
 218            ],
 219            Some(input_file_content.into()),
 220            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 221        ),
 222    );
 223}
 224
 225#[test]
 226#[cfg_attr(not(feature = "eval"), ignore)]
 227fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 228    // Results for 2025-05-22
 229    //
 230    //  Model                          | Pass rate
 231    // ============================================
 232    //
 233    //  claude-3.7-sonnet              |  0.98
 234    //  gemini-2.5-pro-preview-03-25   |  0.99
 235    //  gemini-2.5-flash-preview-04-17 |
 236    //  gpt-4.1                        |
 237    let input_file_path = "root/lib.rs";
 238    let input_file_content =
 239        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 240    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 241    eval(
 242        100,
 243        0.95,
 244        EvalInput::from_conversation(
 245            vec![
 246                message(
 247                    User,
 248                    [text(formatdoc! {"
 249                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 250                        Use `ureq` to download the SDK for the current platform and architecture.
 251                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 252                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 253                        that's inside of the archive.
 254                        Don't re-download the SDK if that executable already exists.
 255
 256                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 257
 258                        Here are the available wasi-sdk assets:
 259                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 260                        - wasi-sdk-25.0-arm64-macos.tar.gz
 261                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 262                        - wasi-sdk-25.0-arm64-linux.tar.gz
 263                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 264                        - wasi-sdk-25.0-arm64-linux.tar.gz
 265                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 266                    "})],
 267                ),
 268                message(
 269                    Assistant,
 270                    [tool_use(
 271                        "tool_1",
 272                        "read_file",
 273                        ReadFileToolInput {
 274                            path: input_file_path.into(),
 275                            start_line: Some(971),
 276                            end_line: Some(1050),
 277                        },
 278                    )],
 279                ),
 280                message(
 281                    User,
 282                    [tool_result(
 283                        "tool_1",
 284                        "read_file",
 285                        lines(input_file_content, 971..1050),
 286                    )],
 287                ),
 288                message(
 289                    Assistant,
 290                    [tool_use(
 291                        "tool_2",
 292                        "read_file",
 293                        ReadFileToolInput {
 294                            path: input_file_path.into(),
 295                            start_line: Some(1050),
 296                            end_line: Some(1100),
 297                        },
 298                    )],
 299                ),
 300                message(
 301                    User,
 302                    [tool_result(
 303                        "tool_2",
 304                        "read_file",
 305                        lines(input_file_content, 1050..1100),
 306                    )],
 307                ),
 308                message(
 309                    Assistant,
 310                    [tool_use(
 311                        "tool_3",
 312                        "read_file",
 313                        ReadFileToolInput {
 314                            path: input_file_path.into(),
 315                            start_line: Some(1100),
 316                            end_line: Some(1150),
 317                        },
 318                    )],
 319                ),
 320                message(
 321                    User,
 322                    [tool_result(
 323                        "tool_3",
 324                        "read_file",
 325                        lines(input_file_content, 1100..1150),
 326                    )],
 327                ),
 328                message(
 329                    Assistant,
 330                    [tool_use(
 331                        "tool_4",
 332                        "edit_file",
 333                        EditFileToolInput {
 334                            display_description: edit_description.into(),
 335                            path: input_file_path.into(),
 336                            mode: EditFileMode::Edit,
 337                        },
 338                    )],
 339                ),
 340            ],
 341            Some(input_file_content.into()),
 342            EvalAssertion::judge_diff(indoc! {"
 343                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 344                - ureq is used to download the SDK for current platform and architecture
 345            "}),
 346        ),
 347    );
 348}
 349
 350#[test]
 351#[cfg_attr(not(feature = "eval"), ignore)]
 352fn eval_disable_cursor_blinking() {
 353    // Results for 2025-05-22
 354    //
 355    //  Model                          | Pass rate
 356    // ============================================
 357    //
 358    //  claude-3.7-sonnet              |
 359    //  gemini-2.5-pro-preview-03-25   |  1.0
 360    //  gemini-2.5-flash-preview-04-17 |
 361    //  gpt-4.1                        |
 362    let input_file_path = "root/editor.rs";
 363    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 364    let edit_description = "Comment out the call to `BlinkManager::enable`";
 365    eval(
 366        100,
 367        0.95,
 368        EvalInput::from_conversation(
 369            vec![
 370                message(User, [text("Let's research how to cursor blinking works.")]),
 371                message(
 372                    Assistant,
 373                    [tool_use(
 374                        "tool_1",
 375                        "grep",
 376                        GrepToolInput {
 377                            regex: "blink".into(),
 378                            include_pattern: None,
 379                            offset: 0,
 380                            case_sensitive: false,
 381                        },
 382                    )],
 383                ),
 384                message(
 385                    User,
 386                    [tool_result(
 387                        "tool_1",
 388                        "grep",
 389                        [
 390                            lines(input_file_content, 100..400),
 391                            lines(input_file_content, 800..1300),
 392                            lines(input_file_content, 1600..2000),
 393                            lines(input_file_content, 5000..5500),
 394                            lines(input_file_content, 8000..9000),
 395                            lines(input_file_content, 18455..18470),
 396                            lines(input_file_content, 20000..20500),
 397                            lines(input_file_content, 21000..21300),
 398                        ]
 399                        .join("Match found:\n\n"),
 400                    )],
 401                ),
 402                message(
 403                    User,
 404                    [text(indoc! {"
 405                        Comment out the lines that interact with the BlinkManager.
 406                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 407                        Don't add additional comments.
 408                    "})],
 409                ),
 410                message(
 411                    Assistant,
 412                    [tool_use(
 413                        "tool_4",
 414                        "edit_file",
 415                        EditFileToolInput {
 416                            display_description: edit_description.into(),
 417                            path: input_file_path.into(),
 418                            mode: EditFileMode::Edit,
 419                        },
 420                    )],
 421                ),
 422            ],
 423            Some(input_file_content.into()),
 424            EvalAssertion::judge_diff(indoc! {"
 425                - Calls to BlinkManager in `observe_window_activation` were commented out
 426                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 427                - All the edits have valid indentation
 428            "}),
 429        ),
 430    );
 431}
 432
 433#[test]
 434#[cfg_attr(not(feature = "eval"), ignore)]
 435fn eval_from_pixels_constructor() {
 436    // Results for 2025-05-22
 437    //
 438    //  Model                          | Pass rate
 439    // ============================================
 440    //
 441    //  claude-3.7-sonnet              |
 442    //  gemini-2.5-pro-preview-03-25   |  0.94
 443    //  gemini-2.5-flash-preview-04-17 |
 444    //  gpt-4.1                        |
 445    let input_file_path = "root/canvas.rs";
 446    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 447    let edit_description = "Implement from_pixels constructor and add tests.";
 448    eval(
 449        100,
 450        0.95,
 451        EvalInput::from_conversation(
 452            vec![
 453                message(
 454                    User,
 455                    [text(indoc! {"
 456                        Introduce a new `from_pixels` constructor in Canvas and
 457                        also add tests for it in the same file.
 458                    "})],
 459                ),
 460                message(
 461                    Assistant,
 462                    [tool_use(
 463                        "tool_1",
 464                        "read_file",
 465                        ReadFileToolInput {
 466                            path: input_file_path.into(),
 467                            start_line: None,
 468                            end_line: None,
 469                        },
 470                    )],
 471                ),
 472                message(
 473                    User,
 474                    [tool_result("tool_1", "read_file", input_file_content)],
 475                ),
 476                message(
 477                    Assistant,
 478                    [tool_use(
 479                        "tool_2",
 480                        "grep",
 481                        GrepToolInput {
 482                            regex: "mod\\s+tests".into(),
 483                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 484                            offset: 0,
 485                            case_sensitive: false,
 486                        },
 487                    )],
 488                ),
 489                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 490                message(
 491                    Assistant,
 492                    [tool_use(
 493                        "tool_3",
 494                        "grep",
 495                        GrepToolInput {
 496                            regex: "mod\\s+tests".into(),
 497                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 498                            offset: 0,
 499                            case_sensitive: false,
 500                        },
 501                    )],
 502                ),
 503                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 504                message(
 505                    Assistant,
 506                    [tool_use(
 507                        "tool_4",
 508                        "grep",
 509                        GrepToolInput {
 510                            regex: "#\\[test\\]".into(),
 511                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 512                            offset: 0,
 513                            case_sensitive: false,
 514                        },
 515                    )],
 516                ),
 517                message(
 518                    User,
 519                    [tool_result(
 520                        "tool_4",
 521                        "grep",
 522                        indoc! {"
 523                            Found 6 matches:
 524
 525                            ## Matches in font-kit/src/loaders/core_text.rs
 526
 527                            ### mod test › L926-936
 528                            ```
 529                            mod test {
 530                                use super::Font;
 531                                use crate::properties::{Stretch, Weight};
 532
 533                                #[cfg(feature = \"source\")]
 534                                use crate::source::SystemSource;
 535
 536                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 537
 538                                #[cfg(feature = \"source\")]
 539                                #[test]
 540                            ```
 541
 542                            55 lines remaining in ancestor node. Read the file to see all.
 543
 544                            ### mod test › L947-951
 545                            ```
 546                                }
 547
 548                                #[test]
 549                                fn test_core_text_to_css_font_weight() {
 550                                    // Exact matches
 551                            ```
 552
 553                            ### mod test › L959-963
 554                            ```
 555                                }
 556
 557                                #[test]
 558                                fn test_core_text_to_css_font_stretch() {
 559                                    // Exact matches
 560                            ```
 561
 562                            ## Matches in font-kit/src/loaders/freetype.rs
 563
 564                            ### mod test › L1238-1248
 565                            ```
 566                            mod test {
 567                                use crate::loaders::freetype::Font;
 568
 569                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 570                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 571
 572                                #[test]
 573                                fn get_pcf_postscript_name() {
 574                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 575                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 576                                }
 577                            ```
 578
 579                            1 lines remaining in ancestor node. Read the file to see all.
 580
 581                            ## Matches in font-kit/src/sources/core_text.rs
 582
 583                            ### mod test › L265-275
 584                            ```
 585                            mod test {
 586                                use crate::properties::{Stretch, Weight};
 587
 588                                #[test]
 589                                fn test_css_to_core_text_font_weight() {
 590                                    // Exact matches
 591                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 592                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 593                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 594                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 595
 596                            ```
 597
 598                            27 lines remaining in ancestor node. Read the file to see all.
 599
 600                            ### mod test › L278-282
 601                            ```
 602                                }
 603
 604                                #[test]
 605                                fn test_css_to_core_text_font_stretch() {
 606                                    // Exact matches
 607                            ```
 608                        "},
 609                    )],
 610                ),
 611                message(
 612                    Assistant,
 613                    [tool_use(
 614                        "tool_5",
 615                        "edit_file",
 616                        EditFileToolInput {
 617                            display_description: edit_description.into(),
 618                            path: input_file_path.into(),
 619                            mode: EditFileMode::Edit,
 620                        },
 621                    )],
 622                ),
 623            ],
 624            Some(input_file_content.into()),
 625            EvalAssertion::judge_diff(indoc! {"
 626                    - The diff contains a new `from_pixels` constructor
 627                    - The diff contains new tests for the `from_pixels` constructor
 628                "}),
 629        ),
 630    );
 631}
 632
 633#[test]
 634#[cfg_attr(not(feature = "eval"), ignore)]
 635fn eval_zode() {
 636    // Results for 2025-05-22
 637    //
 638    //  Model                          | Pass rate
 639    // ============================================
 640    //
 641    //  claude-3.7-sonnet              |  1.0
 642    //  gemini-2.5-pro-preview-03-25   |  1.0
 643    //  gemini-2.5-flash-preview-04-17 |  1.0
 644    //  gpt-4.1                        |  1.0
 645    let input_file_path = "root/zode.py";
 646    let input_content = None;
 647    let edit_description = "Create the main Zode CLI script";
 648    eval(
 649        50,
 650        1.,
 651        EvalInput::from_conversation(
 652            vec![
 653                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 654                message(
 655                    Assistant,
 656                    [
 657                        tool_use(
 658                            "tool_1",
 659                            "read_file",
 660                            ReadFileToolInput {
 661                                path: "root/eval/react.py".into(),
 662                                start_line: None,
 663                                end_line: None,
 664                            },
 665                        ),
 666                        tool_use(
 667                            "tool_2",
 668                            "read_file",
 669                            ReadFileToolInput {
 670                                path: "root/eval/react_test.py".into(),
 671                                start_line: None,
 672                                end_line: None,
 673                            },
 674                        ),
 675                    ],
 676                ),
 677                message(
 678                    User,
 679                    [
 680                        tool_result(
 681                            "tool_1",
 682                            "read_file",
 683                            include_str!("evals/fixtures/zode/react.py"),
 684                        ),
 685                        tool_result(
 686                            "tool_2",
 687                            "read_file",
 688                            include_str!("evals/fixtures/zode/react_test.py"),
 689                        ),
 690                    ],
 691                ),
 692                message(
 693                    Assistant,
 694                    [
 695                        text(
 696                            "Now that I understand what we need to build, I'll create the main Python script:",
 697                        ),
 698                        tool_use(
 699                            "tool_3",
 700                            "edit_file",
 701                            EditFileToolInput {
 702                                display_description: edit_description.into(),
 703                                path: input_file_path.into(),
 704                                mode: EditFileMode::Create,
 705                            },
 706                        ),
 707                    ],
 708                ),
 709            ],
 710            input_content,
 711            EvalAssertion::new(async move |sample, _, _cx| {
 712                let invalid_starts = [' ', '`', '\n'];
 713                let mut message = String::new();
 714                for start in invalid_starts {
 715                    if sample.text_after.starts_with(start) {
 716                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 717                        break;
 718                    }
 719                }
 720                // Remove trailing newline.
 721                message.pop();
 722
 723                if message.is_empty() {
 724                    Ok(EvalAssertionOutcome {
 725                        score: 100,
 726                        message: None,
 727                    })
 728                } else {
 729                    Ok(EvalAssertionOutcome {
 730                        score: 0,
 731                        message: Some(message),
 732                    })
 733                }
 734            }),
 735        ),
 736    );
 737}
 738
 739#[test]
 740#[cfg_attr(not(feature = "eval"), ignore)]
 741fn eval_add_overwrite_test() {
 742    // Results for 2025-05-22
 743    //
 744    //  Model                          | Pass rate
 745    // ============================================
 746    //
 747    //  claude-3.7-sonnet              |  0.16
 748    //  gemini-2.5-pro-preview-03-25   |  0.35
 749    //  gemini-2.5-flash-preview-04-17 |
 750    //  gpt-4.1                        |
 751    let input_file_path = "root/action_log.rs";
 752    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 753    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 754    eval(
 755        200,
 756        0.5, // TODO: make this eval better
 757        EvalInput::from_conversation(
 758            vec![
 759                message(
 760                    User,
 761                    [text(indoc! {"
 762                        Introduce a new test in `action_log.rs` to test overwriting a file.
 763                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 764                        Take inspiration from all the other tests in the file.
 765                    "})],
 766                ),
 767                message(
 768                    Assistant,
 769                    [tool_use(
 770                        "tool_1",
 771                        "read_file",
 772                        ReadFileToolInput {
 773                            path: input_file_path.into(),
 774                            start_line: None,
 775                            end_line: None,
 776                        },
 777                    )],
 778                ),
 779                message(
 780                    User,
 781                    [tool_result(
 782                        "tool_1",
 783                        "read_file",
 784                        indoc! {"
 785                            pub struct ActionLog [L13-20]
 786                             tracked_buffers [L15]
 787                             edited_since_project_diagnostics_check [L17]
 788                             project [L19]
 789                            impl ActionLog [L22-498]
 790                             pub fn new [L24-30]
 791                             pub fn project [L32-34]
 792                             pub fn checked_project_diagnostics [L37-39]
 793                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 794                             fn track_buffer_internal [L46-101]
 795                             fn handle_buffer_event [L103-116]
 796                             fn handle_buffer_edited [L118-123]
 797                             fn handle_buffer_file_changed [L125-158]
 798                             async fn maintain_diff [L160-264]
 799                             pub fn buffer_read [L267-269]
 800                             pub fn buffer_created [L272-276]
 801                             pub fn buffer_edited [L279-287]
 802                             pub fn will_delete_buffer [L289-304]
 803                             pub fn keep_edits_in_range [L306-364]
 804                             pub fn reject_edits_in_ranges [L366-459]
 805                             pub fn keep_all_edits [L461-473]
 806                             pub fn changed_buffers [L476-482]
 807                             pub fn stale_buffers [L485-497]
 808                            fn apply_non_conflicting_edits [L500-561]
 809                            fn diff_snapshots [L563-585]
 810                            fn point_to_row_edit [L587-614]
 811                            enum ChangeAuthor [L617-620]
 812                             User [L618]
 813                             Agent [L619]
 814                            enum TrackedBufferStatus [L623-627]
 815                             Created [L624]
 816                             Modified [L625]
 817                             Deleted [L626]
 818                            struct TrackedBuffer [L629-641]
 819                             buffer [L630]
 820                             base_text [L631]
 821                             unreviewed_changes [L632]
 822                             status [L633]
 823                             version [L634]
 824                             diff [L635]
 825                             snapshot [L636]
 826                             diff_update [L637]
 827                             _open_lsp_handle [L638]
 828                             _maintain_diff [L639]
 829                             _subscription [L640]
 830                            impl TrackedBuffer [L643-657]
 831                             fn has_changes [L644-650]
 832                             fn schedule_diff_update [L652-656]
 833                            pub struct ChangedBuffer [L659-661]
 834                             pub diff [L660]
 835                            mod tests [L664-1574]
 836                             fn init_logger [L678-682]
 837                             fn init_test [L684-691]
 838                             async fn test_keep_edits [L694-769]
 839                             async fn test_deletions [L772-854]
 840                             async fn test_overlapping_user_edits [L857-951]
 841                             async fn test_creating_files [L954-1010]
 842                             async fn test_deleting_files [L1013-1120]
 843                             async fn test_reject_edits [L1123-1255]
 844                             async fn test_reject_multiple_edits [L1258-1331]
 845                             async fn test_reject_deleted_file [L1334-1388]
 846                             async fn test_reject_created_file [L1391-1443]
 847                             async fn test_random_diffs [L1446-1535]
 848                              fn quiesce [L1510-1534]
 849                             struct HunkStatus [L1538-1542]
 850                              range [L1539]
 851                              diff_status [L1540]
 852                              old_text [L1541]
 853                             fn unreviewed_hunks [L1544-1573]
 854
 855                            Showing symbols 1-69 (total symbols: 69)
 856
 857                            Using the line numbers in this outline, you can call this tool again while specifying
 858                            the start_line and end_line fields to see the implementations of symbols in the outline.
 859                        "},
 860                    )],
 861                ),
 862                message(
 863                    Assistant,
 864                    [
 865                        text(
 866                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 867                        ),
 868                        tool_use(
 869                            "tool_2",
 870                            "read_file",
 871                            ReadFileToolInput {
 872                                path: input_file_path.into(),
 873                                start_line: Some(953),
 874                                end_line: Some(1010),
 875                            },
 876                        ),
 877                    ],
 878                ),
 879                message(
 880                    User,
 881                    [tool_result(
 882                        "tool_2",
 883                        "read_file",
 884                        lines(input_file_content, 953..1010),
 885                    )],
 886                ),
 887                message(
 888                    Assistant,
 889                    [
 890                        text(
 891                            "Now let's look at another relevant test to understand how they're structured:",
 892                        ),
 893                        tool_use(
 894                            "tool_3",
 895                            "read_file",
 896                            ReadFileToolInput {
 897                                path: input_file_path.into(),
 898                                start_line: Some(1012),
 899                                end_line: Some(1120),
 900                            },
 901                        ),
 902                    ],
 903                ),
 904                message(
 905                    User,
 906                    [tool_result(
 907                        "tool_3",
 908                        "read_file",
 909                        lines(input_file_content, 1012..1120),
 910                    )],
 911                ),
 912                message(
 913                    Assistant,
 914                    [
 915                        text("Now let's look at how `buffer_created` is implemented:"),
 916                        tool_use(
 917                            "tool_4",
 918                            "read_file",
 919                            ReadFileToolInput {
 920                                path: input_file_path.into(),
 921                                start_line: Some(271),
 922                                end_line: Some(276),
 923                            },
 924                        ),
 925                    ],
 926                ),
 927                message(
 928                    User,
 929                    [tool_result(
 930                        "tool_4",
 931                        "read_file",
 932                        lines(input_file_content, 271..276),
 933                    )],
 934                ),
 935                message(
 936                    Assistant,
 937                    [
 938                        text(indoc! {"
 939                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 940
 941                            1. Create a file
 942                            2. Open a buffer for it
 943                            3. Track it in the action log
 944                            4. Modify the buffer
 945                            5. Call `buffer_created` again as if the file were new
 946                            6. Check that the changes are properly tracked
 947
 948                            Let's write the test:
 949                        "}),
 950                        tool_use(
 951                            "tool_5",
 952                            "edit_file",
 953                            EditFileToolInput {
 954                                display_description: edit_description.into(),
 955                                path: input_file_path.into(),
 956                                mode: EditFileMode::Edit,
 957                            },
 958                        ),
 959                    ],
 960                ),
 961            ],
 962            Some(input_file_content.into()),
 963            EvalAssertion::judge_diff(
 964                "A new test for overwritten files was created, without changing any previous test",
 965            ),
 966        ),
 967    );
 968}
 969
 970#[test]
 971#[cfg_attr(not(feature = "eval"), ignore)]
 972fn eval_create_empty_file() {
 973    // Check that Edit Agent can create a file without writing its
 974    // thoughts into it. This issue is not specific to empty files, but
 975    // it's easier to reproduce with them.
 976    //
 977    // Results for 2025-05-21:
 978    //
 979    //  Model                          | Pass rate
 980    // ============================================
 981    //
 982    //  claude-3.7-sonnet              |  1.00
 983    //  gemini-2.5-pro-preview-03-25   |  1.00
 984    //  gemini-2.5-flash-preview-04-17 |  1.00
 985    //  gpt-4.1                        |  1.00
 986    //
 987    //
 988    // TODO: gpt-4.1-mini errored 38 times:
 989    // "data did not match any variant of untagged enum ResponseStreamResult"
 990    //
 991    let input_file_content = None;
 992    let expected_output_content = String::new();
 993    eval(
 994        100,
 995        0.99,
 996        EvalInput::from_conversation(
 997            vec![
 998                message(User, [text("Create a second empty todo file ")]),
 999                message(
1000                    Assistant,
1001                    [
1002                        text(formatdoc! {"
1003                        I'll help you create a second empty todo file.
1004                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1005                        "}),
1006                        tool_use(
1007                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1008                            "list_directory",
1009                            ListDirectoryToolInput {
1010                                path: "root".to_string(),
1011                            },
1012                        ),
1013                    ],
1014                ),
1015                message(
1016                    User,
1017                    [tool_result(
1018                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1019                        "list_directory",
1020                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1021                    )],
1022                ),
1023                message(
1024                    Assistant,
1025                    [
1026                        text(formatdoc! {"
1027                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1028                    "}),
1029                        tool_use(
1030                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1031                            "edit_file",
1032                            EditFileToolInput {
1033                                display_description: "Create empty TODO3 file".to_string(),
1034                                mode: EditFileMode::Create,
1035                                path: "root/TODO3".into(),
1036                            },
1037                        ),
1038                    ],
1039                ),
1040            ],
1041            input_file_content,
1042            // Bad behavior is to write something like
1043            // "I'll create an empty TODO3 file as requested."
1044            EvalAssertion::assert_eq(expected_output_content),
1045        ),
1046    );
1047}
1048
1049fn message(
1050    role: Role,
1051    contents: impl IntoIterator<Item = MessageContent>,
1052) -> LanguageModelRequestMessage {
1053    LanguageModelRequestMessage {
1054        role,
1055        content: contents.into_iter().collect(),
1056        cache: false,
1057    }
1058}
1059
1060fn text(text: impl Into<String>) -> MessageContent {
1061    MessageContent::Text(text.into())
1062}
1063
1064fn lines(input: &str, range: Range<usize>) -> String {
1065    input
1066        .lines()
1067        .skip(range.start)
1068        .take(range.len())
1069        .collect::<Vec<_>>()
1070        .join("\n")
1071}
1072
1073fn tool_use(
1074    id: impl Into<Arc<str>>,
1075    name: impl Into<Arc<str>>,
1076    input: impl Serialize,
1077) -> MessageContent {
1078    MessageContent::ToolUse(LanguageModelToolUse {
1079        id: LanguageModelToolUseId::from(id.into()),
1080        name: name.into(),
1081        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1082        input: serde_json::to_value(input).unwrap(),
1083        is_input_complete: true,
1084    })
1085}
1086
1087fn tool_result(
1088    id: impl Into<Arc<str>>,
1089    name: impl Into<Arc<str>>,
1090    result: impl Into<Arc<str>>,
1091) -> MessageContent {
1092    MessageContent::ToolResult(LanguageModelToolResult {
1093        tool_use_id: LanguageModelToolUseId::from(id.into()),
1094        tool_name: name.into(),
1095        is_error: false,
1096        content: LanguageModelToolResultContent::Text(result.into()),
1097        output: None,
1098    })
1099}
1100
1101#[derive(Clone)]
1102struct EvalInput {
1103    conversation: Vec<LanguageModelRequestMessage>,
1104    edit_file_input: EditFileToolInput,
1105    input_content: Option<String>,
1106    assertion: EvalAssertion,
1107}
1108
1109impl EvalInput {
1110    fn from_conversation(
1111        conversation: Vec<LanguageModelRequestMessage>,
1112        input_content: Option<String>,
1113        assertion: EvalAssertion,
1114    ) -> Self {
1115        let msg = conversation.last().expect("Conversation must not be empty");
1116        if msg.role != Role::Assistant {
1117            panic!("Conversation must end with an assistant message");
1118        }
1119        let tool_use = msg
1120            .content
1121            .iter()
1122            .flat_map(|content| match content {
1123                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1124                    Some(tool_use)
1125                }
1126                _ => None,
1127            })
1128            .next()
1129            .expect("Conversation must end with an edit_file tool use")
1130            .clone();
1131
1132        let edit_file_input: EditFileToolInput =
1133            serde_json::from_value(tool_use.input.clone()).unwrap();
1134
1135        EvalInput {
1136            conversation,
1137            edit_file_input,
1138            input_content,
1139            assertion,
1140        }
1141    }
1142}
1143
1144#[derive(Clone)]
1145struct EvalSample {
1146    text_before: String,
1147    text_after: String,
1148    edit_output: EditAgentOutput,
1149    diff: String,
1150}
1151
1152trait AssertionFn: 'static + Send + Sync {
1153    fn assert<'a>(
1154        &'a self,
1155        sample: &'a EvalSample,
1156        judge_model: Arc<dyn LanguageModel>,
1157        cx: &'a mut TestAppContext,
1158    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1159}
1160
1161impl<F> AssertionFn for F
1162where
1163    F: 'static
1164        + Send
1165        + Sync
1166        + AsyncFn(
1167            &EvalSample,
1168            Arc<dyn LanguageModel>,
1169            &mut TestAppContext,
1170        ) -> Result<EvalAssertionOutcome>,
1171{
1172    fn assert<'a>(
1173        &'a self,
1174        sample: &'a EvalSample,
1175        judge_model: Arc<dyn LanguageModel>,
1176        cx: &'a mut TestAppContext,
1177    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1178        (self)(sample, judge_model, cx).boxed_local()
1179    }
1180}
1181
1182#[derive(Clone)]
1183struct EvalAssertion(Arc<dyn AssertionFn>);
1184
1185impl EvalAssertion {
1186    fn new<F>(f: F) -> Self
1187    where
1188        F: 'static
1189            + Send
1190            + Sync
1191            + AsyncFn(
1192                &EvalSample,
1193                Arc<dyn LanguageModel>,
1194                &mut TestAppContext,
1195            ) -> Result<EvalAssertionOutcome>,
1196    {
1197        EvalAssertion(Arc::new(f))
1198    }
1199
1200    fn assert_eq(expected: impl Into<String>) -> Self {
1201        let expected = expected.into();
1202        Self::new(async move |sample, _judge, _cx| {
1203            Ok(EvalAssertionOutcome {
1204                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1205                    100
1206                } else {
1207                    0
1208                },
1209                message: None,
1210            })
1211        })
1212    }
1213
1214    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1215        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1216        Self::new(async move |sample, _judge, _cx| {
1217            let matches = expected_diffs.iter().any(|possible_diff| {
1218                let expected =
1219                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1220                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1221            });
1222
1223            Ok(EvalAssertionOutcome {
1224                score: if matches { 100 } else { 0 },
1225                message: None,
1226            })
1227        })
1228    }
1229
1230    fn judge_diff(assertions: &'static str) -> Self {
1231        Self::new(async move |sample, judge, cx| {
1232            let prompt = DiffJudgeTemplate {
1233                diff: sample.diff.clone(),
1234                assertions,
1235            }
1236            .render(&Templates::new())
1237            .unwrap();
1238
1239            let request = LanguageModelRequest {
1240                messages: vec![LanguageModelRequestMessage {
1241                    role: Role::User,
1242                    content: vec![prompt.into()],
1243                    cache: false,
1244                }],
1245                ..Default::default()
1246            };
1247            let mut response = judge
1248                .stream_completion_text(request, &cx.to_async())
1249                .await?;
1250            let mut output = String::new();
1251            while let Some(chunk) = response.stream.next().await {
1252                let chunk = chunk?;
1253                output.push_str(&chunk);
1254            }
1255
1256            // Parse the score from the response
1257            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1258            if let Some(captures) = re.captures(&output) {
1259                if let Some(score_match) = captures.get(1) {
1260                    let score = score_match.as_str().parse().unwrap_or(0);
1261                    return Ok(EvalAssertionOutcome {
1262                        score,
1263                        message: Some(output),
1264                    });
1265                }
1266            }
1267
1268            anyhow::bail!("No score found in response. Raw output: {output}");
1269        })
1270    }
1271
1272    async fn run(
1273        &self,
1274        input: &EvalSample,
1275        judge_model: Arc<dyn LanguageModel>,
1276        cx: &mut TestAppContext,
1277    ) -> Result<EvalAssertionOutcome> {
1278        self.0.assert(input, judge_model, cx).await
1279    }
1280}
1281
1282fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1283    let mut evaluated_count = 0;
1284    let mut failed_count = 0;
1285    report_progress(evaluated_count, failed_count, iterations);
1286
1287    let (tx, rx) = mpsc::channel();
1288
1289    // Cache the last message in the conversation, and run one instance of the eval so that
1290    // all the next ones are cached.
1291    eval.conversation.last_mut().unwrap().cache = true;
1292    run_eval(eval.clone(), tx.clone());
1293
1294    let executor = gpui::background_executor();
1295    for _ in 1..iterations {
1296        let eval = eval.clone();
1297        let tx = tx.clone();
1298        executor.spawn(async move { run_eval(eval, tx) }).detach();
1299    }
1300    drop(tx);
1301
1302    let mut failed_evals = HashMap::default();
1303    let mut errored_evals = HashMap::default();
1304    let mut eval_outputs = Vec::new();
1305    let mut cumulative_parser_metrics = EditParserMetrics::default();
1306    while let Ok(output) = rx.recv() {
1307        match output {
1308            Ok(output) => {
1309                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1310                eval_outputs.push(output.clone());
1311                if output.assertion.score < 80 {
1312                    failed_count += 1;
1313                    failed_evals
1314                        .entry(output.sample.text_after.clone())
1315                        .or_insert(Vec::new())
1316                        .push(output);
1317                }
1318            }
1319            Err(error) => {
1320                failed_count += 1;
1321                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1322            }
1323        }
1324
1325        evaluated_count += 1;
1326        report_progress(evaluated_count, failed_count, iterations);
1327    }
1328
1329    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1330    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1331    if actual_pass_ratio < expected_pass_ratio {
1332        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1333        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1334        for (error, count) in errored_evals {
1335            println!("Eval errored {} times. Error: {}", count, error);
1336        }
1337
1338        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1339        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1340        for (_buffer_output, failed_evals) in failed_evals {
1341            let eval_output = failed_evals.first().unwrap();
1342            println!("Eval failed {} times", failed_evals.len());
1343            println!("{}", eval_output);
1344        }
1345
1346        panic!(
1347            "Actual pass ratio: {}\nExpected pass ratio: {}",
1348            actual_pass_ratio, expected_pass_ratio
1349        );
1350    }
1351
1352    let mismatched_tag_ratio =
1353        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1354    if mismatched_tag_ratio > 0.05 {
1355        for eval_output in eval_outputs {
1356            println!("{}", eval_output);
1357        }
1358        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1359    }
1360}
1361
1362fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1363    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1364    let mut cx = TestAppContext::build(dispatcher, None);
1365    let output = cx.executor().block_test(async {
1366        let test = EditAgentTest::new(&mut cx).await;
1367        test.eval(eval, &mut cx).await
1368    });
1369    tx.send(output).unwrap();
1370}
1371
1372#[derive(Clone)]
1373struct EvalOutput {
1374    sample: EvalSample,
1375    assertion: EvalAssertionOutcome,
1376}
1377
1378impl Display for EvalOutput {
1379    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1380        writeln!(f, "Score: {:?}", self.assertion.score)?;
1381        if let Some(message) = self.assertion.message.as_ref() {
1382            writeln!(f, "Message: {}", message)?;
1383        }
1384
1385        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1386
1387        writeln!(
1388            f,
1389            "Parser Metrics:\n{:#?}",
1390            self.sample.edit_output.parser_metrics
1391        )?;
1392        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1393        Ok(())
1394    }
1395}
1396
1397fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1398    let passed_count = evaluated_count - failed_count;
1399    let passed_ratio = if evaluated_count == 0 {
1400        0.0
1401    } else {
1402        passed_count as f64 / evaluated_count as f64
1403    };
1404    print!(
1405        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1406        evaluated_count,
1407        iterations,
1408        passed_ratio * 100.0
1409    );
1410    std::io::stdout().flush().unwrap();
1411}
1412
1413struct EditAgentTest {
1414    agent: EditAgent,
1415    project: Entity<Project>,
1416    judge_model: Arc<dyn LanguageModel>,
1417}
1418
1419impl EditAgentTest {
1420    async fn new(cx: &mut TestAppContext) -> Self {
1421        cx.executor().allow_parking();
1422
1423        let fs = FakeFs::new(cx.executor().clone());
1424        cx.update(|cx| {
1425            settings::init(cx);
1426            gpui_tokio::init(cx);
1427            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1428            cx.set_http_client(http_client);
1429
1430            client::init_settings(cx);
1431            let client = Client::production(cx);
1432            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1433
1434            settings::init(cx);
1435            Project::init_settings(cx);
1436            language::init(cx);
1437            language_model::init(client.clone(), cx);
1438            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1439            crate::init(client.http_client(), cx);
1440        });
1441
1442        fs.insert_tree("/root", json!({})).await;
1443        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1444        let agent_model = SelectedModel::from_str(
1445            &std::env::var("ZED_AGENT_MODEL")
1446                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1447        )
1448        .unwrap();
1449        let judge_model = SelectedModel::from_str(
1450            &std::env::var("ZED_JUDGE_MODEL")
1451                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1452        )
1453        .unwrap();
1454        let (agent_model, judge_model) = cx
1455            .update(|cx| {
1456                cx.spawn(async move |cx| {
1457                    let agent_model = Self::load_model(&agent_model, cx).await;
1458                    let judge_model = Self::load_model(&judge_model, cx).await;
1459                    (agent_model.unwrap(), judge_model.unwrap())
1460                })
1461            })
1462            .await;
1463        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1464
1465        Self {
1466            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1467            project,
1468            judge_model,
1469        }
1470    }
1471
1472    async fn load_model(
1473        selected_model: &SelectedModel,
1474        cx: &mut AsyncApp,
1475    ) -> Result<Arc<dyn LanguageModel>> {
1476        let (provider, model) = cx.update(|cx| {
1477            let models = LanguageModelRegistry::read_global(cx);
1478            let model = models
1479                .available_models(cx)
1480                .find(|model| {
1481                    model.provider_id() == selected_model.provider
1482                        && model.id() == selected_model.model
1483                })
1484                .expect("Model not found");
1485            let provider = models.provider(&model.provider_id()).unwrap();
1486            (provider, model)
1487        })?;
1488        cx.update(|cx| provider.authenticate(cx))?.await?;
1489        Ok(model)
1490    }
1491
1492    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1493        let path = self
1494            .project
1495            .read_with(cx, |project, cx| {
1496                project.find_project_path(eval.edit_file_input.path, cx)
1497            })
1498            .unwrap();
1499        let buffer = self
1500            .project
1501            .update(cx, |project, cx| project.open_buffer(path, cx))
1502            .await
1503            .unwrap();
1504        let tools = cx.update(|cx| {
1505            ToolRegistry::default_global(cx)
1506                .tools()
1507                .into_iter()
1508                .filter_map(|tool| {
1509                    let input_schema = tool
1510                        .input_schema(self.agent.model.tool_input_format())
1511                        .ok()?;
1512                    Some(LanguageModelRequestTool {
1513                        name: tool.name(),
1514                        description: tool.description(),
1515                        input_schema,
1516                    })
1517                })
1518                .collect::<Vec<_>>()
1519        });
1520        let tool_names = tools
1521            .iter()
1522            .map(|tool| tool.name.clone())
1523            .collect::<Vec<_>>();
1524        let worktrees = vec![WorktreeContext {
1525            root_name: "root".to_string(),
1526            rules_file: None,
1527        }];
1528        let prompt_builder = PromptBuilder::new(None)?;
1529        let project_context = ProjectContext::new(worktrees, Vec::default());
1530        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1531            &project_context,
1532            &ModelContext {
1533                available_tools: tool_names,
1534            },
1535        )?;
1536
1537        let has_system_prompt = eval
1538            .conversation
1539            .first()
1540            .map_or(false, |msg| msg.role == Role::System);
1541        let messages = if has_system_prompt {
1542            eval.conversation
1543        } else {
1544            [LanguageModelRequestMessage {
1545                role: Role::System,
1546                content: vec![MessageContent::Text(system_prompt)],
1547                cache: true,
1548            }]
1549            .into_iter()
1550            .chain(eval.conversation)
1551            .collect::<Vec<_>>()
1552        };
1553
1554        let conversation = LanguageModelRequest {
1555            messages,
1556            tools,
1557            ..Default::default()
1558        };
1559
1560        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1561            if let Some(input_content) = eval.input_content.as_deref() {
1562                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1563            }
1564            let (edit_output, _) = self.agent.edit(
1565                buffer.clone(),
1566                eval.edit_file_input.display_description,
1567                &conversation,
1568                &mut cx.to_async(),
1569            );
1570            edit_output.await?
1571        } else {
1572            let (edit_output, _) = self.agent.overwrite(
1573                buffer.clone(),
1574                eval.edit_file_input.display_description,
1575                &conversation,
1576                &mut cx.to_async(),
1577            );
1578            edit_output.await?
1579        };
1580
1581        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1582        let sample = EvalSample {
1583            edit_output,
1584            diff: language::unified_diff(
1585                eval.input_content.as_deref().unwrap_or_default(),
1586                &buffer_text,
1587            ),
1588            text_before: eval.input_content.unwrap_or_default(),
1589            text_after: buffer_text,
1590        };
1591        let assertion = eval
1592            .assertion
1593            .run(&sample, self.judge_model.clone(), cx)
1594            .await?;
1595
1596        Ok(EvalOutput { assertion, sample })
1597    }
1598}
1599
1600#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1601struct EvalAssertionOutcome {
1602    score: usize,
1603    message: Option<String>,
1604}
1605
1606#[derive(Serialize)]
1607pub struct DiffJudgeTemplate {
1608    diff: String,
1609    assertions: &'static str,
1610}
1611
1612impl Template for DiffJudgeTemplate {
1613    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1614}
1615
1616fn strip_empty_lines(text: &str) -> String {
1617    text.lines()
1618        .filter(|line| !line.trim().is_empty())
1619        .collect::<Vec<_>>()
1620        .join("\n")
1621}