evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    str::FromStr,
  30    sync::mpsc,
  31};
  32use util::path;
  33
  34#[test]
  35#[cfg_attr(not(feature = "eval"), ignore)]
  36fn eval_extract_handle_command_output() {
  37    // Test how well agent generates multiple edit hunks.
  38    //
  39    // Model                       | Pass rate
  40    // ----------------------------|----------
  41    // claude-3.7-sonnet           |  0.98
  42    // gemini-2.5-pro              |  0.86
  43    // gemini-2.5-flash            |  0.11
  44    // gpt-4.1                     |  1.00
  45
  46    let input_file_path = "root/blame.rs";
  47    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  48    let possible_diffs = vec![
  49        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  50        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  51        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  52        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  53        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
  54        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
  55        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
  56    ];
  57    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  58    eval(
  59        100,
  60        0.7, // Taking the lower bar for Gemini
  61        EvalInput::from_conversation(
  62            vec![
  63                message(
  64                    User,
  65                    [text(formatdoc! {"
  66                        Read the `{input_file_path}` file and extract a method in
  67                        the final stanza of `run_git_blame` to deal with command failures,
  68                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  69                        Do not document the method and do not add any comments.
  70
  71                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  72                    "})],
  73                ),
  74                message(
  75                    Assistant,
  76                    [tool_use(
  77                        "tool_1",
  78                        "read_file",
  79                        ReadFileToolInput {
  80                            path: input_file_path.into(),
  81                            start_line: None,
  82                            end_line: None,
  83                        },
  84                    )],
  85                ),
  86                message(
  87                    User,
  88                    [tool_result("tool_1", "read_file", input_file_content)],
  89                ),
  90                message(
  91                    Assistant,
  92                    [tool_use(
  93                        "tool_2",
  94                        "edit_file",
  95                        EditFileToolInput {
  96                            display_description: edit_description.into(),
  97                            path: input_file_path.into(),
  98                            mode: EditFileMode::Edit,
  99                        },
 100                    )],
 101                ),
 102            ],
 103            Some(input_file_content.into()),
 104            EvalAssertion::assert_diff_any(possible_diffs),
 105        ),
 106    );
 107}
 108
 109#[test]
 110#[cfg_attr(not(feature = "eval"), ignore)]
 111fn eval_delete_run_git_blame() {
 112    let input_file_path = "root/blame.rs";
 113    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 114    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 115    let edit_description = "Delete the `run_git_blame` function.";
 116    eval(
 117        100,
 118        0.95,
 119        EvalInput::from_conversation(
 120            vec![
 121                message(
 122                    User,
 123                    [text(formatdoc! {"
 124                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 125                        one function, not its usages.
 126                    "})],
 127                ),
 128                message(
 129                    Assistant,
 130                    [tool_use(
 131                        "tool_1",
 132                        "read_file",
 133                        ReadFileToolInput {
 134                            path: input_file_path.into(),
 135                            start_line: None,
 136                            end_line: None,
 137                        },
 138                    )],
 139                ),
 140                message(
 141                    User,
 142                    [tool_result("tool_1", "read_file", input_file_content)],
 143                ),
 144                message(
 145                    Assistant,
 146                    [tool_use(
 147                        "tool_2",
 148                        "edit_file",
 149                        EditFileToolInput {
 150                            display_description: edit_description.into(),
 151                            path: input_file_path.into(),
 152                            mode: EditFileMode::Edit,
 153                        },
 154                    )],
 155                ),
 156            ],
 157            Some(input_file_content.into()),
 158            EvalAssertion::assert_eq(output_file_content),
 159        ),
 160    );
 161}
 162
 163#[test]
 164#[cfg_attr(not(feature = "eval"), ignore)]
 165fn eval_translate_doc_comments() {
 166    let input_file_path = "root/canvas.rs";
 167    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 168    let edit_description = "Translate all doc comments to Italian";
 169    eval(
 170        200,
 171        1.,
 172        EvalInput::from_conversation(
 173            vec![
 174                message(
 175                    User,
 176                    [text(formatdoc! {"
 177                        Read the {input_file_path} file and edit it (without overwriting it),
 178                        translating all the doc comments to italian.
 179                    "})],
 180                ),
 181                message(
 182                    Assistant,
 183                    [tool_use(
 184                        "tool_1",
 185                        "read_file",
 186                        ReadFileToolInput {
 187                            path: input_file_path.into(),
 188                            start_line: None,
 189                            end_line: None,
 190                        },
 191                    )],
 192                ),
 193                message(
 194                    User,
 195                    [tool_result("tool_1", "read_file", input_file_content)],
 196                ),
 197                message(
 198                    Assistant,
 199                    [tool_use(
 200                        "tool_2",
 201                        "edit_file",
 202                        EditFileToolInput {
 203                            display_description: edit_description.into(),
 204                            path: input_file_path.into(),
 205                            mode: EditFileMode::Edit,
 206                        },
 207                    )],
 208                ),
 209            ],
 210            Some(input_file_content.into()),
 211            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 212        ),
 213    );
 214}
 215
 216#[test]
 217#[cfg_attr(not(feature = "eval"), ignore)]
 218fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 219    let input_file_path = "root/lib.rs";
 220    let input_file_content =
 221        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 222    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 223    eval(
 224        100,
 225        0.95,
 226        EvalInput::from_conversation(
 227            vec![
 228                message(
 229                    User,
 230                    [text(formatdoc! {"
 231                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 232                        Use `ureq` to download the SDK for the current platform and architecture.
 233                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 234                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 235                        that's inside of the archive.
 236                        Don't re-download the SDK if that executable already exists.
 237
 238                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 239
 240                        Here are the available wasi-sdk assets:
 241                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 242                        - wasi-sdk-25.0-arm64-macos.tar.gz
 243                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 244                        - wasi-sdk-25.0-arm64-linux.tar.gz
 245                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 246                        - wasi-sdk-25.0-arm64-linux.tar.gz
 247                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 248                    "})],
 249                ),
 250                message(
 251                    Assistant,
 252                    [tool_use(
 253                        "tool_1",
 254                        "read_file",
 255                        ReadFileToolInput {
 256                            path: input_file_path.into(),
 257                            start_line: Some(971),
 258                            end_line: Some(1050),
 259                        },
 260                    )],
 261                ),
 262                message(
 263                    User,
 264                    [tool_result(
 265                        "tool_1",
 266                        "read_file",
 267                        lines(input_file_content, 971..1050),
 268                    )],
 269                ),
 270                message(
 271                    Assistant,
 272                    [tool_use(
 273                        "tool_2",
 274                        "read_file",
 275                        ReadFileToolInput {
 276                            path: input_file_path.into(),
 277                            start_line: Some(1050),
 278                            end_line: Some(1100),
 279                        },
 280                    )],
 281                ),
 282                message(
 283                    User,
 284                    [tool_result(
 285                        "tool_2",
 286                        "read_file",
 287                        lines(input_file_content, 1050..1100),
 288                    )],
 289                ),
 290                message(
 291                    Assistant,
 292                    [tool_use(
 293                        "tool_3",
 294                        "read_file",
 295                        ReadFileToolInput {
 296                            path: input_file_path.into(),
 297                            start_line: Some(1100),
 298                            end_line: Some(1150),
 299                        },
 300                    )],
 301                ),
 302                message(
 303                    User,
 304                    [tool_result(
 305                        "tool_3",
 306                        "read_file",
 307                        lines(input_file_content, 1100..1150),
 308                    )],
 309                ),
 310                message(
 311                    Assistant,
 312                    [tool_use(
 313                        "tool_4",
 314                        "edit_file",
 315                        EditFileToolInput {
 316                            display_description: edit_description.into(),
 317                            path: input_file_path.into(),
 318                            mode: EditFileMode::Edit,
 319                        },
 320                    )],
 321                ),
 322            ],
 323            Some(input_file_content.into()),
 324            EvalAssertion::judge_diff(indoc! {"
 325                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 326                - ureq is used to download the SDK for current platform and architecture
 327            "}),
 328        ),
 329    );
 330}
 331
 332#[test]
 333#[cfg_attr(not(feature = "eval"), ignore)]
 334fn eval_disable_cursor_blinking() {
 335    let input_file_path = "root/editor.rs";
 336    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 337    let edit_description = "Comment out the call to `BlinkManager::enable`";
 338    eval(
 339        100,
 340        0.95,
 341        EvalInput::from_conversation(
 342            vec![
 343                message(User, [text("Let's research how to cursor blinking works.")]),
 344                message(
 345                    Assistant,
 346                    [tool_use(
 347                        "tool_1",
 348                        "grep",
 349                        GrepToolInput {
 350                            regex: "blink".into(),
 351                            include_pattern: None,
 352                            offset: 0,
 353                            case_sensitive: false,
 354                        },
 355                    )],
 356                ),
 357                message(
 358                    User,
 359                    [tool_result(
 360                        "tool_1",
 361                        "grep",
 362                        [
 363                            lines(input_file_content, 100..400),
 364                            lines(input_file_content, 800..1300),
 365                            lines(input_file_content, 1600..2000),
 366                            lines(input_file_content, 5000..5500),
 367                            lines(input_file_content, 8000..9000),
 368                            lines(input_file_content, 18455..18470),
 369                            lines(input_file_content, 20000..20500),
 370                            lines(input_file_content, 21000..21300),
 371                        ]
 372                        .join("Match found:\n\n"),
 373                    )],
 374                ),
 375                message(
 376                    User,
 377                    [text(indoc! {"
 378                        Comment out the lines that interact with the BlinkManager.
 379                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 380                        Don't add additional comments.
 381                    "})],
 382                ),
 383                message(
 384                    Assistant,
 385                    [tool_use(
 386                        "tool_4",
 387                        "edit_file",
 388                        EditFileToolInput {
 389                            display_description: edit_description.into(),
 390                            path: input_file_path.into(),
 391                            mode: EditFileMode::Edit,
 392                        },
 393                    )],
 394                ),
 395            ],
 396            Some(input_file_content.into()),
 397            EvalAssertion::judge_diff(indoc! {"
 398                - Calls to BlinkManager in `observe_window_activation` were commented out
 399                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 400                - All the edits have valid indentation
 401            "}),
 402        ),
 403    );
 404}
 405
 406#[test]
 407#[cfg_attr(not(feature = "eval"), ignore)]
 408fn eval_from_pixels_constructor() {
 409    let input_file_path = "root/canvas.rs";
 410    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 411    let edit_description = "Implement from_pixels constructor and add tests.";
 412    eval(
 413        100,
 414        0.95,
 415        EvalInput::from_conversation(
 416            vec![
 417                message(
 418                    User,
 419                    [text(indoc! {"
 420                        Introduce a new `from_pixels` constructor in Canvas and
 421                        also add tests for it in the same file.
 422                    "})],
 423                ),
 424                message(
 425                    Assistant,
 426                    [tool_use(
 427                        "tool_1",
 428                        "read_file",
 429                        ReadFileToolInput {
 430                            path: input_file_path.into(),
 431                            start_line: None,
 432                            end_line: None,
 433                        },
 434                    )],
 435                ),
 436                message(
 437                    User,
 438                    [tool_result("tool_1", "read_file", input_file_content)],
 439                ),
 440                message(
 441                    Assistant,
 442                    [tool_use(
 443                        "tool_2",
 444                        "grep",
 445                        GrepToolInput {
 446                            regex: "mod\\s+tests".into(),
 447                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 448                            offset: 0,
 449                            case_sensitive: false,
 450                        },
 451                    )],
 452                ),
 453                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 454                message(
 455                    Assistant,
 456                    [tool_use(
 457                        "tool_3",
 458                        "grep",
 459                        GrepToolInput {
 460                            regex: "mod\\s+tests".into(),
 461                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 462                            offset: 0,
 463                            case_sensitive: false,
 464                        },
 465                    )],
 466                ),
 467                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 468                message(
 469                    Assistant,
 470                    [tool_use(
 471                        "tool_4",
 472                        "grep",
 473                        GrepToolInput {
 474                            regex: "#\\[test\\]".into(),
 475                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 476                            offset: 0,
 477                            case_sensitive: false,
 478                        },
 479                    )],
 480                ),
 481                message(
 482                    User,
 483                    [tool_result(
 484                        "tool_4",
 485                        "grep",
 486                        indoc! {"
 487                            Found 6 matches:
 488
 489                            ## Matches in font-kit/src/loaders/core_text.rs
 490
 491                            ### mod test › L926-936
 492                            ```
 493                            mod test {
 494                                use super::Font;
 495                                use crate::properties::{Stretch, Weight};
 496
 497                                #[cfg(feature = \"source\")]
 498                                use crate::source::SystemSource;
 499
 500                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 501
 502                                #[cfg(feature = \"source\")]
 503                                #[test]
 504                            ```
 505
 506                            55 lines remaining in ancestor node. Read the file to see all.
 507
 508                            ### mod test › L947-951
 509                            ```
 510                                }
 511
 512                                #[test]
 513                                fn test_core_text_to_css_font_weight() {
 514                                    // Exact matches
 515                            ```
 516
 517                            ### mod test › L959-963
 518                            ```
 519                                }
 520
 521                                #[test]
 522                                fn test_core_text_to_css_font_stretch() {
 523                                    // Exact matches
 524                            ```
 525
 526                            ## Matches in font-kit/src/loaders/freetype.rs
 527
 528                            ### mod test › L1238-1248
 529                            ```
 530                            mod test {
 531                                use crate::loaders::freetype::Font;
 532
 533                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 534                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 535
 536                                #[test]
 537                                fn get_pcf_postscript_name() {
 538                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 539                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 540                                }
 541                            ```
 542
 543                            1 lines remaining in ancestor node. Read the file to see all.
 544
 545                            ## Matches in font-kit/src/sources/core_text.rs
 546
 547                            ### mod test › L265-275
 548                            ```
 549                            mod test {
 550                                use crate::properties::{Stretch, Weight};
 551
 552                                #[test]
 553                                fn test_css_to_core_text_font_weight() {
 554                                    // Exact matches
 555                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 556                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 557                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 558                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 559
 560                            ```
 561
 562                            27 lines remaining in ancestor node. Read the file to see all.
 563
 564                            ### mod test › L278-282
 565                            ```
 566                                }
 567
 568                                #[test]
 569                                fn test_css_to_core_text_font_stretch() {
 570                                    // Exact matches
 571                            ```
 572                        "},
 573                    )],
 574                ),
 575                message(
 576                    Assistant,
 577                    [tool_use(
 578                        "tool_5",
 579                        "edit_file",
 580                        EditFileToolInput {
 581                            display_description: edit_description.into(),
 582                            path: input_file_path.into(),
 583                            mode: EditFileMode::Edit,
 584                        },
 585                    )],
 586                ),
 587            ],
 588            Some(input_file_content.into()),
 589            EvalAssertion::judge_diff(indoc! {"
 590                    - The diff contains a new `from_pixels` constructor
 591                    - The diff contains new tests for the `from_pixels` constructor
 592                "}),
 593        ),
 594    );
 595}
 596
 597#[test]
 598#[cfg_attr(not(feature = "eval"), ignore)]
 599fn eval_zode() {
 600    let input_file_path = "root/zode.py";
 601    let input_content = None;
 602    let edit_description = "Create the main Zode CLI script";
 603    eval(
 604        200,
 605        1.,
 606        EvalInput::from_conversation(
 607            vec![
 608                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 609                message(
 610                    Assistant,
 611                    [
 612                        tool_use(
 613                            "tool_1",
 614                            "read_file",
 615                            ReadFileToolInput {
 616                                path: "root/eval/react.py".into(),
 617                                start_line: None,
 618                                end_line: None,
 619                            },
 620                        ),
 621                        tool_use(
 622                            "tool_2",
 623                            "read_file",
 624                            ReadFileToolInput {
 625                                path: "root/eval/react_test.py".into(),
 626                                start_line: None,
 627                                end_line: None,
 628                            },
 629                        ),
 630                    ],
 631                ),
 632                message(
 633                    User,
 634                    [
 635                        tool_result(
 636                            "tool_1",
 637                            "read_file",
 638                            include_str!("evals/fixtures/zode/react.py"),
 639                        ),
 640                        tool_result(
 641                            "tool_2",
 642                            "read_file",
 643                            include_str!("evals/fixtures/zode/react_test.py"),
 644                        ),
 645                    ],
 646                ),
 647                message(
 648                    Assistant,
 649                    [
 650                        text(
 651                            "Now that I understand what we need to build, I'll create the main Python script:",
 652                        ),
 653                        tool_use(
 654                            "tool_3",
 655                            "edit_file",
 656                            EditFileToolInput {
 657                                display_description: edit_description.into(),
 658                                path: input_file_path.into(),
 659                                mode: EditFileMode::Create,
 660                            },
 661                        ),
 662                    ],
 663                ),
 664            ],
 665            input_content,
 666            EvalAssertion::new(async move |sample, _, _cx| {
 667                let invalid_starts = [' ', '`', '\n'];
 668                let mut message = String::new();
 669                for start in invalid_starts {
 670                    if sample.text_after.starts_with(start) {
 671                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 672                        break;
 673                    }
 674                }
 675                // Remove trailing newline.
 676                message.pop();
 677
 678                if message.is_empty() {
 679                    Ok(EvalAssertionOutcome {
 680                        score: 100,
 681                        message: None,
 682                    })
 683                } else {
 684                    Ok(EvalAssertionOutcome {
 685                        score: 0,
 686                        message: Some(message),
 687                    })
 688                }
 689            }),
 690        ),
 691    );
 692}
 693
 694#[test]
 695#[cfg_attr(not(feature = "eval"), ignore)]
 696fn eval_add_overwrite_test() {
 697    let input_file_path = "root/action_log.rs";
 698    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 699    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 700    eval(
 701        200,
 702        0.5, // TODO: make this eval better
 703        EvalInput::from_conversation(
 704            vec![
 705                message(
 706                    User,
 707                    [text(indoc! {"
 708                        Introduce a new test in `action_log.rs` to test overwriting a file.
 709                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 710                        Take inspiration from all the other tests in the file.
 711                    "})],
 712                ),
 713                message(
 714                    Assistant,
 715                    [tool_use(
 716                        "tool_1",
 717                        "read_file",
 718                        ReadFileToolInput {
 719                            path: input_file_path.into(),
 720                            start_line: None,
 721                            end_line: None,
 722                        },
 723                    )],
 724                ),
 725                message(
 726                    User,
 727                    [tool_result(
 728                        "tool_1",
 729                        "read_file",
 730                        indoc! {"
 731                            pub struct ActionLog [L13-20]
 732                             tracked_buffers [L15]
 733                             edited_since_project_diagnostics_check [L17]
 734                             project [L19]
 735                            impl ActionLog [L22-498]
 736                             pub fn new [L24-30]
 737                             pub fn project [L32-34]
 738                             pub fn checked_project_diagnostics [L37-39]
 739                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 740                             fn track_buffer_internal [L46-101]
 741                             fn handle_buffer_event [L103-116]
 742                             fn handle_buffer_edited [L118-123]
 743                             fn handle_buffer_file_changed [L125-158]
 744                             async fn maintain_diff [L160-264]
 745                             pub fn buffer_read [L267-269]
 746                             pub fn buffer_created [L272-276]
 747                             pub fn buffer_edited [L279-287]
 748                             pub fn will_delete_buffer [L289-304]
 749                             pub fn keep_edits_in_range [L306-364]
 750                             pub fn reject_edits_in_ranges [L366-459]
 751                             pub fn keep_all_edits [L461-473]
 752                             pub fn changed_buffers [L476-482]
 753                             pub fn stale_buffers [L485-497]
 754                            fn apply_non_conflicting_edits [L500-561]
 755                            fn diff_snapshots [L563-585]
 756                            fn point_to_row_edit [L587-614]
 757                            enum ChangeAuthor [L617-620]
 758                             User [L618]
 759                             Agent [L619]
 760                            enum TrackedBufferStatus [L623-627]
 761                             Created [L624]
 762                             Modified [L625]
 763                             Deleted [L626]
 764                            struct TrackedBuffer [L629-641]
 765                             buffer [L630]
 766                             base_text [L631]
 767                             unreviewed_changes [L632]
 768                             status [L633]
 769                             version [L634]
 770                             diff [L635]
 771                             snapshot [L636]
 772                             diff_update [L637]
 773                             _open_lsp_handle [L638]
 774                             _maintain_diff [L639]
 775                             _subscription [L640]
 776                            impl TrackedBuffer [L643-657]
 777                             fn has_changes [L644-650]
 778                             fn schedule_diff_update [L652-656]
 779                            pub struct ChangedBuffer [L659-661]
 780                             pub diff [L660]
 781                            mod tests [L664-1574]
 782                             fn init_logger [L678-682]
 783                             fn init_test [L684-691]
 784                             async fn test_keep_edits [L694-769]
 785                             async fn test_deletions [L772-854]
 786                             async fn test_overlapping_user_edits [L857-951]
 787                             async fn test_creating_files [L954-1010]
 788                             async fn test_deleting_files [L1013-1120]
 789                             async fn test_reject_edits [L1123-1255]
 790                             async fn test_reject_multiple_edits [L1258-1331]
 791                             async fn test_reject_deleted_file [L1334-1388]
 792                             async fn test_reject_created_file [L1391-1443]
 793                             async fn test_random_diffs [L1446-1535]
 794                              fn quiesce [L1510-1534]
 795                             struct HunkStatus [L1538-1542]
 796                              range [L1539]
 797                              diff_status [L1540]
 798                              old_text [L1541]
 799                             fn unreviewed_hunks [L1544-1573]
 800
 801                            Showing symbols 1-69 (total symbols: 69)
 802
 803                            Using the line numbers in this outline, you can call this tool again while specifying
 804                            the start_line and end_line fields to see the implementations of symbols in the outline.
 805                        "},
 806                    )],
 807                ),
 808                message(
 809                    Assistant,
 810                    [
 811                        text(
 812                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 813                        ),
 814                        tool_use(
 815                            "tool_2",
 816                            "read_file",
 817                            ReadFileToolInput {
 818                                path: input_file_path.into(),
 819                                start_line: Some(953),
 820                                end_line: Some(1010),
 821                            },
 822                        ),
 823                    ],
 824                ),
 825                message(
 826                    User,
 827                    [tool_result(
 828                        "tool_2",
 829                        "read_file",
 830                        lines(input_file_content, 953..1010),
 831                    )],
 832                ),
 833                message(
 834                    Assistant,
 835                    [
 836                        text(
 837                            "Now let's look at another relevant test to understand how they're structured:",
 838                        ),
 839                        tool_use(
 840                            "tool_3",
 841                            "read_file",
 842                            ReadFileToolInput {
 843                                path: input_file_path.into(),
 844                                start_line: Some(1012),
 845                                end_line: Some(1120),
 846                            },
 847                        ),
 848                    ],
 849                ),
 850                message(
 851                    User,
 852                    [tool_result(
 853                        "tool_3",
 854                        "read_file",
 855                        lines(input_file_content, 1012..1120),
 856                    )],
 857                ),
 858                message(
 859                    Assistant,
 860                    [
 861                        text("Now let's look at how `buffer_created` is implemented:"),
 862                        tool_use(
 863                            "tool_4",
 864                            "read_file",
 865                            ReadFileToolInput {
 866                                path: input_file_path.into(),
 867                                start_line: Some(271),
 868                                end_line: Some(276),
 869                            },
 870                        ),
 871                    ],
 872                ),
 873                message(
 874                    User,
 875                    [tool_result(
 876                        "tool_4",
 877                        "read_file",
 878                        lines(input_file_content, 271..276),
 879                    )],
 880                ),
 881                message(
 882                    Assistant,
 883                    [
 884                        text(indoc! {"
 885                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 886
 887                            1. Create a file
 888                            2. Open a buffer for it
 889                            3. Track it in the action log
 890                            4. Modify the buffer
 891                            5. Call `buffer_created` again as if the file were new
 892                            6. Check that the changes are properly tracked
 893
 894                            Let's write the test:
 895                        "}),
 896                        tool_use(
 897                            "tool_5",
 898                            "edit_file",
 899                            EditFileToolInput {
 900                                display_description: edit_description.into(),
 901                                path: input_file_path.into(),
 902                                mode: EditFileMode::Edit,
 903                            },
 904                        ),
 905                    ],
 906                ),
 907            ],
 908            Some(input_file_content.into()),
 909            EvalAssertion::judge_diff(
 910                "A new test for overwritten files was created, without changing any previous test",
 911            ),
 912        ),
 913    );
 914}
 915
 916#[test]
 917#[cfg_attr(not(feature = "eval"), ignore)]
 918fn eval_create_empty_file() {
 919    // Check that Edit Agent can create a file without writing its
 920    // thoughts into it. This issue is not specific to empty files, but
 921    // it's easier to reproduce with them.
 922    //
 923    //
 924    //  Model                          | Pass rate
 925    // ============================================
 926    //
 927    // --------------------------------------------
 928    //           Prompt version: 2025-05-21
 929    // --------------------------------------------
 930    //
 931    //  claude-3.7-sonnet              |  1.00
 932    //  gemini-2.5-pro-preview-03-25   |  1.00
 933    //  gemini-2.5-flash-preview-04-17 |  1.00
 934    //  gpt-4.1                        |  1.00
 935    //
 936    //
 937    // TODO: gpt-4.1-mini errored 38 times:
 938    // "data did not match any variant of untagged enum ResponseStreamResult"
 939    //
 940    let input_file_content = None;
 941    let expected_output_content = String::new();
 942    eval(
 943        100,
 944        0.99,
 945        EvalInput::from_conversation(
 946            vec![
 947                message(User, [text("Create a second empty todo file ")]),
 948                message(
 949                    Assistant,
 950                    [
 951                        text(formatdoc! {"
 952                        I'll help you create a second empty todo file.
 953                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
 954                        "}),
 955                        tool_use(
 956                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 957                            "list_directory",
 958                            ListDirectoryToolInput {
 959                                path: "root".to_string(),
 960                            },
 961                        ),
 962                    ],
 963                ),
 964                message(
 965                    User,
 966                    [tool_result(
 967                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 968                        "list_directory",
 969                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
 970                    )],
 971                ),
 972                message(
 973                    Assistant,
 974                    [
 975                        text(formatdoc! {"
 976                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
 977                    "}),
 978                        tool_use(
 979                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
 980                            "edit_file",
 981                            EditFileToolInput {
 982                                display_description: "Create empty TODO3 file".to_string(),
 983                                mode: EditFileMode::Create,
 984                                path: "root/TODO3".into(),
 985                            },
 986                        ),
 987                    ],
 988                ),
 989            ],
 990            input_file_content,
 991            // Bad behavior is to write something like
 992            // "I'll create an empty TODO3 file as requested."
 993            EvalAssertion::assert_eq(expected_output_content),
 994        ),
 995    );
 996}
 997
 998fn message(
 999    role: Role,
1000    contents: impl IntoIterator<Item = MessageContent>,
1001) -> LanguageModelRequestMessage {
1002    LanguageModelRequestMessage {
1003        role,
1004        content: contents.into_iter().collect(),
1005        cache: false,
1006    }
1007}
1008
1009fn text(text: impl Into<String>) -> MessageContent {
1010    MessageContent::Text(text.into())
1011}
1012
1013fn lines(input: &str, range: Range<usize>) -> String {
1014    input
1015        .lines()
1016        .skip(range.start)
1017        .take(range.len())
1018        .collect::<Vec<_>>()
1019        .join("\n")
1020}
1021
1022fn tool_use(
1023    id: impl Into<Arc<str>>,
1024    name: impl Into<Arc<str>>,
1025    input: impl Serialize,
1026) -> MessageContent {
1027    MessageContent::ToolUse(LanguageModelToolUse {
1028        id: LanguageModelToolUseId::from(id.into()),
1029        name: name.into(),
1030        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1031        input: serde_json::to_value(input).unwrap(),
1032        is_input_complete: true,
1033    })
1034}
1035
1036fn tool_result(
1037    id: impl Into<Arc<str>>,
1038    name: impl Into<Arc<str>>,
1039    result: impl Into<Arc<str>>,
1040) -> MessageContent {
1041    MessageContent::ToolResult(LanguageModelToolResult {
1042        tool_use_id: LanguageModelToolUseId::from(id.into()),
1043        tool_name: name.into(),
1044        is_error: false,
1045        content: LanguageModelToolResultContent::Text(result.into()),
1046        output: None,
1047    })
1048}
1049
1050#[derive(Clone)]
1051struct EvalInput {
1052    conversation: Vec<LanguageModelRequestMessage>,
1053    edit_file_input: EditFileToolInput,
1054    input_content: Option<String>,
1055    assertion: EvalAssertion,
1056}
1057
1058impl EvalInput {
1059    fn from_conversation(
1060        conversation: Vec<LanguageModelRequestMessage>,
1061        input_content: Option<String>,
1062        assertion: EvalAssertion,
1063    ) -> Self {
1064        let msg = conversation.last().expect("Conversation must not be empty");
1065        if msg.role != Role::Assistant {
1066            panic!("Conversation must end with an assistant message");
1067        }
1068        let tool_use = msg
1069            .content
1070            .iter()
1071            .flat_map(|content| match content {
1072                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1073                    Some(tool_use)
1074                }
1075                _ => None,
1076            })
1077            .next()
1078            .expect("Conversation must end with an edit_file tool use")
1079            .clone();
1080
1081        let edit_file_input: EditFileToolInput =
1082            serde_json::from_value(tool_use.input.clone()).unwrap();
1083
1084        EvalInput {
1085            conversation,
1086            edit_file_input,
1087            input_content,
1088            assertion,
1089        }
1090    }
1091}
1092
1093#[derive(Clone)]
1094struct EvalSample {
1095    text_before: String,
1096    text_after: String,
1097    edit_output: EditAgentOutput,
1098    diff: String,
1099}
1100
1101trait AssertionFn: 'static + Send + Sync {
1102    fn assert<'a>(
1103        &'a self,
1104        sample: &'a EvalSample,
1105        judge_model: Arc<dyn LanguageModel>,
1106        cx: &'a mut TestAppContext,
1107    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1108}
1109
1110impl<F> AssertionFn for F
1111where
1112    F: 'static
1113        + Send
1114        + Sync
1115        + AsyncFn(
1116            &EvalSample,
1117            Arc<dyn LanguageModel>,
1118            &mut TestAppContext,
1119        ) -> Result<EvalAssertionOutcome>,
1120{
1121    fn assert<'a>(
1122        &'a self,
1123        sample: &'a EvalSample,
1124        judge_model: Arc<dyn LanguageModel>,
1125        cx: &'a mut TestAppContext,
1126    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1127        (self)(sample, judge_model, cx).boxed_local()
1128    }
1129}
1130
1131#[derive(Clone)]
1132struct EvalAssertion(Arc<dyn AssertionFn>);
1133
1134impl EvalAssertion {
1135    fn new<F>(f: F) -> Self
1136    where
1137        F: 'static
1138            + Send
1139            + Sync
1140            + AsyncFn(
1141                &EvalSample,
1142                Arc<dyn LanguageModel>,
1143                &mut TestAppContext,
1144            ) -> Result<EvalAssertionOutcome>,
1145    {
1146        EvalAssertion(Arc::new(f))
1147    }
1148
1149    fn assert_eq(expected: impl Into<String>) -> Self {
1150        let expected = expected.into();
1151        Self::new(async move |sample, _judge, _cx| {
1152            Ok(EvalAssertionOutcome {
1153                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1154                    100
1155                } else {
1156                    0
1157                },
1158                message: None,
1159            })
1160        })
1161    }
1162
1163    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1164        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1165        Self::new(async move |sample, _judge, _cx| {
1166            let matches = expected_diffs.iter().any(|possible_diff| {
1167                let expected =
1168                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1169                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1170            });
1171
1172            Ok(EvalAssertionOutcome {
1173                score: if matches { 100 } else { 0 },
1174                message: None,
1175            })
1176        })
1177    }
1178
1179    fn judge_diff(assertions: &'static str) -> Self {
1180        Self::new(async move |sample, judge, cx| {
1181            let prompt = DiffJudgeTemplate {
1182                diff: sample.diff.clone(),
1183                assertions,
1184            }
1185            .render(&Templates::new())
1186            .unwrap();
1187
1188            let request = LanguageModelRequest {
1189                messages: vec![LanguageModelRequestMessage {
1190                    role: Role::User,
1191                    content: vec![prompt.into()],
1192                    cache: false,
1193                }],
1194                ..Default::default()
1195            };
1196            let mut response = judge
1197                .stream_completion_text(request, &cx.to_async())
1198                .await?;
1199            let mut output = String::new();
1200            while let Some(chunk) = response.stream.next().await {
1201                let chunk = chunk?;
1202                output.push_str(&chunk);
1203            }
1204
1205            // Parse the score from the response
1206            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1207            if let Some(captures) = re.captures(&output) {
1208                if let Some(score_match) = captures.get(1) {
1209                    let score = score_match.as_str().parse().unwrap_or(0);
1210                    return Ok(EvalAssertionOutcome {
1211                        score,
1212                        message: Some(output),
1213                    });
1214                }
1215            }
1216
1217            anyhow::bail!("No score found in response. Raw output: {output}");
1218        })
1219    }
1220
1221    async fn run(
1222        &self,
1223        input: &EvalSample,
1224        judge_model: Arc<dyn LanguageModel>,
1225        cx: &mut TestAppContext,
1226    ) -> Result<EvalAssertionOutcome> {
1227        self.0.assert(input, judge_model, cx).await
1228    }
1229}
1230
1231fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1232    let mut evaluated_count = 0;
1233    let mut failed_count = 0;
1234    report_progress(evaluated_count, failed_count, iterations);
1235
1236    let (tx, rx) = mpsc::channel();
1237
1238    // Cache the last message in the conversation, and run one instance of the eval so that
1239    // all the next ones are cached.
1240    eval.conversation.last_mut().unwrap().cache = true;
1241    run_eval(eval.clone(), tx.clone());
1242
1243    let executor = gpui::background_executor();
1244    for _ in 1..iterations {
1245        let eval = eval.clone();
1246        let tx = tx.clone();
1247        executor.spawn(async move { run_eval(eval, tx) }).detach();
1248    }
1249    drop(tx);
1250
1251    let mut failed_evals = HashMap::default();
1252    let mut errored_evals = HashMap::default();
1253    let mut eval_outputs = Vec::new();
1254    let mut cumulative_parser_metrics = EditParserMetrics::default();
1255    while let Ok(output) = rx.recv() {
1256        match output {
1257            Ok(output) => {
1258                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1259                eval_outputs.push(output.clone());
1260                if output.assertion.score < 80 {
1261                    failed_count += 1;
1262                    failed_evals
1263                        .entry(output.sample.text_after.clone())
1264                        .or_insert(Vec::new())
1265                        .push(output);
1266                }
1267            }
1268            Err(error) => {
1269                failed_count += 1;
1270                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1271            }
1272        }
1273
1274        evaluated_count += 1;
1275        report_progress(evaluated_count, failed_count, iterations);
1276    }
1277
1278    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1279    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1280    if actual_pass_ratio < expected_pass_ratio {
1281        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1282        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1283        for (error, count) in errored_evals {
1284            println!("Eval errored {} times. Error: {}", count, error);
1285        }
1286
1287        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1288        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1289        for (_buffer_output, failed_evals) in failed_evals {
1290            let eval_output = failed_evals.first().unwrap();
1291            println!("Eval failed {} times", failed_evals.len());
1292            println!("{}", eval_output);
1293        }
1294
1295        panic!(
1296            "Actual pass ratio: {}\nExpected pass ratio: {}",
1297            actual_pass_ratio, expected_pass_ratio
1298        );
1299    }
1300
1301    let mismatched_tag_ratio =
1302        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1303    if mismatched_tag_ratio > 0.05 {
1304        for eval_output in eval_outputs {
1305            println!("{}", eval_output);
1306        }
1307        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1308    }
1309}
1310
1311fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1312    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1313    let mut cx = TestAppContext::build(dispatcher, None);
1314    let output = cx.executor().block_test(async {
1315        let test = EditAgentTest::new(&mut cx).await;
1316        test.eval(eval, &mut cx).await
1317    });
1318    tx.send(output).unwrap();
1319}
1320
1321#[derive(Clone)]
1322struct EvalOutput {
1323    sample: EvalSample,
1324    assertion: EvalAssertionOutcome,
1325}
1326
1327impl Display for EvalOutput {
1328    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1329        writeln!(f, "Score: {:?}", self.assertion.score)?;
1330        if let Some(message) = self.assertion.message.as_ref() {
1331            writeln!(f, "Message: {}", message)?;
1332        }
1333
1334        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1335
1336        writeln!(
1337            f,
1338            "Parser Metrics:\n{:#?}",
1339            self.sample.edit_output.parser_metrics
1340        )?;
1341        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1342        Ok(())
1343    }
1344}
1345
1346fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1347    let passed_count = evaluated_count - failed_count;
1348    let passed_ratio = if evaluated_count == 0 {
1349        0.0
1350    } else {
1351        passed_count as f64 / evaluated_count as f64
1352    };
1353    print!(
1354        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1355        evaluated_count,
1356        iterations,
1357        passed_ratio * 100.0
1358    );
1359    std::io::stdout().flush().unwrap();
1360}
1361
1362struct EditAgentTest {
1363    agent: EditAgent,
1364    project: Entity<Project>,
1365    judge_model: Arc<dyn LanguageModel>,
1366}
1367
1368impl EditAgentTest {
1369    async fn new(cx: &mut TestAppContext) -> Self {
1370        cx.executor().allow_parking();
1371
1372        let fs = FakeFs::new(cx.executor().clone());
1373        cx.update(|cx| {
1374            settings::init(cx);
1375            gpui_tokio::init(cx);
1376            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1377            cx.set_http_client(http_client);
1378
1379            client::init_settings(cx);
1380            let client = Client::production(cx);
1381            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1382
1383            settings::init(cx);
1384            Project::init_settings(cx);
1385            language::init(cx);
1386            language_model::init(client.clone(), cx);
1387            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1388            crate::init(client.http_client(), cx);
1389        });
1390
1391        fs.insert_tree("/root", json!({})).await;
1392        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1393        let agent_model = SelectedModel::from_str(
1394            &std::env::var("ZED_AGENT_MODEL")
1395                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1396        )
1397        .unwrap();
1398        let judge_model = SelectedModel::from_str(
1399            &std::env::var("ZED_JUDGE_MODEL")
1400                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1401        )
1402        .unwrap();
1403        let (agent_model, judge_model) = cx
1404            .update(|cx| {
1405                cx.spawn(async move |cx| {
1406                    let agent_model = Self::load_model(&agent_model, cx).await;
1407                    let judge_model = Self::load_model(&judge_model, cx).await;
1408                    (agent_model.unwrap(), judge_model.unwrap())
1409                })
1410            })
1411            .await;
1412        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1413
1414        Self {
1415            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1416            project,
1417            judge_model,
1418        }
1419    }
1420
1421    async fn load_model(
1422        selected_model: &SelectedModel,
1423        cx: &mut AsyncApp,
1424    ) -> Result<Arc<dyn LanguageModel>> {
1425        let (provider, model) = cx.update(|cx| {
1426            let models = LanguageModelRegistry::read_global(cx);
1427            let model = models
1428                .available_models(cx)
1429                .find(|model| {
1430                    model.provider_id() == selected_model.provider
1431                        && model.id() == selected_model.model
1432                })
1433                .unwrap();
1434            let provider = models.provider(&model.provider_id()).unwrap();
1435            (provider, model)
1436        })?;
1437        cx.update(|cx| provider.authenticate(cx))?.await?;
1438        Ok(model)
1439    }
1440
1441    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1442        let path = self
1443            .project
1444            .read_with(cx, |project, cx| {
1445                project.find_project_path(eval.edit_file_input.path, cx)
1446            })
1447            .unwrap();
1448        let buffer = self
1449            .project
1450            .update(cx, |project, cx| project.open_buffer(path, cx))
1451            .await
1452            .unwrap();
1453        let tools = cx.update(|cx| {
1454            ToolRegistry::default_global(cx)
1455                .tools()
1456                .into_iter()
1457                .filter_map(|tool| {
1458                    let input_schema = tool
1459                        .input_schema(self.agent.model.tool_input_format())
1460                        .ok()?;
1461                    Some(LanguageModelRequestTool {
1462                        name: tool.name(),
1463                        description: tool.description(),
1464                        input_schema,
1465                    })
1466                })
1467                .collect::<Vec<_>>()
1468        });
1469        let tool_names = tools
1470            .iter()
1471            .map(|tool| tool.name.clone())
1472            .collect::<Vec<_>>();
1473        let worktrees = vec![WorktreeContext {
1474            root_name: "root".to_string(),
1475            rules_file: None,
1476        }];
1477        let prompt_builder = PromptBuilder::new(None)?;
1478        let project_context = ProjectContext::new(worktrees, Vec::default());
1479        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1480            &project_context,
1481            &ModelContext {
1482                available_tools: tool_names,
1483            },
1484        )?;
1485
1486        let has_system_prompt = eval
1487            .conversation
1488            .first()
1489            .map_or(false, |msg| msg.role == Role::System);
1490        let messages = if has_system_prompt {
1491            eval.conversation
1492        } else {
1493            [LanguageModelRequestMessage {
1494                role: Role::System,
1495                content: vec![MessageContent::Text(system_prompt)],
1496                cache: true,
1497            }]
1498            .into_iter()
1499            .chain(eval.conversation)
1500            .collect::<Vec<_>>()
1501        };
1502
1503        let conversation = LanguageModelRequest {
1504            messages,
1505            tools,
1506            ..Default::default()
1507        };
1508
1509        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1510            if let Some(input_content) = eval.input_content.as_deref() {
1511                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1512            }
1513            let (edit_output, _) = self.agent.edit(
1514                buffer.clone(),
1515                eval.edit_file_input.display_description,
1516                &conversation,
1517                &mut cx.to_async(),
1518            );
1519            edit_output.await?
1520        } else {
1521            let (edit_output, _) = self.agent.overwrite(
1522                buffer.clone(),
1523                eval.edit_file_input.display_description,
1524                &conversation,
1525                &mut cx.to_async(),
1526            );
1527            edit_output.await?
1528        };
1529
1530        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1531        let sample = EvalSample {
1532            edit_output,
1533            diff: language::unified_diff(
1534                eval.input_content.as_deref().unwrap_or_default(),
1535                &buffer_text,
1536            ),
1537            text_before: eval.input_content.unwrap_or_default(),
1538            text_after: buffer_text,
1539        };
1540        let assertion = eval
1541            .assertion
1542            .run(&sample, self.judge_model.clone(), cx)
1543            .await?;
1544
1545        Ok(EvalOutput { assertion, sample })
1546    }
1547}
1548
1549#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1550struct EvalAssertionOutcome {
1551    score: usize,
1552    message: Option<String>,
1553}
1554
1555#[derive(Serialize)]
1556pub struct DiffJudgeTemplate {
1557    diff: String,
1558    assertions: &'static str,
1559}
1560
1561impl Template for DiffJudgeTemplate {
1562    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1563}
1564
1565fn strip_empty_lines(text: &str) -> String {
1566    text.lines()
1567        .filter(|line| !line.trim().is_empty())
1568        .collect::<Vec<_>>()
1569        .join("\n")
1570}