evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use rand::prelude::*;
  22use reqwest_client::ReqwestClient;
  23use serde_json::json;
  24use std::{
  25    cmp::Reverse,
  26    fmt::{self, Display},
  27    io::Write as _,
  28    str::FromStr,
  29    sync::mpsc,
  30};
  31use util::path;
  32
  33#[test]
  34#[cfg_attr(not(feature = "eval"), ignore)]
  35fn eval_extract_handle_command_output() {
  36    let input_file_path = "root/blame.rs";
  37    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  38    let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
  39    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  40    eval(
  41        100,
  42        0.95,
  43        EvalInput::from_conversation(
  44            vec![
  45                message(
  46                    User,
  47                    [text(formatdoc! {"
  48                        Read the `{input_file_path}` file and extract a method in
  49                        the final stanza of `run_git_blame` to deal with command failures,
  50                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  51
  52                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  53                    "})],
  54                ),
  55                message(
  56                    Assistant,
  57                    [tool_use(
  58                        "tool_1",
  59                        "read_file",
  60                        ReadFileToolInput {
  61                            path: input_file_path.into(),
  62                            start_line: None,
  63                            end_line: None,
  64                        },
  65                    )],
  66                ),
  67                message(
  68                    User,
  69                    [tool_result("tool_1", "read_file", input_file_content)],
  70                ),
  71                message(
  72                    Assistant,
  73                    [tool_use(
  74                        "tool_2",
  75                        "edit_file",
  76                        EditFileToolInput {
  77                            display_description: edit_description.into(),
  78                            path: input_file_path.into(),
  79                            mode: EditFileMode::Edit,
  80                        },
  81                    )],
  82                ),
  83            ],
  84            Some(input_file_content.into()),
  85            EvalAssertion::assert_eq(output_file_content),
  86        ),
  87    );
  88}
  89
  90#[test]
  91#[cfg_attr(not(feature = "eval"), ignore)]
  92fn eval_delete_run_git_blame() {
  93    let input_file_path = "root/blame.rs";
  94    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
  95    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
  96    let edit_description = "Delete the `run_git_blame` function.";
  97    eval(
  98        100,
  99        0.95,
 100        EvalInput::from_conversation(
 101            vec![
 102                message(
 103                    User,
 104                    [text(formatdoc! {"
 105                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 106                        one function, not its usages.
 107                    "})],
 108                ),
 109                message(
 110                    Assistant,
 111                    [tool_use(
 112                        "tool_1",
 113                        "read_file",
 114                        ReadFileToolInput {
 115                            path: input_file_path.into(),
 116                            start_line: None,
 117                            end_line: None,
 118                        },
 119                    )],
 120                ),
 121                message(
 122                    User,
 123                    [tool_result("tool_1", "read_file", input_file_content)],
 124                ),
 125                message(
 126                    Assistant,
 127                    [tool_use(
 128                        "tool_2",
 129                        "edit_file",
 130                        EditFileToolInput {
 131                            display_description: edit_description.into(),
 132                            path: input_file_path.into(),
 133                            mode: EditFileMode::Edit,
 134                        },
 135                    )],
 136                ),
 137            ],
 138            Some(input_file_content.into()),
 139            EvalAssertion::assert_eq(output_file_content),
 140        ),
 141    );
 142}
 143
 144#[test]
 145#[cfg_attr(not(feature = "eval"), ignore)]
 146fn eval_translate_doc_comments() {
 147    let input_file_path = "root/canvas.rs";
 148    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 149    let edit_description = "Translate all doc comments to Italian";
 150    eval(
 151        200,
 152        1.,
 153        EvalInput::from_conversation(
 154            vec![
 155                message(
 156                    User,
 157                    [text(formatdoc! {"
 158                        Read the {input_file_path} file and edit it (without overwriting it),
 159                        translating all the doc comments to italian.
 160                    "})],
 161                ),
 162                message(
 163                    Assistant,
 164                    [tool_use(
 165                        "tool_1",
 166                        "read_file",
 167                        ReadFileToolInput {
 168                            path: input_file_path.into(),
 169                            start_line: None,
 170                            end_line: None,
 171                        },
 172                    )],
 173                ),
 174                message(
 175                    User,
 176                    [tool_result("tool_1", "read_file", input_file_content)],
 177                ),
 178                message(
 179                    Assistant,
 180                    [tool_use(
 181                        "tool_2",
 182                        "edit_file",
 183                        EditFileToolInput {
 184                            display_description: edit_description.into(),
 185                            path: input_file_path.into(),
 186                            mode: EditFileMode::Edit,
 187                        },
 188                    )],
 189                ),
 190            ],
 191            Some(input_file_content.into()),
 192            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 193        ),
 194    );
 195}
 196
 197#[test]
 198#[cfg_attr(not(feature = "eval"), ignore)]
 199fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 200    let input_file_path = "root/lib.rs";
 201    let input_file_content =
 202        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 203    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 204    eval(
 205        100,
 206        0.95,
 207        EvalInput::from_conversation(
 208            vec![
 209                message(
 210                    User,
 211                    [text(formatdoc! {"
 212                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 213                        Use `ureq` to download the SDK for the current platform and architecture.
 214                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 215                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 216                        that's inside of the archive.
 217                        Don't re-download the SDK if that executable already exists.
 218
 219                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 220
 221                        Here are the available wasi-sdk assets:
 222                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 223                        - wasi-sdk-25.0-arm64-macos.tar.gz
 224                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 225                        - wasi-sdk-25.0-arm64-linux.tar.gz
 226                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 227                        - wasi-sdk-25.0-arm64-linux.tar.gz
 228                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 229                    "})],
 230                ),
 231                message(
 232                    Assistant,
 233                    [tool_use(
 234                        "tool_1",
 235                        "read_file",
 236                        ReadFileToolInput {
 237                            path: input_file_path.into(),
 238                            start_line: Some(971),
 239                            end_line: Some(1050),
 240                        },
 241                    )],
 242                ),
 243                message(
 244                    User,
 245                    [tool_result(
 246                        "tool_1",
 247                        "read_file",
 248                        lines(input_file_content, 971..1050),
 249                    )],
 250                ),
 251                message(
 252                    Assistant,
 253                    [tool_use(
 254                        "tool_2",
 255                        "read_file",
 256                        ReadFileToolInput {
 257                            path: input_file_path.into(),
 258                            start_line: Some(1050),
 259                            end_line: Some(1100),
 260                        },
 261                    )],
 262                ),
 263                message(
 264                    User,
 265                    [tool_result(
 266                        "tool_2",
 267                        "read_file",
 268                        lines(input_file_content, 1050..1100),
 269                    )],
 270                ),
 271                message(
 272                    Assistant,
 273                    [tool_use(
 274                        "tool_3",
 275                        "read_file",
 276                        ReadFileToolInput {
 277                            path: input_file_path.into(),
 278                            start_line: Some(1100),
 279                            end_line: Some(1150),
 280                        },
 281                    )],
 282                ),
 283                message(
 284                    User,
 285                    [tool_result(
 286                        "tool_3",
 287                        "read_file",
 288                        lines(input_file_content, 1100..1150),
 289                    )],
 290                ),
 291                message(
 292                    Assistant,
 293                    [tool_use(
 294                        "tool_4",
 295                        "edit_file",
 296                        EditFileToolInput {
 297                            display_description: edit_description.into(),
 298                            path: input_file_path.into(),
 299                            mode: EditFileMode::Edit,
 300                        },
 301                    )],
 302                ),
 303            ],
 304            Some(input_file_content.into()),
 305            EvalAssertion::judge_diff(indoc! {"
 306                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 307                - ureq is used to download the SDK for current platform and architecture
 308            "}),
 309        ),
 310    );
 311}
 312
 313#[test]
 314#[cfg_attr(not(feature = "eval"), ignore)]
 315fn eval_disable_cursor_blinking() {
 316    let input_file_path = "root/editor.rs";
 317    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 318    let edit_description = "Comment out the call to `BlinkManager::enable`";
 319    eval(
 320        100,
 321        0.95,
 322        EvalInput::from_conversation(
 323            vec![
 324                message(User, [text("Let's research how to cursor blinking works.")]),
 325                message(
 326                    Assistant,
 327                    [tool_use(
 328                        "tool_1",
 329                        "grep",
 330                        GrepToolInput {
 331                            regex: "blink".into(),
 332                            include_pattern: None,
 333                            offset: 0,
 334                            case_sensitive: false,
 335                        },
 336                    )],
 337                ),
 338                message(
 339                    User,
 340                    [tool_result(
 341                        "tool_1",
 342                        "grep",
 343                        [
 344                            lines(input_file_content, 100..400),
 345                            lines(input_file_content, 800..1300),
 346                            lines(input_file_content, 1600..2000),
 347                            lines(input_file_content, 5000..5500),
 348                            lines(input_file_content, 8000..9000),
 349                            lines(input_file_content, 18455..18470),
 350                            lines(input_file_content, 20000..20500),
 351                            lines(input_file_content, 21000..21300),
 352                        ]
 353                        .join("Match found:\n\n"),
 354                    )],
 355                ),
 356                message(
 357                    User,
 358                    [text(indoc! {"
 359                        Comment out the lines that interact with the BlinkManager.
 360                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 361                        Don't add additional comments.
 362                    "})],
 363                ),
 364                message(
 365                    Assistant,
 366                    [tool_use(
 367                        "tool_4",
 368                        "edit_file",
 369                        EditFileToolInput {
 370                            display_description: edit_description.into(),
 371                            path: input_file_path.into(),
 372                            mode: EditFileMode::Edit,
 373                        },
 374                    )],
 375                ),
 376            ],
 377            Some(input_file_content.into()),
 378            EvalAssertion::judge_diff(indoc! {"
 379                - Calls to BlinkManager in `observe_window_activation` were commented out
 380                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 381                - All the edits have valid indentation
 382            "}),
 383        ),
 384    );
 385}
 386
 387#[test]
 388#[cfg_attr(not(feature = "eval"), ignore)]
 389fn eval_from_pixels_constructor() {
 390    let input_file_path = "root/canvas.rs";
 391    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 392    let edit_description = "Implement from_pixels constructor and add tests.";
 393    eval(
 394        100,
 395        0.95,
 396        EvalInput::from_conversation(
 397            vec![
 398                message(
 399                    User,
 400                    [text(indoc! {"
 401                        Introduce a new `from_pixels` constructor in Canvas and
 402                        also add tests for it in the same file.
 403                    "})],
 404                ),
 405                message(
 406                    Assistant,
 407                    [tool_use(
 408                        "tool_1",
 409                        "read_file",
 410                        ReadFileToolInput {
 411                            path: input_file_path.into(),
 412                            start_line: None,
 413                            end_line: None,
 414                        },
 415                    )],
 416                ),
 417                message(
 418                    User,
 419                    [tool_result("tool_1", "read_file", input_file_content)],
 420                ),
 421                message(
 422                    Assistant,
 423                    [tool_use(
 424                        "tool_2",
 425                        "grep",
 426                        GrepToolInput {
 427                            regex: "mod\\s+tests".into(),
 428                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 429                            offset: 0,
 430                            case_sensitive: false,
 431                        },
 432                    )],
 433                ),
 434                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 435                message(
 436                    Assistant,
 437                    [tool_use(
 438                        "tool_3",
 439                        "grep",
 440                        GrepToolInput {
 441                            regex: "mod\\s+tests".into(),
 442                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 443                            offset: 0,
 444                            case_sensitive: false,
 445                        },
 446                    )],
 447                ),
 448                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 449                message(
 450                    Assistant,
 451                    [tool_use(
 452                        "tool_4",
 453                        "grep",
 454                        GrepToolInput {
 455                            regex: "#\\[test\\]".into(),
 456                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 457                            offset: 0,
 458                            case_sensitive: false,
 459                        },
 460                    )],
 461                ),
 462                message(
 463                    User,
 464                    [tool_result(
 465                        "tool_4",
 466                        "grep",
 467                        indoc! {"
 468                            Found 6 matches:
 469
 470                            ## Matches in font-kit/src/loaders/core_text.rs
 471
 472                            ### mod test › L926-936
 473                            ```
 474                            mod test {
 475                                use super::Font;
 476                                use crate::properties::{Stretch, Weight};
 477
 478                                #[cfg(feature = \"source\")]
 479                                use crate::source::SystemSource;
 480
 481                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 482
 483                                #[cfg(feature = \"source\")]
 484                                #[test]
 485                            ```
 486
 487                            55 lines remaining in ancestor node. Read the file to see all.
 488
 489                            ### mod test › L947-951
 490                            ```
 491                                }
 492
 493                                #[test]
 494                                fn test_core_text_to_css_font_weight() {
 495                                    // Exact matches
 496                            ```
 497
 498                            ### mod test › L959-963
 499                            ```
 500                                }
 501
 502                                #[test]
 503                                fn test_core_text_to_css_font_stretch() {
 504                                    // Exact matches
 505                            ```
 506
 507                            ## Matches in font-kit/src/loaders/freetype.rs
 508
 509                            ### mod test › L1238-1248
 510                            ```
 511                            mod test {
 512                                use crate::loaders::freetype::Font;
 513
 514                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 515                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 516
 517                                #[test]
 518                                fn get_pcf_postscript_name() {
 519                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 520                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 521                                }
 522                            ```
 523
 524                            1 lines remaining in ancestor node. Read the file to see all.
 525
 526                            ## Matches in font-kit/src/sources/core_text.rs
 527
 528                            ### mod test › L265-275
 529                            ```
 530                            mod test {
 531                                use crate::properties::{Stretch, Weight};
 532
 533                                #[test]
 534                                fn test_css_to_core_text_font_weight() {
 535                                    // Exact matches
 536                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 537                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 538                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 539                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 540
 541                            ```
 542
 543                            27 lines remaining in ancestor node. Read the file to see all.
 544
 545                            ### mod test › L278-282
 546                            ```
 547                                }
 548
 549                                #[test]
 550                                fn test_css_to_core_text_font_stretch() {
 551                                    // Exact matches
 552                            ```
 553                        "},
 554                    )],
 555                ),
 556                message(
 557                    Assistant,
 558                    [tool_use(
 559                        "tool_5",
 560                        "edit_file",
 561                        EditFileToolInput {
 562                            display_description: edit_description.into(),
 563                            path: input_file_path.into(),
 564                            mode: EditFileMode::Edit,
 565                        },
 566                    )],
 567                ),
 568            ],
 569            Some(input_file_content.into()),
 570            EvalAssertion::judge_diff(indoc! {"
 571                    - The diff contains a new `from_pixels` constructor
 572                    - The diff contains new tests for the `from_pixels` constructor
 573                "}),
 574        ),
 575    );
 576}
 577
 578#[test]
 579#[cfg_attr(not(feature = "eval"), ignore)]
 580fn eval_zode() {
 581    let input_file_path = "root/zode.py";
 582    let input_content = None;
 583    let edit_description = "Create the main Zode CLI script";
 584    eval(
 585        200,
 586        1.,
 587        EvalInput::from_conversation(
 588            vec![
 589                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 590                message(
 591                    Assistant,
 592                    [
 593                        tool_use(
 594                            "tool_1",
 595                            "read_file",
 596                            ReadFileToolInput {
 597                                path: "root/eval/react.py".into(),
 598                                start_line: None,
 599                                end_line: None,
 600                            },
 601                        ),
 602                        tool_use(
 603                            "tool_2",
 604                            "read_file",
 605                            ReadFileToolInput {
 606                                path: "root/eval/react_test.py".into(),
 607                                start_line: None,
 608                                end_line: None,
 609                            },
 610                        ),
 611                    ],
 612                ),
 613                message(
 614                    User,
 615                    [
 616                        tool_result(
 617                            "tool_1",
 618                            "read_file",
 619                            include_str!("evals/fixtures/zode/react.py"),
 620                        ),
 621                        tool_result(
 622                            "tool_2",
 623                            "read_file",
 624                            include_str!("evals/fixtures/zode/react_test.py"),
 625                        ),
 626                    ],
 627                ),
 628                message(
 629                    Assistant,
 630                    [
 631                        text(
 632                            "Now that I understand what we need to build, I'll create the main Python script:",
 633                        ),
 634                        tool_use(
 635                            "tool_3",
 636                            "edit_file",
 637                            EditFileToolInput {
 638                                display_description: edit_description.into(),
 639                                path: input_file_path.into(),
 640                                mode: EditFileMode::Create,
 641                            },
 642                        ),
 643                    ],
 644                ),
 645            ],
 646            input_content,
 647            EvalAssertion::new(async move |sample, _, _cx| {
 648                let invalid_starts = [' ', '`', '\n'];
 649                let mut message = String::new();
 650                for start in invalid_starts {
 651                    if sample.text.starts_with(start) {
 652                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 653                        break;
 654                    }
 655                }
 656                // Remove trailing newline.
 657                message.pop();
 658
 659                if message.is_empty() {
 660                    Ok(EvalAssertionOutcome {
 661                        score: 100,
 662                        message: None,
 663                    })
 664                } else {
 665                    Ok(EvalAssertionOutcome {
 666                        score: 0,
 667                        message: Some(message),
 668                    })
 669                }
 670            }),
 671        ),
 672    );
 673}
 674
 675#[test]
 676#[cfg_attr(not(feature = "eval"), ignore)]
 677fn eval_add_overwrite_test() {
 678    let input_file_path = "root/action_log.rs";
 679    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 680    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 681    eval(
 682        200,
 683        0.5, // TODO: make this eval better
 684        EvalInput::from_conversation(
 685            vec![
 686                message(
 687                    User,
 688                    [text(indoc! {"
 689                        Introduce a new test in `action_log.rs` to test overwriting a file.
 690                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 691                        Take inspiration from all the other tests in the file.
 692                    "})],
 693                ),
 694                message(
 695                    Assistant,
 696                    [tool_use(
 697                        "tool_1",
 698                        "read_file",
 699                        ReadFileToolInput {
 700                            path: input_file_path.into(),
 701                            start_line: None,
 702                            end_line: None,
 703                        },
 704                    )],
 705                ),
 706                message(
 707                    User,
 708                    [tool_result(
 709                        "tool_1",
 710                        "read_file",
 711                        indoc! {"
 712                            pub struct ActionLog [L13-20]
 713                             tracked_buffers [L15]
 714                             edited_since_project_diagnostics_check [L17]
 715                             project [L19]
 716                            impl ActionLog [L22-498]
 717                             pub fn new [L24-30]
 718                             pub fn project [L32-34]
 719                             pub fn checked_project_diagnostics [L37-39]
 720                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 721                             fn track_buffer_internal [L46-101]
 722                             fn handle_buffer_event [L103-116]
 723                             fn handle_buffer_edited [L118-123]
 724                             fn handle_buffer_file_changed [L125-158]
 725                             async fn maintain_diff [L160-264]
 726                             pub fn buffer_read [L267-269]
 727                             pub fn buffer_created [L272-276]
 728                             pub fn buffer_edited [L279-287]
 729                             pub fn will_delete_buffer [L289-304]
 730                             pub fn keep_edits_in_range [L306-364]
 731                             pub fn reject_edits_in_ranges [L366-459]
 732                             pub fn keep_all_edits [L461-473]
 733                             pub fn changed_buffers [L476-482]
 734                             pub fn stale_buffers [L485-497]
 735                            fn apply_non_conflicting_edits [L500-561]
 736                            fn diff_snapshots [L563-585]
 737                            fn point_to_row_edit [L587-614]
 738                            enum ChangeAuthor [L617-620]
 739                             User [L618]
 740                             Agent [L619]
 741                            enum TrackedBufferStatus [L623-627]
 742                             Created [L624]
 743                             Modified [L625]
 744                             Deleted [L626]
 745                            struct TrackedBuffer [L629-641]
 746                             buffer [L630]
 747                             base_text [L631]
 748                             unreviewed_changes [L632]
 749                             status [L633]
 750                             version [L634]
 751                             diff [L635]
 752                             snapshot [L636]
 753                             diff_update [L637]
 754                             _open_lsp_handle [L638]
 755                             _maintain_diff [L639]
 756                             _subscription [L640]
 757                            impl TrackedBuffer [L643-657]
 758                             fn has_changes [L644-650]
 759                             fn schedule_diff_update [L652-656]
 760                            pub struct ChangedBuffer [L659-661]
 761                             pub diff [L660]
 762                            mod tests [L664-1574]
 763                             fn init_logger [L678-682]
 764                             fn init_test [L684-691]
 765                             async fn test_keep_edits [L694-769]
 766                             async fn test_deletions [L772-854]
 767                             async fn test_overlapping_user_edits [L857-951]
 768                             async fn test_creating_files [L954-1010]
 769                             async fn test_deleting_files [L1013-1120]
 770                             async fn test_reject_edits [L1123-1255]
 771                             async fn test_reject_multiple_edits [L1258-1331]
 772                             async fn test_reject_deleted_file [L1334-1388]
 773                             async fn test_reject_created_file [L1391-1443]
 774                             async fn test_random_diffs [L1446-1535]
 775                              fn quiesce [L1510-1534]
 776                             struct HunkStatus [L1538-1542]
 777                              range [L1539]
 778                              diff_status [L1540]
 779                              old_text [L1541]
 780                             fn unreviewed_hunks [L1544-1573]
 781
 782                            Showing symbols 1-69 (total symbols: 69)
 783
 784                            Using the line numbers in this outline, you can call this tool again while specifying
 785                            the start_line and end_line fields to see the implementations of symbols in the outline.
 786                        "},
 787                    )],
 788                ),
 789                message(
 790                    Assistant,
 791                    [
 792                        text(
 793                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 794                        ),
 795                        tool_use(
 796                            "tool_2",
 797                            "read_file",
 798                            ReadFileToolInput {
 799                                path: input_file_path.into(),
 800                                start_line: Some(953),
 801                                end_line: Some(1010),
 802                            },
 803                        ),
 804                    ],
 805                ),
 806                message(
 807                    User,
 808                    [tool_result(
 809                        "tool_2",
 810                        "read_file",
 811                        lines(input_file_content, 953..1010),
 812                    )],
 813                ),
 814                message(
 815                    Assistant,
 816                    [
 817                        text(
 818                            "Now let's look at another relevant test to understand how they're structured:",
 819                        ),
 820                        tool_use(
 821                            "tool_3",
 822                            "read_file",
 823                            ReadFileToolInput {
 824                                path: input_file_path.into(),
 825                                start_line: Some(1012),
 826                                end_line: Some(1120),
 827                            },
 828                        ),
 829                    ],
 830                ),
 831                message(
 832                    User,
 833                    [tool_result(
 834                        "tool_3",
 835                        "read_file",
 836                        lines(input_file_content, 1012..1120),
 837                    )],
 838                ),
 839                message(
 840                    Assistant,
 841                    [
 842                        text("Now let's look at how `buffer_created` is implemented:"),
 843                        tool_use(
 844                            "tool_4",
 845                            "read_file",
 846                            ReadFileToolInput {
 847                                path: input_file_path.into(),
 848                                start_line: Some(271),
 849                                end_line: Some(276),
 850                            },
 851                        ),
 852                    ],
 853                ),
 854                message(
 855                    User,
 856                    [tool_result(
 857                        "tool_4",
 858                        "read_file",
 859                        lines(input_file_content, 271..276),
 860                    )],
 861                ),
 862                message(
 863                    Assistant,
 864                    [
 865                        text(indoc! {"
 866                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 867
 868                            1. Create a file
 869                            2. Open a buffer for it
 870                            3. Track it in the action log
 871                            4. Modify the buffer
 872                            5. Call `buffer_created` again as if the file were new
 873                            6. Check that the changes are properly tracked
 874
 875                            Let's write the test:
 876                        "}),
 877                        tool_use(
 878                            "tool_5",
 879                            "edit_file",
 880                            EditFileToolInput {
 881                                display_description: edit_description.into(),
 882                                path: input_file_path.into(),
 883                                mode: EditFileMode::Edit,
 884                            },
 885                        ),
 886                    ],
 887                ),
 888            ],
 889            Some(input_file_content.into()),
 890            EvalAssertion::judge_diff(
 891                "A new test for overwritten files was created, without changing any previous test",
 892            ),
 893        ),
 894    );
 895}
 896
 897#[test]
 898#[ignore] // until we figure out the mystery described in the comments
 899// #[cfg_attr(not(feature = "eval"), ignore)]
 900fn eval_create_empty_file() {
 901    // Check that Edit Agent can create a file without writing its
 902    // thoughts into it. This issue is not specific to empty files, but
 903    // it's easier to reproduce with them.
 904    //
 905    // NOTE: For some mysterious reason, I could easily reproduce this
 906    // issue roughly 90% of the time in actual Zed. However, once I
 907    // extract the exact LLM request before the failure point and
 908    // generate from that, the reproduction rate drops to 2%!
 909    //
 910    // Things I've tried to make sure it's not a fluke: disabling prompt
 911    // caching, capturing the LLM request via a proxy server, running the
 912    // prompt on Claude separately from evals. Every time it was mostly
 913    // giving good outcomes, which doesn't match my actual experience in
 914    // Zed.
 915    //
 916    // At some point I discovered that simply adding one insignificant
 917    // space or a newline to the prompt suddenly results in an outcome I
 918    // tried to reproduce almost perfectly.
 919    //
 920    // This weirdness happens even outside of the Zed code base and even
 921    // when using a different subscription. The result is the same: an
 922    // extra newline or space changes the model behavior significantly
 923    // enough, so that the pass rate drops from 99% to 0-3%
 924    //
 925    // I have no explanation to this.
 926    //
 927    //
 928    //  Model                          | Pass rate
 929    // ============================================
 930    //
 931    // --------------------------------------------
 932    //           Prompt version: 2025-05-19
 933    // --------------------------------------------
 934    //
 935    //  claude-3.7-sonnet              |  0.98
 936    //    + one extra space in prompt  |  0.00
 937    //    + original prompt again      |  0.99
 938    //    + extra newline              |  0.03
 939    //  gemini-2.5-pro-preview-03-25   |  1.00
 940    //  gemini-2.5-flash-preview-04-17 |  1.00
 941    //    + one extra space            |  1.00
 942    //  gpt-4.1                        |  1.00
 943    //    + one extra space            |  1.00
 944    //
 945    //
 946    // TODO: gpt-4.1-mini errored 38 times:
 947    // "data did not match any variant of untagged enum ResponseStreamResult"
 948    //
 949    let input_file_content = None;
 950    let expected_output_content = String::new();
 951    eval(
 952        1,
 953        1.0,
 954        EvalInput::from_conversation(
 955            vec![
 956                message(User, [text("Create a second empty todo file ")]),
 957                message(
 958                    Assistant,
 959                    [
 960                        text(formatdoc! {"
 961                        I'll help you create a second empty todo file.
 962                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
 963                        "}),
 964                        tool_use(
 965                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 966                            "list_directory",
 967                            ListDirectoryToolInput {
 968                                path: "root".to_string(),
 969                            },
 970                        ),
 971                    ],
 972                ),
 973                message(
 974                    User,
 975                    [tool_result(
 976                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 977                        "list_directory",
 978                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
 979                    )],
 980                ),
 981                message(
 982                    Assistant,
 983                    [
 984                        text(formatdoc! {"
 985                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
 986                    "}),
 987                        tool_use(
 988                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
 989                            "edit_file",
 990                            EditFileToolInput {
 991                                display_description: "Create empty TODO3 file".to_string(),
 992                                mode: EditFileMode::Create,
 993                                path: "root/TODO3".into(),
 994                            },
 995                        ),
 996                    ],
 997                ),
 998            ],
 999            input_file_content,
1000            // Bad behavior is to write something like
1001            // "I'll create an empty TODO3 file as requested."
1002            EvalAssertion::assert_eq(expected_output_content),
1003        ),
1004    );
1005}
1006
1007fn message(
1008    role: Role,
1009    contents: impl IntoIterator<Item = MessageContent>,
1010) -> LanguageModelRequestMessage {
1011    LanguageModelRequestMessage {
1012        role,
1013        content: contents.into_iter().collect(),
1014        cache: false,
1015    }
1016}
1017
1018fn text(text: impl Into<String>) -> MessageContent {
1019    MessageContent::Text(text.into())
1020}
1021
1022fn lines(input: &str, range: Range<usize>) -> String {
1023    input
1024        .lines()
1025        .skip(range.start)
1026        .take(range.len())
1027        .collect::<Vec<_>>()
1028        .join("\n")
1029}
1030
1031fn tool_use(
1032    id: impl Into<Arc<str>>,
1033    name: impl Into<Arc<str>>,
1034    input: impl Serialize,
1035) -> MessageContent {
1036    MessageContent::ToolUse(LanguageModelToolUse {
1037        id: LanguageModelToolUseId::from(id.into()),
1038        name: name.into(),
1039        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1040        input: serde_json::to_value(input).unwrap(),
1041        is_input_complete: true,
1042    })
1043}
1044
1045fn tool_result(
1046    id: impl Into<Arc<str>>,
1047    name: impl Into<Arc<str>>,
1048    result: impl Into<Arc<str>>,
1049) -> MessageContent {
1050    MessageContent::ToolResult(LanguageModelToolResult {
1051        tool_use_id: LanguageModelToolUseId::from(id.into()),
1052        tool_name: name.into(),
1053        is_error: false,
1054        content: LanguageModelToolResultContent::Text(result.into()),
1055        output: None,
1056    })
1057}
1058
1059#[derive(Clone)]
1060struct EvalInput {
1061    conversation: Vec<LanguageModelRequestMessage>,
1062    edit_file_input: EditFileToolInput,
1063    input_content: Option<String>,
1064    assertion: EvalAssertion,
1065}
1066
1067impl EvalInput {
1068    fn from_conversation(
1069        conversation: Vec<LanguageModelRequestMessage>,
1070        input_content: Option<String>,
1071        assertion: EvalAssertion,
1072    ) -> Self {
1073        let msg = conversation.last().expect("Conversation must not be empty");
1074        if msg.role != Role::Assistant {
1075            panic!("Conversation must end with an assistant message");
1076        }
1077        let tool_use = msg
1078            .content
1079            .iter()
1080            .flat_map(|content| match content {
1081                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1082                    Some(tool_use)
1083                }
1084                _ => None,
1085            })
1086            .next()
1087            .expect("Conversation must end with an edit_file tool use")
1088            .clone();
1089
1090        let edit_file_input: EditFileToolInput =
1091            serde_json::from_value(tool_use.input.clone()).unwrap();
1092
1093        EvalInput {
1094            conversation,
1095            edit_file_input,
1096            input_content,
1097            assertion,
1098        }
1099    }
1100}
1101
1102#[derive(Clone)]
1103struct EvalSample {
1104    text: String,
1105    edit_output: EditAgentOutput,
1106    diff: String,
1107}
1108
1109trait AssertionFn: 'static + Send + Sync {
1110    fn assert<'a>(
1111        &'a self,
1112        sample: &'a EvalSample,
1113        judge_model: Arc<dyn LanguageModel>,
1114        cx: &'a mut TestAppContext,
1115    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1116}
1117
1118impl<F> AssertionFn for F
1119where
1120    F: 'static
1121        + Send
1122        + Sync
1123        + AsyncFn(
1124            &EvalSample,
1125            Arc<dyn LanguageModel>,
1126            &mut TestAppContext,
1127        ) -> Result<EvalAssertionOutcome>,
1128{
1129    fn assert<'a>(
1130        &'a self,
1131        sample: &'a EvalSample,
1132        judge_model: Arc<dyn LanguageModel>,
1133        cx: &'a mut TestAppContext,
1134    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1135        (self)(sample, judge_model, cx).boxed_local()
1136    }
1137}
1138
1139#[derive(Clone)]
1140struct EvalAssertion(Arc<dyn AssertionFn>);
1141
1142impl EvalAssertion {
1143    fn new<F>(f: F) -> Self
1144    where
1145        F: 'static
1146            + Send
1147            + Sync
1148            + AsyncFn(
1149                &EvalSample,
1150                Arc<dyn LanguageModel>,
1151                &mut TestAppContext,
1152            ) -> Result<EvalAssertionOutcome>,
1153    {
1154        EvalAssertion(Arc::new(f))
1155    }
1156
1157    fn assert_eq(expected: impl Into<String>) -> Self {
1158        let expected = expected.into();
1159        Self::new(async move |sample, _judge, _cx| {
1160            Ok(EvalAssertionOutcome {
1161                score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
1162                    100
1163                } else {
1164                    0
1165                },
1166                message: None,
1167            })
1168        })
1169    }
1170
1171    fn judge_diff(assertions: &'static str) -> Self {
1172        Self::new(async move |sample, judge, cx| {
1173            let prompt = DiffJudgeTemplate {
1174                diff: sample.diff.clone(),
1175                assertions,
1176            }
1177            .render(&Templates::new())
1178            .unwrap();
1179
1180            let request = LanguageModelRequest {
1181                messages: vec![LanguageModelRequestMessage {
1182                    role: Role::User,
1183                    content: vec![prompt.into()],
1184                    cache: false,
1185                }],
1186                ..Default::default()
1187            };
1188            let mut response = judge
1189                .stream_completion_text(request, &cx.to_async())
1190                .await?;
1191            let mut output = String::new();
1192            while let Some(chunk) = response.stream.next().await {
1193                let chunk = chunk?;
1194                output.push_str(&chunk);
1195            }
1196
1197            // Parse the score from the response
1198            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1199            if let Some(captures) = re.captures(&output) {
1200                if let Some(score_match) = captures.get(1) {
1201                    let score = score_match.as_str().parse().unwrap_or(0);
1202                    return Ok(EvalAssertionOutcome {
1203                        score,
1204                        message: Some(output),
1205                    });
1206                }
1207            }
1208
1209            anyhow::bail!("No score found in response. Raw output: {output}");
1210        })
1211    }
1212
1213    async fn run(
1214        &self,
1215        input: &EvalSample,
1216        judge_model: Arc<dyn LanguageModel>,
1217        cx: &mut TestAppContext,
1218    ) -> Result<EvalAssertionOutcome> {
1219        self.0.assert(input, judge_model, cx).await
1220    }
1221}
1222
1223fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1224    let mut evaluated_count = 0;
1225    let mut failed_count = 0;
1226    report_progress(evaluated_count, failed_count, iterations);
1227
1228    let (tx, rx) = mpsc::channel();
1229
1230    // Cache the last message in the conversation, and run one instance of the eval so that
1231    // all the next ones are cached.
1232    eval.conversation.last_mut().unwrap().cache = true;
1233    run_eval(eval.clone(), tx.clone());
1234
1235    let executor = gpui::background_executor();
1236    for _ in 1..iterations {
1237        let eval = eval.clone();
1238        let tx = tx.clone();
1239        executor.spawn(async move { run_eval(eval, tx) }).detach();
1240    }
1241    drop(tx);
1242
1243    let mut failed_evals = HashMap::default();
1244    let mut errored_evals = HashMap::default();
1245    let mut eval_outputs = Vec::new();
1246    let mut cumulative_parser_metrics = EditParserMetrics::default();
1247    while let Ok(output) = rx.recv() {
1248        match output {
1249            Ok(output) => {
1250                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1251                eval_outputs.push(output.clone());
1252                if output.assertion.score < 80 {
1253                    failed_count += 1;
1254                    failed_evals
1255                        .entry(output.sample.text.clone())
1256                        .or_insert(Vec::new())
1257                        .push(output);
1258                }
1259            }
1260            Err(error) => {
1261                failed_count += 1;
1262                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1263            }
1264        }
1265
1266        evaluated_count += 1;
1267        report_progress(evaluated_count, failed_count, iterations);
1268    }
1269
1270    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1271    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1272    if actual_pass_ratio < expected_pass_ratio {
1273        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1274        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1275        for (error, count) in errored_evals {
1276            println!("Eval errored {} times. Error: {}", count, error);
1277        }
1278
1279        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1280        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1281        for (_buffer_output, failed_evals) in failed_evals {
1282            let eval_output = failed_evals.first().unwrap();
1283            println!("Eval failed {} times", failed_evals.len());
1284            println!("{}", eval_output);
1285        }
1286
1287        panic!(
1288            "Actual pass ratio: {}\nExpected pass ratio: {}",
1289            actual_pass_ratio, expected_pass_ratio
1290        );
1291    }
1292
1293    let mismatched_tag_ratio =
1294        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1295    if mismatched_tag_ratio > 0.05 {
1296        for eval_output in eval_outputs {
1297            println!("{}", eval_output);
1298        }
1299        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1300    }
1301}
1302
1303fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1304    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1305    let mut cx = TestAppContext::build(dispatcher, None);
1306    let output = cx.executor().block_test(async {
1307        let test = EditAgentTest::new(&mut cx).await;
1308        test.eval(eval, &mut cx).await
1309    });
1310    tx.send(output).unwrap();
1311}
1312
1313#[derive(Clone)]
1314struct EvalOutput {
1315    sample: EvalSample,
1316    assertion: EvalAssertionOutcome,
1317}
1318
1319impl Display for EvalOutput {
1320    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1321        writeln!(f, "Score: {:?}", self.assertion.score)?;
1322        if let Some(message) = self.assertion.message.as_ref() {
1323            writeln!(f, "Message: {}", message)?;
1324        }
1325
1326        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1327
1328        writeln!(
1329            f,
1330            "Parser Metrics:\n{:#?}",
1331            self.sample.edit_output.parser_metrics
1332        )?;
1333        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1334        Ok(())
1335    }
1336}
1337
1338fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1339    let passed_count = evaluated_count - failed_count;
1340    let passed_ratio = if evaluated_count == 0 {
1341        0.0
1342    } else {
1343        passed_count as f64 / evaluated_count as f64
1344    };
1345    print!(
1346        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1347        evaluated_count,
1348        iterations,
1349        passed_ratio * 100.0
1350    );
1351    std::io::stdout().flush().unwrap();
1352}
1353
1354struct EditAgentTest {
1355    agent: EditAgent,
1356    project: Entity<Project>,
1357    judge_model: Arc<dyn LanguageModel>,
1358}
1359
1360impl EditAgentTest {
1361    async fn new(cx: &mut TestAppContext) -> Self {
1362        cx.executor().allow_parking();
1363
1364        let fs = FakeFs::new(cx.executor().clone());
1365        cx.update(|cx| {
1366            settings::init(cx);
1367            gpui_tokio::init(cx);
1368            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1369            cx.set_http_client(http_client);
1370
1371            client::init_settings(cx);
1372            let client = Client::production(cx);
1373            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1374
1375            settings::init(cx);
1376            Project::init_settings(cx);
1377            language::init(cx);
1378            language_model::init(client.clone(), cx);
1379            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1380            crate::init(client.http_client(), cx);
1381        });
1382
1383        fs.insert_tree("/root", json!({})).await;
1384        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1385        let agent_model = SelectedModel::from_str(
1386            &std::env::var("ZED_AGENT_MODEL")
1387                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1388        )
1389        .unwrap();
1390        let judge_model = SelectedModel::from_str(
1391            &std::env::var("ZED_JUDGE_MODEL")
1392                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1393        )
1394        .unwrap();
1395        let (agent_model, judge_model) = cx
1396            .update(|cx| {
1397                cx.spawn(async move |cx| {
1398                    let agent_model = Self::load_model(&agent_model, cx).await;
1399                    let judge_model = Self::load_model(&judge_model, cx).await;
1400                    (agent_model.unwrap(), judge_model.unwrap())
1401                })
1402            })
1403            .await;
1404        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1405
1406        Self {
1407            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1408            project,
1409            judge_model,
1410        }
1411    }
1412
1413    async fn load_model(
1414        selected_model: &SelectedModel,
1415        cx: &mut AsyncApp,
1416    ) -> Result<Arc<dyn LanguageModel>> {
1417        let (provider, model) = cx.update(|cx| {
1418            let models = LanguageModelRegistry::read_global(cx);
1419            let model = models
1420                .available_models(cx)
1421                .find(|model| {
1422                    model.provider_id() == selected_model.provider
1423                        && model.id() == selected_model.model
1424                })
1425                .unwrap();
1426            let provider = models.provider(&model.provider_id()).unwrap();
1427            (provider, model)
1428        })?;
1429        cx.update(|cx| provider.authenticate(cx))?.await?;
1430        Ok(model)
1431    }
1432
1433    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1434        let path = self
1435            .project
1436            .read_with(cx, |project, cx| {
1437                project.find_project_path(eval.edit_file_input.path, cx)
1438            })
1439            .unwrap();
1440        let buffer = self
1441            .project
1442            .update(cx, |project, cx| project.open_buffer(path, cx))
1443            .await
1444            .unwrap();
1445        let conversation = LanguageModelRequest {
1446            messages: eval.conversation,
1447            tools: cx.update(|cx| {
1448                ToolRegistry::default_global(cx)
1449                    .tools()
1450                    .into_iter()
1451                    .filter_map(|tool| {
1452                        let input_schema = tool
1453                            .input_schema(self.agent.model.tool_input_format())
1454                            .ok()?;
1455                        Some(LanguageModelRequestTool {
1456                            name: tool.name(),
1457                            description: tool.description(),
1458                            input_schema,
1459                        })
1460                    })
1461                    .collect()
1462            }),
1463            ..Default::default()
1464        };
1465        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1466            if let Some(input_content) = eval.input_content.as_deref() {
1467                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1468            }
1469            let (edit_output, _) = self.agent.edit(
1470                buffer.clone(),
1471                eval.edit_file_input.display_description,
1472                &conversation,
1473                &mut cx.to_async(),
1474            );
1475            edit_output.await?
1476        } else {
1477            let (edit_output, _) = self.agent.overwrite(
1478                buffer.clone(),
1479                eval.edit_file_input.display_description,
1480                &conversation,
1481                &mut cx.to_async(),
1482            );
1483            edit_output.await?
1484        };
1485
1486        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1487        let sample = EvalSample {
1488            edit_output,
1489            diff: language::unified_diff(
1490                eval.input_content.as_deref().unwrap_or_default(),
1491                &buffer_text,
1492            ),
1493            text: buffer_text,
1494        };
1495        let assertion = eval
1496            .assertion
1497            .run(&sample, self.judge_model.clone(), cx)
1498            .await?;
1499
1500        Ok(EvalOutput { assertion, sample })
1501    }
1502}
1503
1504#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1505struct EvalAssertionOutcome {
1506    score: usize,
1507    message: Option<String>,
1508}
1509
1510#[derive(Serialize)]
1511pub struct DiffJudgeTemplate {
1512    diff: String,
1513    assertions: &'static str,
1514}
1515
1516impl Template for DiffJudgeTemplate {
1517    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1518}
1519
1520fn strip_empty_lines(text: &str) -> String {
1521    text.lines()
1522        .filter(|line| !line.trim().is_empty())
1523        .collect::<Vec<_>>()
1524        .join("\n")
1525}