evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use anyhow::anyhow;
  10use assistant_tool::ToolRegistry;
  11use client::{Client, UserStore};
  12use collections::HashMap;
  13use fs::FakeFs;
  14use futures::{FutureExt, future::LocalBoxFuture};
  15use gpui::{AppContext, TestAppContext};
  16use indoc::{formatdoc, indoc};
  17use language_model::{
  18    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  19    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  20};
  21use project::Project;
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    str::FromStr,
  30    sync::mpsc,
  31};
  32use util::path;
  33
  34#[test]
  35#[cfg_attr(not(feature = "eval"), ignore)]
  36fn eval_extract_handle_command_output() {
  37    let input_file_path = "root/blame.rs";
  38    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  39    let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
  40    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  41    eval(
  42        100,
  43        0.95,
  44        EvalInput::from_conversation(
  45            vec![
  46                message(
  47                    User,
  48                    [text(formatdoc! {"
  49                        Read the `{input_file_path}` file and extract a method in
  50                        the final stanza of `run_git_blame` to deal with command failures,
  51                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  52
  53                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  54                    "})],
  55                ),
  56                message(
  57                    Assistant,
  58                    [tool_use(
  59                        "tool_1",
  60                        "read_file",
  61                        ReadFileToolInput {
  62                            path: input_file_path.into(),
  63                            start_line: None,
  64                            end_line: None,
  65                        },
  66                    )],
  67                ),
  68                message(
  69                    User,
  70                    [tool_result("tool_1", "read_file", input_file_content)],
  71                ),
  72                message(
  73                    Assistant,
  74                    [tool_use(
  75                        "tool_2",
  76                        "edit_file",
  77                        EditFileToolInput {
  78                            display_description: edit_description.into(),
  79                            path: input_file_path.into(),
  80                            mode: EditFileMode::Edit,
  81                        },
  82                    )],
  83                ),
  84            ],
  85            Some(input_file_content.into()),
  86            EvalAssertion::assert_eq(output_file_content),
  87        ),
  88    );
  89}
  90
  91#[test]
  92#[cfg_attr(not(feature = "eval"), ignore)]
  93fn eval_delete_run_git_blame() {
  94    let input_file_path = "root/blame.rs";
  95    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
  96    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
  97    let edit_description = "Delete the `run_git_blame` function.";
  98    eval(
  99        100,
 100        0.95,
 101        EvalInput::from_conversation(
 102            vec![
 103                message(
 104                    User,
 105                    [text(formatdoc! {"
 106                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 107                        one function, not its usages.
 108                    "})],
 109                ),
 110                message(
 111                    Assistant,
 112                    [tool_use(
 113                        "tool_1",
 114                        "read_file",
 115                        ReadFileToolInput {
 116                            path: input_file_path.into(),
 117                            start_line: None,
 118                            end_line: None,
 119                        },
 120                    )],
 121                ),
 122                message(
 123                    User,
 124                    [tool_result("tool_1", "read_file", input_file_content)],
 125                ),
 126                message(
 127                    Assistant,
 128                    [tool_use(
 129                        "tool_2",
 130                        "edit_file",
 131                        EditFileToolInput {
 132                            display_description: edit_description.into(),
 133                            path: input_file_path.into(),
 134                            mode: EditFileMode::Edit,
 135                        },
 136                    )],
 137                ),
 138            ],
 139            Some(input_file_content.into()),
 140            EvalAssertion::assert_eq(output_file_content),
 141        ),
 142    );
 143}
 144
 145#[test]
 146#[cfg_attr(not(feature = "eval"), ignore)]
 147fn eval_translate_doc_comments() {
 148    let input_file_path = "root/canvas.rs";
 149    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 150    let edit_description = "Translate all doc comments to Italian";
 151    eval(
 152        200,
 153        1.,
 154        EvalInput::from_conversation(
 155            vec![
 156                message(
 157                    User,
 158                    [text(formatdoc! {"
 159                        Read the {input_file_path} file and edit it (without overwriting it),
 160                        translating all the doc comments to italian.
 161                    "})],
 162                ),
 163                message(
 164                    Assistant,
 165                    [tool_use(
 166                        "tool_1",
 167                        "read_file",
 168                        ReadFileToolInput {
 169                            path: input_file_path.into(),
 170                            start_line: None,
 171                            end_line: None,
 172                        },
 173                    )],
 174                ),
 175                message(
 176                    User,
 177                    [tool_result("tool_1", "read_file", input_file_content)],
 178                ),
 179                message(
 180                    Assistant,
 181                    [tool_use(
 182                        "tool_2",
 183                        "edit_file",
 184                        EditFileToolInput {
 185                            display_description: edit_description.into(),
 186                            path: input_file_path.into(),
 187                            mode: EditFileMode::Edit,
 188                        },
 189                    )],
 190                ),
 191            ],
 192            Some(input_file_content.into()),
 193            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 194        ),
 195    );
 196}
 197
 198#[test]
 199#[cfg_attr(not(feature = "eval"), ignore)]
 200fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 201    let input_file_path = "root/lib.rs";
 202    let input_file_content =
 203        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 204    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 205    eval(
 206        100,
 207        0.95,
 208        EvalInput::from_conversation(
 209            vec![
 210                message(
 211                    User,
 212                    [text(formatdoc! {"
 213                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 214                        Use `ureq` to download the SDK for the current platform and architecture.
 215                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 216                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 217                        that's inside of the archive.
 218                        Don't re-download the SDK if that executable already exists.
 219
 220                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 221
 222                        Here are the available wasi-sdk assets:
 223                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 224                        - wasi-sdk-25.0-arm64-macos.tar.gz
 225                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 226                        - wasi-sdk-25.0-arm64-linux.tar.gz
 227                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 228                        - wasi-sdk-25.0-arm64-linux.tar.gz
 229                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 230                    "})],
 231                ),
 232                message(
 233                    Assistant,
 234                    [tool_use(
 235                        "tool_1",
 236                        "read_file",
 237                        ReadFileToolInput {
 238                            path: input_file_path.into(),
 239                            start_line: Some(971),
 240                            end_line: Some(1050),
 241                        },
 242                    )],
 243                ),
 244                message(
 245                    User,
 246                    [tool_result(
 247                        "tool_1",
 248                        "read_file",
 249                        lines(input_file_content, 971..1050),
 250                    )],
 251                ),
 252                message(
 253                    Assistant,
 254                    [tool_use(
 255                        "tool_2",
 256                        "read_file",
 257                        ReadFileToolInput {
 258                            path: input_file_path.into(),
 259                            start_line: Some(1050),
 260                            end_line: Some(1100),
 261                        },
 262                    )],
 263                ),
 264                message(
 265                    User,
 266                    [tool_result(
 267                        "tool_2",
 268                        "read_file",
 269                        lines(input_file_content, 1050..1100),
 270                    )],
 271                ),
 272                message(
 273                    Assistant,
 274                    [tool_use(
 275                        "tool_3",
 276                        "read_file",
 277                        ReadFileToolInput {
 278                            path: input_file_path.into(),
 279                            start_line: Some(1100),
 280                            end_line: Some(1150),
 281                        },
 282                    )],
 283                ),
 284                message(
 285                    User,
 286                    [tool_result(
 287                        "tool_3",
 288                        "read_file",
 289                        lines(input_file_content, 1100..1150),
 290                    )],
 291                ),
 292                message(
 293                    Assistant,
 294                    [tool_use(
 295                        "tool_4",
 296                        "edit_file",
 297                        EditFileToolInput {
 298                            display_description: edit_description.into(),
 299                            path: input_file_path.into(),
 300                            mode: EditFileMode::Edit,
 301                        },
 302                    )],
 303                ),
 304            ],
 305            Some(input_file_content.into()),
 306            EvalAssertion::judge_diff(indoc! {"
 307                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 308                - ureq is used to download the SDK for current platform and architecture
 309            "}),
 310        ),
 311    );
 312}
 313
 314#[test]
 315#[cfg_attr(not(feature = "eval"), ignore)]
 316fn eval_disable_cursor_blinking() {
 317    let input_file_path = "root/editor.rs";
 318    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 319    let edit_description = "Comment out the call to `BlinkManager::enable`";
 320    eval(
 321        100,
 322        0.95,
 323        EvalInput::from_conversation(
 324            vec![
 325                message(User, [text("Let's research how to cursor blinking works.")]),
 326                message(
 327                    Assistant,
 328                    [tool_use(
 329                        "tool_1",
 330                        "grep",
 331                        GrepToolInput {
 332                            regex: "blink".into(),
 333                            include_pattern: None,
 334                            offset: 0,
 335                            case_sensitive: false,
 336                        },
 337                    )],
 338                ),
 339                message(
 340                    User,
 341                    [tool_result(
 342                        "tool_1",
 343                        "grep",
 344                        [
 345                            lines(input_file_content, 100..400),
 346                            lines(input_file_content, 800..1300),
 347                            lines(input_file_content, 1600..2000),
 348                            lines(input_file_content, 5000..5500),
 349                            lines(input_file_content, 8000..9000),
 350                            lines(input_file_content, 18455..18470),
 351                            lines(input_file_content, 20000..20500),
 352                            lines(input_file_content, 21000..21300),
 353                        ]
 354                        .join("Match found:\n\n"),
 355                    )],
 356                ),
 357                message(
 358                    User,
 359                    [text(indoc! {"
 360                        Comment out the lines that interact with the BlinkManager.
 361                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 362                        Don't add additional comments.
 363                    "})],
 364                ),
 365                message(
 366                    Assistant,
 367                    [tool_use(
 368                        "tool_4",
 369                        "edit_file",
 370                        EditFileToolInput {
 371                            display_description: edit_description.into(),
 372                            path: input_file_path.into(),
 373                            mode: EditFileMode::Edit,
 374                        },
 375                    )],
 376                ),
 377            ],
 378            Some(input_file_content.into()),
 379            EvalAssertion::judge_diff(indoc! {"
 380                - Calls to BlinkManager in `observe_window_activation` were commented out
 381                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 382                - All the edits have valid indentation
 383            "}),
 384        ),
 385    );
 386}
 387
 388#[test]
 389#[cfg_attr(not(feature = "eval"), ignore)]
 390fn eval_from_pixels_constructor() {
 391    let input_file_path = "root/canvas.rs";
 392    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 393    let edit_description = "Implement from_pixels constructor and add tests.";
 394    eval(
 395        100,
 396        0.95,
 397        EvalInput::from_conversation(
 398            vec![
 399                message(
 400                    User,
 401                    [text(indoc! {"
 402                        Introduce a new `from_pixels` constructor in Canvas and
 403                        also add tests for it in the same file.
 404                    "})],
 405                ),
 406                message(
 407                    Assistant,
 408                    [tool_use(
 409                        "tool_1",
 410                        "read_file",
 411                        ReadFileToolInput {
 412                            path: input_file_path.into(),
 413                            start_line: None,
 414                            end_line: None,
 415                        },
 416                    )],
 417                ),
 418                message(
 419                    User,
 420                    [tool_result("tool_1", "read_file", input_file_content)],
 421                ),
 422                message(
 423                    Assistant,
 424                    [tool_use(
 425                        "tool_2",
 426                        "grep",
 427                        GrepToolInput {
 428                            regex: "mod\\s+tests".into(),
 429                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 430                            offset: 0,
 431                            case_sensitive: false,
 432                        },
 433                    )],
 434                ),
 435                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 436                message(
 437                    Assistant,
 438                    [tool_use(
 439                        "tool_3",
 440                        "grep",
 441                        GrepToolInput {
 442                            regex: "mod\\s+tests".into(),
 443                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 444                            offset: 0,
 445                            case_sensitive: false,
 446                        },
 447                    )],
 448                ),
 449                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 450                message(
 451                    Assistant,
 452                    [tool_use(
 453                        "tool_4",
 454                        "grep",
 455                        GrepToolInput {
 456                            regex: "#\\[test\\]".into(),
 457                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 458                            offset: 0,
 459                            case_sensitive: false,
 460                        },
 461                    )],
 462                ),
 463                message(
 464                    User,
 465                    [tool_result(
 466                        "tool_4",
 467                        "grep",
 468                        indoc! {"
 469                            Found 6 matches:
 470
 471                            ## Matches in font-kit/src/loaders/core_text.rs
 472
 473                            ### mod test › L926-936
 474                            ```
 475                            mod test {
 476                                use super::Font;
 477                                use crate::properties::{Stretch, Weight};
 478
 479                                #[cfg(feature = \"source\")]
 480                                use crate::source::SystemSource;
 481
 482                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 483
 484                                #[cfg(feature = \"source\")]
 485                                #[test]
 486                            ```
 487
 488                            55 lines remaining in ancestor node. Read the file to see all.
 489
 490                            ### mod test › L947-951
 491                            ```
 492                                }
 493
 494                                #[test]
 495                                fn test_core_text_to_css_font_weight() {
 496                                    // Exact matches
 497                            ```
 498
 499                            ### mod test › L959-963
 500                            ```
 501                                }
 502
 503                                #[test]
 504                                fn test_core_text_to_css_font_stretch() {
 505                                    // Exact matches
 506                            ```
 507
 508                            ## Matches in font-kit/src/loaders/freetype.rs
 509
 510                            ### mod test › L1238-1248
 511                            ```
 512                            mod test {
 513                                use crate::loaders::freetype::Font;
 514
 515                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 516                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 517
 518                                #[test]
 519                                fn get_pcf_postscript_name() {
 520                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 521                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 522                                }
 523                            ```
 524
 525                            1 lines remaining in ancestor node. Read the file to see all.
 526
 527                            ## Matches in font-kit/src/sources/core_text.rs
 528
 529                            ### mod test › L265-275
 530                            ```
 531                            mod test {
 532                                use crate::properties::{Stretch, Weight};
 533
 534                                #[test]
 535                                fn test_css_to_core_text_font_weight() {
 536                                    // Exact matches
 537                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 538                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 539                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 540                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 541
 542                            ```
 543
 544                            27 lines remaining in ancestor node. Read the file to see all.
 545
 546                            ### mod test › L278-282
 547                            ```
 548                                }
 549
 550                                #[test]
 551                                fn test_css_to_core_text_font_stretch() {
 552                                    // Exact matches
 553                            ```
 554                        "},
 555                    )],
 556                ),
 557                message(
 558                    Assistant,
 559                    [tool_use(
 560                        "tool_5",
 561                        "edit_file",
 562                        EditFileToolInput {
 563                            display_description: edit_description.into(),
 564                            path: input_file_path.into(),
 565                            mode: EditFileMode::Edit,
 566                        },
 567                    )],
 568                ),
 569            ],
 570            Some(input_file_content.into()),
 571            EvalAssertion::judge_diff(indoc! {"
 572                    - The diff contains a new `from_pixels` constructor
 573                    - The diff contains new tests for the `from_pixels` constructor
 574                "}),
 575        ),
 576    );
 577}
 578
 579#[test]
 580#[cfg_attr(not(feature = "eval"), ignore)]
 581fn eval_zode() {
 582    let input_file_path = "root/zode.py";
 583    let input_content = None;
 584    let edit_description = "Create the main Zode CLI script";
 585    eval(
 586        200,
 587        1.,
 588        EvalInput::from_conversation(
 589            vec![
 590                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 591                message(
 592                    Assistant,
 593                    [
 594                        tool_use(
 595                            "tool_1",
 596                            "read_file",
 597                            ReadFileToolInput {
 598                                path: "root/eval/react.py".into(),
 599                                start_line: None,
 600                                end_line: None,
 601                            },
 602                        ),
 603                        tool_use(
 604                            "tool_2",
 605                            "read_file",
 606                            ReadFileToolInput {
 607                                path: "root/eval/react_test.py".into(),
 608                                start_line: None,
 609                                end_line: None,
 610                            },
 611                        ),
 612                    ],
 613                ),
 614                message(
 615                    User,
 616                    [
 617                        tool_result(
 618                            "tool_1",
 619                            "read_file",
 620                            include_str!("evals/fixtures/zode/react.py"),
 621                        ),
 622                        tool_result(
 623                            "tool_2",
 624                            "read_file",
 625                            include_str!("evals/fixtures/zode/react_test.py"),
 626                        ),
 627                    ],
 628                ),
 629                message(
 630                    Assistant,
 631                    [
 632                        text(
 633                            "Now that I understand what we need to build, I'll create the main Python script:",
 634                        ),
 635                        tool_use(
 636                            "tool_3",
 637                            "edit_file",
 638                            EditFileToolInput {
 639                                display_description: edit_description.into(),
 640                                path: input_file_path.into(),
 641                                mode: EditFileMode::Create,
 642                            },
 643                        ),
 644                    ],
 645                ),
 646            ],
 647            input_content,
 648            EvalAssertion::new(async move |sample, _, _cx| {
 649                let invalid_starts = [' ', '`', '\n'];
 650                let mut message = String::new();
 651                for start in invalid_starts {
 652                    if sample.text.starts_with(start) {
 653                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 654                        break;
 655                    }
 656                }
 657                // Remove trailing newline.
 658                message.pop();
 659
 660                if message.is_empty() {
 661                    Ok(EvalAssertionOutcome {
 662                        score: 100,
 663                        message: None,
 664                    })
 665                } else {
 666                    Ok(EvalAssertionOutcome {
 667                        score: 0,
 668                        message: Some(message),
 669                    })
 670                }
 671            }),
 672        ),
 673    );
 674}
 675
 676#[test]
 677#[cfg_attr(not(feature = "eval"), ignore)]
 678fn eval_add_overwrite_test() {
 679    let input_file_path = "root/action_log.rs";
 680    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 681    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 682    eval(
 683        200,
 684        0.5, // TODO: make this eval better
 685        EvalInput::from_conversation(
 686            vec![
 687                message(
 688                    User,
 689                    [text(indoc! {"
 690                        Introduce a new test in `action_log.rs` to test overwriting a file.
 691                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 692                        Take inspiration from all the other tests in the file.
 693                    "})],
 694                ),
 695                message(
 696                    Assistant,
 697                    [tool_use(
 698                        "tool_1",
 699                        "read_file",
 700                        ReadFileToolInput {
 701                            path: input_file_path.into(),
 702                            start_line: None,
 703                            end_line: None,
 704                        },
 705                    )],
 706                ),
 707                message(
 708                    User,
 709                    [tool_result(
 710                        "tool_1",
 711                        "read_file",
 712                        indoc! {"
 713                            pub struct ActionLog [L13-20]
 714                             tracked_buffers [L15]
 715                             edited_since_project_diagnostics_check [L17]
 716                             project [L19]
 717                            impl ActionLog [L22-498]
 718                             pub fn new [L24-30]
 719                             pub fn project [L32-34]
 720                             pub fn checked_project_diagnostics [L37-39]
 721                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 722                             fn track_buffer_internal [L46-101]
 723                             fn handle_buffer_event [L103-116]
 724                             fn handle_buffer_edited [L118-123]
 725                             fn handle_buffer_file_changed [L125-158]
 726                             async fn maintain_diff [L160-264]
 727                             pub fn buffer_read [L267-269]
 728                             pub fn buffer_created [L272-276]
 729                             pub fn buffer_edited [L279-287]
 730                             pub fn will_delete_buffer [L289-304]
 731                             pub fn keep_edits_in_range [L306-364]
 732                             pub fn reject_edits_in_ranges [L366-459]
 733                             pub fn keep_all_edits [L461-473]
 734                             pub fn changed_buffers [L476-482]
 735                             pub fn stale_buffers [L485-497]
 736                            fn apply_non_conflicting_edits [L500-561]
 737                            fn diff_snapshots [L563-585]
 738                            fn point_to_row_edit [L587-614]
 739                            enum ChangeAuthor [L617-620]
 740                             User [L618]
 741                             Agent [L619]
 742                            enum TrackedBufferStatus [L623-627]
 743                             Created [L624]
 744                             Modified [L625]
 745                             Deleted [L626]
 746                            struct TrackedBuffer [L629-641]
 747                             buffer [L630]
 748                             base_text [L631]
 749                             unreviewed_changes [L632]
 750                             status [L633]
 751                             version [L634]
 752                             diff [L635]
 753                             snapshot [L636]
 754                             diff_update [L637]
 755                             _open_lsp_handle [L638]
 756                             _maintain_diff [L639]
 757                             _subscription [L640]
 758                            impl TrackedBuffer [L643-657]
 759                             fn has_changes [L644-650]
 760                             fn schedule_diff_update [L652-656]
 761                            pub struct ChangedBuffer [L659-661]
 762                             pub diff [L660]
 763                            mod tests [L664-1574]
 764                             fn init_logger [L678-682]
 765                             fn init_test [L684-691]
 766                             async fn test_keep_edits [L694-769]
 767                             async fn test_deletions [L772-854]
 768                             async fn test_overlapping_user_edits [L857-951]
 769                             async fn test_creating_files [L954-1010]
 770                             async fn test_deleting_files [L1013-1120]
 771                             async fn test_reject_edits [L1123-1255]
 772                             async fn test_reject_multiple_edits [L1258-1331]
 773                             async fn test_reject_deleted_file [L1334-1388]
 774                             async fn test_reject_created_file [L1391-1443]
 775                             async fn test_random_diffs [L1446-1535]
 776                              fn quiesce [L1510-1534]
 777                             struct HunkStatus [L1538-1542]
 778                              range [L1539]
 779                              diff_status [L1540]
 780                              old_text [L1541]
 781                             fn unreviewed_hunks [L1544-1573]
 782
 783                            Showing symbols 1-69 (total symbols: 69)
 784
 785                            Using the line numbers in this outline, you can call this tool again while specifying
 786                            the start_line and end_line fields to see the implementations of symbols in the outline.
 787                        "},
 788                    )],
 789                ),
 790                message(
 791                    Assistant,
 792                    [
 793                        text(
 794                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 795                        ),
 796                        tool_use(
 797                            "tool_2",
 798                            "read_file",
 799                            ReadFileToolInput {
 800                                path: input_file_path.into(),
 801                                start_line: Some(953),
 802                                end_line: Some(1010),
 803                            },
 804                        ),
 805                    ],
 806                ),
 807                message(
 808                    User,
 809                    [tool_result(
 810                        "tool_2",
 811                        "read_file",
 812                        lines(input_file_content, 953..1010),
 813                    )],
 814                ),
 815                message(
 816                    Assistant,
 817                    [
 818                        text(
 819                            "Now let's look at another relevant test to understand how they're structured:",
 820                        ),
 821                        tool_use(
 822                            "tool_3",
 823                            "read_file",
 824                            ReadFileToolInput {
 825                                path: input_file_path.into(),
 826                                start_line: Some(1012),
 827                                end_line: Some(1120),
 828                            },
 829                        ),
 830                    ],
 831                ),
 832                message(
 833                    User,
 834                    [tool_result(
 835                        "tool_3",
 836                        "read_file",
 837                        lines(input_file_content, 1012..1120),
 838                    )],
 839                ),
 840                message(
 841                    Assistant,
 842                    [
 843                        text("Now let's look at how `buffer_created` is implemented:"),
 844                        tool_use(
 845                            "tool_4",
 846                            "read_file",
 847                            ReadFileToolInput {
 848                                path: input_file_path.into(),
 849                                start_line: Some(271),
 850                                end_line: Some(276),
 851                            },
 852                        ),
 853                    ],
 854                ),
 855                message(
 856                    User,
 857                    [tool_result(
 858                        "tool_4",
 859                        "read_file",
 860                        lines(input_file_content, 271..276),
 861                    )],
 862                ),
 863                message(
 864                    Assistant,
 865                    [
 866                        text(indoc! {"
 867                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 868
 869                            1. Create a file
 870                            2. Open a buffer for it
 871                            3. Track it in the action log
 872                            4. Modify the buffer
 873                            5. Call `buffer_created` again as if the file were new
 874                            6. Check that the changes are properly tracked
 875
 876                            Let's write the test:
 877                        "}),
 878                        tool_use(
 879                            "tool_5",
 880                            "edit_file",
 881                            EditFileToolInput {
 882                                display_description: edit_description.into(),
 883                                path: input_file_path.into(),
 884                                mode: EditFileMode::Edit,
 885                            },
 886                        ),
 887                    ],
 888                ),
 889            ],
 890            Some(input_file_content.into()),
 891            EvalAssertion::judge_diff(
 892                "A new test for overwritten files was created, without changing any previous test",
 893            ),
 894        ),
 895    );
 896}
 897
 898#[test]
 899#[ignore] // until we figure out the mystery described in the comments
 900// #[cfg_attr(not(feature = "eval"), ignore)]
 901fn eval_create_empty_file() {
 902    // Check that Edit Agent can create a file without writing its
 903    // thoughts into it. This issue is not specific to empty files, but
 904    // it's easier to reproduce with them.
 905    //
 906    // NOTE: For some mysterious reason, I could easily reproduce this
 907    // issue roughly 90% of the time in actual Zed. However, once I
 908    // extract the exact LLM request before the failure point and
 909    // generate from that, the reproduction rate drops to 2%!
 910    //
 911    // Things I've tried to make sure it's not a fluke: disabling prompt
 912    // caching, capturing the LLM request via a proxy server, running the
 913    // prompt on Claude separately from evals. Every time it was mostly
 914    // giving good outcomes, which doesn't match my actual experience in
 915    // Zed.
 916    //
 917    // At some point I discovered that simply adding one insignificant
 918    // space or a newline to the prompt suddenly results in an outcome I
 919    // tried to reproduce almost perfectly.
 920    //
 921    // This weirdness happens even outside of the Zed code base and even
 922    // when using a different subscription. The result is the same: an
 923    // extra newline or space changes the model behavior significantly
 924    // enough, so that the pass rate drops from 99% to 0-3%
 925    //
 926    // I have no explanation to this.
 927    //
 928    //
 929    //  Model                          | Pass rate
 930    // ============================================
 931    //
 932    // --------------------------------------------
 933    //           Prompt version: 2025-05-19
 934    // --------------------------------------------
 935    //
 936    //  claude-3.7-sonnet              |  0.98
 937    //    + one extra space in prompt  |  0.00
 938    //    + original prompt again      |  0.99
 939    //    + extra newline              |  0.03
 940    //  gemini-2.5-pro-preview-03-25   |  1.00
 941    //  gemini-2.5-flash-preview-04-17 |  1.00
 942    //    + one extra space            |  1.00
 943    //  gpt-4.1                        |  1.00
 944    //    + one extra space            |  1.00
 945    //
 946    //
 947    // TODO: gpt-4.1-mini errored 38 times:
 948    // "data did not match any variant of untagged enum ResponseStreamResult"
 949    //
 950    let input_file_content = None;
 951    let expected_output_content = String::new();
 952    eval(
 953        1,
 954        1.0,
 955        EvalInput::from_conversation(
 956            vec![
 957                message(User, [text("Create a second empty todo file ")]),
 958                message(
 959                    Assistant,
 960                    [
 961                        text(formatdoc! {"
 962                        I'll help you create a second empty todo file.
 963                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
 964                        "}),
 965                        tool_use(
 966                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 967                            "list_directory",
 968                            ListDirectoryToolInput {
 969                                path: "root".to_string(),
 970                            },
 971                        ),
 972                    ],
 973                ),
 974                message(
 975                    User,
 976                    [tool_result(
 977                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 978                        "list_directory",
 979                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
 980                    )],
 981                ),
 982                message(
 983                    Assistant,
 984                    [
 985                        text(formatdoc! {"
 986                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
 987                    "}),
 988                        tool_use(
 989                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
 990                            "edit_file",
 991                            EditFileToolInput {
 992                                display_description: "Create empty TODO3 file".to_string(),
 993                                mode: EditFileMode::Create,
 994                                path: "root/TODO3".into(),
 995                            },
 996                        ),
 997                    ],
 998                ),
 999            ],
1000            input_file_content,
1001            // Bad behavior is to write something like
1002            // "I'll create an empty TODO3 file as requested."
1003            EvalAssertion::assert_eq(expected_output_content),
1004        ),
1005    );
1006}
1007
1008fn message(
1009    role: Role,
1010    contents: impl IntoIterator<Item = MessageContent>,
1011) -> LanguageModelRequestMessage {
1012    LanguageModelRequestMessage {
1013        role,
1014        content: contents.into_iter().collect(),
1015        cache: false,
1016    }
1017}
1018
1019fn text(text: impl Into<String>) -> MessageContent {
1020    MessageContent::Text(text.into())
1021}
1022
1023fn lines(input: &str, range: Range<usize>) -> String {
1024    input
1025        .lines()
1026        .skip(range.start)
1027        .take(range.len())
1028        .collect::<Vec<_>>()
1029        .join("\n")
1030}
1031
1032fn tool_use(
1033    id: impl Into<Arc<str>>,
1034    name: impl Into<Arc<str>>,
1035    input: impl Serialize,
1036) -> MessageContent {
1037    MessageContent::ToolUse(LanguageModelToolUse {
1038        id: LanguageModelToolUseId::from(id.into()),
1039        name: name.into(),
1040        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1041        input: serde_json::to_value(input).unwrap(),
1042        is_input_complete: true,
1043    })
1044}
1045
1046fn tool_result(
1047    id: impl Into<Arc<str>>,
1048    name: impl Into<Arc<str>>,
1049    result: impl Into<Arc<str>>,
1050) -> MessageContent {
1051    MessageContent::ToolResult(LanguageModelToolResult {
1052        tool_use_id: LanguageModelToolUseId::from(id.into()),
1053        tool_name: name.into(),
1054        is_error: false,
1055        content: LanguageModelToolResultContent::Text(result.into()),
1056        output: None,
1057    })
1058}
1059
1060#[derive(Clone)]
1061struct EvalInput {
1062    conversation: Vec<LanguageModelRequestMessage>,
1063    edit_file_input: EditFileToolInput,
1064    input_content: Option<String>,
1065    assertion: EvalAssertion,
1066}
1067
1068impl EvalInput {
1069    fn from_conversation(
1070        conversation: Vec<LanguageModelRequestMessage>,
1071        input_content: Option<String>,
1072        assertion: EvalAssertion,
1073    ) -> Self {
1074        let msg = conversation.last().expect("Conversation must not be empty");
1075        if msg.role != Role::Assistant {
1076            panic!("Conversation must end with an assistant message");
1077        }
1078        let tool_use = msg
1079            .content
1080            .iter()
1081            .flat_map(|content| match content {
1082                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1083                    Some(tool_use)
1084                }
1085                _ => None,
1086            })
1087            .next()
1088            .expect("Conversation must end with an edit_file tool use")
1089            .clone();
1090
1091        let edit_file_input: EditFileToolInput =
1092            serde_json::from_value(tool_use.input.clone()).unwrap();
1093
1094        EvalInput {
1095            conversation,
1096            edit_file_input,
1097            input_content,
1098            assertion,
1099        }
1100    }
1101}
1102
1103#[derive(Clone)]
1104struct EvalSample {
1105    text: String,
1106    edit_output: EditAgentOutput,
1107    diff: String,
1108}
1109
1110trait AssertionFn: 'static + Send + Sync {
1111    fn assert<'a>(
1112        &'a self,
1113        sample: &'a EvalSample,
1114        judge_model: Arc<dyn LanguageModel>,
1115        cx: &'a mut TestAppContext,
1116    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1117}
1118
1119impl<F> AssertionFn for F
1120where
1121    F: 'static
1122        + Send
1123        + Sync
1124        + AsyncFn(
1125            &EvalSample,
1126            Arc<dyn LanguageModel>,
1127            &mut TestAppContext,
1128        ) -> Result<EvalAssertionOutcome>,
1129{
1130    fn assert<'a>(
1131        &'a self,
1132        sample: &'a EvalSample,
1133        judge_model: Arc<dyn LanguageModel>,
1134        cx: &'a mut TestAppContext,
1135    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1136        (self)(sample, judge_model, cx).boxed_local()
1137    }
1138}
1139
1140#[derive(Clone)]
1141struct EvalAssertion(Arc<dyn AssertionFn>);
1142
1143impl EvalAssertion {
1144    fn new<F>(f: F) -> Self
1145    where
1146        F: 'static
1147            + Send
1148            + Sync
1149            + AsyncFn(
1150                &EvalSample,
1151                Arc<dyn LanguageModel>,
1152                &mut TestAppContext,
1153            ) -> Result<EvalAssertionOutcome>,
1154    {
1155        EvalAssertion(Arc::new(f))
1156    }
1157
1158    fn assert_eq(expected: impl Into<String>) -> Self {
1159        let expected = expected.into();
1160        Self::new(async move |sample, _judge, _cx| {
1161            Ok(EvalAssertionOutcome {
1162                score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
1163                    100
1164                } else {
1165                    0
1166                },
1167                message: None,
1168            })
1169        })
1170    }
1171
1172    fn judge_diff(assertions: &'static str) -> Self {
1173        Self::new(async move |sample, judge, cx| {
1174            let prompt = DiffJudgeTemplate {
1175                diff: sample.diff.clone(),
1176                assertions,
1177            }
1178            .render(&Templates::new())
1179            .unwrap();
1180
1181            let request = LanguageModelRequest {
1182                messages: vec![LanguageModelRequestMessage {
1183                    role: Role::User,
1184                    content: vec![prompt.into()],
1185                    cache: false,
1186                }],
1187                ..Default::default()
1188            };
1189            let mut response = judge
1190                .stream_completion_text(request, &cx.to_async())
1191                .await?;
1192            let mut output = String::new();
1193            while let Some(chunk) = response.stream.next().await {
1194                let chunk = chunk?;
1195                output.push_str(&chunk);
1196            }
1197
1198            // Parse the score from the response
1199            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1200            if let Some(captures) = re.captures(&output) {
1201                if let Some(score_match) = captures.get(1) {
1202                    let score = score_match.as_str().parse().unwrap_or(0);
1203                    return Ok(EvalAssertionOutcome {
1204                        score,
1205                        message: Some(output),
1206                    });
1207                }
1208            }
1209
1210            Err(anyhow!(
1211                "No score found in response. Raw output: {}",
1212                output
1213            ))
1214        })
1215    }
1216
1217    async fn run(
1218        &self,
1219        input: &EvalSample,
1220        judge_model: Arc<dyn LanguageModel>,
1221        cx: &mut TestAppContext,
1222    ) -> Result<EvalAssertionOutcome> {
1223        self.0.assert(input, judge_model, cx).await
1224    }
1225}
1226
1227fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1228    let mut evaluated_count = 0;
1229    let mut failed_count = 0;
1230    report_progress(evaluated_count, failed_count, iterations);
1231
1232    let (tx, rx) = mpsc::channel();
1233
1234    // Cache the last message in the conversation, and run one instance of the eval so that
1235    // all the next ones are cached.
1236    eval.conversation.last_mut().unwrap().cache = true;
1237    run_eval(eval.clone(), tx.clone());
1238
1239    let executor = gpui::background_executor();
1240    for _ in 1..iterations {
1241        let eval = eval.clone();
1242        let tx = tx.clone();
1243        executor.spawn(async move { run_eval(eval, tx) }).detach();
1244    }
1245    drop(tx);
1246
1247    let mut failed_evals = HashMap::default();
1248    let mut errored_evals = HashMap::default();
1249    let mut eval_outputs = Vec::new();
1250    let mut cumulative_parser_metrics = EditParserMetrics::default();
1251    while let Ok(output) = rx.recv() {
1252        match output {
1253            Ok(output) => {
1254                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1255                eval_outputs.push(output.clone());
1256                if output.assertion.score < 80 {
1257                    failed_count += 1;
1258                    failed_evals
1259                        .entry(output.sample.text.clone())
1260                        .or_insert(Vec::new())
1261                        .push(output);
1262                }
1263            }
1264            Err(error) => {
1265                failed_count += 1;
1266                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1267            }
1268        }
1269
1270        evaluated_count += 1;
1271        report_progress(evaluated_count, failed_count, iterations);
1272    }
1273
1274    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1275    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1276    if actual_pass_ratio < expected_pass_ratio {
1277        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1278        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1279        for (error, count) in errored_evals {
1280            println!("Eval errored {} times. Error: {}", count, error);
1281        }
1282
1283        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1284        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1285        for (_buffer_output, failed_evals) in failed_evals {
1286            let eval_output = failed_evals.first().unwrap();
1287            println!("Eval failed {} times", failed_evals.len());
1288            println!("{}", eval_output);
1289        }
1290
1291        panic!(
1292            "Actual pass ratio: {}\nExpected pass ratio: {}",
1293            actual_pass_ratio, expected_pass_ratio
1294        );
1295    }
1296
1297    let mismatched_tag_ratio =
1298        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1299    if mismatched_tag_ratio > 0.05 {
1300        for eval_output in eval_outputs {
1301            println!("{}", eval_output);
1302        }
1303        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1304    }
1305}
1306
1307fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1308    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1309    let mut cx = TestAppContext::build(dispatcher, None);
1310    let output = cx.executor().block_test(async {
1311        let test = EditAgentTest::new(&mut cx).await;
1312        test.eval(eval, &mut cx).await
1313    });
1314    tx.send(output).unwrap();
1315}
1316
1317#[derive(Clone)]
1318struct EvalOutput {
1319    sample: EvalSample,
1320    assertion: EvalAssertionOutcome,
1321}
1322
1323impl Display for EvalOutput {
1324    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1325        writeln!(f, "Score: {:?}", self.assertion.score)?;
1326        if let Some(message) = self.assertion.message.as_ref() {
1327            writeln!(f, "Message: {}", message)?;
1328        }
1329
1330        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1331
1332        writeln!(
1333            f,
1334            "Parser Metrics:\n{:#?}",
1335            self.sample.edit_output.parser_metrics
1336        )?;
1337        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1338        Ok(())
1339    }
1340}
1341
1342fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1343    let passed_count = evaluated_count - failed_count;
1344    let passed_ratio = if evaluated_count == 0 {
1345        0.0
1346    } else {
1347        passed_count as f64 / evaluated_count as f64
1348    };
1349    print!(
1350        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1351        evaluated_count,
1352        iterations,
1353        passed_ratio * 100.0
1354    );
1355    std::io::stdout().flush().unwrap();
1356}
1357
1358struct EditAgentTest {
1359    agent: EditAgent,
1360    project: Entity<Project>,
1361    judge_model: Arc<dyn LanguageModel>,
1362}
1363
1364impl EditAgentTest {
1365    async fn new(cx: &mut TestAppContext) -> Self {
1366        cx.executor().allow_parking();
1367
1368        let fs = FakeFs::new(cx.executor().clone());
1369        cx.update(|cx| {
1370            settings::init(cx);
1371            gpui_tokio::init(cx);
1372            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1373            cx.set_http_client(http_client);
1374
1375            client::init_settings(cx);
1376            let client = Client::production(cx);
1377            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1378
1379            settings::init(cx);
1380            Project::init_settings(cx);
1381            language::init(cx);
1382            language_model::init(client.clone(), cx);
1383            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1384            crate::init(client.http_client(), cx);
1385        });
1386
1387        fs.insert_tree("/root", json!({})).await;
1388        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1389        let agent_model = SelectedModel::from_str(
1390            &std::env::var("ZED_AGENT_MODEL")
1391                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1392        )
1393        .unwrap();
1394        let judge_model = SelectedModel::from_str(
1395            &std::env::var("ZED_JUDGE_MODEL")
1396                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1397        )
1398        .unwrap();
1399        let (agent_model, judge_model) = cx
1400            .update(|cx| {
1401                cx.spawn(async move |cx| {
1402                    let agent_model = Self::load_model(&agent_model, cx).await;
1403                    let judge_model = Self::load_model(&judge_model, cx).await;
1404                    (agent_model.unwrap(), judge_model.unwrap())
1405                })
1406            })
1407            .await;
1408        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1409
1410        Self {
1411            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1412            project,
1413            judge_model,
1414        }
1415    }
1416
1417    async fn load_model(
1418        selected_model: &SelectedModel,
1419        cx: &mut AsyncApp,
1420    ) -> Result<Arc<dyn LanguageModel>> {
1421        let (provider, model) = cx.update(|cx| {
1422            let models = LanguageModelRegistry::read_global(cx);
1423            let model = models
1424                .available_models(cx)
1425                .find(|model| {
1426                    model.provider_id() == selected_model.provider
1427                        && model.id() == selected_model.model
1428                })
1429                .unwrap();
1430            let provider = models.provider(&model.provider_id()).unwrap();
1431            (provider, model)
1432        })?;
1433        cx.update(|cx| provider.authenticate(cx))?.await?;
1434        Ok(model)
1435    }
1436
1437    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1438        let path = self
1439            .project
1440            .read_with(cx, |project, cx| {
1441                project.find_project_path(eval.edit_file_input.path, cx)
1442            })
1443            .unwrap();
1444        let buffer = self
1445            .project
1446            .update(cx, |project, cx| project.open_buffer(path, cx))
1447            .await
1448            .unwrap();
1449        let conversation = LanguageModelRequest {
1450            messages: eval.conversation,
1451            tools: cx.update(|cx| {
1452                ToolRegistry::default_global(cx)
1453                    .tools()
1454                    .into_iter()
1455                    .filter_map(|tool| {
1456                        let input_schema = tool
1457                            .input_schema(self.agent.model.tool_input_format())
1458                            .ok()?;
1459                        Some(LanguageModelRequestTool {
1460                            name: tool.name(),
1461                            description: tool.description(),
1462                            input_schema,
1463                        })
1464                    })
1465                    .collect()
1466            }),
1467            ..Default::default()
1468        };
1469        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1470            if let Some(input_content) = eval.input_content.as_deref() {
1471                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1472            }
1473            let (edit_output, _) = self.agent.edit(
1474                buffer.clone(),
1475                eval.edit_file_input.display_description,
1476                &conversation,
1477                &mut cx.to_async(),
1478            );
1479            edit_output.await?
1480        } else {
1481            let (edit_output, _) = self.agent.overwrite(
1482                buffer.clone(),
1483                eval.edit_file_input.display_description,
1484                &conversation,
1485                &mut cx.to_async(),
1486            );
1487            edit_output.await?
1488        };
1489
1490        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1491        let sample = EvalSample {
1492            edit_output,
1493            diff: language::unified_diff(
1494                eval.input_content.as_deref().unwrap_or_default(),
1495                &buffer_text,
1496            ),
1497            text: buffer_text,
1498        };
1499        let assertion = eval
1500            .assertion
1501            .run(&sample, self.judge_model.clone(), cx)
1502            .await?;
1503
1504        Ok(EvalOutput { assertion, sample })
1505    }
1506}
1507
1508#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1509struct EvalAssertionOutcome {
1510    score: usize,
1511    message: Option<String>,
1512}
1513
1514#[derive(Serialize)]
1515pub struct DiffJudgeTemplate {
1516    diff: String,
1517    assertions: &'static str,
1518}
1519
1520impl Template for DiffJudgeTemplate {
1521    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1522}
1523
1524fn strip_empty_lines(text: &str) -> String {
1525    text.lines()
1526        .filter(|line| !line.trim().is_empty())
1527        .collect::<Vec<_>>()
1528        .join("\n")
1529}