evals.rs

   1use super::*;
   2use crate::{
   3    ReadFileToolInput,
   4    edit_file_tool::{EditFileMode, EditFileToolInput},
   5    grep_tool::GrepToolInput,
   6    list_directory_tool::ListDirectoryToolInput,
   7};
   8use Role::*;
   9use assistant_tool::ToolRegistry;
  10use client::{Client, UserStore};
  11use collections::HashMap;
  12use fs::FakeFs;
  13use futures::{FutureExt, future::LocalBoxFuture};
  14use gpui::{AppContext, TestAppContext};
  15use indoc::{formatdoc, indoc};
  16use language_model::{
  17    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
  18    LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  19};
  20use project::Project;
  21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
  22use rand::prelude::*;
  23use reqwest_client::ReqwestClient;
  24use serde_json::json;
  25use std::{
  26    cmp::Reverse,
  27    fmt::{self, Display},
  28    io::Write as _,
  29    str::FromStr,
  30    sync::mpsc,
  31};
  32use util::path;
  33
  34#[test]
  35#[cfg_attr(not(feature = "eval"), ignore)]
  36fn eval_extract_handle_command_output() {
  37    let input_file_path = "root/blame.rs";
  38    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  39    let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
  40    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
  41    eval(
  42        100,
  43        0.95,
  44        EvalInput::from_conversation(
  45            vec![
  46                message(
  47                    User,
  48                    [text(formatdoc! {"
  49                        Read the `{input_file_path}` file and extract a method in
  50                        the final stanza of `run_git_blame` to deal with command failures,
  51                        call it `handle_command_output` and take the std::process::Output as the only parameter.
  52
  53                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
  54                    "})],
  55                ),
  56                message(
  57                    Assistant,
  58                    [tool_use(
  59                        "tool_1",
  60                        "read_file",
  61                        ReadFileToolInput {
  62                            path: input_file_path.into(),
  63                            start_line: None,
  64                            end_line: None,
  65                        },
  66                    )],
  67                ),
  68                message(
  69                    User,
  70                    [tool_result("tool_1", "read_file", input_file_content)],
  71                ),
  72                message(
  73                    Assistant,
  74                    [tool_use(
  75                        "tool_2",
  76                        "edit_file",
  77                        EditFileToolInput {
  78                            display_description: edit_description.into(),
  79                            path: input_file_path.into(),
  80                            mode: EditFileMode::Edit,
  81                        },
  82                    )],
  83                ),
  84            ],
  85            Some(input_file_content.into()),
  86            EvalAssertion::assert_eq(output_file_content),
  87        ),
  88    );
  89}
  90
  91#[test]
  92#[cfg_attr(not(feature = "eval"), ignore)]
  93fn eval_delete_run_git_blame() {
  94    let input_file_path = "root/blame.rs";
  95    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
  96    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
  97    let edit_description = "Delete the `run_git_blame` function.";
  98    eval(
  99        100,
 100        0.95,
 101        EvalInput::from_conversation(
 102            vec![
 103                message(
 104                    User,
 105                    [text(formatdoc! {"
 106                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 107                        one function, not its usages.
 108                    "})],
 109                ),
 110                message(
 111                    Assistant,
 112                    [tool_use(
 113                        "tool_1",
 114                        "read_file",
 115                        ReadFileToolInput {
 116                            path: input_file_path.into(),
 117                            start_line: None,
 118                            end_line: None,
 119                        },
 120                    )],
 121                ),
 122                message(
 123                    User,
 124                    [tool_result("tool_1", "read_file", input_file_content)],
 125                ),
 126                message(
 127                    Assistant,
 128                    [tool_use(
 129                        "tool_2",
 130                        "edit_file",
 131                        EditFileToolInput {
 132                            display_description: edit_description.into(),
 133                            path: input_file_path.into(),
 134                            mode: EditFileMode::Edit,
 135                        },
 136                    )],
 137                ),
 138            ],
 139            Some(input_file_content.into()),
 140            EvalAssertion::assert_eq(output_file_content),
 141        ),
 142    );
 143}
 144
 145#[test]
 146#[cfg_attr(not(feature = "eval"), ignore)]
 147fn eval_translate_doc_comments() {
 148    let input_file_path = "root/canvas.rs";
 149    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 150    let edit_description = "Translate all doc comments to Italian";
 151    eval(
 152        200,
 153        1.,
 154        EvalInput::from_conversation(
 155            vec![
 156                message(
 157                    User,
 158                    [text(formatdoc! {"
 159                        Read the {input_file_path} file and edit it (without overwriting it),
 160                        translating all the doc comments to italian.
 161                    "})],
 162                ),
 163                message(
 164                    Assistant,
 165                    [tool_use(
 166                        "tool_1",
 167                        "read_file",
 168                        ReadFileToolInput {
 169                            path: input_file_path.into(),
 170                            start_line: None,
 171                            end_line: None,
 172                        },
 173                    )],
 174                ),
 175                message(
 176                    User,
 177                    [tool_result("tool_1", "read_file", input_file_content)],
 178                ),
 179                message(
 180                    Assistant,
 181                    [tool_use(
 182                        "tool_2",
 183                        "edit_file",
 184                        EditFileToolInput {
 185                            display_description: edit_description.into(),
 186                            path: input_file_path.into(),
 187                            mode: EditFileMode::Edit,
 188                        },
 189                    )],
 190                ),
 191            ],
 192            Some(input_file_content.into()),
 193            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 194        ),
 195    );
 196}
 197
 198#[test]
 199#[cfg_attr(not(feature = "eval"), ignore)]
 200fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 201    let input_file_path = "root/lib.rs";
 202    let input_file_content =
 203        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 204    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 205    eval(
 206        100,
 207        0.95,
 208        EvalInput::from_conversation(
 209            vec![
 210                message(
 211                    User,
 212                    [text(formatdoc! {"
 213                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 214                        Use `ureq` to download the SDK for the current platform and architecture.
 215                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 216                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 217                        that's inside of the archive.
 218                        Don't re-download the SDK if that executable already exists.
 219
 220                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 221
 222                        Here are the available wasi-sdk assets:
 223                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 224                        - wasi-sdk-25.0-arm64-macos.tar.gz
 225                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 226                        - wasi-sdk-25.0-arm64-linux.tar.gz
 227                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 228                        - wasi-sdk-25.0-arm64-linux.tar.gz
 229                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 230                    "})],
 231                ),
 232                message(
 233                    Assistant,
 234                    [tool_use(
 235                        "tool_1",
 236                        "read_file",
 237                        ReadFileToolInput {
 238                            path: input_file_path.into(),
 239                            start_line: Some(971),
 240                            end_line: Some(1050),
 241                        },
 242                    )],
 243                ),
 244                message(
 245                    User,
 246                    [tool_result(
 247                        "tool_1",
 248                        "read_file",
 249                        lines(input_file_content, 971..1050),
 250                    )],
 251                ),
 252                message(
 253                    Assistant,
 254                    [tool_use(
 255                        "tool_2",
 256                        "read_file",
 257                        ReadFileToolInput {
 258                            path: input_file_path.into(),
 259                            start_line: Some(1050),
 260                            end_line: Some(1100),
 261                        },
 262                    )],
 263                ),
 264                message(
 265                    User,
 266                    [tool_result(
 267                        "tool_2",
 268                        "read_file",
 269                        lines(input_file_content, 1050..1100),
 270                    )],
 271                ),
 272                message(
 273                    Assistant,
 274                    [tool_use(
 275                        "tool_3",
 276                        "read_file",
 277                        ReadFileToolInput {
 278                            path: input_file_path.into(),
 279                            start_line: Some(1100),
 280                            end_line: Some(1150),
 281                        },
 282                    )],
 283                ),
 284                message(
 285                    User,
 286                    [tool_result(
 287                        "tool_3",
 288                        "read_file",
 289                        lines(input_file_content, 1100..1150),
 290                    )],
 291                ),
 292                message(
 293                    Assistant,
 294                    [tool_use(
 295                        "tool_4",
 296                        "edit_file",
 297                        EditFileToolInput {
 298                            display_description: edit_description.into(),
 299                            path: input_file_path.into(),
 300                            mode: EditFileMode::Edit,
 301                        },
 302                    )],
 303                ),
 304            ],
 305            Some(input_file_content.into()),
 306            EvalAssertion::judge_diff(indoc! {"
 307                - The compile_parser_to_wasm method has been changed to use wasi-sdk
 308                - ureq is used to download the SDK for current platform and architecture
 309            "}),
 310        ),
 311    );
 312}
 313
 314#[test]
 315#[cfg_attr(not(feature = "eval"), ignore)]
 316fn eval_disable_cursor_blinking() {
 317    let input_file_path = "root/editor.rs";
 318    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 319    let edit_description = "Comment out the call to `BlinkManager::enable`";
 320    eval(
 321        100,
 322        0.95,
 323        EvalInput::from_conversation(
 324            vec![
 325                message(User, [text("Let's research how to cursor blinking works.")]),
 326                message(
 327                    Assistant,
 328                    [tool_use(
 329                        "tool_1",
 330                        "grep",
 331                        GrepToolInput {
 332                            regex: "blink".into(),
 333                            include_pattern: None,
 334                            offset: 0,
 335                            case_sensitive: false,
 336                        },
 337                    )],
 338                ),
 339                message(
 340                    User,
 341                    [tool_result(
 342                        "tool_1",
 343                        "grep",
 344                        [
 345                            lines(input_file_content, 100..400),
 346                            lines(input_file_content, 800..1300),
 347                            lines(input_file_content, 1600..2000),
 348                            lines(input_file_content, 5000..5500),
 349                            lines(input_file_content, 8000..9000),
 350                            lines(input_file_content, 18455..18470),
 351                            lines(input_file_content, 20000..20500),
 352                            lines(input_file_content, 21000..21300),
 353                        ]
 354                        .join("Match found:\n\n"),
 355                    )],
 356                ),
 357                message(
 358                    User,
 359                    [text(indoc! {"
 360                        Comment out the lines that interact with the BlinkManager.
 361                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 362                        Don't add additional comments.
 363                    "})],
 364                ),
 365                message(
 366                    Assistant,
 367                    [tool_use(
 368                        "tool_4",
 369                        "edit_file",
 370                        EditFileToolInput {
 371                            display_description: edit_description.into(),
 372                            path: input_file_path.into(),
 373                            mode: EditFileMode::Edit,
 374                        },
 375                    )],
 376                ),
 377            ],
 378            Some(input_file_content.into()),
 379            EvalAssertion::judge_diff(indoc! {"
 380                - Calls to BlinkManager in `observe_window_activation` were commented out
 381                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
 382                - All the edits have valid indentation
 383            "}),
 384        ),
 385    );
 386}
 387
 388#[test]
 389#[cfg_attr(not(feature = "eval"), ignore)]
 390fn eval_from_pixels_constructor() {
 391    let input_file_path = "root/canvas.rs";
 392    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 393    let edit_description = "Implement from_pixels constructor and add tests.";
 394    eval(
 395        100,
 396        0.95,
 397        EvalInput::from_conversation(
 398            vec![
 399                message(
 400                    User,
 401                    [text(indoc! {"
 402                        Introduce a new `from_pixels` constructor in Canvas and
 403                        also add tests for it in the same file.
 404                    "})],
 405                ),
 406                message(
 407                    Assistant,
 408                    [tool_use(
 409                        "tool_1",
 410                        "read_file",
 411                        ReadFileToolInput {
 412                            path: input_file_path.into(),
 413                            start_line: None,
 414                            end_line: None,
 415                        },
 416                    )],
 417                ),
 418                message(
 419                    User,
 420                    [tool_result("tool_1", "read_file", input_file_content)],
 421                ),
 422                message(
 423                    Assistant,
 424                    [tool_use(
 425                        "tool_2",
 426                        "grep",
 427                        GrepToolInput {
 428                            regex: "mod\\s+tests".into(),
 429                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 430                            offset: 0,
 431                            case_sensitive: false,
 432                        },
 433                    )],
 434                ),
 435                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 436                message(
 437                    Assistant,
 438                    [tool_use(
 439                        "tool_3",
 440                        "grep",
 441                        GrepToolInput {
 442                            regex: "mod\\s+tests".into(),
 443                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 444                            offset: 0,
 445                            case_sensitive: false,
 446                        },
 447                    )],
 448                ),
 449                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 450                message(
 451                    Assistant,
 452                    [tool_use(
 453                        "tool_4",
 454                        "grep",
 455                        GrepToolInput {
 456                            regex: "#\\[test\\]".into(),
 457                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 458                            offset: 0,
 459                            case_sensitive: false,
 460                        },
 461                    )],
 462                ),
 463                message(
 464                    User,
 465                    [tool_result(
 466                        "tool_4",
 467                        "grep",
 468                        indoc! {"
 469                            Found 6 matches:
 470
 471                            ## Matches in font-kit/src/loaders/core_text.rs
 472
 473                            ### mod test › L926-936
 474                            ```
 475                            mod test {
 476                                use super::Font;
 477                                use crate::properties::{Stretch, Weight};
 478
 479                                #[cfg(feature = \"source\")]
 480                                use crate::source::SystemSource;
 481
 482                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 483
 484                                #[cfg(feature = \"source\")]
 485                                #[test]
 486                            ```
 487
 488                            55 lines remaining in ancestor node. Read the file to see all.
 489
 490                            ### mod test › L947-951
 491                            ```
 492                                }
 493
 494                                #[test]
 495                                fn test_core_text_to_css_font_weight() {
 496                                    // Exact matches
 497                            ```
 498
 499                            ### mod test › L959-963
 500                            ```
 501                                }
 502
 503                                #[test]
 504                                fn test_core_text_to_css_font_stretch() {
 505                                    // Exact matches
 506                            ```
 507
 508                            ## Matches in font-kit/src/loaders/freetype.rs
 509
 510                            ### mod test › L1238-1248
 511                            ```
 512                            mod test {
 513                                use crate::loaders::freetype::Font;
 514
 515                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 516                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 517
 518                                #[test]
 519                                fn get_pcf_postscript_name() {
 520                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 521                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 522                                }
 523                            ```
 524
 525                            1 lines remaining in ancestor node. Read the file to see all.
 526
 527                            ## Matches in font-kit/src/sources/core_text.rs
 528
 529                            ### mod test › L265-275
 530                            ```
 531                            mod test {
 532                                use crate::properties::{Stretch, Weight};
 533
 534                                #[test]
 535                                fn test_css_to_core_text_font_weight() {
 536                                    // Exact matches
 537                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 538                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 539                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 540                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 541
 542                            ```
 543
 544                            27 lines remaining in ancestor node. Read the file to see all.
 545
 546                            ### mod test › L278-282
 547                            ```
 548                                }
 549
 550                                #[test]
 551                                fn test_css_to_core_text_font_stretch() {
 552                                    // Exact matches
 553                            ```
 554                        "},
 555                    )],
 556                ),
 557                message(
 558                    Assistant,
 559                    [tool_use(
 560                        "tool_5",
 561                        "edit_file",
 562                        EditFileToolInput {
 563                            display_description: edit_description.into(),
 564                            path: input_file_path.into(),
 565                            mode: EditFileMode::Edit,
 566                        },
 567                    )],
 568                ),
 569            ],
 570            Some(input_file_content.into()),
 571            EvalAssertion::judge_diff(indoc! {"
 572                    - The diff contains a new `from_pixels` constructor
 573                    - The diff contains new tests for the `from_pixels` constructor
 574                "}),
 575        ),
 576    );
 577}
 578
 579#[test]
 580#[cfg_attr(not(feature = "eval"), ignore)]
 581fn eval_zode() {
 582    let input_file_path = "root/zode.py";
 583    let input_content = None;
 584    let edit_description = "Create the main Zode CLI script";
 585    eval(
 586        200,
 587        1.,
 588        EvalInput::from_conversation(
 589            vec![
 590                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 591                message(
 592                    Assistant,
 593                    [
 594                        tool_use(
 595                            "tool_1",
 596                            "read_file",
 597                            ReadFileToolInput {
 598                                path: "root/eval/react.py".into(),
 599                                start_line: None,
 600                                end_line: None,
 601                            },
 602                        ),
 603                        tool_use(
 604                            "tool_2",
 605                            "read_file",
 606                            ReadFileToolInput {
 607                                path: "root/eval/react_test.py".into(),
 608                                start_line: None,
 609                                end_line: None,
 610                            },
 611                        ),
 612                    ],
 613                ),
 614                message(
 615                    User,
 616                    [
 617                        tool_result(
 618                            "tool_1",
 619                            "read_file",
 620                            include_str!("evals/fixtures/zode/react.py"),
 621                        ),
 622                        tool_result(
 623                            "tool_2",
 624                            "read_file",
 625                            include_str!("evals/fixtures/zode/react_test.py"),
 626                        ),
 627                    ],
 628                ),
 629                message(
 630                    Assistant,
 631                    [
 632                        text(
 633                            "Now that I understand what we need to build, I'll create the main Python script:",
 634                        ),
 635                        tool_use(
 636                            "tool_3",
 637                            "edit_file",
 638                            EditFileToolInput {
 639                                display_description: edit_description.into(),
 640                                path: input_file_path.into(),
 641                                mode: EditFileMode::Create,
 642                            },
 643                        ),
 644                    ],
 645                ),
 646            ],
 647            input_content,
 648            EvalAssertion::new(async move |sample, _, _cx| {
 649                let invalid_starts = [' ', '`', '\n'];
 650                let mut message = String::new();
 651                for start in invalid_starts {
 652                    if sample.text.starts_with(start) {
 653                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 654                        break;
 655                    }
 656                }
 657                // Remove trailing newline.
 658                message.pop();
 659
 660                if message.is_empty() {
 661                    Ok(EvalAssertionOutcome {
 662                        score: 100,
 663                        message: None,
 664                    })
 665                } else {
 666                    Ok(EvalAssertionOutcome {
 667                        score: 0,
 668                        message: Some(message),
 669                    })
 670                }
 671            }),
 672        ),
 673    );
 674}
 675
 676#[test]
 677#[cfg_attr(not(feature = "eval"), ignore)]
 678fn eval_add_overwrite_test() {
 679    let input_file_path = "root/action_log.rs";
 680    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 681    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 682    eval(
 683        200,
 684        0.5, // TODO: make this eval better
 685        EvalInput::from_conversation(
 686            vec![
 687                message(
 688                    User,
 689                    [text(indoc! {"
 690                        Introduce a new test in `action_log.rs` to test overwriting a file.
 691                        That is, a file already exists, but we call `buffer_created` as if the file were new.
 692                        Take inspiration from all the other tests in the file.
 693                    "})],
 694                ),
 695                message(
 696                    Assistant,
 697                    [tool_use(
 698                        "tool_1",
 699                        "read_file",
 700                        ReadFileToolInput {
 701                            path: input_file_path.into(),
 702                            start_line: None,
 703                            end_line: None,
 704                        },
 705                    )],
 706                ),
 707                message(
 708                    User,
 709                    [tool_result(
 710                        "tool_1",
 711                        "read_file",
 712                        indoc! {"
 713                            pub struct ActionLog [L13-20]
 714                             tracked_buffers [L15]
 715                             edited_since_project_diagnostics_check [L17]
 716                             project [L19]
 717                            impl ActionLog [L22-498]
 718                             pub fn new [L24-30]
 719                             pub fn project [L32-34]
 720                             pub fn checked_project_diagnostics [L37-39]
 721                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 722                             fn track_buffer_internal [L46-101]
 723                             fn handle_buffer_event [L103-116]
 724                             fn handle_buffer_edited [L118-123]
 725                             fn handle_buffer_file_changed [L125-158]
 726                             async fn maintain_diff [L160-264]
 727                             pub fn buffer_read [L267-269]
 728                             pub fn buffer_created [L272-276]
 729                             pub fn buffer_edited [L279-287]
 730                             pub fn will_delete_buffer [L289-304]
 731                             pub fn keep_edits_in_range [L306-364]
 732                             pub fn reject_edits_in_ranges [L366-459]
 733                             pub fn keep_all_edits [L461-473]
 734                             pub fn changed_buffers [L476-482]
 735                             pub fn stale_buffers [L485-497]
 736                            fn apply_non_conflicting_edits [L500-561]
 737                            fn diff_snapshots [L563-585]
 738                            fn point_to_row_edit [L587-614]
 739                            enum ChangeAuthor [L617-620]
 740                             User [L618]
 741                             Agent [L619]
 742                            enum TrackedBufferStatus [L623-627]
 743                             Created [L624]
 744                             Modified [L625]
 745                             Deleted [L626]
 746                            struct TrackedBuffer [L629-641]
 747                             buffer [L630]
 748                             base_text [L631]
 749                             unreviewed_changes [L632]
 750                             status [L633]
 751                             version [L634]
 752                             diff [L635]
 753                             snapshot [L636]
 754                             diff_update [L637]
 755                             _open_lsp_handle [L638]
 756                             _maintain_diff [L639]
 757                             _subscription [L640]
 758                            impl TrackedBuffer [L643-657]
 759                             fn has_changes [L644-650]
 760                             fn schedule_diff_update [L652-656]
 761                            pub struct ChangedBuffer [L659-661]
 762                             pub diff [L660]
 763                            mod tests [L664-1574]
 764                             fn init_logger [L678-682]
 765                             fn init_test [L684-691]
 766                             async fn test_keep_edits [L694-769]
 767                             async fn test_deletions [L772-854]
 768                             async fn test_overlapping_user_edits [L857-951]
 769                             async fn test_creating_files [L954-1010]
 770                             async fn test_deleting_files [L1013-1120]
 771                             async fn test_reject_edits [L1123-1255]
 772                             async fn test_reject_multiple_edits [L1258-1331]
 773                             async fn test_reject_deleted_file [L1334-1388]
 774                             async fn test_reject_created_file [L1391-1443]
 775                             async fn test_random_diffs [L1446-1535]
 776                              fn quiesce [L1510-1534]
 777                             struct HunkStatus [L1538-1542]
 778                              range [L1539]
 779                              diff_status [L1540]
 780                              old_text [L1541]
 781                             fn unreviewed_hunks [L1544-1573]
 782
 783                            Showing symbols 1-69 (total symbols: 69)
 784
 785                            Using the line numbers in this outline, you can call this tool again while specifying
 786                            the start_line and end_line fields to see the implementations of symbols in the outline.
 787                        "},
 788                    )],
 789                ),
 790                message(
 791                    Assistant,
 792                    [
 793                        text(
 794                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 795                        ),
 796                        tool_use(
 797                            "tool_2",
 798                            "read_file",
 799                            ReadFileToolInput {
 800                                path: input_file_path.into(),
 801                                start_line: Some(953),
 802                                end_line: Some(1010),
 803                            },
 804                        ),
 805                    ],
 806                ),
 807                message(
 808                    User,
 809                    [tool_result(
 810                        "tool_2",
 811                        "read_file",
 812                        lines(input_file_content, 953..1010),
 813                    )],
 814                ),
 815                message(
 816                    Assistant,
 817                    [
 818                        text(
 819                            "Now let's look at another relevant test to understand how they're structured:",
 820                        ),
 821                        tool_use(
 822                            "tool_3",
 823                            "read_file",
 824                            ReadFileToolInput {
 825                                path: input_file_path.into(),
 826                                start_line: Some(1012),
 827                                end_line: Some(1120),
 828                            },
 829                        ),
 830                    ],
 831                ),
 832                message(
 833                    User,
 834                    [tool_result(
 835                        "tool_3",
 836                        "read_file",
 837                        lines(input_file_content, 1012..1120),
 838                    )],
 839                ),
 840                message(
 841                    Assistant,
 842                    [
 843                        text("Now let's look at how `buffer_created` is implemented:"),
 844                        tool_use(
 845                            "tool_4",
 846                            "read_file",
 847                            ReadFileToolInput {
 848                                path: input_file_path.into(),
 849                                start_line: Some(271),
 850                                end_line: Some(276),
 851                            },
 852                        ),
 853                    ],
 854                ),
 855                message(
 856                    User,
 857                    [tool_result(
 858                        "tool_4",
 859                        "read_file",
 860                        lines(input_file_content, 271..276),
 861                    )],
 862                ),
 863                message(
 864                    Assistant,
 865                    [
 866                        text(indoc! {"
 867                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 868
 869                            1. Create a file
 870                            2. Open a buffer for it
 871                            3. Track it in the action log
 872                            4. Modify the buffer
 873                            5. Call `buffer_created` again as if the file were new
 874                            6. Check that the changes are properly tracked
 875
 876                            Let's write the test:
 877                        "}),
 878                        tool_use(
 879                            "tool_5",
 880                            "edit_file",
 881                            EditFileToolInput {
 882                                display_description: edit_description.into(),
 883                                path: input_file_path.into(),
 884                                mode: EditFileMode::Edit,
 885                            },
 886                        ),
 887                    ],
 888                ),
 889            ],
 890            Some(input_file_content.into()),
 891            EvalAssertion::judge_diff(
 892                "A new test for overwritten files was created, without changing any previous test",
 893            ),
 894        ),
 895    );
 896}
 897
 898#[test]
 899#[cfg_attr(not(feature = "eval"), ignore)]
 900fn eval_create_empty_file() {
 901    // Check that Edit Agent can create a file without writing its
 902    // thoughts into it. This issue is not specific to empty files, but
 903    // it's easier to reproduce with them.
 904    //
 905    //
 906    //  Model                          | Pass rate
 907    // ============================================
 908    //
 909    // --------------------------------------------
 910    //           Prompt version: 2025-05-21
 911    // --------------------------------------------
 912    //
 913    //  claude-3.7-sonnet              |  1.00
 914    //  gemini-2.5-pro-preview-03-25   |  1.00
 915    //  gemini-2.5-flash-preview-04-17 |  1.00
 916    //  gpt-4.1                        |  1.00
 917    //
 918    //
 919    // TODO: gpt-4.1-mini errored 38 times:
 920    // "data did not match any variant of untagged enum ResponseStreamResult"
 921    //
 922    let input_file_content = None;
 923    let expected_output_content = String::new();
 924    eval(
 925        100,
 926        0.99,
 927        EvalInput::from_conversation(
 928            vec![
 929                message(User, [text("Create a second empty todo file ")]),
 930                message(
 931                    Assistant,
 932                    [
 933                        text(formatdoc! {"
 934                        I'll help you create a second empty todo file.
 935                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
 936                        "}),
 937                        tool_use(
 938                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 939                            "list_directory",
 940                            ListDirectoryToolInput {
 941                                path: "root".to_string(),
 942                            },
 943                        ),
 944                    ],
 945                ),
 946                message(
 947                    User,
 948                    [tool_result(
 949                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
 950                        "list_directory",
 951                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
 952                    )],
 953                ),
 954                message(
 955                    Assistant,
 956                    [
 957                        text(formatdoc! {"
 958                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
 959                    "}),
 960                        tool_use(
 961                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
 962                            "edit_file",
 963                            EditFileToolInput {
 964                                display_description: "Create empty TODO3 file".to_string(),
 965                                mode: EditFileMode::Create,
 966                                path: "root/TODO3".into(),
 967                            },
 968                        ),
 969                    ],
 970                ),
 971            ],
 972            input_file_content,
 973            // Bad behavior is to write something like
 974            // "I'll create an empty TODO3 file as requested."
 975            EvalAssertion::assert_eq(expected_output_content),
 976        ),
 977    );
 978}
 979
 980fn message(
 981    role: Role,
 982    contents: impl IntoIterator<Item = MessageContent>,
 983) -> LanguageModelRequestMessage {
 984    LanguageModelRequestMessage {
 985        role,
 986        content: contents.into_iter().collect(),
 987        cache: false,
 988    }
 989}
 990
 991fn text(text: impl Into<String>) -> MessageContent {
 992    MessageContent::Text(text.into())
 993}
 994
 995fn lines(input: &str, range: Range<usize>) -> String {
 996    input
 997        .lines()
 998        .skip(range.start)
 999        .take(range.len())
1000        .collect::<Vec<_>>()
1001        .join("\n")
1002}
1003
1004fn tool_use(
1005    id: impl Into<Arc<str>>,
1006    name: impl Into<Arc<str>>,
1007    input: impl Serialize,
1008) -> MessageContent {
1009    MessageContent::ToolUse(LanguageModelToolUse {
1010        id: LanguageModelToolUseId::from(id.into()),
1011        name: name.into(),
1012        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1013        input: serde_json::to_value(input).unwrap(),
1014        is_input_complete: true,
1015    })
1016}
1017
1018fn tool_result(
1019    id: impl Into<Arc<str>>,
1020    name: impl Into<Arc<str>>,
1021    result: impl Into<Arc<str>>,
1022) -> MessageContent {
1023    MessageContent::ToolResult(LanguageModelToolResult {
1024        tool_use_id: LanguageModelToolUseId::from(id.into()),
1025        tool_name: name.into(),
1026        is_error: false,
1027        content: LanguageModelToolResultContent::Text(result.into()),
1028        output: None,
1029    })
1030}
1031
1032#[derive(Clone)]
1033struct EvalInput {
1034    conversation: Vec<LanguageModelRequestMessage>,
1035    edit_file_input: EditFileToolInput,
1036    input_content: Option<String>,
1037    assertion: EvalAssertion,
1038}
1039
1040impl EvalInput {
1041    fn from_conversation(
1042        conversation: Vec<LanguageModelRequestMessage>,
1043        input_content: Option<String>,
1044        assertion: EvalAssertion,
1045    ) -> Self {
1046        let msg = conversation.last().expect("Conversation must not be empty");
1047        if msg.role != Role::Assistant {
1048            panic!("Conversation must end with an assistant message");
1049        }
1050        let tool_use = msg
1051            .content
1052            .iter()
1053            .flat_map(|content| match content {
1054                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1055                    Some(tool_use)
1056                }
1057                _ => None,
1058            })
1059            .next()
1060            .expect("Conversation must end with an edit_file tool use")
1061            .clone();
1062
1063        let edit_file_input: EditFileToolInput =
1064            serde_json::from_value(tool_use.input.clone()).unwrap();
1065
1066        EvalInput {
1067            conversation,
1068            edit_file_input,
1069            input_content,
1070            assertion,
1071        }
1072    }
1073}
1074
1075#[derive(Clone)]
1076struct EvalSample {
1077    text: String,
1078    edit_output: EditAgentOutput,
1079    diff: String,
1080}
1081
1082trait AssertionFn: 'static + Send + Sync {
1083    fn assert<'a>(
1084        &'a self,
1085        sample: &'a EvalSample,
1086        judge_model: Arc<dyn LanguageModel>,
1087        cx: &'a mut TestAppContext,
1088    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1089}
1090
1091impl<F> AssertionFn for F
1092where
1093    F: 'static
1094        + Send
1095        + Sync
1096        + AsyncFn(
1097            &EvalSample,
1098            Arc<dyn LanguageModel>,
1099            &mut TestAppContext,
1100        ) -> Result<EvalAssertionOutcome>,
1101{
1102    fn assert<'a>(
1103        &'a self,
1104        sample: &'a EvalSample,
1105        judge_model: Arc<dyn LanguageModel>,
1106        cx: &'a mut TestAppContext,
1107    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1108        (self)(sample, judge_model, cx).boxed_local()
1109    }
1110}
1111
1112#[derive(Clone)]
1113struct EvalAssertion(Arc<dyn AssertionFn>);
1114
1115impl EvalAssertion {
1116    fn new<F>(f: F) -> Self
1117    where
1118        F: 'static
1119            + Send
1120            + Sync
1121            + AsyncFn(
1122                &EvalSample,
1123                Arc<dyn LanguageModel>,
1124                &mut TestAppContext,
1125            ) -> Result<EvalAssertionOutcome>,
1126    {
1127        EvalAssertion(Arc::new(f))
1128    }
1129
1130    fn assert_eq(expected: impl Into<String>) -> Self {
1131        let expected = expected.into();
1132        Self::new(async move |sample, _judge, _cx| {
1133            Ok(EvalAssertionOutcome {
1134                score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
1135                    100
1136                } else {
1137                    0
1138                },
1139                message: None,
1140            })
1141        })
1142    }
1143
1144    fn judge_diff(assertions: &'static str) -> Self {
1145        Self::new(async move |sample, judge, cx| {
1146            let prompt = DiffJudgeTemplate {
1147                diff: sample.diff.clone(),
1148                assertions,
1149            }
1150            .render(&Templates::new())
1151            .unwrap();
1152
1153            let request = LanguageModelRequest {
1154                messages: vec![LanguageModelRequestMessage {
1155                    role: Role::User,
1156                    content: vec![prompt.into()],
1157                    cache: false,
1158                }],
1159                ..Default::default()
1160            };
1161            let mut response = judge
1162                .stream_completion_text(request, &cx.to_async())
1163                .await?;
1164            let mut output = String::new();
1165            while let Some(chunk) = response.stream.next().await {
1166                let chunk = chunk?;
1167                output.push_str(&chunk);
1168            }
1169
1170            // Parse the score from the response
1171            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1172            if let Some(captures) = re.captures(&output) {
1173                if let Some(score_match) = captures.get(1) {
1174                    let score = score_match.as_str().parse().unwrap_or(0);
1175                    return Ok(EvalAssertionOutcome {
1176                        score,
1177                        message: Some(output),
1178                    });
1179                }
1180            }
1181
1182            anyhow::bail!("No score found in response. Raw output: {output}");
1183        })
1184    }
1185
1186    async fn run(
1187        &self,
1188        input: &EvalSample,
1189        judge_model: Arc<dyn LanguageModel>,
1190        cx: &mut TestAppContext,
1191    ) -> Result<EvalAssertionOutcome> {
1192        self.0.assert(input, judge_model, cx).await
1193    }
1194}
1195
1196fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1197    let mut evaluated_count = 0;
1198    let mut failed_count = 0;
1199    report_progress(evaluated_count, failed_count, iterations);
1200
1201    let (tx, rx) = mpsc::channel();
1202
1203    // Cache the last message in the conversation, and run one instance of the eval so that
1204    // all the next ones are cached.
1205    eval.conversation.last_mut().unwrap().cache = true;
1206    run_eval(eval.clone(), tx.clone());
1207
1208    let executor = gpui::background_executor();
1209    for _ in 1..iterations {
1210        let eval = eval.clone();
1211        let tx = tx.clone();
1212        executor.spawn(async move { run_eval(eval, tx) }).detach();
1213    }
1214    drop(tx);
1215
1216    let mut failed_evals = HashMap::default();
1217    let mut errored_evals = HashMap::default();
1218    let mut eval_outputs = Vec::new();
1219    let mut cumulative_parser_metrics = EditParserMetrics::default();
1220    while let Ok(output) = rx.recv() {
1221        match output {
1222            Ok(output) => {
1223                cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1224                eval_outputs.push(output.clone());
1225                if output.assertion.score < 80 {
1226                    failed_count += 1;
1227                    failed_evals
1228                        .entry(output.sample.text.clone())
1229                        .or_insert(Vec::new())
1230                        .push(output);
1231                }
1232            }
1233            Err(error) => {
1234                failed_count += 1;
1235                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1236            }
1237        }
1238
1239        evaluated_count += 1;
1240        report_progress(evaluated_count, failed_count, iterations);
1241    }
1242
1243    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1244    println!("Actual pass ratio: {}\n", actual_pass_ratio);
1245    if actual_pass_ratio < expected_pass_ratio {
1246        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1247        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1248        for (error, count) in errored_evals {
1249            println!("Eval errored {} times. Error: {}", count, error);
1250        }
1251
1252        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1253        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1254        for (_buffer_output, failed_evals) in failed_evals {
1255            let eval_output = failed_evals.first().unwrap();
1256            println!("Eval failed {} times", failed_evals.len());
1257            println!("{}", eval_output);
1258        }
1259
1260        panic!(
1261            "Actual pass ratio: {}\nExpected pass ratio: {}",
1262            actual_pass_ratio, expected_pass_ratio
1263        );
1264    }
1265
1266    let mismatched_tag_ratio =
1267        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1268    if mismatched_tag_ratio > 0.05 {
1269        for eval_output in eval_outputs {
1270            println!("{}", eval_output);
1271        }
1272        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1273    }
1274}
1275
1276fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1277    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1278    let mut cx = TestAppContext::build(dispatcher, None);
1279    let output = cx.executor().block_test(async {
1280        let test = EditAgentTest::new(&mut cx).await;
1281        test.eval(eval, &mut cx).await
1282    });
1283    tx.send(output).unwrap();
1284}
1285
1286#[derive(Clone)]
1287struct EvalOutput {
1288    sample: EvalSample,
1289    assertion: EvalAssertionOutcome,
1290}
1291
1292impl Display for EvalOutput {
1293    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1294        writeln!(f, "Score: {:?}", self.assertion.score)?;
1295        if let Some(message) = self.assertion.message.as_ref() {
1296            writeln!(f, "Message: {}", message)?;
1297        }
1298
1299        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1300
1301        writeln!(
1302            f,
1303            "Parser Metrics:\n{:#?}",
1304            self.sample.edit_output.parser_metrics
1305        )?;
1306        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1307        Ok(())
1308    }
1309}
1310
1311fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1312    let passed_count = evaluated_count - failed_count;
1313    let passed_ratio = if evaluated_count == 0 {
1314        0.0
1315    } else {
1316        passed_count as f64 / evaluated_count as f64
1317    };
1318    print!(
1319        "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1320        evaluated_count,
1321        iterations,
1322        passed_ratio * 100.0
1323    );
1324    std::io::stdout().flush().unwrap();
1325}
1326
1327struct EditAgentTest {
1328    agent: EditAgent,
1329    project: Entity<Project>,
1330    judge_model: Arc<dyn LanguageModel>,
1331}
1332
1333impl EditAgentTest {
1334    async fn new(cx: &mut TestAppContext) -> Self {
1335        cx.executor().allow_parking();
1336
1337        let fs = FakeFs::new(cx.executor().clone());
1338        cx.update(|cx| {
1339            settings::init(cx);
1340            gpui_tokio::init(cx);
1341            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1342            cx.set_http_client(http_client);
1343
1344            client::init_settings(cx);
1345            let client = Client::production(cx);
1346            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1347
1348            settings::init(cx);
1349            Project::init_settings(cx);
1350            language::init(cx);
1351            language_model::init(client.clone(), cx);
1352            language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1353            crate::init(client.http_client(), cx);
1354        });
1355
1356        fs.insert_tree("/root", json!({})).await;
1357        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1358        let agent_model = SelectedModel::from_str(
1359            &std::env::var("ZED_AGENT_MODEL")
1360                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1361        )
1362        .unwrap();
1363        let judge_model = SelectedModel::from_str(
1364            &std::env::var("ZED_JUDGE_MODEL")
1365                .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1366        )
1367        .unwrap();
1368        let (agent_model, judge_model) = cx
1369            .update(|cx| {
1370                cx.spawn(async move |cx| {
1371                    let agent_model = Self::load_model(&agent_model, cx).await;
1372                    let judge_model = Self::load_model(&judge_model, cx).await;
1373                    (agent_model.unwrap(), judge_model.unwrap())
1374                })
1375            })
1376            .await;
1377        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1378
1379        Self {
1380            agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1381            project,
1382            judge_model,
1383        }
1384    }
1385
1386    async fn load_model(
1387        selected_model: &SelectedModel,
1388        cx: &mut AsyncApp,
1389    ) -> Result<Arc<dyn LanguageModel>> {
1390        let (provider, model) = cx.update(|cx| {
1391            let models = LanguageModelRegistry::read_global(cx);
1392            let model = models
1393                .available_models(cx)
1394                .find(|model| {
1395                    model.provider_id() == selected_model.provider
1396                        && model.id() == selected_model.model
1397                })
1398                .unwrap();
1399            let provider = models.provider(&model.provider_id()).unwrap();
1400            (provider, model)
1401        })?;
1402        cx.update(|cx| provider.authenticate(cx))?.await?;
1403        Ok(model)
1404    }
1405
1406    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1407        let path = self
1408            .project
1409            .read_with(cx, |project, cx| {
1410                project.find_project_path(eval.edit_file_input.path, cx)
1411            })
1412            .unwrap();
1413        let buffer = self
1414            .project
1415            .update(cx, |project, cx| project.open_buffer(path, cx))
1416            .await
1417            .unwrap();
1418        let tools = cx.update(|cx| {
1419            ToolRegistry::default_global(cx)
1420                .tools()
1421                .into_iter()
1422                .filter_map(|tool| {
1423                    let input_schema = tool
1424                        .input_schema(self.agent.model.tool_input_format())
1425                        .ok()?;
1426                    Some(LanguageModelRequestTool {
1427                        name: tool.name(),
1428                        description: tool.description(),
1429                        input_schema,
1430                    })
1431                })
1432                .collect::<Vec<_>>()
1433        });
1434        let tool_names = tools
1435            .iter()
1436            .map(|tool| tool.name.clone())
1437            .collect::<Vec<_>>();
1438        let worktrees = vec![WorktreeContext {
1439            root_name: "root".to_string(),
1440            rules_file: None,
1441        }];
1442        let prompt_builder = PromptBuilder::new(None)?;
1443        let project_context = ProjectContext::new(worktrees, Vec::default());
1444        let system_prompt = prompt_builder.generate_assistant_system_prompt(
1445            &project_context,
1446            &ModelContext {
1447                available_tools: tool_names,
1448            },
1449        )?;
1450
1451        let has_system_prompt = eval
1452            .conversation
1453            .first()
1454            .map_or(false, |msg| msg.role == Role::System);
1455        let messages = if has_system_prompt {
1456            eval.conversation
1457        } else {
1458            [LanguageModelRequestMessage {
1459                role: Role::System,
1460                content: vec![MessageContent::Text(system_prompt)],
1461                cache: true,
1462            }]
1463            .into_iter()
1464            .chain(eval.conversation)
1465            .collect::<Vec<_>>()
1466        };
1467
1468        let conversation = LanguageModelRequest {
1469            messages,
1470            tools,
1471            ..Default::default()
1472        };
1473        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1474            if let Some(input_content) = eval.input_content.as_deref() {
1475                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1476            }
1477            let (edit_output, _) = self.agent.edit(
1478                buffer.clone(),
1479                eval.edit_file_input.display_description,
1480                &conversation,
1481                &mut cx.to_async(),
1482            );
1483            edit_output.await?
1484        } else {
1485            let (edit_output, _) = self.agent.overwrite(
1486                buffer.clone(),
1487                eval.edit_file_input.display_description,
1488                &conversation,
1489                &mut cx.to_async(),
1490            );
1491            edit_output.await?
1492        };
1493
1494        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1495        let sample = EvalSample {
1496            edit_output,
1497            diff: language::unified_diff(
1498                eval.input_content.as_deref().unwrap_or_default(),
1499                &buffer_text,
1500            ),
1501            text: buffer_text,
1502        };
1503        let assertion = eval
1504            .assertion
1505            .run(&sample, self.judge_model.clone(), cx)
1506            .await?;
1507
1508        Ok(EvalOutput { assertion, sample })
1509    }
1510}
1511
1512#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1513struct EvalAssertionOutcome {
1514    score: usize,
1515    message: Option<String>,
1516}
1517
1518#[derive(Serialize)]
1519pub struct DiffJudgeTemplate {
1520    diff: String,
1521    assertions: &'static str,
1522}
1523
1524impl Template for DiffJudgeTemplate {
1525    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1526}
1527
1528fn strip_empty_lines(text: &str) -> String {
1529    text.lines()
1530        .filter(|line| !line.trim().is_empty())
1531        .collect::<Vec<_>>()
1532        .join("\n")
1533}