evals.rs

   1use super::*;
   2use crate::{
   3    EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
   4};
   5use Role::*;
   6use client::{Client, UserStore};
   7use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
   8use fs::FakeFs;
   9use futures::{FutureExt, future::LocalBoxFuture};
  10use gpui::{AppContext, TestAppContext, Timer};
  11use http_client::StatusCode;
  12use indoc::{formatdoc, indoc};
  13use language_model::{
  14    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  15    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  16};
  17use project::Project;
  18use prompt_store::{ProjectContext, WorktreeContext};
  19use rand::prelude::*;
  20use reqwest_client::ReqwestClient;
  21use serde_json::json;
  22use std::{
  23    fmt::{self, Display},
  24    path::Path,
  25    str::FromStr,
  26    time::Duration,
  27};
  28use util::path;
  29
  30#[derive(Default, Clone, Debug)]
  31struct EditAgentOutputProcessor {
  32    mismatched_tag_threshold: f32,
  33    cumulative_tags: usize,
  34    cumulative_mismatched_tags: usize,
  35    eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
  36}
  37
  38fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
  39    EditAgentOutputProcessor {
  40        mismatched_tag_threshold,
  41        cumulative_tags: 0,
  42        cumulative_mismatched_tags: 0,
  43        eval_outputs: Vec::new(),
  44    }
  45}
  46
  47#[derive(Clone, Debug)]
  48struct EditEvalMetadata {
  49    tags: usize,
  50    mismatched_tags: usize,
  51}
  52
  53impl EvalOutputProcessor for EditAgentOutputProcessor {
  54    type Metadata = EditEvalMetadata;
  55
  56    fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
  57        if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
  58            self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
  59            self.cumulative_tags += output.metadata.tags;
  60            self.eval_outputs.push(output.clone());
  61        }
  62    }
  63
  64    fn assert(&mut self) {
  65        let mismatched_tag_ratio =
  66            self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
  67        if mismatched_tag_ratio > self.mismatched_tag_threshold {
  68            for eval_output in &self.eval_outputs {
  69                println!("{}", eval_output.data);
  70            }
  71            panic!(
  72                "Too many mismatched tags: {:?}",
  73                self.cumulative_mismatched_tags
  74            );
  75        }
  76    }
  77}
  78
  79#[test]
  80#[cfg_attr(not(feature = "unit-eval"), ignore)]
  81fn eval_extract_handle_command_output() {
  82    // Test how well agent generates multiple edit hunks.
  83    //
  84    // Model                       | Pass rate
  85    // ----------------------------|----------
  86    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  87    // claude-sonnet-4             |  0.97 (2025-06-14)
  88    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  89    // gemini-2.5-flash            |  0.11 (2025-05-22)
  90    // gpt-4.1                     |  1.00 (2025-05-22)
  91
  92    let input_file_path = "root/blame.rs";
  93    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  94    let possible_diffs = vec![
  95        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  96        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  97        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  98        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  99        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
 100        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
 101        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
 102    ];
 103    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
 104    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 105        run_eval(EvalInput::from_conversation(
 106            vec![
 107                message(
 108                    User,
 109                    [text(formatdoc! {"
 110                            Read the `{input_file_path}` file and extract a method in
 111                            the final stanza of `run_git_blame` to deal with command failures,
 112                            call it `handle_command_output` and take the std::process::Output as the only parameter.
 113                            Do not document the method and do not add any comments.
 114
 115                            Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
 116                        "})],
 117                ),
 118                message(
 119                    Assistant,
 120                    [tool_use(
 121                        "tool_1",
 122                        "read_file",
 123                        ReadFileToolInput {
 124                            path: input_file_path.into(),
 125                            start_line: None,
 126                            end_line: None,
 127                            start_byte: None,
 128                            max_bytes: None,
 129                        },
 130                    )],
 131                ),
 132                message(
 133                    User,
 134                    [tool_result("tool_1", "read_file", input_file_content)],
 135                ),
 136                message(
 137                    Assistant,
 138                    [tool_use(
 139                        "tool_2",
 140                        "edit_file",
 141                        EditFileToolInput {
 142                            display_description: edit_description.into(),
 143                            path: input_file_path.into(),
 144                            mode: EditFileMode::Edit,
 145                        },
 146                    )],
 147                ),
 148            ],
 149            Some(input_file_content.into()),
 150            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 151        ))
 152    });
 153}
 154
 155#[test]
 156#[cfg_attr(not(feature = "unit-eval"), ignore)]
 157fn eval_delete_run_git_blame() {
 158    // Model                       | Pass rate
 159    // ----------------------------|----------
 160    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 161    // claude-sonnet-4             | 0.96 (2025-06-14)
 162    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 163    // gemini-2.5-flash            |
 164    // gpt-4.1                     |
 165
 166    let input_file_path = "root/blame.rs";
 167    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 168    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 169    let edit_description = "Delete the `run_git_blame` function.";
 170
 171    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 172        run_eval(EvalInput::from_conversation(
 173            vec![
 174                message(
 175                    User,
 176                    [text(formatdoc! {"
 177                            Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 178                            one function, not its usages.
 179                        "})],
 180                ),
 181                message(
 182                    Assistant,
 183                    [tool_use(
 184                        "tool_1",
 185                        "read_file",
 186                        ReadFileToolInput {
 187                            path: input_file_path.into(),
 188                            start_line: None,
 189                            end_line: None,
 190                            start_byte: None,
 191                            max_bytes: None,
 192                        },
 193                    )],
 194                ),
 195                message(
 196                    User,
 197                    [tool_result("tool_1", "read_file", input_file_content)],
 198                ),
 199                message(
 200                    Assistant,
 201                    [tool_use(
 202                        "tool_2",
 203                        "edit_file",
 204                        EditFileToolInput {
 205                            display_description: edit_description.into(),
 206                            path: input_file_path.into(),
 207                            mode: EditFileMode::Edit,
 208                        },
 209                    )],
 210                ),
 211            ],
 212            Some(input_file_content.into()),
 213            EvalAssertion::assert_eq(output_file_content),
 214        ))
 215    });
 216}
 217
 218#[test]
 219#[cfg_attr(not(feature = "unit-eval"), ignore)]
 220fn eval_translate_doc_comments() {
 221    //  Model                          | Pass rate
 222    // ============================================
 223    //
 224    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 225    //  claude-sonnet-4                |  1.0  (2025-06-14)
 226    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 227    //  gemini-2.5-flash-preview-04-17 |
 228    //  gpt-4.1                        |
 229
 230    let input_file_path = "root/canvas.rs";
 231    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 232    let edit_description = "Translate all doc comments to Italian";
 233
 234    eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
 235        run_eval(EvalInput::from_conversation(
 236            vec![
 237                message(
 238                    User,
 239                    [text(formatdoc! {"
 240                            Read the {input_file_path} file and edit it (without overwriting it),
 241                            translating all the doc comments to italian.
 242                        "})],
 243                ),
 244                message(
 245                    Assistant,
 246                    [tool_use(
 247                        "tool_1",
 248                        "read_file",
 249                        ReadFileToolInput {
 250                            path: input_file_path.into(),
 251                            start_line: None,
 252                            end_line: None,
 253                            start_byte: None,
 254                            max_bytes: None,
 255                        },
 256                    )],
 257                ),
 258                message(
 259                    User,
 260                    [tool_result("tool_1", "read_file", input_file_content)],
 261                ),
 262                message(
 263                    Assistant,
 264                    [tool_use(
 265                        "tool_2",
 266                        "edit_file",
 267                        EditFileToolInput {
 268                            display_description: edit_description.into(),
 269                            path: input_file_path.into(),
 270                            mode: EditFileMode::Edit,
 271                        },
 272                    )],
 273                ),
 274            ],
 275            Some(input_file_content.into()),
 276            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 277        ))
 278    });
 279}
 280
 281#[test]
 282#[cfg_attr(not(feature = "unit-eval"), ignore)]
 283fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 284    //  Model                          | Pass rate
 285    // ============================================
 286    //
 287    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 288    //  claude-sonnet-4                |  0.11 (2025-06-14)
 289    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 290    //  gemini-2.5-flash-preview-04-17 |
 291    //  gpt-4.1                        |
 292
 293    let input_file_path = "root/lib.rs";
 294    let input_file_content =
 295        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 296    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 297
 298    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 299        run_eval(EvalInput::from_conversation(
 300            vec![
 301                message(
 302                    User,
 303                    [text(formatdoc! {"
 304                            Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 305                            Use `ureq` to download the SDK for the current platform and architecture.
 306                            Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 307                            Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 308                            that's inside of the archive.
 309                            Don't re-download the SDK if that executable already exists.
 310
 311                            Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 312
 313                            Here are the available wasi-sdk assets:
 314                            - wasi-sdk-25.0-x86_64-macos.tar.gz
 315                            - wasi-sdk-25.0-arm64-macos.tar.gz
 316                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 317                            - wasi-sdk-25.0-arm64-linux.tar.gz
 318                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 319                            - wasi-sdk-25.0-arm64-linux.tar.gz
 320                            - wasi-sdk-25.0-x86_64-windows.tar.gz
 321                        "})],
 322                ),
 323                message(
 324                    Assistant,
 325                    [tool_use(
 326                        "tool_1",
 327                        "read_file",
 328                        ReadFileToolInput {
 329                            path: input_file_path.into(),
 330                            start_line: Some(971),
 331                            end_line: Some(1050),
 332                            start_byte: None,
 333                            max_bytes: None,
 334                        },
 335                    )],
 336                ),
 337                message(
 338                    User,
 339                    [tool_result(
 340                        "tool_1",
 341                        "read_file",
 342                        lines(input_file_content, 971..1050),
 343                    )],
 344                ),
 345                message(
 346                    Assistant,
 347                    [tool_use(
 348                        "tool_2",
 349                        "read_file",
 350                        ReadFileToolInput {
 351                            path: input_file_path.into(),
 352                            start_line: Some(1050),
 353                            end_line: Some(1100),
 354                            start_byte: None,
 355                            max_bytes: None,
 356                        },
 357                    )],
 358                ),
 359                message(
 360                    User,
 361                    [tool_result(
 362                        "tool_2",
 363                        "read_file",
 364                        lines(input_file_content, 1050..1100),
 365                    )],
 366                ),
 367                message(
 368                    Assistant,
 369                    [tool_use(
 370                        "tool_3",
 371                        "read_file",
 372                        ReadFileToolInput {
 373                            path: input_file_path.into(),
 374                            start_line: Some(1100),
 375                            end_line: Some(1150),
 376                            start_byte: None,
 377                            max_bytes: None,
 378                        },
 379                    )],
 380                ),
 381                message(
 382                    User,
 383                    [tool_result(
 384                        "tool_3",
 385                        "read_file",
 386                        lines(input_file_content, 1100..1150),
 387                    )],
 388                ),
 389                message(
 390                    Assistant,
 391                    [tool_use(
 392                        "tool_4",
 393                        "edit_file",
 394                        EditFileToolInput {
 395                            display_description: edit_description.into(),
 396                            path: input_file_path.into(),
 397                            mode: EditFileMode::Edit,
 398                        },
 399                    )],
 400                ),
 401            ],
 402            Some(input_file_content.into()),
 403            EvalAssertion::judge_diff(indoc! {"
 404                    - The compile_parser_to_wasm method has been changed to use wasi-sdk
 405                    - ureq is used to download the SDK for current platform and architecture
 406                "}),
 407        ))
 408    });
 409}
 410
 411#[test]
 412#[cfg_attr(not(feature = "unit-eval"), ignore)]
 413fn eval_disable_cursor_blinking() {
 414    //  Model                          | Pass rate
 415    // ============================================
 416    //
 417    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 418    //  claude-sonnet-4                |  0.81 (2025-07-14)
 419    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 420    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 421    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 422
 423    let input_file_path = "root/editor.rs";
 424    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 425    let edit_description = "Comment out the call to `BlinkManager::enable`";
 426    let possible_diffs = vec![
 427        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 428        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 429        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 430        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 431    ];
 432    eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
 433        run_eval(EvalInput::from_conversation(
 434            vec![
 435                message(User, [text("Let's research how to cursor blinking works.")]),
 436                message(
 437                    Assistant,
 438                    [tool_use(
 439                        "tool_1",
 440                        "grep",
 441                        GrepToolInput {
 442                            regex: "blink".into(),
 443                            include_pattern: None,
 444                            offset: 0,
 445                            case_sensitive: false,
 446                        },
 447                    )],
 448                ),
 449                message(
 450                    User,
 451                    [tool_result(
 452                        "tool_1",
 453                        "grep",
 454                        [
 455                            lines(input_file_content, 100..400),
 456                            lines(input_file_content, 800..1300),
 457                            lines(input_file_content, 1600..2000),
 458                            lines(input_file_content, 5000..5500),
 459                            lines(input_file_content, 8000..9000),
 460                            lines(input_file_content, 18455..18470),
 461                            lines(input_file_content, 20000..20500),
 462                            lines(input_file_content, 21000..21300),
 463                        ]
 464                        .join("Match found:\n\n"),
 465                    )],
 466                ),
 467                message(
 468                    User,
 469                    [text(indoc! {"
 470                            Comment out the lines that interact with the BlinkManager.
 471                            Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 472                            Don't add additional comments.
 473                        "})],
 474                ),
 475                message(
 476                    Assistant,
 477                    [tool_use(
 478                        "tool_4",
 479                        "edit_file",
 480                        EditFileToolInput {
 481                            display_description: edit_description.into(),
 482                            path: input_file_path.into(),
 483                            mode: EditFileMode::Edit,
 484                        },
 485                    )],
 486                ),
 487            ],
 488            Some(input_file_content.into()),
 489            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 490        ))
 491    });
 492}
 493
 494#[test]
 495#[cfg_attr(not(feature = "unit-eval"), ignore)]
 496fn eval_from_pixels_constructor() {
 497    // Results for 2025-06-13
 498    //
 499    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 500    // value. Higher values improve the pass rate but may sometimes cause
 501    // edits to be misapplied. In the context of this eval, this means
 502    // the agent might add from_pixels tests in incorrect locations
 503    // (e.g., at the beginning of the file), yet the evaluation may still
 504    // rate it highly.
 505    //
 506    //  Model                          | Date        | Pass rate
 507    // =========================================================
 508    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 509    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 510    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 511    //  gpt-4.1                        |
 512
 513    let input_file_path = "root/canvas.rs";
 514    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 515    let edit_description = "Implement from_pixels constructor and add tests.";
 516
 517    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
 518        run_eval(EvalInput::from_conversation(
 519            vec![
 520                message(
 521                    User,
 522                    [text(indoc! {"
 523                            Introduce a new `from_pixels` constructor in Canvas and
 524                            also add tests for it in the same file.
 525                        "})],
 526                ),
 527                message(
 528                    Assistant,
 529                    [tool_use(
 530                        "tool_1",
 531                        "read_file",
 532                        ReadFileToolInput {
 533                            path: input_file_path.into(),
 534                            start_line: None,
 535                            end_line: None,
 536                            start_byte: None,
 537                            max_bytes: None,
 538                        },
 539                    )],
 540                ),
 541                message(
 542                    User,
 543                    [tool_result("tool_1", "read_file", input_file_content)],
 544                ),
 545                message(
 546                    Assistant,
 547                    [tool_use(
 548                        "tool_2",
 549                        "grep",
 550                        GrepToolInput {
 551                            regex: "mod\\s+tests".into(),
 552                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 553                            offset: 0,
 554                            case_sensitive: false,
 555                        },
 556                    )],
 557                ),
 558                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 559                message(
 560                    Assistant,
 561                    [tool_use(
 562                        "tool_3",
 563                        "grep",
 564                        GrepToolInput {
 565                            regex: "mod\\s+tests".into(),
 566                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 567                            offset: 0,
 568                            case_sensitive: false,
 569                        },
 570                    )],
 571                ),
 572                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 573                message(
 574                    Assistant,
 575                    [tool_use(
 576                        "tool_4",
 577                        "grep",
 578                        GrepToolInput {
 579                            regex: "#\\[test\\]".into(),
 580                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 581                            offset: 0,
 582                            case_sensitive: false,
 583                        },
 584                    )],
 585                ),
 586                message(
 587                    User,
 588                    [tool_result(
 589                        "tool_4",
 590                        "grep",
 591                        indoc! {"
 592                                Found 6 matches:
 593
 594                                ## Matches in font-kit/src/loaders/core_text.rs
 595
 596                                ### mod test › L926-936
 597                                ```
 598                                mod test {
 599                                    use super::Font;
 600                                    use crate::properties::{Stretch, Weight};
 601
 602                                    #[cfg(feature = \"source\")]
 603                                    use crate::source::SystemSource;
 604
 605                                    static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 606
 607                                    #[cfg(feature = \"source\")]
 608                                    #[test]
 609                                ```
 610
 611                                55 lines remaining in ancestor node. Read the file to see all.
 612
 613                                ### mod test › L947-951
 614                                ```
 615                                    }
 616
 617                                    #[test]
 618                                    fn test_core_text_to_css_font_weight() {
 619                                        // Exact matches
 620                                ```
 621
 622                                ### mod test › L959-963
 623                                ```
 624                                    }
 625
 626                                    #[test]
 627                                    fn test_core_text_to_css_font_stretch() {
 628                                        // Exact matches
 629                                ```
 630
 631                                ## Matches in font-kit/src/loaders/freetype.rs
 632
 633                                ### mod test › L1238-1248
 634                                ```
 635                                mod test {
 636                                    use crate::loaders::freetype::Font;
 637
 638                                    static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 639                                    static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 640
 641                                    #[test]
 642                                    fn get_pcf_postscript_name() {
 643                                        let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 644                                        assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 645                                    }
 646                                ```
 647
 648                                1 lines remaining in ancestor node. Read the file to see all.
 649
 650                                ## Matches in font-kit/src/sources/core_text.rs
 651
 652                                ### mod test › L265-275
 653                                ```
 654                                mod test {
 655                                    use crate::properties::{Stretch, Weight};
 656
 657                                    #[test]
 658                                    fn test_css_to_core_text_font_weight() {
 659                                        // Exact matches
 660                                        assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 661                                        assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 662                                        assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 663                                        assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 664
 665                                ```
 666
 667                                27 lines remaining in ancestor node. Read the file to see all.
 668
 669                                ### mod test › L278-282
 670                                ```
 671                                    }
 672
 673                                    #[test]
 674                                    fn test_css_to_core_text_font_stretch() {
 675                                        // Exact matches
 676                                ```
 677                            "},
 678                    )],
 679                ),
 680                message(
 681                    Assistant,
 682                    [tool_use(
 683                        "tool_5",
 684                        "edit_file",
 685                        EditFileToolInput {
 686                            display_description: edit_description.into(),
 687                            path: input_file_path.into(),
 688                            mode: EditFileMode::Edit,
 689                        },
 690                    )],
 691                ),
 692            ],
 693            Some(input_file_content.into()),
 694            EvalAssertion::judge_diff(indoc! {"
 695                        - The diff contains a new `from_pixels` constructor
 696                        - The diff contains new tests for the `from_pixels` constructor
 697                    "}),
 698        ))
 699    });
 700}
 701
 702#[test]
 703#[cfg_attr(not(feature = "unit-eval"), ignore)]
 704fn eval_zode() {
 705    //  Model                          | Pass rate
 706    // ============================================
 707    //
 708    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 709    //  claude-sonnet-4                |  1.0 (2025-06-14)
 710    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 711    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 712    //  gpt-4.1                        |  1.0 (2025-05-22)
 713
 714    let input_file_path = "root/zode.py";
 715    let input_content = None;
 716    let edit_description = "Create the main Zode CLI script";
 717
 718    eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
 719        run_eval(EvalInput::from_conversation(
 720            vec![
 721                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 722                message(
 723                    Assistant,
 724                    [
 725                        tool_use(
 726                            "tool_1",
 727                            "read_file",
 728                            ReadFileToolInput {
 729                                path: "root/eval/react.py".into(),
 730                                start_line: None,
 731                                end_line: None,
 732                                start_byte: None,
 733                                max_bytes: None,
 734                            },
 735                        ),
 736                        tool_use(
 737                            "tool_2",
 738                            "read_file",
 739                            ReadFileToolInput {
 740                                path: "root/eval/react_test.py".into(),
 741                                start_line: None,
 742                                end_line: None,
 743                                start_byte: None,
 744                                max_bytes: None,
 745                            },
 746                        ),
 747                    ],
 748                ),
 749                message(
 750                    User,
 751                    [
 752                        tool_result(
 753                            "tool_1",
 754                            "read_file",
 755                            include_str!("evals/fixtures/zode/react.py"),
 756                        ),
 757                        tool_result(
 758                            "tool_2",
 759                            "read_file",
 760                            include_str!("evals/fixtures/zode/react_test.py"),
 761                        ),
 762                    ],
 763                ),
 764                message(
 765                    Assistant,
 766                    [
 767                        text(
 768                            "Now that I understand what we need to build, I'll create the main Python script:",
 769                        ),
 770                        tool_use(
 771                            "tool_3",
 772                            "edit_file",
 773                            EditFileToolInput {
 774                                display_description: edit_description.into(),
 775                                path: input_file_path.into(),
 776                                mode: EditFileMode::Create,
 777                            },
 778                        ),
 779                    ],
 780                ),
 781            ],
 782            input_content.clone(),
 783            EvalAssertion::new(async move |sample, _, _cx| {
 784                let invalid_starts = [' ', '`', '\n'];
 785                let mut message = String::new();
 786                for start in invalid_starts {
 787                    if sample.text_after.starts_with(start) {
 788                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 789                        break;
 790                    }
 791                }
 792                // Remove trailing newline.
 793                message.pop();
 794
 795                if message.is_empty() {
 796                    Ok(EvalAssertionOutcome {
 797                        score: 100,
 798                        message: None,
 799                    })
 800                } else {
 801                    Ok(EvalAssertionOutcome {
 802                        score: 0,
 803                        message: Some(message),
 804                    })
 805                }
 806            }),
 807        ))
 808    });
 809}
 810
 811#[test]
 812#[cfg_attr(not(feature = "unit-eval"), ignore)]
 813fn eval_add_overwrite_test() {
 814    //  Model                          | Pass rate
 815    // ============================================
 816    //
 817    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 818    //  claude-sonnet-4                |  0.07 (2025-06-14)
 819    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 820    //  gemini-2.5-flash-preview-04-17 |
 821    //  gpt-4.1                        |
 822
 823    let input_file_path = "root/action_log.rs";
 824    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 825    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 826
 827    eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
 828        run_eval(EvalInput::from_conversation(
 829            vec![
 830                message(
 831                    User,
 832                    [text(indoc! {"
 833                            Introduce a new test in `action_log.rs` to test overwriting a file.
 834                            That is, a file already exists, but we call `buffer_created` as if the file were new.
 835                            Take inspiration from all the other tests in the file.
 836                        "})],
 837                ),
 838                message(
 839                    Assistant,
 840                    [tool_use(
 841                        "tool_1",
 842                        "read_file",
 843                        ReadFileToolInput {
 844                            path: input_file_path.into(),
 845                            start_line: None,
 846                            end_line: None,
 847                            start_byte: None,
 848                            max_bytes: None,
 849                        },
 850                    )],
 851                ),
 852                message(
 853                    User,
 854                    [tool_result(
 855                        "tool_1",
 856                        "read_file",
 857                        indoc! {"
 858                                pub struct ActionLog [L13-20]
 859                                 tracked_buffers [L15]
 860                                 edited_since_project_diagnostics_check [L17]
 861                                 project [L19]
 862                                impl ActionLog [L22-498]
 863                                 pub fn new [L24-30]
 864                                 pub fn project [L32-34]
 865                                 pub fn checked_project_diagnostics [L37-39]
 866                                 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 867                                 fn track_buffer_internal [L46-101]
 868                                 fn handle_buffer_event [L103-116]
 869                                 fn handle_buffer_edited [L118-123]
 870                                 fn handle_buffer_file_changed [L125-158]
 871                                 async fn maintain_diff [L160-264]
 872                                 pub fn buffer_read [L267-269]
 873                                 pub fn buffer_created [L272-276]
 874                                 pub fn buffer_edited [L279-287]
 875                                 pub fn will_delete_buffer [L289-304]
 876                                 pub fn keep_edits_in_range [L306-364]
 877                                 pub fn reject_edits_in_ranges [L366-459]
 878                                 pub fn keep_all_edits [L461-473]
 879                                 pub fn changed_buffers [L476-482]
 880                                 pub fn stale_buffers [L485-497]
 881                                fn apply_non_conflicting_edits [L500-561]
 882                                fn diff_snapshots [L563-585]
 883                                fn point_to_row_edit [L587-614]
 884                                enum ChangeAuthor [L617-620]
 885                                 User [L618]
 886                                 Agent [L619]
 887                                enum TrackedBufferStatus [L623-627]
 888                                 Created [L624]
 889                                 Modified [L625]
 890                                 Deleted [L626]
 891                                struct TrackedBuffer [L629-641]
 892                                 buffer [L630]
 893                                 base_text [L631]
 894                                 unreviewed_changes [L632]
 895                                 status [L633]
 896                                 version [L634]
 897                                 diff [L635]
 898                                 snapshot [L636]
 899                                 diff_update [L637]
 900                                 _open_lsp_handle [L638]
 901                                 _maintain_diff [L639]
 902                                 _subscription [L640]
 903                                impl TrackedBuffer [L643-657]
 904                                 fn has_changes [L644-650]
 905                                 fn schedule_diff_update [L652-656]
 906                                pub struct ChangedBuffer [L659-661]
 907                                 pub diff [L660]
 908                                mod tests [L664-1574]
 909                                 fn init_logger [L678-682]
 910                                 fn init_test [L684-691]
 911                                 async fn test_keep_edits [L694-769]
 912                                 async fn test_deletions [L772-854]
 913                                 async fn test_overlapping_user_edits [L857-951]
 914                                 async fn test_creating_files [L954-1010]
 915                                 async fn test_deleting_files [L1013-1120]
 916                                 async fn test_reject_edits [L1123-1255]
 917                                 async fn test_reject_multiple_edits [L1258-1331]
 918                                 async fn test_reject_deleted_file [L1334-1388]
 919                                 async fn test_reject_created_file [L1391-1443]
 920                                 async fn test_random_diffs [L1446-1535]
 921                                  fn quiesce [L1510-1534]
 922                                 struct HunkStatus [L1538-1542]
 923                                  range [L1539]
 924                                  diff_status [L1540]
 925                                  old_text [L1541]
 926                                 fn unreviewed_hunks [L1544-1573]
 927
 928                                Showing symbols 1-69 (total symbols: 69)
 929
 930                                Using the line numbers in this outline, you can call this tool again while specifying
 931                                the start_line and end_line fields to see the implementations of symbols in the outline.
 932                            "},
 933                    )],
 934                ),
 935                message(
 936                    Assistant,
 937                    [
 938                        text(
 939                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 940                        ),
 941                        tool_use(
 942                            "tool_2",
 943                            "read_file",
 944                            ReadFileToolInput {
 945                                path: input_file_path.into(),
 946                                start_line: Some(953),
 947                                end_line: Some(1010),
 948                                start_byte: None,
 949                                max_bytes: None,
 950                            },
 951                        ),
 952                    ],
 953                ),
 954                message(
 955                    User,
 956                    [tool_result(
 957                        "tool_2",
 958                        "read_file",
 959                        lines(input_file_content, 953..1010),
 960                    )],
 961                ),
 962                message(
 963                    Assistant,
 964                    [
 965                        text(
 966                            "Now let's look at another relevant test to understand how they're structured:",
 967                        ),
 968                        tool_use(
 969                            "tool_3",
 970                            "read_file",
 971                            ReadFileToolInput {
 972                                path: input_file_path.into(),
 973                                start_line: Some(1012),
 974                                end_line: Some(1120),
 975                                start_byte: None,
 976                                max_bytes: None,
 977                            },
 978                        ),
 979                    ],
 980                ),
 981                message(
 982                    User,
 983                    [tool_result(
 984                        "tool_3",
 985                        "read_file",
 986                        lines(input_file_content, 1012..1120),
 987                    )],
 988                ),
 989                message(
 990                    Assistant,
 991                    [
 992                        text("Now let's look at how `buffer_created` is implemented:"),
 993                        tool_use(
 994                            "tool_4",
 995                            "read_file",
 996                            ReadFileToolInput {
 997                                path: input_file_path.into(),
 998                                start_line: Some(271),
 999                                end_line: Some(276),
1000                                start_byte: None,
1001                                max_bytes: None,
1002                            },
1003                        ),
1004                    ],
1005                ),
1006                message(
1007                    User,
1008                    [tool_result(
1009                        "tool_4",
1010                        "read_file",
1011                        lines(input_file_content, 271..276),
1012                    )],
1013                ),
1014                message(
1015                    Assistant,
1016                    [
1017                        text(indoc! {"
1018                                Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
1019
1020                                1. Create a file
1021                                2. Open a buffer for it
1022                                3. Track it in the action log
1023                                4. Modify the buffer
1024                                5. Call `buffer_created` again as if the file were new
1025                                6. Check that the changes are properly tracked
1026
1027                                Let's write the test:
1028                            "}),
1029                        tool_use(
1030                            "tool_5",
1031                            "edit_file",
1032                            EditFileToolInput {
1033                                display_description: edit_description.into(),
1034                                path: input_file_path.into(),
1035                                mode: EditFileMode::Edit,
1036                            },
1037                        ),
1038                    ],
1039                ),
1040            ],
1041            Some(input_file_content.into()),
1042            EvalAssertion::judge_diff(
1043                "A new test for overwritten files was created, without changing any previous test",
1044            ),
1045        ))
1046    });
1047}
1048
1049#[test]
1050#[cfg_attr(not(feature = "unit-eval"), ignore)]
1051fn eval_create_empty_file() {
1052    // Check that Edit Agent can create a file without writing its
1053    // thoughts into it. This issue is not specific to empty files, but
1054    // it's easier to reproduce with them.
1055    //
1056    //  Model                          | Pass rate
1057    // ============================================
1058    //
1059    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1060    //  claude-sonnet-4                |  1.00 (2025-06-14)
1061    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1062    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1063    //  gpt-4.1                        |  1.00 (2025-05-21)
1064    //
1065    //
1066    // TODO: gpt-4.1-mini errored 38 times:
1067    // "data did not match any variant of untagged enum ResponseStreamResult"
1068
1069    let input_file_content = None;
1070    let expected_output_content = String::new();
1071
1072    eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1073        run_eval(EvalInput::from_conversation(
1074            vec![
1075                message(User, [text("Create a second empty todo file ")]),
1076                message(
1077                    Assistant,
1078                    [
1079                        text(formatdoc! {"
1080                            I'll help you create a second empty todo file.
1081                            First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1082                            "}),
1083                        tool_use(
1084                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1085                            "list_directory",
1086                            ListDirectoryToolInput {
1087                                path: "root".to_string(),
1088                            },
1089                        ),
1090                    ],
1091                ),
1092                message(
1093                    User,
1094                    [tool_result(
1095                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1096                        "list_directory",
1097                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1098                    )],
1099                ),
1100                message(
1101                    Assistant,
1102                    [
1103                        text(formatdoc! {"
1104                            I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1105                        "}),
1106                        tool_use(
1107                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1108                            "edit_file",
1109                            EditFileToolInput {
1110                                display_description: "Create empty TODO3 file".to_string(),
1111                                mode: EditFileMode::Create,
1112                                path: "root/TODO3".into(),
1113                            },
1114                        ),
1115                    ],
1116                ),
1117            ],
1118            input_file_content.clone(),
1119            // Bad behavior is to write something like
1120            // "I'll create an empty TODO3 file as requested."
1121            EvalAssertion::assert_eq(expected_output_content.clone()),
1122        ))
1123    });
1124}
1125
1126fn message(
1127    role: Role,
1128    contents: impl IntoIterator<Item = MessageContent>,
1129) -> LanguageModelRequestMessage {
1130    LanguageModelRequestMessage {
1131        role,
1132        content: contents.into_iter().collect(),
1133        cache: false,
1134        reasoning_details: None,
1135    }
1136}
1137
1138fn text(text: impl Into<String>) -> MessageContent {
1139    MessageContent::Text(text.into())
1140}
1141
1142fn lines(input: &str, range: Range<usize>) -> String {
1143    input
1144        .lines()
1145        .skip(range.start)
1146        .take(range.len())
1147        .collect::<Vec<_>>()
1148        .join("\n")
1149}
1150
1151fn tool_use(
1152    id: impl Into<Arc<str>>,
1153    name: impl Into<Arc<str>>,
1154    input: impl Serialize,
1155) -> MessageContent {
1156    MessageContent::ToolUse(LanguageModelToolUse {
1157        id: LanguageModelToolUseId::from(id.into()),
1158        name: name.into(),
1159        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1160        input: serde_json::to_value(input).unwrap(),
1161        is_input_complete: true,
1162        thought_signature: None,
1163    })
1164}
1165
1166fn tool_result(
1167    id: impl Into<Arc<str>>,
1168    name: impl Into<Arc<str>>,
1169    result: impl Into<Arc<str>>,
1170) -> MessageContent {
1171    MessageContent::ToolResult(LanguageModelToolResult {
1172        tool_use_id: LanguageModelToolUseId::from(id.into()),
1173        tool_name: name.into(),
1174        is_error: false,
1175        content: LanguageModelToolResultContent::Text(result.into()),
1176        output: None,
1177    })
1178}
1179
1180#[derive(Clone)]
1181struct EvalInput {
1182    conversation: Vec<LanguageModelRequestMessage>,
1183    edit_file_input: EditFileToolInput,
1184    input_content: Option<String>,
1185    assertion: EvalAssertion,
1186}
1187
1188impl EvalInput {
1189    fn from_conversation(
1190        conversation: Vec<LanguageModelRequestMessage>,
1191        input_content: Option<String>,
1192        assertion: EvalAssertion,
1193    ) -> Self {
1194        let msg = conversation.last().expect("Conversation must not be empty");
1195        if msg.role != Role::Assistant {
1196            panic!("Conversation must end with an assistant message");
1197        }
1198        let tool_use = msg
1199            .content
1200            .iter()
1201            .flat_map(|content| match content {
1202                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1203                    Some(tool_use)
1204                }
1205                _ => None,
1206            })
1207            .next()
1208            .expect("Conversation must end with an edit_file tool use")
1209            .clone();
1210
1211        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1212
1213        EvalInput {
1214            conversation,
1215            edit_file_input,
1216            input_content,
1217            assertion,
1218        }
1219    }
1220}
1221
1222#[derive(Clone)]
1223struct EvalSample {
1224    text_before: String,
1225    text_after: String,
1226    edit_output: EditAgentOutput,
1227    diff: String,
1228}
1229
1230trait AssertionFn: 'static + Send + Sync {
1231    fn assert<'a>(
1232        &'a self,
1233        sample: &'a EvalSample,
1234        judge_model: Arc<dyn LanguageModel>,
1235        cx: &'a mut TestAppContext,
1236    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1237}
1238
1239impl<F> AssertionFn for F
1240where
1241    F: 'static
1242        + Send
1243        + Sync
1244        + AsyncFn(
1245            &EvalSample,
1246            Arc<dyn LanguageModel>,
1247            &mut TestAppContext,
1248        ) -> Result<EvalAssertionOutcome>,
1249{
1250    fn assert<'a>(
1251        &'a self,
1252        sample: &'a EvalSample,
1253        judge_model: Arc<dyn LanguageModel>,
1254        cx: &'a mut TestAppContext,
1255    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1256        (self)(sample, judge_model, cx).boxed_local()
1257    }
1258}
1259
1260#[derive(Clone)]
1261struct EvalAssertion(Arc<dyn AssertionFn>);
1262
1263impl EvalAssertion {
1264    fn new<F>(f: F) -> Self
1265    where
1266        F: 'static
1267            + Send
1268            + Sync
1269            + AsyncFn(
1270                &EvalSample,
1271                Arc<dyn LanguageModel>,
1272                &mut TestAppContext,
1273            ) -> Result<EvalAssertionOutcome>,
1274    {
1275        EvalAssertion(Arc::new(f))
1276    }
1277
1278    fn assert_eq(expected: impl Into<String>) -> Self {
1279        let expected = expected.into();
1280        Self::new(async move |sample, _judge, _cx| {
1281            Ok(EvalAssertionOutcome {
1282                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1283                    100
1284                } else {
1285                    0
1286                },
1287                message: None,
1288            })
1289        })
1290    }
1291
1292    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1293        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1294        Self::new(async move |sample, _judge, _cx| {
1295            let matches = expected_diffs.iter().any(|possible_diff| {
1296                let expected =
1297                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1298                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1299            });
1300
1301            Ok(EvalAssertionOutcome {
1302                score: if matches { 100 } else { 0 },
1303                message: None,
1304            })
1305        })
1306    }
1307
1308    fn judge_diff(assertions: &'static str) -> Self {
1309        Self::new(async move |sample, judge, cx| {
1310            let prompt = DiffJudgeTemplate {
1311                diff: sample.diff.clone(),
1312                assertions,
1313            }
1314            .render(&Templates::new())
1315            .unwrap();
1316
1317            let request = LanguageModelRequest {
1318                messages: vec![LanguageModelRequestMessage {
1319                    role: Role::User,
1320                    content: vec![prompt.into()],
1321                    cache: false,
1322                    reasoning_details: None,
1323                }],
1324                thinking_allowed: true,
1325                ..Default::default()
1326            };
1327            let mut response = retry_on_rate_limit(async || {
1328                Ok(judge
1329                    .stream_completion_text(request.clone(), &cx.to_async())
1330                    .await?)
1331            })
1332            .await?;
1333            let mut output = String::new();
1334            while let Some(chunk) = response.stream.next().await {
1335                let chunk = chunk?;
1336                output.push_str(&chunk);
1337            }
1338
1339            // Parse the score from the response
1340            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1341            if let Some(captures) = re.captures(&output)
1342                && let Some(score_match) = captures.get(1)
1343            {
1344                let score = score_match.as_str().parse().unwrap_or(0);
1345                return Ok(EvalAssertionOutcome {
1346                    score,
1347                    message: Some(output),
1348                });
1349            }
1350
1351            anyhow::bail!("No score found in response. Raw output: {output}");
1352        })
1353    }
1354
1355    async fn run(
1356        &self,
1357        input: &EvalSample,
1358        judge_model: Arc<dyn LanguageModel>,
1359        cx: &mut TestAppContext,
1360    ) -> Result<EvalAssertionOutcome> {
1361        self.0.assert(input, judge_model, cx).await
1362    }
1363}
1364
1365fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1366    let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1367    let mut cx = TestAppContext::build(dispatcher, None);
1368    let result = cx.executor().block_test(async {
1369        let test = EditAgentTest::new(&mut cx).await;
1370        test.eval(eval, &mut cx).await
1371    });
1372    cx.quit();
1373    match result {
1374        Ok(output) => eval_utils::EvalOutput {
1375            data: output.to_string(),
1376            outcome: if output.assertion.score < 80 {
1377                eval_utils::OutcomeKind::Failed
1378            } else {
1379                eval_utils::OutcomeKind::Passed
1380            },
1381            metadata: EditEvalMetadata {
1382                tags: output.sample.edit_output.parser_metrics.tags,
1383                mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1384            },
1385        },
1386        Err(e) => eval_utils::EvalOutput {
1387            data: format!("{e:?}"),
1388            outcome: eval_utils::OutcomeKind::Error,
1389            metadata: EditEvalMetadata {
1390                tags: 0,
1391                mismatched_tags: 0,
1392            },
1393        },
1394    }
1395}
1396
1397#[derive(Clone)]
1398struct EditEvalOutput {
1399    sample: EvalSample,
1400    assertion: EvalAssertionOutcome,
1401}
1402
1403impl Display for EditEvalOutput {
1404    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1405        writeln!(f, "Score: {:?}", self.assertion.score)?;
1406        if let Some(message) = self.assertion.message.as_ref() {
1407            writeln!(f, "Message: {}", message)?;
1408        }
1409
1410        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1411
1412        writeln!(
1413            f,
1414            "Parser Metrics:\n{:#?}",
1415            self.sample.edit_output.parser_metrics
1416        )?;
1417        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1418        Ok(())
1419    }
1420}
1421
1422struct EditAgentTest {
1423    agent: EditAgent,
1424    project: Entity<Project>,
1425    judge_model: Arc<dyn LanguageModel>,
1426}
1427
1428impl EditAgentTest {
1429    async fn new(cx: &mut TestAppContext) -> Self {
1430        cx.executor().allow_parking();
1431
1432        let fs = FakeFs::new(cx.executor());
1433        cx.update(|cx| {
1434            settings::init(cx);
1435            gpui_tokio::init(cx);
1436            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1437            cx.set_http_client(http_client);
1438            let client = Client::production(cx);
1439            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1440            settings::init(cx);
1441            language_model::init(client.clone(), cx);
1442            language_models::init(user_store, client.clone(), cx);
1443        });
1444
1445        fs.insert_tree("/root", json!({})).await;
1446        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1447        let agent_model = SelectedModel::from_str(
1448            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1449        )
1450        .unwrap();
1451        let judge_model = SelectedModel::from_str(
1452            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1453        )
1454        .unwrap();
1455
1456        let authenticate_provider_tasks = cx.update(|cx| {
1457            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1458                registry
1459                    .providers()
1460                    .iter()
1461                    .map(|p| p.authenticate(cx))
1462                    .collect::<Vec<_>>()
1463            })
1464        });
1465        let (agent_model, judge_model) = cx
1466            .update(|cx| {
1467                cx.spawn(async move |cx| {
1468                    futures::future::join_all(authenticate_provider_tasks).await;
1469                    let agent_model = Self::load_model(&agent_model, cx).await;
1470                    let judge_model = Self::load_model(&judge_model, cx).await;
1471                    (agent_model.unwrap(), judge_model.unwrap())
1472                })
1473            })
1474            .await;
1475        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1476
1477        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1478
1479        Self {
1480            agent: EditAgent::new(
1481                agent_model,
1482                project.clone(),
1483                action_log,
1484                Templates::new(),
1485                edit_format,
1486            ),
1487            project,
1488            judge_model,
1489        }
1490    }
1491
1492    async fn load_model(
1493        selected_model: &SelectedModel,
1494        cx: &mut AsyncApp,
1495    ) -> Result<Arc<dyn LanguageModel>> {
1496        cx.update(|cx| {
1497            let registry = LanguageModelRegistry::read_global(cx);
1498            let provider = registry
1499                .provider(&selected_model.provider)
1500                .expect("Provider not found");
1501            provider.authenticate(cx)
1502        })?
1503        .await?;
1504        cx.update(|cx| {
1505            let models = LanguageModelRegistry::read_global(cx);
1506            let model = models
1507                .available_models(cx)
1508                .find(|model| {
1509                    model.provider_id() == selected_model.provider
1510                        && model.id() == selected_model.model
1511                })
1512                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1513            model
1514        })
1515    }
1516
1517    async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1518        // Make sure the last message in the conversation is cached.
1519        eval.conversation.last_mut().unwrap().cache = true;
1520
1521        let path = self
1522            .project
1523            .read_with(cx, |project, cx| {
1524                project.find_project_path(eval.edit_file_input.path, cx)
1525            })
1526            .unwrap();
1527        let buffer = self
1528            .project
1529            .update(cx, |project, cx| project.open_buffer(path, cx))
1530            .await
1531            .unwrap();
1532
1533        let tools = crate::built_in_tools().collect::<Vec<_>>();
1534
1535        let system_prompt = {
1536            let worktrees = vec![WorktreeContext {
1537                root_name: "root".to_string(),
1538                abs_path: Path::new("/path/to/root").into(),
1539                rules_file: None,
1540            }];
1541            let project_context = ProjectContext::new(worktrees, Vec::default());
1542            let tool_names = tools
1543                .iter()
1544                .map(|tool| tool.name.clone().into())
1545                .collect::<Vec<_>>();
1546            let template = crate::SystemPromptTemplate {
1547                project: &project_context,
1548                available_tools: tool_names,
1549                model_name: None,
1550            };
1551            let templates = Templates::new();
1552            template.render(&templates).unwrap()
1553        };
1554
1555        let has_system_prompt = eval
1556            .conversation
1557            .first()
1558            .is_some_and(|msg| msg.role == Role::System);
1559        let messages = if has_system_prompt {
1560            eval.conversation
1561        } else {
1562            [LanguageModelRequestMessage {
1563                role: Role::System,
1564                content: vec![MessageContent::Text(system_prompt)],
1565                cache: true,
1566                reasoning_details: None,
1567            }]
1568            .into_iter()
1569            .chain(eval.conversation)
1570            .collect::<Vec<_>>()
1571        };
1572
1573        let conversation = LanguageModelRequest {
1574            messages,
1575            tools,
1576            thinking_allowed: true,
1577            ..Default::default()
1578        };
1579
1580        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1581            if let Some(input_content) = eval.input_content.as_deref() {
1582                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1583            }
1584            retry_on_rate_limit(async || {
1585                self.agent
1586                    .edit(
1587                        buffer.clone(),
1588                        eval.edit_file_input.display_description.clone(),
1589                        &conversation,
1590                        &mut cx.to_async(),
1591                    )
1592                    .0
1593                    .await
1594            })
1595            .await?
1596        } else {
1597            retry_on_rate_limit(async || {
1598                self.agent
1599                    .overwrite(
1600                        buffer.clone(),
1601                        eval.edit_file_input.display_description.clone(),
1602                        &conversation,
1603                        &mut cx.to_async(),
1604                    )
1605                    .0
1606                    .await
1607            })
1608            .await?
1609        };
1610
1611        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1612        let sample = EvalSample {
1613            edit_output,
1614            diff: language::unified_diff(
1615                eval.input_content.as_deref().unwrap_or_default(),
1616                &buffer_text,
1617            ),
1618            text_before: eval.input_content.unwrap_or_default(),
1619            text_after: buffer_text,
1620        };
1621        let assertion = eval
1622            .assertion
1623            .run(&sample, self.judge_model.clone(), cx)
1624            .await?;
1625
1626        Ok(EditEvalOutput { assertion, sample })
1627    }
1628}
1629
1630async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1631    const MAX_RETRIES: usize = 20;
1632    let mut attempt = 0;
1633
1634    loop {
1635        attempt += 1;
1636        let response = request().await;
1637
1638        if attempt >= MAX_RETRIES {
1639            return response;
1640        }
1641
1642        let retry_delay = match &response {
1643            Ok(_) => None,
1644            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1645                Some(err) => match &err {
1646                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1647                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1648                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1649                    }
1650                    LanguageModelCompletionError::UpstreamProviderError {
1651                        status,
1652                        retry_after,
1653                        ..
1654                    } => {
1655                        // Only retry for specific status codes
1656                        let should_retry = matches!(
1657                            *status,
1658                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1659                        ) || status.as_u16() == 529;
1660
1661                        if should_retry {
1662                            // Use server-provided retry_after if available, otherwise use default
1663                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1664                        } else {
1665                            None
1666                        }
1667                    }
1668                    LanguageModelCompletionError::ApiReadResponseError { .. }
1669                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1670                    | LanguageModelCompletionError::HttpSend { .. } => {
1671                        // Exponential backoff for transient I/O and internal server errors
1672                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1673                    }
1674                    _ => None,
1675                },
1676                _ => None,
1677            },
1678        };
1679
1680        if let Some(retry_after) = retry_delay {
1681            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1682            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1683            Timer::after(retry_after + jitter).await;
1684        } else {
1685            return response;
1686        }
1687    }
1688}
1689
1690#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1691struct EvalAssertionOutcome {
1692    score: usize,
1693    message: Option<String>,
1694}
1695
1696#[derive(Serialize)]
1697pub struct DiffJudgeTemplate {
1698    diff: String,
1699    assertions: &'static str,
1700}
1701
1702impl Template for DiffJudgeTemplate {
1703    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1704}
1705
1706fn strip_empty_lines(text: &str) -> String {
1707    text.lines()
1708        .filter(|line| !line.trim().is_empty())
1709        .collect::<Vec<_>>()
1710        .join("\n")
1711}