evals.rs

   1use super::*;
   2use crate::{
   3    AgentTool, EditFileMode, EditFileTool, EditFileToolInput, GrepTool, GrepToolInput,
   4    ListDirectoryTool, ListDirectoryToolInput, ReadFileTool, ReadFileToolInput,
   5};
   6use Role::*;
   7use client::{Client, UserStore};
   8use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
   9use fs::FakeFs;
  10use futures::{FutureExt, future::LocalBoxFuture};
  11use gpui::{AppContext, TestAppContext};
  12use http_client::StatusCode;
  13use indoc::{formatdoc, indoc};
  14use language_model::{
  15    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  16    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  17};
  18use project::Project;
  19use prompt_store::{ProjectContext, WorktreeContext};
  20use rand::prelude::*;
  21use reqwest_client::ReqwestClient;
  22use serde_json::json;
  23use std::{
  24    fmt::{self, Display},
  25    path::Path,
  26    str::FromStr,
  27    time::Duration,
  28};
  29use util::path;
  30
  31#[derive(Default, Clone, Debug)]
  32struct EditAgentOutputProcessor {
  33    mismatched_tag_threshold: f32,
  34    cumulative_tags: usize,
  35    cumulative_mismatched_tags: usize,
  36    eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
  37}
  38
  39fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
  40    EditAgentOutputProcessor {
  41        mismatched_tag_threshold,
  42        cumulative_tags: 0,
  43        cumulative_mismatched_tags: 0,
  44        eval_outputs: Vec::new(),
  45    }
  46}
  47
  48#[derive(Clone, Debug)]
  49struct EditEvalMetadata {
  50    tags: usize,
  51    mismatched_tags: usize,
  52}
  53
  54impl EvalOutputProcessor for EditAgentOutputProcessor {
  55    type Metadata = EditEvalMetadata;
  56
  57    fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
  58        if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
  59            self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
  60            self.cumulative_tags += output.metadata.tags;
  61            self.eval_outputs.push(output.clone());
  62        }
  63    }
  64
  65    fn assert(&mut self) {
  66        let mismatched_tag_ratio =
  67            self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
  68        if mismatched_tag_ratio > self.mismatched_tag_threshold {
  69            for eval_output in &self.eval_outputs {
  70                println!("{}", eval_output.data);
  71            }
  72            panic!(
  73                "Too many mismatched tags: {:?}",
  74                self.cumulative_mismatched_tags
  75            );
  76        }
  77    }
  78}
  79
  80#[test]
  81#[cfg_attr(not(feature = "unit-eval"), ignore)]
  82fn eval_extract_handle_command_output() {
  83    // Test how well agent generates multiple edit hunks.
  84    //
  85    // Model                       | Pass rate
  86    // ----------------------------|----------
  87    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  88    // claude-sonnet-4             |  0.97 (2025-06-14)
  89    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  90    // gemini-2.5-flash            |  0.11 (2025-05-22)
  91
  92    let input_file_path = "root/blame.rs";
  93    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  94    let possible_diffs = vec![
  95        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  96        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  97        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  98        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  99        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
 100        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
 101        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
 102    ];
 103    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
 104    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 105        run_eval(EvalInput::from_conversation(
 106            vec![
 107                message(
 108                    User,
 109                    [text(formatdoc! {"
 110                            Read the `{input_file_path}` file and extract a method in
 111                            the final stanza of `run_git_blame` to deal with command failures,
 112                            call it `handle_command_output` and take the std::process::Output as the only parameter.
 113                            Do not document the method and do not add any comments.
 114
 115                            Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
 116                        "})],
 117                ),
 118                message(
 119                    Assistant,
 120                    [tool_use(
 121                        "tool_1",
 122                        ReadFileTool::NAME,
 123                        ReadFileToolInput {
 124                            path: input_file_path.into(),
 125                            start_line: None,
 126                            end_line: None,
 127                        },
 128                    )],
 129                ),
 130                message(
 131                    User,
 132                    [tool_result(
 133                        "tool_1",
 134                        ReadFileTool::NAME,
 135                        input_file_content,
 136                    )],
 137                ),
 138                message(
 139                    Assistant,
 140                    [tool_use(
 141                        "tool_2",
 142                        EditFileTool::NAME,
 143                        EditFileToolInput {
 144                            display_description: edit_description.into(),
 145                            path: input_file_path.into(),
 146                            mode: EditFileMode::Edit,
 147                        },
 148                    )],
 149                ),
 150            ],
 151            Some(input_file_content.into()),
 152            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 153        ))
 154    });
 155}
 156
 157#[test]
 158#[cfg_attr(not(feature = "unit-eval"), ignore)]
 159fn eval_delete_run_git_blame() {
 160    // Model                       | Pass rate
 161    // ----------------------------|----------
 162    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 163    // claude-sonnet-4             | 0.96 (2025-06-14)
 164    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 165    // gemini-2.5-flash            |
 166
 167    let input_file_path = "root/blame.rs";
 168    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 169    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 170    let edit_description = "Delete the `run_git_blame` function.";
 171
 172    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 173        run_eval(EvalInput::from_conversation(
 174            vec![
 175                message(
 176                    User,
 177                    [text(formatdoc! {"
 178                            Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 179                            one function, not its usages.
 180                        "})],
 181                ),
 182                message(
 183                    Assistant,
 184                    [tool_use(
 185                        "tool_1",
 186                        ReadFileTool::NAME,
 187                        ReadFileToolInput {
 188                            path: input_file_path.into(),
 189                            start_line: None,
 190                            end_line: None,
 191                        },
 192                    )],
 193                ),
 194                message(
 195                    User,
 196                    [tool_result(
 197                        "tool_1",
 198                        ReadFileTool::NAME,
 199                        input_file_content,
 200                    )],
 201                ),
 202                message(
 203                    Assistant,
 204                    [tool_use(
 205                        "tool_2",
 206                        EditFileTool::NAME,
 207                        EditFileToolInput {
 208                            display_description: edit_description.into(),
 209                            path: input_file_path.into(),
 210                            mode: EditFileMode::Edit,
 211                        },
 212                    )],
 213                ),
 214            ],
 215            Some(input_file_content.into()),
 216            EvalAssertion::assert_eq(output_file_content),
 217        ))
 218    });
 219}
 220
 221#[test]
 222#[cfg_attr(not(feature = "unit-eval"), ignore)]
 223fn eval_translate_doc_comments() {
 224    //  Model                          | Pass rate
 225    // ============================================
 226    //
 227    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 228    //  claude-sonnet-4                |  1.0  (2025-06-14)
 229    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 230    //  gemini-2.5-flash-preview-04-17 |
 231
 232    let input_file_path = "root/canvas.rs";
 233    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 234    let edit_description = "Translate all doc comments to Italian";
 235
 236    eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
 237        run_eval(EvalInput::from_conversation(
 238            vec![
 239                message(
 240                    User,
 241                    [text(formatdoc! {"
 242                            Read the {input_file_path} file and edit it (without overwriting it),
 243                            translating all the doc comments to italian.
 244                        "})],
 245                ),
 246                message(
 247                    Assistant,
 248                    [tool_use(
 249                        "tool_1",
 250                        ReadFileTool::NAME,
 251                        ReadFileToolInput {
 252                            path: input_file_path.into(),
 253                            start_line: None,
 254                            end_line: None,
 255                        },
 256                    )],
 257                ),
 258                message(
 259                    User,
 260                    [tool_result(
 261                        "tool_1",
 262                        ReadFileTool::NAME,
 263                        input_file_content,
 264                    )],
 265                ),
 266                message(
 267                    Assistant,
 268                    [tool_use(
 269                        "tool_2",
 270                        EditFileTool::NAME,
 271                        EditFileToolInput {
 272                            display_description: edit_description.into(),
 273                            path: input_file_path.into(),
 274                            mode: EditFileMode::Edit,
 275                        },
 276                    )],
 277                ),
 278            ],
 279            Some(input_file_content.into()),
 280            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 281        ))
 282    });
 283}
 284
 285#[test]
 286#[cfg_attr(not(feature = "unit-eval"), ignore)]
 287fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 288    //  Model                          | Pass rate
 289    // ============================================
 290    //
 291    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 292    //  claude-sonnet-4                |  0.11 (2025-06-14)
 293    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 294    //  gemini-2.5-flash-preview-04-17 |
 295
 296    let input_file_path = "root/lib.rs";
 297    let input_file_content =
 298        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 299    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 300
 301    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 302        run_eval(EvalInput::from_conversation(
 303            vec![
 304                message(
 305                    User,
 306                    [text(formatdoc! {"
 307                            Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 308                            Use `ureq` to download the SDK for the current platform and architecture.
 309                            Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 310                            Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 311                            that's inside of the archive.
 312                            Don't re-download the SDK if that executable already exists.
 313
 314                            Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 315
 316                            Here are the available wasi-sdk assets:
 317                            - wasi-sdk-25.0-x86_64-macos.tar.gz
 318                            - wasi-sdk-25.0-arm64-macos.tar.gz
 319                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 320                            - wasi-sdk-25.0-arm64-linux.tar.gz
 321                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 322                            - wasi-sdk-25.0-arm64-linux.tar.gz
 323                            - wasi-sdk-25.0-x86_64-windows.tar.gz
 324                        "})],
 325                ),
 326                message(
 327                    Assistant,
 328                    [tool_use(
 329                        "tool_1",
 330                        ReadFileTool::NAME,
 331                        ReadFileToolInput {
 332                            path: input_file_path.into(),
 333                            start_line: Some(971),
 334                            end_line: Some(1050),
 335                        },
 336                    )],
 337                ),
 338                message(
 339                    User,
 340                    [tool_result(
 341                        "tool_1",
 342                        ReadFileTool::NAME,
 343                        lines(input_file_content, 971..1050),
 344                    )],
 345                ),
 346                message(
 347                    Assistant,
 348                    [tool_use(
 349                        "tool_2",
 350                        ReadFileTool::NAME,
 351                        ReadFileToolInput {
 352                            path: input_file_path.into(),
 353                            start_line: Some(1050),
 354                            end_line: Some(1100),
 355                        },
 356                    )],
 357                ),
 358                message(
 359                    User,
 360                    [tool_result(
 361                        "tool_2",
 362                        ReadFileTool::NAME,
 363                        lines(input_file_content, 1050..1100),
 364                    )],
 365                ),
 366                message(
 367                    Assistant,
 368                    [tool_use(
 369                        "tool_3",
 370                        ReadFileTool::NAME,
 371                        ReadFileToolInput {
 372                            path: input_file_path.into(),
 373                            start_line: Some(1100),
 374                            end_line: Some(1150),
 375                        },
 376                    )],
 377                ),
 378                message(
 379                    User,
 380                    [tool_result(
 381                        "tool_3",
 382                        ReadFileTool::NAME,
 383                        lines(input_file_content, 1100..1150),
 384                    )],
 385                ),
 386                message(
 387                    Assistant,
 388                    [tool_use(
 389                        "tool_4",
 390                        EditFileTool::NAME,
 391                        EditFileToolInput {
 392                            display_description: edit_description.into(),
 393                            path: input_file_path.into(),
 394                            mode: EditFileMode::Edit,
 395                        },
 396                    )],
 397                ),
 398            ],
 399            Some(input_file_content.into()),
 400            EvalAssertion::judge_diff(indoc! {"
 401                    - The compile_parser_to_wasm method has been changed to use wasi-sdk
 402                    - ureq is used to download the SDK for current platform and architecture
 403                "}),
 404        ))
 405    });
 406}
 407
 408#[test]
 409#[cfg_attr(not(feature = "unit-eval"), ignore)]
 410fn eval_disable_cursor_blinking() {
 411    //  Model                          | Pass rate
 412    // ============================================
 413    //
 414    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 415    //  claude-sonnet-4                |  0.81 (2025-07-14)
 416    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 417    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 418
 419    let input_file_path = "root/editor.rs";
 420    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 421    let edit_description = "Comment out the call to `BlinkManager::enable`";
 422    let possible_diffs = vec![
 423        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 424        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 425        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 426        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 427    ];
 428    eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
 429        run_eval(EvalInput::from_conversation(
 430            vec![
 431                message(User, [text("Let's research how to cursor blinking works.")]),
 432                message(
 433                    Assistant,
 434                    [tool_use(
 435                        "tool_1",
 436                        GrepTool::NAME,
 437                        GrepToolInput {
 438                            regex: "blink".into(),
 439                            include_pattern: None,
 440                            offset: 0,
 441                            case_sensitive: false,
 442                        },
 443                    )],
 444                ),
 445                message(
 446                    User,
 447                    [tool_result(
 448                        "tool_1",
 449                        GrepTool::NAME,
 450                        [
 451                            lines(input_file_content, 100..400),
 452                            lines(input_file_content, 800..1300),
 453                            lines(input_file_content, 1600..2000),
 454                            lines(input_file_content, 5000..5500),
 455                            lines(input_file_content, 8000..9000),
 456                            lines(input_file_content, 18455..18470),
 457                            lines(input_file_content, 20000..20500),
 458                            lines(input_file_content, 21000..21300),
 459                        ]
 460                        .join("Match found:\n\n"),
 461                    )],
 462                ),
 463                message(
 464                    User,
 465                    [text(indoc! {"
 466                            Comment out the lines that interact with the BlinkManager.
 467                            Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 468                            Don't add additional comments.
 469                        "})],
 470                ),
 471                message(
 472                    Assistant,
 473                    [tool_use(
 474                        "tool_4",
 475                        EditFileTool::NAME,
 476                        EditFileToolInput {
 477                            display_description: edit_description.into(),
 478                            path: input_file_path.into(),
 479                            mode: EditFileMode::Edit,
 480                        },
 481                    )],
 482                ),
 483            ],
 484            Some(input_file_content.into()),
 485            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 486        ))
 487    });
 488}
 489
 490#[test]
 491#[cfg_attr(not(feature = "unit-eval"), ignore)]
 492fn eval_from_pixels_constructor() {
 493    // Results for 2025-06-13
 494    //
 495    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 496    // value. Higher values improve the pass rate but may sometimes cause
 497    // edits to be misapplied. In the context of this eval, this means
 498    // the agent might add from_pixels tests in incorrect locations
 499    // (e.g., at the beginning of the file), yet the evaluation may still
 500    // rate it highly.
 501    //
 502    //  Model                          | Date        | Pass rate
 503    // =========================================================
 504    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 505    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 506    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 507
 508    let input_file_path = "root/canvas.rs";
 509    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 510    let edit_description = "Implement from_pixels constructor and add tests.";
 511
 512    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
 513        run_eval(EvalInput::from_conversation(
 514            vec![
 515                message(
 516                    User,
 517                    [text(indoc! {"
 518                            Introduce a new `from_pixels` constructor in Canvas and
 519                            also add tests for it in the same file.
 520                        "})],
 521                ),
 522                message(
 523                    Assistant,
 524                    [tool_use(
 525                        "tool_1",
 526                        ReadFileTool::NAME,
 527                        ReadFileToolInput {
 528                            path: input_file_path.into(),
 529                            start_line: None,
 530                            end_line: None,
 531                        },
 532                    )],
 533                ),
 534                message(
 535                    User,
 536                    [tool_result(
 537                        "tool_1",
 538                        ReadFileTool::NAME,
 539                        input_file_content,
 540                    )],
 541                ),
 542                message(
 543                    Assistant,
 544                    [tool_use(
 545                        "tool_2",
 546                        GrepTool::NAME,
 547                        GrepToolInput {
 548                            regex: "mod\\s+tests".into(),
 549                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 550                            offset: 0,
 551                            case_sensitive: false,
 552                        },
 553                    )],
 554                ),
 555                message(
 556                    User,
 557                    [tool_result("tool_2", GrepTool::NAME, "No matches found")],
 558                ),
 559                message(
 560                    Assistant,
 561                    [tool_use(
 562                        "tool_3",
 563                        GrepTool::NAME,
 564                        GrepToolInput {
 565                            regex: "mod\\s+tests".into(),
 566                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 567                            offset: 0,
 568                            case_sensitive: false,
 569                        },
 570                    )],
 571                ),
 572                message(
 573                    User,
 574                    [tool_result("tool_3", GrepTool::NAME, "No matches found")],
 575                ),
 576                message(
 577                    Assistant,
 578                    [tool_use(
 579                        "tool_4",
 580                        GrepTool::NAME,
 581                        GrepToolInput {
 582                            regex: "#\\[test\\]".into(),
 583                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 584                            offset: 0,
 585                            case_sensitive: false,
 586                        },
 587                    )],
 588                ),
 589                message(
 590                    User,
 591                    [tool_result(
 592                        "tool_4",
 593                        GrepTool::NAME,
 594                        indoc! {"
 595                                Found 6 matches:
 596
 597                                ## Matches in font-kit/src/loaders/core_text.rs
 598
 599                                ### mod test › L926-936
 600                                ```
 601                                mod test {
 602                                    use super::Font;
 603                                    use crate::properties::{Stretch, Weight};
 604
 605                                    #[cfg(feature = \"source\")]
 606                                    use crate::source::SystemSource;
 607
 608                                    static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 609
 610                                    #[cfg(feature = \"source\")]
 611                                    #[test]
 612                                ```
 613
 614                                55 lines remaining in ancestor node. Read the file to see all.
 615
 616                                ### mod test › L947-951
 617                                ```
 618                                    }
 619
 620                                    #[test]
 621                                    fn test_core_text_to_css_font_weight() {
 622                                        // Exact matches
 623                                ```
 624
 625                                ### mod test › L959-963
 626                                ```
 627                                    }
 628
 629                                    #[test]
 630                                    fn test_core_text_to_css_font_stretch() {
 631                                        // Exact matches
 632                                ```
 633
 634                                ## Matches in font-kit/src/loaders/freetype.rs
 635
 636                                ### mod test › L1238-1248
 637                                ```
 638                                mod test {
 639                                    use crate::loaders::freetype::Font;
 640
 641                                    static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 642                                    static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 643
 644                                    #[test]
 645                                    fn get_pcf_postscript_name() {
 646                                        let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 647                                        assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 648                                    }
 649                                ```
 650
 651                                1 lines remaining in ancestor node. Read the file to see all.
 652
 653                                ## Matches in font-kit/src/sources/core_text.rs
 654
 655                                ### mod test › L265-275
 656                                ```
 657                                mod test {
 658                                    use crate::properties::{Stretch, Weight};
 659
 660                                    #[test]
 661                                    fn test_css_to_core_text_font_weight() {
 662                                        // Exact matches
 663                                        assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 664                                        assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 665                                        assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 666                                        assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 667
 668                                ```
 669
 670                                27 lines remaining in ancestor node. Read the file to see all.
 671
 672                                ### mod test › L278-282
 673                                ```
 674                                    }
 675
 676                                    #[test]
 677                                    fn test_css_to_core_text_font_stretch() {
 678                                        // Exact matches
 679                                ```
 680                            "},
 681                    )],
 682                ),
 683                message(
 684                    Assistant,
 685                    [tool_use(
 686                        "tool_5",
 687                        EditFileTool::NAME,
 688                        EditFileToolInput {
 689                            display_description: edit_description.into(),
 690                            path: input_file_path.into(),
 691                            mode: EditFileMode::Edit,
 692                        },
 693                    )],
 694                ),
 695            ],
 696            Some(input_file_content.into()),
 697            EvalAssertion::judge_diff(indoc! {"
 698                        - The diff contains a new `from_pixels` constructor
 699                        - The diff contains new tests for the `from_pixels` constructor
 700                    "}),
 701        ))
 702    });
 703}
 704
 705#[test]
 706#[cfg_attr(not(feature = "unit-eval"), ignore)]
 707fn eval_zode() {
 708    //  Model                          | Pass rate
 709    // ============================================
 710    //
 711    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 712    //  claude-sonnet-4                |  1.0 (2025-06-14)
 713    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 714    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 715
 716    let input_file_path = "root/zode.py";
 717    let input_content = None;
 718    let edit_description = "Create the main Zode CLI script";
 719
 720    eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
 721        run_eval(EvalInput::from_conversation(
 722            vec![
 723                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 724                message(
 725                    Assistant,
 726                    [
 727                        tool_use(
 728                            "tool_1",
 729                            ReadFileTool::NAME,
 730                            ReadFileToolInput {
 731                                path: "root/eval/react.py".into(),
 732                                start_line: None,
 733                                end_line: None,
 734                            },
 735                        ),
 736                        tool_use(
 737                            "tool_2",
 738                            ReadFileTool::NAME,
 739                            ReadFileToolInput {
 740                                path: "root/eval/react_test.py".into(),
 741                                start_line: None,
 742                                end_line: None,
 743                            },
 744                        ),
 745                    ],
 746                ),
 747                message(
 748                    User,
 749                    [
 750                        tool_result(
 751                            "tool_1",
 752                            ReadFileTool::NAME,
 753                            include_str!("evals/fixtures/zode/react.py"),
 754                        ),
 755                        tool_result(
 756                            "tool_2",
 757                            ReadFileTool::NAME,
 758                            include_str!("evals/fixtures/zode/react_test.py"),
 759                        ),
 760                    ],
 761                ),
 762                message(
 763                    Assistant,
 764                    [
 765                        text(
 766                            "Now that I understand what we need to build, I'll create the main Python script:",
 767                        ),
 768                        tool_use(
 769                            "tool_3",
 770                            EditFileTool::NAME,
 771                            EditFileToolInput {
 772                                display_description: edit_description.into(),
 773                                path: input_file_path.into(),
 774                                mode: EditFileMode::Create,
 775                            },
 776                        ),
 777                    ],
 778                ),
 779            ],
 780            input_content.clone(),
 781            EvalAssertion::new(async move |sample, _, _cx| {
 782                let invalid_starts = [' ', '`', '\n'];
 783                let mut message = String::new();
 784                for start in invalid_starts {
 785                    if sample.text_after.starts_with(start) {
 786                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 787                        break;
 788                    }
 789                }
 790                // Remove trailing newline.
 791                message.pop();
 792
 793                if message.is_empty() {
 794                    Ok(EvalAssertionOutcome {
 795                        score: 100,
 796                        message: None,
 797                    })
 798                } else {
 799                    Ok(EvalAssertionOutcome {
 800                        score: 0,
 801                        message: Some(message),
 802                    })
 803                }
 804            }),
 805        ))
 806    });
 807}
 808
 809#[test]
 810#[cfg_attr(not(feature = "unit-eval"), ignore)]
 811fn eval_add_overwrite_test() {
 812    //  Model                          | Pass rate
 813    // ============================================
 814    //
 815    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 816    //  claude-sonnet-4                |  0.07 (2025-06-14)
 817    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 818    //  gemini-2.5-flash-preview-04-17 |
 819
 820    let input_file_path = "root/action_log.rs";
 821    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 822    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 823
 824    eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
 825        run_eval(EvalInput::from_conversation(
 826            vec![
 827                message(
 828                    User,
 829                    [text(indoc! {"
 830                            Introduce a new test in `action_log.rs` to test overwriting a file.
 831                            That is, a file already exists, but we call `buffer_created` as if the file were new.
 832                            Take inspiration from all the other tests in the file.
 833                        "})],
 834                ),
 835                message(
 836                    Assistant,
 837                    [tool_use(
 838                        "tool_1",
 839                        ReadFileTool::NAME,
 840                        ReadFileToolInput {
 841                            path: input_file_path.into(),
 842                            start_line: None,
 843                            end_line: None,
 844                        },
 845                    )],
 846                ),
 847                message(
 848                    User,
 849                    [tool_result(
 850                        "tool_1",
 851                        ReadFileTool::NAME,
 852                        indoc! {"
 853                                pub struct ActionLog [L13-20]
 854                                 tracked_buffers [L15]
 855                                 edited_since_project_diagnostics_check [L17]
 856                                 project [L19]
 857                                impl ActionLog [L22-498]
 858                                 pub fn new [L24-30]
 859                                 pub fn project [L32-34]
 860                                 pub fn checked_project_diagnostics [L37-39]
 861                                 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 862                                 fn track_buffer_internal [L46-101]
 863                                 fn handle_buffer_event [L103-116]
 864                                 fn handle_buffer_edited [L118-123]
 865                                 fn handle_buffer_file_changed [L125-158]
 866                                 async fn maintain_diff [L160-264]
 867                                 pub fn buffer_read [L267-269]
 868                                 pub fn buffer_created [L272-276]
 869                                 pub fn buffer_edited [L279-287]
 870                                 pub fn will_delete_buffer [L289-304]
 871                                 pub fn keep_edits_in_range [L306-364]
 872                                 pub fn reject_edits_in_ranges [L366-459]
 873                                 pub fn keep_all_edits [L461-473]
 874                                 pub fn changed_buffers [L476-482]
 875                                 pub fn stale_buffers [L485-497]
 876                                fn apply_non_conflicting_edits [L500-561]
 877                                fn diff_snapshots [L563-585]
 878                                fn point_to_row_edit [L587-614]
 879                                enum ChangeAuthor [L617-620]
 880                                 User [L618]
 881                                 Agent [L619]
 882                                enum TrackedBufferStatus [L623-627]
 883                                 Created [L624]
 884                                 Modified [L625]
 885                                 Deleted [L626]
 886                                struct TrackedBuffer [L629-641]
 887                                 buffer [L630]
 888                                 base_text [L631]
 889                                 unreviewed_changes [L632]
 890                                 status [L633]
 891                                 version [L634]
 892                                 diff [L635]
 893                                 snapshot [L636]
 894                                 diff_update [L637]
 895                                 _open_lsp_handle [L638]
 896                                 _maintain_diff [L639]
 897                                 _subscription [L640]
 898                                impl TrackedBuffer [L643-657]
 899                                 fn has_changes [L644-650]
 900                                 fn schedule_diff_update [L652-656]
 901                                pub struct ChangedBuffer [L659-661]
 902                                 pub diff [L660]
 903                                mod tests [L664-1574]
 904                                 fn init_logger [L678-682]
 905                                 fn init_test [L684-691]
 906                                 async fn test_keep_edits [L694-769]
 907                                 async fn test_deletions [L772-854]
 908                                 async fn test_overlapping_user_edits [L857-951]
 909                                 async fn test_creating_files [L954-1010]
 910                                 async fn test_deleting_files [L1013-1120]
 911                                 async fn test_reject_edits [L1123-1255]
 912                                 async fn test_reject_multiple_edits [L1258-1331]
 913                                 async fn test_reject_deleted_file [L1334-1388]
 914                                 async fn test_reject_created_file [L1391-1443]
 915                                 async fn test_random_diffs [L1446-1535]
 916                                  fn quiesce [L1510-1534]
 917                                 struct HunkStatus [L1538-1542]
 918                                  range [L1539]
 919                                  diff_status [L1540]
 920                                  old_text [L1541]
 921                                 fn unreviewed_hunks [L1544-1573]
 922
 923                                Showing symbols 1-69 (total symbols: 69)
 924
 925                                Using the line numbers in this outline, you can call this tool again while specifying
 926                                the start_line and end_line fields to see the implementations of symbols in the outline.
 927                            "},
 928                    )],
 929                ),
 930                message(
 931                    Assistant,
 932                    [
 933                        text(
 934                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 935                        ),
 936                        tool_use(
 937                            "tool_2",
 938                            ReadFileTool::NAME,
 939                            ReadFileToolInput {
 940                                path: input_file_path.into(),
 941                                start_line: Some(953),
 942                                end_line: Some(1010),
 943                            },
 944                        ),
 945                    ],
 946                ),
 947                message(
 948                    User,
 949                    [tool_result(
 950                        "tool_2",
 951                        ReadFileTool::NAME,
 952                        lines(input_file_content, 953..1010),
 953                    )],
 954                ),
 955                message(
 956                    Assistant,
 957                    [
 958                        text(
 959                            "Now let's look at another relevant test to understand how they're structured:",
 960                        ),
 961                        tool_use(
 962                            "tool_3",
 963                            ReadFileTool::NAME,
 964                            ReadFileToolInput {
 965                                path: input_file_path.into(),
 966                                start_line: Some(1012),
 967                                end_line: Some(1120),
 968                            },
 969                        ),
 970                    ],
 971                ),
 972                message(
 973                    User,
 974                    [tool_result(
 975                        "tool_3",
 976                        ReadFileTool::NAME,
 977                        lines(input_file_content, 1012..1120),
 978                    )],
 979                ),
 980                message(
 981                    Assistant,
 982                    [
 983                        text("Now let's look at how `buffer_created` is implemented:"),
 984                        tool_use(
 985                            "tool_4",
 986                            ReadFileTool::NAME,
 987                            ReadFileToolInput {
 988                                path: input_file_path.into(),
 989                                start_line: Some(271),
 990                                end_line: Some(276),
 991                            },
 992                        ),
 993                    ],
 994                ),
 995                message(
 996                    User,
 997                    [tool_result(
 998                        "tool_4",
 999                        ReadFileTool::NAME,
1000                        lines(input_file_content, 271..276),
1001                    )],
1002                ),
1003                message(
1004                    Assistant,
1005                    [
1006                        text(indoc! {"
1007                                Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
1008
1009                                1. Create a file
1010                                2. Open a buffer for it
1011                                3. Track it in the action log
1012                                4. Modify the buffer
1013                                5. Call `buffer_created` again as if the file were new
1014                                6. Check that the changes are properly tracked
1015
1016                                Let's write the test:
1017                            "}),
1018                        tool_use(
1019                            "tool_5",
1020                            EditFileTool::NAME,
1021                            EditFileToolInput {
1022                                display_description: edit_description.into(),
1023                                path: input_file_path.into(),
1024                                mode: EditFileMode::Edit,
1025                            },
1026                        ),
1027                    ],
1028                ),
1029            ],
1030            Some(input_file_content.into()),
1031            EvalAssertion::judge_diff(
1032                "A new test for overwritten files was created, without changing any previous test",
1033            ),
1034        ))
1035    });
1036}
1037
1038#[test]
1039#[cfg_attr(not(feature = "unit-eval"), ignore)]
1040fn eval_create_empty_file() {
1041    // Check that Edit Agent can create a file without writing its
1042    // thoughts into it. This issue is not specific to empty files, but
1043    // it's easier to reproduce with them.
1044    //
1045    //  Model                          | Pass rate
1046    // ============================================
1047    //
1048    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1049    //  claude-sonnet-4                |  1.00 (2025-06-14)
1050    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1051    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1052
1053    let input_file_content = None;
1054    let expected_output_content = String::new();
1055
1056    eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1057        run_eval(EvalInput::from_conversation(
1058            vec![
1059                message(User, [text("Create a second empty todo file ")]),
1060                message(
1061                    Assistant,
1062                    [
1063                        text(formatdoc! {"
1064                            I'll help you create a second empty todo file.
1065                            First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1066                            "}),
1067                        tool_use(
1068                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1069                            ListDirectoryTool::NAME,
1070                            ListDirectoryToolInput {
1071                                path: "root".to_string(),
1072                            },
1073                        ),
1074                    ],
1075                ),
1076                message(
1077                    User,
1078                    [tool_result(
1079                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1080                        ListDirectoryTool::NAME,
1081                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1082                    )],
1083                ),
1084                message(
1085                    Assistant,
1086                    [
1087                        text(formatdoc! {"
1088                            I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1089                        "}),
1090                        tool_use(
1091                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1092                            EditFileTool::NAME,
1093                            EditFileToolInput {
1094                                display_description: "Create empty TODO3 file".to_string(),
1095                                mode: EditFileMode::Create,
1096                                path: "root/TODO3".into(),
1097                            },
1098                        ),
1099                    ],
1100                ),
1101            ],
1102            input_file_content.clone(),
1103            // Bad behavior is to write something like
1104            // "I'll create an empty TODO3 file as requested."
1105            EvalAssertion::assert_eq(expected_output_content.clone()),
1106        ))
1107    });
1108}
1109
1110fn message(
1111    role: Role,
1112    contents: impl IntoIterator<Item = MessageContent>,
1113) -> LanguageModelRequestMessage {
1114    LanguageModelRequestMessage {
1115        role,
1116        content: contents.into_iter().collect(),
1117        cache: false,
1118        reasoning_details: None,
1119    }
1120}
1121
1122fn text(text: impl Into<String>) -> MessageContent {
1123    MessageContent::Text(text.into())
1124}
1125
1126fn lines(input: &str, range: Range<usize>) -> String {
1127    input
1128        .lines()
1129        .skip(range.start)
1130        .take(range.len())
1131        .collect::<Vec<_>>()
1132        .join("\n")
1133}
1134
1135fn tool_use(
1136    id: impl Into<Arc<str>>,
1137    name: impl Into<Arc<str>>,
1138    input: impl Serialize,
1139) -> MessageContent {
1140    MessageContent::ToolUse(LanguageModelToolUse {
1141        id: LanguageModelToolUseId::from(id.into()),
1142        name: name.into(),
1143        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1144        input: serde_json::to_value(input).unwrap(),
1145        is_input_complete: true,
1146        thought_signature: None,
1147    })
1148}
1149
1150fn tool_result(
1151    id: impl Into<Arc<str>>,
1152    name: impl Into<Arc<str>>,
1153    result: impl Into<Arc<str>>,
1154) -> MessageContent {
1155    MessageContent::ToolResult(LanguageModelToolResult {
1156        tool_use_id: LanguageModelToolUseId::from(id.into()),
1157        tool_name: name.into(),
1158        is_error: false,
1159        content: LanguageModelToolResultContent::Text(result.into()),
1160        output: None,
1161    })
1162}
1163
1164#[derive(Clone)]
1165struct EvalInput {
1166    conversation: Vec<LanguageModelRequestMessage>,
1167    edit_file_input: EditFileToolInput,
1168    input_content: Option<String>,
1169    assertion: EvalAssertion,
1170}
1171
1172impl EvalInput {
1173    fn from_conversation(
1174        conversation: Vec<LanguageModelRequestMessage>,
1175        input_content: Option<String>,
1176        assertion: EvalAssertion,
1177    ) -> Self {
1178        let msg = conversation.last().expect("Conversation must not be empty");
1179        if msg.role != Role::Assistant {
1180            panic!("Conversation must end with an assistant message");
1181        }
1182        let tool_use = msg
1183            .content
1184            .iter()
1185            .flat_map(|content| match content {
1186                MessageContent::ToolUse(tool_use) if tool_use.name == EditFileTool::NAME.into() => {
1187                    Some(tool_use)
1188                }
1189                _ => None,
1190            })
1191            .next()
1192            .expect("Conversation must end with an edit_file tool use")
1193            .clone();
1194
1195        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1196
1197        EvalInput {
1198            conversation,
1199            edit_file_input,
1200            input_content,
1201            assertion,
1202        }
1203    }
1204}
1205
1206#[derive(Clone)]
1207struct EvalSample {
1208    text_before: String,
1209    text_after: String,
1210    edit_output: EditAgentOutput,
1211    diff: String,
1212}
1213
1214trait AssertionFn: 'static + Send + Sync {
1215    fn assert<'a>(
1216        &'a self,
1217        sample: &'a EvalSample,
1218        judge_model: Arc<dyn LanguageModel>,
1219        cx: &'a mut TestAppContext,
1220    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1221}
1222
1223impl<F> AssertionFn for F
1224where
1225    F: 'static
1226        + Send
1227        + Sync
1228        + AsyncFn(
1229            &EvalSample,
1230            Arc<dyn LanguageModel>,
1231            &mut TestAppContext,
1232        ) -> Result<EvalAssertionOutcome>,
1233{
1234    fn assert<'a>(
1235        &'a self,
1236        sample: &'a EvalSample,
1237        judge_model: Arc<dyn LanguageModel>,
1238        cx: &'a mut TestAppContext,
1239    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1240        (self)(sample, judge_model, cx).boxed_local()
1241    }
1242}
1243
1244#[derive(Clone)]
1245struct EvalAssertion(Arc<dyn AssertionFn>);
1246
1247impl EvalAssertion {
1248    fn new<F>(f: F) -> Self
1249    where
1250        F: 'static
1251            + Send
1252            + Sync
1253            + AsyncFn(
1254                &EvalSample,
1255                Arc<dyn LanguageModel>,
1256                &mut TestAppContext,
1257            ) -> Result<EvalAssertionOutcome>,
1258    {
1259        EvalAssertion(Arc::new(f))
1260    }
1261
1262    fn assert_eq(expected: impl Into<String>) -> Self {
1263        let expected = expected.into();
1264        Self::new(async move |sample, _judge, _cx| {
1265            Ok(EvalAssertionOutcome {
1266                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1267                    100
1268                } else {
1269                    0
1270                },
1271                message: None,
1272            })
1273        })
1274    }
1275
1276    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1277        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1278        Self::new(async move |sample, _judge, _cx| {
1279            let matches = expected_diffs.iter().any(|possible_diff| {
1280                let expected =
1281                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1282                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1283            });
1284
1285            Ok(EvalAssertionOutcome {
1286                score: if matches { 100 } else { 0 },
1287                message: None,
1288            })
1289        })
1290    }
1291
1292    fn judge_diff(assertions: &'static str) -> Self {
1293        Self::new(async move |sample, judge, cx| {
1294            let prompt = DiffJudgeTemplate {
1295                diff: sample.diff.clone(),
1296                assertions,
1297            }
1298            .render(&Templates::new())
1299            .unwrap();
1300
1301            let request = LanguageModelRequest {
1302                messages: vec![LanguageModelRequestMessage {
1303                    role: Role::User,
1304                    content: vec![prompt.into()],
1305                    cache: false,
1306                    reasoning_details: None,
1307                }],
1308                thinking_allowed: true,
1309                ..Default::default()
1310            };
1311            let mut response = retry_on_rate_limit(async || {
1312                Ok(judge
1313                    .stream_completion_text(request.clone(), &cx.to_async())
1314                    .await?)
1315            })
1316            .await?;
1317            let mut output = String::new();
1318            while let Some(chunk) = response.stream.next().await {
1319                let chunk = chunk?;
1320                output.push_str(&chunk);
1321            }
1322
1323            // Parse the score from the response
1324            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1325            if let Some(captures) = re.captures(&output)
1326                && let Some(score_match) = captures.get(1)
1327            {
1328                let score = score_match.as_str().parse().unwrap_or(0);
1329                return Ok(EvalAssertionOutcome {
1330                    score,
1331                    message: Some(output),
1332                });
1333            }
1334
1335            anyhow::bail!("No score found in response. Raw output: {output}");
1336        })
1337    }
1338
1339    async fn run(
1340        &self,
1341        input: &EvalSample,
1342        judge_model: Arc<dyn LanguageModel>,
1343        cx: &mut TestAppContext,
1344    ) -> Result<EvalAssertionOutcome> {
1345        self.0.assert(input, judge_model, cx).await
1346    }
1347}
1348
1349fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1350    let dispatcher = gpui::TestDispatcher::new(rand::random());
1351    let mut cx = TestAppContext::build(dispatcher, None);
1352    let foreground_executor = cx.foreground_executor().clone();
1353    let result = foreground_executor.block_test(async {
1354        let test = EditAgentTest::new(&mut cx).await;
1355        test.eval(eval, &mut cx).await
1356    });
1357    cx.quit();
1358    match result {
1359        Ok(output) => eval_utils::EvalOutput {
1360            data: output.to_string(),
1361            outcome: if output.assertion.score < 80 {
1362                eval_utils::OutcomeKind::Failed
1363            } else {
1364                eval_utils::OutcomeKind::Passed
1365            },
1366            metadata: EditEvalMetadata {
1367                tags: output.sample.edit_output.parser_metrics.tags,
1368                mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1369            },
1370        },
1371        Err(e) => eval_utils::EvalOutput {
1372            data: format!("{e:?}"),
1373            outcome: eval_utils::OutcomeKind::Error,
1374            metadata: EditEvalMetadata {
1375                tags: 0,
1376                mismatched_tags: 0,
1377            },
1378        },
1379    }
1380}
1381
1382#[derive(Clone)]
1383struct EditEvalOutput {
1384    sample: EvalSample,
1385    assertion: EvalAssertionOutcome,
1386}
1387
1388impl Display for EditEvalOutput {
1389    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1390        writeln!(f, "Score: {:?}", self.assertion.score)?;
1391        if let Some(message) = self.assertion.message.as_ref() {
1392            writeln!(f, "Message: {}", message)?;
1393        }
1394
1395        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1396
1397        writeln!(
1398            f,
1399            "Parser Metrics:\n{:#?}",
1400            self.sample.edit_output.parser_metrics
1401        )?;
1402        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1403        Ok(())
1404    }
1405}
1406
1407struct EditAgentTest {
1408    agent: EditAgent,
1409    project: Entity<Project>,
1410    judge_model: Arc<dyn LanguageModel>,
1411}
1412
1413impl EditAgentTest {
1414    async fn new(cx: &mut TestAppContext) -> Self {
1415        cx.executor().allow_parking();
1416
1417        let fs = FakeFs::new(cx.executor());
1418        cx.update(|cx| {
1419            settings::init(cx);
1420            gpui_tokio::init(cx);
1421            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1422            cx.set_http_client(http_client);
1423            let client = Client::production(cx);
1424            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1425            settings::init(cx);
1426            language_model::init(user_store.clone(), client.clone(), cx);
1427            language_models::init(user_store, client.clone(), cx);
1428        });
1429
1430        fs.insert_tree("/root", json!({})).await;
1431        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1432        let agent_model = SelectedModel::from_str(
1433            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1434        )
1435        .unwrap();
1436        let judge_model = SelectedModel::from_str(
1437            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1438        )
1439        .unwrap();
1440
1441        let authenticate_provider_tasks = cx.update(|cx| {
1442            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1443                registry
1444                    .providers()
1445                    .iter()
1446                    .map(|p| p.authenticate(cx))
1447                    .collect::<Vec<_>>()
1448            })
1449        });
1450        let (agent_model, judge_model) = cx
1451            .update(|cx| {
1452                cx.spawn(async move |cx| {
1453                    futures::future::join_all(authenticate_provider_tasks).await;
1454                    let agent_model = Self::load_model(&agent_model, cx).await;
1455                    let judge_model = Self::load_model(&judge_model, cx).await;
1456                    (agent_model.unwrap(), judge_model.unwrap())
1457                })
1458            })
1459            .await;
1460        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1461
1462        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1463
1464        Self {
1465            agent: EditAgent::new(
1466                agent_model,
1467                project.clone(),
1468                action_log,
1469                Templates::new(),
1470                edit_format,
1471                true,
1472                true,
1473            ),
1474            project,
1475            judge_model,
1476        }
1477    }
1478
1479    async fn load_model(
1480        selected_model: &SelectedModel,
1481        cx: &mut AsyncApp,
1482    ) -> Result<Arc<dyn LanguageModel>> {
1483        cx.update(|cx| {
1484            let registry = LanguageModelRegistry::read_global(cx);
1485            let provider = registry
1486                .provider(&selected_model.provider)
1487                .expect("Provider not found");
1488            provider.authenticate(cx)
1489        })
1490        .await?;
1491        Ok(cx.update(|cx| {
1492            let models = LanguageModelRegistry::read_global(cx);
1493            let model = models
1494                .available_models(cx)
1495                .find(|model| {
1496                    model.provider_id() == selected_model.provider
1497                        && model.id() == selected_model.model
1498                })
1499                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1500            model
1501        }))
1502    }
1503
1504    async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1505        // Make sure the last message in the conversation is cached.
1506        eval.conversation.last_mut().unwrap().cache = true;
1507
1508        let path = self
1509            .project
1510            .read_with(cx, |project, cx| {
1511                project.find_project_path(eval.edit_file_input.path, cx)
1512            })
1513            .unwrap();
1514        let buffer = self
1515            .project
1516            .update(cx, |project, cx| project.open_buffer(path, cx))
1517            .await
1518            .unwrap();
1519
1520        let tools = crate::built_in_tools().collect::<Vec<_>>();
1521
1522        let system_prompt = {
1523            let worktrees = vec![WorktreeContext {
1524                root_name: "root".to_string(),
1525                abs_path: Path::new("/path/to/root").into(),
1526                rules_file: None,
1527            }];
1528            let project_context = ProjectContext::new(worktrees, Vec::default());
1529            let tool_names = tools
1530                .iter()
1531                .map(|tool| tool.name.clone().into())
1532                .collect::<Vec<_>>();
1533            let template = crate::SystemPromptTemplate {
1534                project: &project_context,
1535                available_tools: tool_names,
1536                model_name: None,
1537            };
1538            let templates = Templates::new();
1539            template.render(&templates).unwrap()
1540        };
1541
1542        let has_system_prompt = eval
1543            .conversation
1544            .first()
1545            .is_some_and(|msg| msg.role == Role::System);
1546        let messages = if has_system_prompt {
1547            eval.conversation
1548        } else {
1549            [LanguageModelRequestMessage {
1550                role: Role::System,
1551                content: vec![MessageContent::Text(system_prompt)],
1552                cache: true,
1553                reasoning_details: None,
1554            }]
1555            .into_iter()
1556            .chain(eval.conversation)
1557            .collect::<Vec<_>>()
1558        };
1559
1560        let conversation = LanguageModelRequest {
1561            messages,
1562            tools,
1563            thinking_allowed: true,
1564            ..Default::default()
1565        };
1566
1567        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1568            if let Some(input_content) = eval.input_content.as_deref() {
1569                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1570            }
1571            retry_on_rate_limit(async || {
1572                self.agent
1573                    .edit(
1574                        buffer.clone(),
1575                        eval.edit_file_input.display_description.clone(),
1576                        &conversation,
1577                        &mut cx.to_async(),
1578                    )
1579                    .0
1580                    .await
1581            })
1582            .await?
1583        } else {
1584            retry_on_rate_limit(async || {
1585                self.agent
1586                    .overwrite(
1587                        buffer.clone(),
1588                        eval.edit_file_input.display_description.clone(),
1589                        &conversation,
1590                        &mut cx.to_async(),
1591                    )
1592                    .0
1593                    .await
1594            })
1595            .await?
1596        };
1597
1598        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1599        let sample = EvalSample {
1600            edit_output,
1601            diff: language::unified_diff(
1602                eval.input_content.as_deref().unwrap_or_default(),
1603                &buffer_text,
1604            ),
1605            text_before: eval.input_content.unwrap_or_default(),
1606            text_after: buffer_text,
1607        };
1608        let assertion = eval
1609            .assertion
1610            .run(&sample, self.judge_model.clone(), cx)
1611            .await?;
1612
1613        Ok(EditEvalOutput { assertion, sample })
1614    }
1615}
1616
1617async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1618    const MAX_RETRIES: usize = 20;
1619    let mut attempt = 0;
1620
1621    loop {
1622        attempt += 1;
1623        let response = request().await;
1624
1625        if attempt >= MAX_RETRIES {
1626            return response;
1627        }
1628
1629        let retry_delay = match &response {
1630            Ok(_) => None,
1631            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1632                Some(err) => match &err {
1633                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1634                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1635                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1636                    }
1637                    LanguageModelCompletionError::UpstreamProviderError {
1638                        status,
1639                        retry_after,
1640                        ..
1641                    } => {
1642                        // Only retry for specific status codes
1643                        let should_retry = matches!(
1644                            *status,
1645                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1646                        ) || status.as_u16() == 529;
1647
1648                        if should_retry {
1649                            // Use server-provided retry_after if available, otherwise use default
1650                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1651                        } else {
1652                            None
1653                        }
1654                    }
1655                    LanguageModelCompletionError::ApiReadResponseError { .. }
1656                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1657                    | LanguageModelCompletionError::HttpSend { .. } => {
1658                        // Exponential backoff for transient I/O and internal server errors
1659                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1660                    }
1661                    _ => None,
1662                },
1663                _ => None,
1664            },
1665        };
1666
1667        if let Some(retry_after) = retry_delay {
1668            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1669            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1670            // This code does not use the gpui::executor
1671            #[allow(clippy::disallowed_methods)]
1672            smol::Timer::after(retry_after + jitter).await;
1673        } else {
1674            return response;
1675        }
1676    }
1677}
1678
1679#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1680struct EvalAssertionOutcome {
1681    score: usize,
1682    message: Option<String>,
1683}
1684
1685#[derive(Serialize)]
1686pub struct DiffJudgeTemplate {
1687    diff: String,
1688    assertions: &'static str,
1689}
1690
1691impl Template for DiffJudgeTemplate {
1692    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1693}
1694
1695fn strip_empty_lines(text: &str) -> String {
1696    text.lines()
1697        .filter(|line| !line.trim().is_empty())
1698        .collect::<Vec<_>>()
1699        .join("\n")
1700}