evals.rs

   1use super::*;
   2use crate::{
   3    AgentTool, EditFileMode, EditFileTool, EditFileToolInput, GrepTool, GrepToolInput,
   4    ListDirectoryTool, ListDirectoryToolInput, ReadFileTool, ReadFileToolInput,
   5};
   6use Role::*;
   7use client::{Client, UserStore};
   8use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
   9use fs::FakeFs;
  10use futures::{FutureExt, future::LocalBoxFuture};
  11use gpui::{AppContext, TestAppContext};
  12use http_client::StatusCode;
  13use indoc::{formatdoc, indoc};
  14use language_model::{
  15    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  16    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  17};
  18use project::Project;
  19use prompt_store::{ProjectContext, WorktreeContext};
  20use rand::prelude::*;
  21use reqwest_client::ReqwestClient;
  22use serde_json::json;
  23use std::{
  24    fmt::{self, Display},
  25    path::Path,
  26    str::FromStr,
  27    time::Duration,
  28};
  29use util::path;
  30
  31#[derive(Default, Clone, Debug)]
  32struct EditAgentOutputProcessor {
  33    mismatched_tag_threshold: f32,
  34    cumulative_tags: usize,
  35    cumulative_mismatched_tags: usize,
  36    eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
  37}
  38
  39fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
  40    EditAgentOutputProcessor {
  41        mismatched_tag_threshold,
  42        cumulative_tags: 0,
  43        cumulative_mismatched_tags: 0,
  44        eval_outputs: Vec::new(),
  45    }
  46}
  47
  48#[derive(Clone, Debug)]
  49struct EditEvalMetadata {
  50    tags: usize,
  51    mismatched_tags: usize,
  52}
  53
  54impl EvalOutputProcessor for EditAgentOutputProcessor {
  55    type Metadata = EditEvalMetadata;
  56
  57    fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
  58        if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
  59            self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
  60            self.cumulative_tags += output.metadata.tags;
  61            self.eval_outputs.push(output.clone());
  62        }
  63    }
  64
  65    fn assert(&mut self) {
  66        let mismatched_tag_ratio =
  67            self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
  68        if mismatched_tag_ratio > self.mismatched_tag_threshold {
  69            for eval_output in &self.eval_outputs {
  70                println!("{}", eval_output.data);
  71            }
  72            panic!(
  73                "Too many mismatched tags: {:?}",
  74                self.cumulative_mismatched_tags
  75            );
  76        }
  77    }
  78}
  79
  80#[test]
  81#[cfg_attr(not(feature = "unit-eval"), ignore)]
  82fn eval_extract_handle_command_output() {
  83    // Test how well agent generates multiple edit hunks.
  84    //
  85    // Model                       | Pass rate
  86    // ----------------------------|----------
  87    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  88    // claude-sonnet-4             |  0.97 (2025-06-14)
  89    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  90    // gemini-2.5-flash            |  0.11 (2025-05-22)
  91    // gpt-4.1                     |  1.00 (2025-05-22)
  92
  93    let input_file_path = "root/blame.rs";
  94    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  95    let possible_diffs = vec![
  96        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  97        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  98        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  99        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
 100        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
 101        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
 102        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
 103    ];
 104    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
 105    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 106        run_eval(EvalInput::from_conversation(
 107            vec![
 108                message(
 109                    User,
 110                    [text(formatdoc! {"
 111                            Read the `{input_file_path}` file and extract a method in
 112                            the final stanza of `run_git_blame` to deal with command failures,
 113                            call it `handle_command_output` and take the std::process::Output as the only parameter.
 114                            Do not document the method and do not add any comments.
 115
 116                            Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
 117                        "})],
 118                ),
 119                message(
 120                    Assistant,
 121                    [tool_use(
 122                        "tool_1",
 123                        ReadFileTool::NAME,
 124                        ReadFileToolInput {
 125                            path: input_file_path.into(),
 126                            start_line: None,
 127                            end_line: None,
 128                        },
 129                    )],
 130                ),
 131                message(
 132                    User,
 133                    [tool_result(
 134                        "tool_1",
 135                        ReadFileTool::NAME,
 136                        input_file_content,
 137                    )],
 138                ),
 139                message(
 140                    Assistant,
 141                    [tool_use(
 142                        "tool_2",
 143                        EditFileTool::NAME,
 144                        EditFileToolInput {
 145                            display_description: edit_description.into(),
 146                            path: input_file_path.into(),
 147                            mode: EditFileMode::Edit,
 148                        },
 149                    )],
 150                ),
 151            ],
 152            Some(input_file_content.into()),
 153            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 154        ))
 155    });
 156}
 157
 158#[test]
 159#[cfg_attr(not(feature = "unit-eval"), ignore)]
 160fn eval_delete_run_git_blame() {
 161    // Model                       | Pass rate
 162    // ----------------------------|----------
 163    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 164    // claude-sonnet-4             | 0.96 (2025-06-14)
 165    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 166    // gemini-2.5-flash            |
 167    // gpt-4.1                     |
 168
 169    let input_file_path = "root/blame.rs";
 170    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 171    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 172    let edit_description = "Delete the `run_git_blame` function.";
 173
 174    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 175        run_eval(EvalInput::from_conversation(
 176            vec![
 177                message(
 178                    User,
 179                    [text(formatdoc! {"
 180                            Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 181                            one function, not its usages.
 182                        "})],
 183                ),
 184                message(
 185                    Assistant,
 186                    [tool_use(
 187                        "tool_1",
 188                        ReadFileTool::NAME,
 189                        ReadFileToolInput {
 190                            path: input_file_path.into(),
 191                            start_line: None,
 192                            end_line: None,
 193                        },
 194                    )],
 195                ),
 196                message(
 197                    User,
 198                    [tool_result(
 199                        "tool_1",
 200                        ReadFileTool::NAME,
 201                        input_file_content,
 202                    )],
 203                ),
 204                message(
 205                    Assistant,
 206                    [tool_use(
 207                        "tool_2",
 208                        EditFileTool::NAME,
 209                        EditFileToolInput {
 210                            display_description: edit_description.into(),
 211                            path: input_file_path.into(),
 212                            mode: EditFileMode::Edit,
 213                        },
 214                    )],
 215                ),
 216            ],
 217            Some(input_file_content.into()),
 218            EvalAssertion::assert_eq(output_file_content),
 219        ))
 220    });
 221}
 222
 223#[test]
 224#[cfg_attr(not(feature = "unit-eval"), ignore)]
 225fn eval_translate_doc_comments() {
 226    //  Model                          | Pass rate
 227    // ============================================
 228    //
 229    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 230    //  claude-sonnet-4                |  1.0  (2025-06-14)
 231    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 232    //  gemini-2.5-flash-preview-04-17 |
 233    //  gpt-4.1                        |
 234
 235    let input_file_path = "root/canvas.rs";
 236    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 237    let edit_description = "Translate all doc comments to Italian";
 238
 239    eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
 240        run_eval(EvalInput::from_conversation(
 241            vec![
 242                message(
 243                    User,
 244                    [text(formatdoc! {"
 245                            Read the {input_file_path} file and edit it (without overwriting it),
 246                            translating all the doc comments to italian.
 247                        "})],
 248                ),
 249                message(
 250                    Assistant,
 251                    [tool_use(
 252                        "tool_1",
 253                        ReadFileTool::NAME,
 254                        ReadFileToolInput {
 255                            path: input_file_path.into(),
 256                            start_line: None,
 257                            end_line: None,
 258                        },
 259                    )],
 260                ),
 261                message(
 262                    User,
 263                    [tool_result(
 264                        "tool_1",
 265                        ReadFileTool::NAME,
 266                        input_file_content,
 267                    )],
 268                ),
 269                message(
 270                    Assistant,
 271                    [tool_use(
 272                        "tool_2",
 273                        EditFileTool::NAME,
 274                        EditFileToolInput {
 275                            display_description: edit_description.into(),
 276                            path: input_file_path.into(),
 277                            mode: EditFileMode::Edit,
 278                        },
 279                    )],
 280                ),
 281            ],
 282            Some(input_file_content.into()),
 283            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 284        ))
 285    });
 286}
 287
 288#[test]
 289#[cfg_attr(not(feature = "unit-eval"), ignore)]
 290fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 291    //  Model                          | Pass rate
 292    // ============================================
 293    //
 294    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 295    //  claude-sonnet-4                |  0.11 (2025-06-14)
 296    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 297    //  gemini-2.5-flash-preview-04-17 |
 298    //  gpt-4.1                        |
 299
 300    let input_file_path = "root/lib.rs";
 301    let input_file_content =
 302        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 303    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 304
 305    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 306        run_eval(EvalInput::from_conversation(
 307            vec![
 308                message(
 309                    User,
 310                    [text(formatdoc! {"
 311                            Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 312                            Use `ureq` to download the SDK for the current platform and architecture.
 313                            Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 314                            Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 315                            that's inside of the archive.
 316                            Don't re-download the SDK if that executable already exists.
 317
 318                            Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 319
 320                            Here are the available wasi-sdk assets:
 321                            - wasi-sdk-25.0-x86_64-macos.tar.gz
 322                            - wasi-sdk-25.0-arm64-macos.tar.gz
 323                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 324                            - wasi-sdk-25.0-arm64-linux.tar.gz
 325                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 326                            - wasi-sdk-25.0-arm64-linux.tar.gz
 327                            - wasi-sdk-25.0-x86_64-windows.tar.gz
 328                        "})],
 329                ),
 330                message(
 331                    Assistant,
 332                    [tool_use(
 333                        "tool_1",
 334                        ReadFileTool::NAME,
 335                        ReadFileToolInput {
 336                            path: input_file_path.into(),
 337                            start_line: Some(971),
 338                            end_line: Some(1050),
 339                        },
 340                    )],
 341                ),
 342                message(
 343                    User,
 344                    [tool_result(
 345                        "tool_1",
 346                        ReadFileTool::NAME,
 347                        lines(input_file_content, 971..1050),
 348                    )],
 349                ),
 350                message(
 351                    Assistant,
 352                    [tool_use(
 353                        "tool_2",
 354                        ReadFileTool::NAME,
 355                        ReadFileToolInput {
 356                            path: input_file_path.into(),
 357                            start_line: Some(1050),
 358                            end_line: Some(1100),
 359                        },
 360                    )],
 361                ),
 362                message(
 363                    User,
 364                    [tool_result(
 365                        "tool_2",
 366                        ReadFileTool::NAME,
 367                        lines(input_file_content, 1050..1100),
 368                    )],
 369                ),
 370                message(
 371                    Assistant,
 372                    [tool_use(
 373                        "tool_3",
 374                        ReadFileTool::NAME,
 375                        ReadFileToolInput {
 376                            path: input_file_path.into(),
 377                            start_line: Some(1100),
 378                            end_line: Some(1150),
 379                        },
 380                    )],
 381                ),
 382                message(
 383                    User,
 384                    [tool_result(
 385                        "tool_3",
 386                        ReadFileTool::NAME,
 387                        lines(input_file_content, 1100..1150),
 388                    )],
 389                ),
 390                message(
 391                    Assistant,
 392                    [tool_use(
 393                        "tool_4",
 394                        EditFileTool::NAME,
 395                        EditFileToolInput {
 396                            display_description: edit_description.into(),
 397                            path: input_file_path.into(),
 398                            mode: EditFileMode::Edit,
 399                        },
 400                    )],
 401                ),
 402            ],
 403            Some(input_file_content.into()),
 404            EvalAssertion::judge_diff(indoc! {"
 405                    - The compile_parser_to_wasm method has been changed to use wasi-sdk
 406                    - ureq is used to download the SDK for current platform and architecture
 407                "}),
 408        ))
 409    });
 410}
 411
 412#[test]
 413#[cfg_attr(not(feature = "unit-eval"), ignore)]
 414fn eval_disable_cursor_blinking() {
 415    //  Model                          | Pass rate
 416    // ============================================
 417    //
 418    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 419    //  claude-sonnet-4                |  0.81 (2025-07-14)
 420    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 421    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 422    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 423
 424    let input_file_path = "root/editor.rs";
 425    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 426    let edit_description = "Comment out the call to `BlinkManager::enable`";
 427    let possible_diffs = vec![
 428        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 429        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 430        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 431        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 432    ];
 433    eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
 434        run_eval(EvalInput::from_conversation(
 435            vec![
 436                message(User, [text("Let's research how to cursor blinking works.")]),
 437                message(
 438                    Assistant,
 439                    [tool_use(
 440                        "tool_1",
 441                        GrepTool::NAME,
 442                        GrepToolInput {
 443                            regex: "blink".into(),
 444                            include_pattern: None,
 445                            offset: 0,
 446                            case_sensitive: false,
 447                        },
 448                    )],
 449                ),
 450                message(
 451                    User,
 452                    [tool_result(
 453                        "tool_1",
 454                        GrepTool::NAME,
 455                        [
 456                            lines(input_file_content, 100..400),
 457                            lines(input_file_content, 800..1300),
 458                            lines(input_file_content, 1600..2000),
 459                            lines(input_file_content, 5000..5500),
 460                            lines(input_file_content, 8000..9000),
 461                            lines(input_file_content, 18455..18470),
 462                            lines(input_file_content, 20000..20500),
 463                            lines(input_file_content, 21000..21300),
 464                        ]
 465                        .join("Match found:\n\n"),
 466                    )],
 467                ),
 468                message(
 469                    User,
 470                    [text(indoc! {"
 471                            Comment out the lines that interact with the BlinkManager.
 472                            Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 473                            Don't add additional comments.
 474                        "})],
 475                ),
 476                message(
 477                    Assistant,
 478                    [tool_use(
 479                        "tool_4",
 480                        EditFileTool::NAME,
 481                        EditFileToolInput {
 482                            display_description: edit_description.into(),
 483                            path: input_file_path.into(),
 484                            mode: EditFileMode::Edit,
 485                        },
 486                    )],
 487                ),
 488            ],
 489            Some(input_file_content.into()),
 490            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 491        ))
 492    });
 493}
 494
 495#[test]
 496#[cfg_attr(not(feature = "unit-eval"), ignore)]
 497fn eval_from_pixels_constructor() {
 498    // Results for 2025-06-13
 499    //
 500    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 501    // value. Higher values improve the pass rate but may sometimes cause
 502    // edits to be misapplied. In the context of this eval, this means
 503    // the agent might add from_pixels tests in incorrect locations
 504    // (e.g., at the beginning of the file), yet the evaluation may still
 505    // rate it highly.
 506    //
 507    //  Model                          | Date        | Pass rate
 508    // =========================================================
 509    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 510    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 511    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 512    //  gpt-4.1                        |
 513
 514    let input_file_path = "root/canvas.rs";
 515    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 516    let edit_description = "Implement from_pixels constructor and add tests.";
 517
 518    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
 519        run_eval(EvalInput::from_conversation(
 520            vec![
 521                message(
 522                    User,
 523                    [text(indoc! {"
 524                            Introduce a new `from_pixels` constructor in Canvas and
 525                            also add tests for it in the same file.
 526                        "})],
 527                ),
 528                message(
 529                    Assistant,
 530                    [tool_use(
 531                        "tool_1",
 532                        ReadFileTool::NAME,
 533                        ReadFileToolInput {
 534                            path: input_file_path.into(),
 535                            start_line: None,
 536                            end_line: None,
 537                        },
 538                    )],
 539                ),
 540                message(
 541                    User,
 542                    [tool_result(
 543                        "tool_1",
 544                        ReadFileTool::NAME,
 545                        input_file_content,
 546                    )],
 547                ),
 548                message(
 549                    Assistant,
 550                    [tool_use(
 551                        "tool_2",
 552                        GrepTool::NAME,
 553                        GrepToolInput {
 554                            regex: "mod\\s+tests".into(),
 555                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 556                            offset: 0,
 557                            case_sensitive: false,
 558                        },
 559                    )],
 560                ),
 561                message(
 562                    User,
 563                    [tool_result("tool_2", GrepTool::NAME, "No matches found")],
 564                ),
 565                message(
 566                    Assistant,
 567                    [tool_use(
 568                        "tool_3",
 569                        GrepTool::NAME,
 570                        GrepToolInput {
 571                            regex: "mod\\s+tests".into(),
 572                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 573                            offset: 0,
 574                            case_sensitive: false,
 575                        },
 576                    )],
 577                ),
 578                message(
 579                    User,
 580                    [tool_result("tool_3", GrepTool::NAME, "No matches found")],
 581                ),
 582                message(
 583                    Assistant,
 584                    [tool_use(
 585                        "tool_4",
 586                        GrepTool::NAME,
 587                        GrepToolInput {
 588                            regex: "#\\[test\\]".into(),
 589                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 590                            offset: 0,
 591                            case_sensitive: false,
 592                        },
 593                    )],
 594                ),
 595                message(
 596                    User,
 597                    [tool_result(
 598                        "tool_4",
 599                        GrepTool::NAME,
 600                        indoc! {"
 601                                Found 6 matches:
 602
 603                                ## Matches in font-kit/src/loaders/core_text.rs
 604
 605                                ### mod test › L926-936
 606                                ```
 607                                mod test {
 608                                    use super::Font;
 609                                    use crate::properties::{Stretch, Weight};
 610
 611                                    #[cfg(feature = \"source\")]
 612                                    use crate::source::SystemSource;
 613
 614                                    static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 615
 616                                    #[cfg(feature = \"source\")]
 617                                    #[test]
 618                                ```
 619
 620                                55 lines remaining in ancestor node. Read the file to see all.
 621
 622                                ### mod test › L947-951
 623                                ```
 624                                    }
 625
 626                                    #[test]
 627                                    fn test_core_text_to_css_font_weight() {
 628                                        // Exact matches
 629                                ```
 630
 631                                ### mod test › L959-963
 632                                ```
 633                                    }
 634
 635                                    #[test]
 636                                    fn test_core_text_to_css_font_stretch() {
 637                                        // Exact matches
 638                                ```
 639
 640                                ## Matches in font-kit/src/loaders/freetype.rs
 641
 642                                ### mod test › L1238-1248
 643                                ```
 644                                mod test {
 645                                    use crate::loaders::freetype::Font;
 646
 647                                    static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 648                                    static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 649
 650                                    #[test]
 651                                    fn get_pcf_postscript_name() {
 652                                        let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 653                                        assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 654                                    }
 655                                ```
 656
 657                                1 lines remaining in ancestor node. Read the file to see all.
 658
 659                                ## Matches in font-kit/src/sources/core_text.rs
 660
 661                                ### mod test › L265-275
 662                                ```
 663                                mod test {
 664                                    use crate::properties::{Stretch, Weight};
 665
 666                                    #[test]
 667                                    fn test_css_to_core_text_font_weight() {
 668                                        // Exact matches
 669                                        assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 670                                        assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 671                                        assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 672                                        assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 673
 674                                ```
 675
 676                                27 lines remaining in ancestor node. Read the file to see all.
 677
 678                                ### mod test › L278-282
 679                                ```
 680                                    }
 681
 682                                    #[test]
 683                                    fn test_css_to_core_text_font_stretch() {
 684                                        // Exact matches
 685                                ```
 686                            "},
 687                    )],
 688                ),
 689                message(
 690                    Assistant,
 691                    [tool_use(
 692                        "tool_5",
 693                        EditFileTool::NAME,
 694                        EditFileToolInput {
 695                            display_description: edit_description.into(),
 696                            path: input_file_path.into(),
 697                            mode: EditFileMode::Edit,
 698                        },
 699                    )],
 700                ),
 701            ],
 702            Some(input_file_content.into()),
 703            EvalAssertion::judge_diff(indoc! {"
 704                        - The diff contains a new `from_pixels` constructor
 705                        - The diff contains new tests for the `from_pixels` constructor
 706                    "}),
 707        ))
 708    });
 709}
 710
 711#[test]
 712#[cfg_attr(not(feature = "unit-eval"), ignore)]
 713fn eval_zode() {
 714    //  Model                          | Pass rate
 715    // ============================================
 716    //
 717    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 718    //  claude-sonnet-4                |  1.0 (2025-06-14)
 719    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 720    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 721    //  gpt-4.1                        |  1.0 (2025-05-22)
 722
 723    let input_file_path = "root/zode.py";
 724    let input_content = None;
 725    let edit_description = "Create the main Zode CLI script";
 726
 727    eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
 728        run_eval(EvalInput::from_conversation(
 729            vec![
 730                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 731                message(
 732                    Assistant,
 733                    [
 734                        tool_use(
 735                            "tool_1",
 736                            ReadFileTool::NAME,
 737                            ReadFileToolInput {
 738                                path: "root/eval/react.py".into(),
 739                                start_line: None,
 740                                end_line: None,
 741                            },
 742                        ),
 743                        tool_use(
 744                            "tool_2",
 745                            ReadFileTool::NAME,
 746                            ReadFileToolInput {
 747                                path: "root/eval/react_test.py".into(),
 748                                start_line: None,
 749                                end_line: None,
 750                            },
 751                        ),
 752                    ],
 753                ),
 754                message(
 755                    User,
 756                    [
 757                        tool_result(
 758                            "tool_1",
 759                            ReadFileTool::NAME,
 760                            include_str!("evals/fixtures/zode/react.py"),
 761                        ),
 762                        tool_result(
 763                            "tool_2",
 764                            ReadFileTool::NAME,
 765                            include_str!("evals/fixtures/zode/react_test.py"),
 766                        ),
 767                    ],
 768                ),
 769                message(
 770                    Assistant,
 771                    [
 772                        text(
 773                            "Now that I understand what we need to build, I'll create the main Python script:",
 774                        ),
 775                        tool_use(
 776                            "tool_3",
 777                            EditFileTool::NAME,
 778                            EditFileToolInput {
 779                                display_description: edit_description.into(),
 780                                path: input_file_path.into(),
 781                                mode: EditFileMode::Create,
 782                            },
 783                        ),
 784                    ],
 785                ),
 786            ],
 787            input_content.clone(),
 788            EvalAssertion::new(async move |sample, _, _cx| {
 789                let invalid_starts = [' ', '`', '\n'];
 790                let mut message = String::new();
 791                for start in invalid_starts {
 792                    if sample.text_after.starts_with(start) {
 793                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 794                        break;
 795                    }
 796                }
 797                // Remove trailing newline.
 798                message.pop();
 799
 800                if message.is_empty() {
 801                    Ok(EvalAssertionOutcome {
 802                        score: 100,
 803                        message: None,
 804                    })
 805                } else {
 806                    Ok(EvalAssertionOutcome {
 807                        score: 0,
 808                        message: Some(message),
 809                    })
 810                }
 811            }),
 812        ))
 813    });
 814}
 815
 816#[test]
 817#[cfg_attr(not(feature = "unit-eval"), ignore)]
 818fn eval_add_overwrite_test() {
 819    //  Model                          | Pass rate
 820    // ============================================
 821    //
 822    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 823    //  claude-sonnet-4                |  0.07 (2025-06-14)
 824    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 825    //  gemini-2.5-flash-preview-04-17 |
 826    //  gpt-4.1                        |
 827
 828    let input_file_path = "root/action_log.rs";
 829    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 830    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 831
 832    eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
 833        run_eval(EvalInput::from_conversation(
 834            vec![
 835                message(
 836                    User,
 837                    [text(indoc! {"
 838                            Introduce a new test in `action_log.rs` to test overwriting a file.
 839                            That is, a file already exists, but we call `buffer_created` as if the file were new.
 840                            Take inspiration from all the other tests in the file.
 841                        "})],
 842                ),
 843                message(
 844                    Assistant,
 845                    [tool_use(
 846                        "tool_1",
 847                        ReadFileTool::NAME,
 848                        ReadFileToolInput {
 849                            path: input_file_path.into(),
 850                            start_line: None,
 851                            end_line: None,
 852                        },
 853                    )],
 854                ),
 855                message(
 856                    User,
 857                    [tool_result(
 858                        "tool_1",
 859                        ReadFileTool::NAME,
 860                        indoc! {"
 861                                pub struct ActionLog [L13-20]
 862                                 tracked_buffers [L15]
 863                                 edited_since_project_diagnostics_check [L17]
 864                                 project [L19]
 865                                impl ActionLog [L22-498]
 866                                 pub fn new [L24-30]
 867                                 pub fn project [L32-34]
 868                                 pub fn checked_project_diagnostics [L37-39]
 869                                 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 870                                 fn track_buffer_internal [L46-101]
 871                                 fn handle_buffer_event [L103-116]
 872                                 fn handle_buffer_edited [L118-123]
 873                                 fn handle_buffer_file_changed [L125-158]
 874                                 async fn maintain_diff [L160-264]
 875                                 pub fn buffer_read [L267-269]
 876                                 pub fn buffer_created [L272-276]
 877                                 pub fn buffer_edited [L279-287]
 878                                 pub fn will_delete_buffer [L289-304]
 879                                 pub fn keep_edits_in_range [L306-364]
 880                                 pub fn reject_edits_in_ranges [L366-459]
 881                                 pub fn keep_all_edits [L461-473]
 882                                 pub fn changed_buffers [L476-482]
 883                                 pub fn stale_buffers [L485-497]
 884                                fn apply_non_conflicting_edits [L500-561]
 885                                fn diff_snapshots [L563-585]
 886                                fn point_to_row_edit [L587-614]
 887                                enum ChangeAuthor [L617-620]
 888                                 User [L618]
 889                                 Agent [L619]
 890                                enum TrackedBufferStatus [L623-627]
 891                                 Created [L624]
 892                                 Modified [L625]
 893                                 Deleted [L626]
 894                                struct TrackedBuffer [L629-641]
 895                                 buffer [L630]
 896                                 base_text [L631]
 897                                 unreviewed_changes [L632]
 898                                 status [L633]
 899                                 version [L634]
 900                                 diff [L635]
 901                                 snapshot [L636]
 902                                 diff_update [L637]
 903                                 _open_lsp_handle [L638]
 904                                 _maintain_diff [L639]
 905                                 _subscription [L640]
 906                                impl TrackedBuffer [L643-657]
 907                                 fn has_changes [L644-650]
 908                                 fn schedule_diff_update [L652-656]
 909                                pub struct ChangedBuffer [L659-661]
 910                                 pub diff [L660]
 911                                mod tests [L664-1574]
 912                                 fn init_logger [L678-682]
 913                                 fn init_test [L684-691]
 914                                 async fn test_keep_edits [L694-769]
 915                                 async fn test_deletions [L772-854]
 916                                 async fn test_overlapping_user_edits [L857-951]
 917                                 async fn test_creating_files [L954-1010]
 918                                 async fn test_deleting_files [L1013-1120]
 919                                 async fn test_reject_edits [L1123-1255]
 920                                 async fn test_reject_multiple_edits [L1258-1331]
 921                                 async fn test_reject_deleted_file [L1334-1388]
 922                                 async fn test_reject_created_file [L1391-1443]
 923                                 async fn test_random_diffs [L1446-1535]
 924                                  fn quiesce [L1510-1534]
 925                                 struct HunkStatus [L1538-1542]
 926                                  range [L1539]
 927                                  diff_status [L1540]
 928                                  old_text [L1541]
 929                                 fn unreviewed_hunks [L1544-1573]
 930
 931                                Showing symbols 1-69 (total symbols: 69)
 932
 933                                Using the line numbers in this outline, you can call this tool again while specifying
 934                                the start_line and end_line fields to see the implementations of symbols in the outline.
 935                            "},
 936                    )],
 937                ),
 938                message(
 939                    Assistant,
 940                    [
 941                        text(
 942                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 943                        ),
 944                        tool_use(
 945                            "tool_2",
 946                            ReadFileTool::NAME,
 947                            ReadFileToolInput {
 948                                path: input_file_path.into(),
 949                                start_line: Some(953),
 950                                end_line: Some(1010),
 951                            },
 952                        ),
 953                    ],
 954                ),
 955                message(
 956                    User,
 957                    [tool_result(
 958                        "tool_2",
 959                        ReadFileTool::NAME,
 960                        lines(input_file_content, 953..1010),
 961                    )],
 962                ),
 963                message(
 964                    Assistant,
 965                    [
 966                        text(
 967                            "Now let's look at another relevant test to understand how they're structured:",
 968                        ),
 969                        tool_use(
 970                            "tool_3",
 971                            ReadFileTool::NAME,
 972                            ReadFileToolInput {
 973                                path: input_file_path.into(),
 974                                start_line: Some(1012),
 975                                end_line: Some(1120),
 976                            },
 977                        ),
 978                    ],
 979                ),
 980                message(
 981                    User,
 982                    [tool_result(
 983                        "tool_3",
 984                        ReadFileTool::NAME,
 985                        lines(input_file_content, 1012..1120),
 986                    )],
 987                ),
 988                message(
 989                    Assistant,
 990                    [
 991                        text("Now let's look at how `buffer_created` is implemented:"),
 992                        tool_use(
 993                            "tool_4",
 994                            ReadFileTool::NAME,
 995                            ReadFileToolInput {
 996                                path: input_file_path.into(),
 997                                start_line: Some(271),
 998                                end_line: Some(276),
 999                            },
1000                        ),
1001                    ],
1002                ),
1003                message(
1004                    User,
1005                    [tool_result(
1006                        "tool_4",
1007                        ReadFileTool::NAME,
1008                        lines(input_file_content, 271..276),
1009                    )],
1010                ),
1011                message(
1012                    Assistant,
1013                    [
1014                        text(indoc! {"
1015                                Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
1016
1017                                1. Create a file
1018                                2. Open a buffer for it
1019                                3. Track it in the action log
1020                                4. Modify the buffer
1021                                5. Call `buffer_created` again as if the file were new
1022                                6. Check that the changes are properly tracked
1023
1024                                Let's write the test:
1025                            "}),
1026                        tool_use(
1027                            "tool_5",
1028                            EditFileTool::NAME,
1029                            EditFileToolInput {
1030                                display_description: edit_description.into(),
1031                                path: input_file_path.into(),
1032                                mode: EditFileMode::Edit,
1033                            },
1034                        ),
1035                    ],
1036                ),
1037            ],
1038            Some(input_file_content.into()),
1039            EvalAssertion::judge_diff(
1040                "A new test for overwritten files was created, without changing any previous test",
1041            ),
1042        ))
1043    });
1044}
1045
1046#[test]
1047#[cfg_attr(not(feature = "unit-eval"), ignore)]
1048fn eval_create_empty_file() {
1049    // Check that Edit Agent can create a file without writing its
1050    // thoughts into it. This issue is not specific to empty files, but
1051    // it's easier to reproduce with them.
1052    //
1053    //  Model                          | Pass rate
1054    // ============================================
1055    //
1056    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1057    //  claude-sonnet-4                |  1.00 (2025-06-14)
1058    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1059    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1060    //  gpt-4.1                        |  1.00 (2025-05-21)
1061    //
1062    //
1063    // TODO: gpt-4.1-mini errored 38 times:
1064    // "data did not match any variant of untagged enum ResponseStreamResult"
1065
1066    let input_file_content = None;
1067    let expected_output_content = String::new();
1068
1069    eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1070        run_eval(EvalInput::from_conversation(
1071            vec![
1072                message(User, [text("Create a second empty todo file ")]),
1073                message(
1074                    Assistant,
1075                    [
1076                        text(formatdoc! {"
1077                            I'll help you create a second empty todo file.
1078                            First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1079                            "}),
1080                        tool_use(
1081                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1082                            ListDirectoryTool::NAME,
1083                            ListDirectoryToolInput {
1084                                path: "root".to_string(),
1085                            },
1086                        ),
1087                    ],
1088                ),
1089                message(
1090                    User,
1091                    [tool_result(
1092                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1093                        ListDirectoryTool::NAME,
1094                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1095                    )],
1096                ),
1097                message(
1098                    Assistant,
1099                    [
1100                        text(formatdoc! {"
1101                            I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1102                        "}),
1103                        tool_use(
1104                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1105                            EditFileTool::NAME,
1106                            EditFileToolInput {
1107                                display_description: "Create empty TODO3 file".to_string(),
1108                                mode: EditFileMode::Create,
1109                                path: "root/TODO3".into(),
1110                            },
1111                        ),
1112                    ],
1113                ),
1114            ],
1115            input_file_content.clone(),
1116            // Bad behavior is to write something like
1117            // "I'll create an empty TODO3 file as requested."
1118            EvalAssertion::assert_eq(expected_output_content.clone()),
1119        ))
1120    });
1121}
1122
1123fn message(
1124    role: Role,
1125    contents: impl IntoIterator<Item = MessageContent>,
1126) -> LanguageModelRequestMessage {
1127    LanguageModelRequestMessage {
1128        role,
1129        content: contents.into_iter().collect(),
1130        cache: false,
1131        reasoning_details: None,
1132    }
1133}
1134
1135fn text(text: impl Into<String>) -> MessageContent {
1136    MessageContent::Text(text.into())
1137}
1138
1139fn lines(input: &str, range: Range<usize>) -> String {
1140    input
1141        .lines()
1142        .skip(range.start)
1143        .take(range.len())
1144        .collect::<Vec<_>>()
1145        .join("\n")
1146}
1147
1148fn tool_use(
1149    id: impl Into<Arc<str>>,
1150    name: impl Into<Arc<str>>,
1151    input: impl Serialize,
1152) -> MessageContent {
1153    MessageContent::ToolUse(LanguageModelToolUse {
1154        id: LanguageModelToolUseId::from(id.into()),
1155        name: name.into(),
1156        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1157        input: serde_json::to_value(input).unwrap(),
1158        is_input_complete: true,
1159        thought_signature: None,
1160    })
1161}
1162
1163fn tool_result(
1164    id: impl Into<Arc<str>>,
1165    name: impl Into<Arc<str>>,
1166    result: impl Into<Arc<str>>,
1167) -> MessageContent {
1168    MessageContent::ToolResult(LanguageModelToolResult {
1169        tool_use_id: LanguageModelToolUseId::from(id.into()),
1170        tool_name: name.into(),
1171        is_error: false,
1172        content: LanguageModelToolResultContent::Text(result.into()),
1173        output: None,
1174    })
1175}
1176
1177#[derive(Clone)]
1178struct EvalInput {
1179    conversation: Vec<LanguageModelRequestMessage>,
1180    edit_file_input: EditFileToolInput,
1181    input_content: Option<String>,
1182    assertion: EvalAssertion,
1183}
1184
1185impl EvalInput {
1186    fn from_conversation(
1187        conversation: Vec<LanguageModelRequestMessage>,
1188        input_content: Option<String>,
1189        assertion: EvalAssertion,
1190    ) -> Self {
1191        let msg = conversation.last().expect("Conversation must not be empty");
1192        if msg.role != Role::Assistant {
1193            panic!("Conversation must end with an assistant message");
1194        }
1195        let tool_use = msg
1196            .content
1197            .iter()
1198            .flat_map(|content| match content {
1199                MessageContent::ToolUse(tool_use) if tool_use.name == EditFileTool::NAME.into() => {
1200                    Some(tool_use)
1201                }
1202                _ => None,
1203            })
1204            .next()
1205            .expect("Conversation must end with an edit_file tool use")
1206            .clone();
1207
1208        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1209
1210        EvalInput {
1211            conversation,
1212            edit_file_input,
1213            input_content,
1214            assertion,
1215        }
1216    }
1217}
1218
1219#[derive(Clone)]
1220struct EvalSample {
1221    text_before: String,
1222    text_after: String,
1223    edit_output: EditAgentOutput,
1224    diff: String,
1225}
1226
1227trait AssertionFn: 'static + Send + Sync {
1228    fn assert<'a>(
1229        &'a self,
1230        sample: &'a EvalSample,
1231        judge_model: Arc<dyn LanguageModel>,
1232        cx: &'a mut TestAppContext,
1233    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1234}
1235
1236impl<F> AssertionFn for F
1237where
1238    F: 'static
1239        + Send
1240        + Sync
1241        + AsyncFn(
1242            &EvalSample,
1243            Arc<dyn LanguageModel>,
1244            &mut TestAppContext,
1245        ) -> Result<EvalAssertionOutcome>,
1246{
1247    fn assert<'a>(
1248        &'a self,
1249        sample: &'a EvalSample,
1250        judge_model: Arc<dyn LanguageModel>,
1251        cx: &'a mut TestAppContext,
1252    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1253        (self)(sample, judge_model, cx).boxed_local()
1254    }
1255}
1256
1257#[derive(Clone)]
1258struct EvalAssertion(Arc<dyn AssertionFn>);
1259
1260impl EvalAssertion {
1261    fn new<F>(f: F) -> Self
1262    where
1263        F: 'static
1264            + Send
1265            + Sync
1266            + AsyncFn(
1267                &EvalSample,
1268                Arc<dyn LanguageModel>,
1269                &mut TestAppContext,
1270            ) -> Result<EvalAssertionOutcome>,
1271    {
1272        EvalAssertion(Arc::new(f))
1273    }
1274
1275    fn assert_eq(expected: impl Into<String>) -> Self {
1276        let expected = expected.into();
1277        Self::new(async move |sample, _judge, _cx| {
1278            Ok(EvalAssertionOutcome {
1279                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1280                    100
1281                } else {
1282                    0
1283                },
1284                message: None,
1285            })
1286        })
1287    }
1288
1289    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1290        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1291        Self::new(async move |sample, _judge, _cx| {
1292            let matches = expected_diffs.iter().any(|possible_diff| {
1293                let expected =
1294                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1295                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1296            });
1297
1298            Ok(EvalAssertionOutcome {
1299                score: if matches { 100 } else { 0 },
1300                message: None,
1301            })
1302        })
1303    }
1304
1305    fn judge_diff(assertions: &'static str) -> Self {
1306        Self::new(async move |sample, judge, cx| {
1307            let prompt = DiffJudgeTemplate {
1308                diff: sample.diff.clone(),
1309                assertions,
1310            }
1311            .render(&Templates::new())
1312            .unwrap();
1313
1314            let request = LanguageModelRequest {
1315                messages: vec![LanguageModelRequestMessage {
1316                    role: Role::User,
1317                    content: vec![prompt.into()],
1318                    cache: false,
1319                    reasoning_details: None,
1320                }],
1321                thinking_allowed: true,
1322                ..Default::default()
1323            };
1324            let mut response = retry_on_rate_limit(async || {
1325                Ok(judge
1326                    .stream_completion_text(request.clone(), &cx.to_async())
1327                    .await?)
1328            })
1329            .await?;
1330            let mut output = String::new();
1331            while let Some(chunk) = response.stream.next().await {
1332                let chunk = chunk?;
1333                output.push_str(&chunk);
1334            }
1335
1336            // Parse the score from the response
1337            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1338            if let Some(captures) = re.captures(&output)
1339                && let Some(score_match) = captures.get(1)
1340            {
1341                let score = score_match.as_str().parse().unwrap_or(0);
1342                return Ok(EvalAssertionOutcome {
1343                    score,
1344                    message: Some(output),
1345                });
1346            }
1347
1348            anyhow::bail!("No score found in response. Raw output: {output}");
1349        })
1350    }
1351
1352    async fn run(
1353        &self,
1354        input: &EvalSample,
1355        judge_model: Arc<dyn LanguageModel>,
1356        cx: &mut TestAppContext,
1357    ) -> Result<EvalAssertionOutcome> {
1358        self.0.assert(input, judge_model, cx).await
1359    }
1360}
1361
1362fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1363    let dispatcher = gpui::TestDispatcher::new(rand::random());
1364    let mut cx = TestAppContext::build(dispatcher, None);
1365    let foreground_executor = cx.foreground_executor().clone();
1366    let result = foreground_executor.block_test(async {
1367        let test = EditAgentTest::new(&mut cx).await;
1368        test.eval(eval, &mut cx).await
1369    });
1370    cx.quit();
1371    match result {
1372        Ok(output) => eval_utils::EvalOutput {
1373            data: output.to_string(),
1374            outcome: if output.assertion.score < 80 {
1375                eval_utils::OutcomeKind::Failed
1376            } else {
1377                eval_utils::OutcomeKind::Passed
1378            },
1379            metadata: EditEvalMetadata {
1380                tags: output.sample.edit_output.parser_metrics.tags,
1381                mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1382            },
1383        },
1384        Err(e) => eval_utils::EvalOutput {
1385            data: format!("{e:?}"),
1386            outcome: eval_utils::OutcomeKind::Error,
1387            metadata: EditEvalMetadata {
1388                tags: 0,
1389                mismatched_tags: 0,
1390            },
1391        },
1392    }
1393}
1394
1395#[derive(Clone)]
1396struct EditEvalOutput {
1397    sample: EvalSample,
1398    assertion: EvalAssertionOutcome,
1399}
1400
1401impl Display for EditEvalOutput {
1402    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1403        writeln!(f, "Score: {:?}", self.assertion.score)?;
1404        if let Some(message) = self.assertion.message.as_ref() {
1405            writeln!(f, "Message: {}", message)?;
1406        }
1407
1408        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1409
1410        writeln!(
1411            f,
1412            "Parser Metrics:\n{:#?}",
1413            self.sample.edit_output.parser_metrics
1414        )?;
1415        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1416        Ok(())
1417    }
1418}
1419
1420struct EditAgentTest {
1421    agent: EditAgent,
1422    project: Entity<Project>,
1423    judge_model: Arc<dyn LanguageModel>,
1424}
1425
1426impl EditAgentTest {
1427    async fn new(cx: &mut TestAppContext) -> Self {
1428        cx.executor().allow_parking();
1429
1430        let fs = FakeFs::new(cx.executor());
1431        cx.update(|cx| {
1432            settings::init(cx);
1433            gpui_tokio::init(cx);
1434            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1435            cx.set_http_client(http_client);
1436            let client = Client::production(cx);
1437            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1438            settings::init(cx);
1439            language_model::init(client.clone(), cx);
1440            language_models::init(user_store, client.clone(), cx);
1441        });
1442
1443        fs.insert_tree("/root", json!({})).await;
1444        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1445        let agent_model = SelectedModel::from_str(
1446            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1447        )
1448        .unwrap();
1449        let judge_model = SelectedModel::from_str(
1450            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1451        )
1452        .unwrap();
1453
1454        let authenticate_provider_tasks = cx.update(|cx| {
1455            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1456                registry
1457                    .providers()
1458                    .iter()
1459                    .map(|p| p.authenticate(cx))
1460                    .collect::<Vec<_>>()
1461            })
1462        });
1463        let (agent_model, judge_model) = cx
1464            .update(|cx| {
1465                cx.spawn(async move |cx| {
1466                    futures::future::join_all(authenticate_provider_tasks).await;
1467                    let agent_model = Self::load_model(&agent_model, cx).await;
1468                    let judge_model = Self::load_model(&judge_model, cx).await;
1469                    (agent_model.unwrap(), judge_model.unwrap())
1470                })
1471            })
1472            .await;
1473        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1474
1475        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1476
1477        Self {
1478            agent: EditAgent::new(
1479                agent_model,
1480                project.clone(),
1481                action_log,
1482                Templates::new(),
1483                edit_format,
1484            ),
1485            project,
1486            judge_model,
1487        }
1488    }
1489
1490    async fn load_model(
1491        selected_model: &SelectedModel,
1492        cx: &mut AsyncApp,
1493    ) -> Result<Arc<dyn LanguageModel>> {
1494        cx.update(|cx| {
1495            let registry = LanguageModelRegistry::read_global(cx);
1496            let provider = registry
1497                .provider(&selected_model.provider)
1498                .expect("Provider not found");
1499            provider.authenticate(cx)
1500        })
1501        .await?;
1502        Ok(cx.update(|cx| {
1503            let models = LanguageModelRegistry::read_global(cx);
1504            let model = models
1505                .available_models(cx)
1506                .find(|model| {
1507                    model.provider_id() == selected_model.provider
1508                        && model.id() == selected_model.model
1509                })
1510                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1511            model
1512        }))
1513    }
1514
1515    async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1516        // Make sure the last message in the conversation is cached.
1517        eval.conversation.last_mut().unwrap().cache = true;
1518
1519        let path = self
1520            .project
1521            .read_with(cx, |project, cx| {
1522                project.find_project_path(eval.edit_file_input.path, cx)
1523            })
1524            .unwrap();
1525        let buffer = self
1526            .project
1527            .update(cx, |project, cx| project.open_buffer(path, cx))
1528            .await
1529            .unwrap();
1530
1531        let tools = crate::built_in_tools().collect::<Vec<_>>();
1532
1533        let system_prompt = {
1534            let worktrees = vec![WorktreeContext {
1535                root_name: "root".to_string(),
1536                abs_path: Path::new("/path/to/root").into(),
1537                rules_file: None,
1538            }];
1539            let project_context = ProjectContext::new(worktrees, Vec::default());
1540            let tool_names = tools
1541                .iter()
1542                .map(|tool| tool.name.clone().into())
1543                .collect::<Vec<_>>();
1544            let template = crate::SystemPromptTemplate {
1545                project: &project_context,
1546                available_tools: tool_names,
1547                model_name: None,
1548            };
1549            let templates = Templates::new();
1550            template.render(&templates).unwrap()
1551        };
1552
1553        let has_system_prompt = eval
1554            .conversation
1555            .first()
1556            .is_some_and(|msg| msg.role == Role::System);
1557        let messages = if has_system_prompt {
1558            eval.conversation
1559        } else {
1560            [LanguageModelRequestMessage {
1561                role: Role::System,
1562                content: vec![MessageContent::Text(system_prompt)],
1563                cache: true,
1564                reasoning_details: None,
1565            }]
1566            .into_iter()
1567            .chain(eval.conversation)
1568            .collect::<Vec<_>>()
1569        };
1570
1571        let conversation = LanguageModelRequest {
1572            messages,
1573            tools,
1574            thinking_allowed: true,
1575            ..Default::default()
1576        };
1577
1578        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1579            if let Some(input_content) = eval.input_content.as_deref() {
1580                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1581            }
1582            retry_on_rate_limit(async || {
1583                self.agent
1584                    .edit(
1585                        buffer.clone(),
1586                        eval.edit_file_input.display_description.clone(),
1587                        &conversation,
1588                        &mut cx.to_async(),
1589                    )
1590                    .0
1591                    .await
1592            })
1593            .await?
1594        } else {
1595            retry_on_rate_limit(async || {
1596                self.agent
1597                    .overwrite(
1598                        buffer.clone(),
1599                        eval.edit_file_input.display_description.clone(),
1600                        &conversation,
1601                        &mut cx.to_async(),
1602                    )
1603                    .0
1604                    .await
1605            })
1606            .await?
1607        };
1608
1609        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1610        let sample = EvalSample {
1611            edit_output,
1612            diff: language::unified_diff(
1613                eval.input_content.as_deref().unwrap_or_default(),
1614                &buffer_text,
1615            ),
1616            text_before: eval.input_content.unwrap_or_default(),
1617            text_after: buffer_text,
1618        };
1619        let assertion = eval
1620            .assertion
1621            .run(&sample, self.judge_model.clone(), cx)
1622            .await?;
1623
1624        Ok(EditEvalOutput { assertion, sample })
1625    }
1626}
1627
1628async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1629    const MAX_RETRIES: usize = 20;
1630    let mut attempt = 0;
1631
1632    loop {
1633        attempt += 1;
1634        let response = request().await;
1635
1636        if attempt >= MAX_RETRIES {
1637            return response;
1638        }
1639
1640        let retry_delay = match &response {
1641            Ok(_) => None,
1642            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1643                Some(err) => match &err {
1644                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1645                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1646                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1647                    }
1648                    LanguageModelCompletionError::UpstreamProviderError {
1649                        status,
1650                        retry_after,
1651                        ..
1652                    } => {
1653                        // Only retry for specific status codes
1654                        let should_retry = matches!(
1655                            *status,
1656                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1657                        ) || status.as_u16() == 529;
1658
1659                        if should_retry {
1660                            // Use server-provided retry_after if available, otherwise use default
1661                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1662                        } else {
1663                            None
1664                        }
1665                    }
1666                    LanguageModelCompletionError::ApiReadResponseError { .. }
1667                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1668                    | LanguageModelCompletionError::HttpSend { .. } => {
1669                        // Exponential backoff for transient I/O and internal server errors
1670                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1671                    }
1672                    _ => None,
1673                },
1674                _ => None,
1675            },
1676        };
1677
1678        if let Some(retry_after) = retry_delay {
1679            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1680            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1681            // This code does not use the gpui::executor
1682            #[allow(clippy::disallowed_methods)]
1683            smol::Timer::after(retry_after + jitter).await;
1684        } else {
1685            return response;
1686        }
1687    }
1688}
1689
1690#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1691struct EvalAssertionOutcome {
1692    score: usize,
1693    message: Option<String>,
1694}
1695
1696#[derive(Serialize)]
1697pub struct DiffJudgeTemplate {
1698    diff: String,
1699    assertions: &'static str,
1700}
1701
1702impl Template for DiffJudgeTemplate {
1703    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1704}
1705
1706fn strip_empty_lines(text: &str) -> String {
1707    text.lines()
1708        .filter(|line| !line.trim().is_empty())
1709        .collect::<Vec<_>>()
1710        .join("\n")
1711}