evals.rs

   1use super::*;
   2use crate::{
   3    EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
   4};
   5use Role::*;
   6use client::{Client, UserStore};
   7use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
   8use fs::FakeFs;
   9use futures::{FutureExt, future::LocalBoxFuture};
  10use gpui::{AppContext, TestAppContext, Timer};
  11use http_client::StatusCode;
  12use indoc::{formatdoc, indoc};
  13use language_model::{
  14    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
  15    LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
  16};
  17use project::Project;
  18use prompt_store::{ProjectContext, WorktreeContext};
  19use rand::prelude::*;
  20use reqwest_client::ReqwestClient;
  21use serde_json::json;
  22use std::{
  23    fmt::{self, Display},
  24    path::Path,
  25    str::FromStr,
  26    time::Duration,
  27};
  28use util::path;
  29
  30#[derive(Default, Clone, Debug)]
  31struct EditAgentOutputProcessor {
  32    mismatched_tag_threshold: f32,
  33    cumulative_tags: usize,
  34    cumulative_mismatched_tags: usize,
  35    eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
  36}
  37
  38fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
  39    EditAgentOutputProcessor {
  40        mismatched_tag_threshold,
  41        cumulative_tags: 0,
  42        cumulative_mismatched_tags: 0,
  43        eval_outputs: Vec::new(),
  44    }
  45}
  46
  47#[derive(Clone, Debug)]
  48struct EditEvalMetadata {
  49    tags: usize,
  50    mismatched_tags: usize,
  51}
  52
  53impl EvalOutputProcessor for EditAgentOutputProcessor {
  54    type Metadata = EditEvalMetadata;
  55
  56    fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
  57        if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
  58            self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
  59            self.cumulative_tags += output.metadata.tags;
  60            self.eval_outputs.push(output.clone());
  61        }
  62    }
  63
  64    fn assert(&mut self) {
  65        let mismatched_tag_ratio =
  66            self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
  67        if mismatched_tag_ratio > self.mismatched_tag_threshold {
  68            for eval_output in &self.eval_outputs {
  69                println!("{}", eval_output.data);
  70            }
  71            panic!(
  72                "Too many mismatched tags: {:?}",
  73                self.cumulative_mismatched_tags
  74            );
  75        }
  76    }
  77}
  78
  79#[test]
  80#[cfg_attr(not(feature = "unit-eval"), ignore)]
  81fn eval_extract_handle_command_output() {
  82    // Test how well agent generates multiple edit hunks.
  83    //
  84    // Model                       | Pass rate
  85    // ----------------------------|----------
  86    // claude-3.7-sonnet           |  0.99 (2025-06-14)
  87    // claude-sonnet-4             |  0.97 (2025-06-14)
  88    // gemini-2.5-pro-06-05        |  0.98 (2025-06-16)
  89    // gemini-2.5-flash            |  0.11 (2025-05-22)
  90    // gpt-4.1                     |  1.00 (2025-05-22)
  91
  92    let input_file_path = "root/blame.rs";
  93    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
  94    let possible_diffs = vec![
  95        include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
  96        include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
  97        include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
  98        include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
  99        include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
 100        include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
 101        include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
 102    ];
 103    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
 104    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 105        run_eval(EvalInput::from_conversation(
 106            vec![
 107                message(
 108                    User,
 109                    [text(formatdoc! {"
 110                            Read the `{input_file_path}` file and extract a method in
 111                            the final stanza of `run_git_blame` to deal with command failures,
 112                            call it `handle_command_output` and take the std::process::Output as the only parameter.
 113                            Do not document the method and do not add any comments.
 114
 115                            Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
 116                        "})],
 117                ),
 118                message(
 119                    Assistant,
 120                    [tool_use(
 121                        "tool_1",
 122                        "read_file",
 123                        ReadFileToolInput {
 124                            path: input_file_path.into(),
 125                            start_line: None,
 126                            end_line: None,
 127                        },
 128                    )],
 129                ),
 130                message(
 131                    User,
 132                    [tool_result("tool_1", "read_file", input_file_content)],
 133                ),
 134                message(
 135                    Assistant,
 136                    [tool_use(
 137                        "tool_2",
 138                        "edit_file",
 139                        EditFileToolInput {
 140                            display_description: edit_description.into(),
 141                            path: input_file_path.into(),
 142                            mode: EditFileMode::Edit,
 143                        },
 144                    )],
 145                ),
 146            ],
 147            Some(input_file_content.into()),
 148            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 149        ))
 150    });
 151}
 152
 153#[test]
 154#[cfg_attr(not(feature = "unit-eval"), ignore)]
 155fn eval_delete_run_git_blame() {
 156    // Model                       | Pass rate
 157    // ----------------------------|----------
 158    // claude-3.7-sonnet           | 1.0  (2025-06-14)
 159    // claude-sonnet-4             | 0.96 (2025-06-14)
 160    // gemini-2.5-pro-06-05        | 1.0  (2025-06-16)
 161    // gemini-2.5-flash            |
 162    // gpt-4.1                     |
 163
 164    let input_file_path = "root/blame.rs";
 165    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
 166    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
 167    let edit_description = "Delete the `run_git_blame` function.";
 168
 169    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 170        run_eval(EvalInput::from_conversation(
 171            vec![
 172                message(
 173                    User,
 174                    [text(formatdoc! {"
 175                            Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 176                            one function, not its usages.
 177                        "})],
 178                ),
 179                message(
 180                    Assistant,
 181                    [tool_use(
 182                        "tool_1",
 183                        "read_file",
 184                        ReadFileToolInput {
 185                            path: input_file_path.into(),
 186                            start_line: None,
 187                            end_line: None,
 188                        },
 189                    )],
 190                ),
 191                message(
 192                    User,
 193                    [tool_result("tool_1", "read_file", input_file_content)],
 194                ),
 195                message(
 196                    Assistant,
 197                    [tool_use(
 198                        "tool_2",
 199                        "edit_file",
 200                        EditFileToolInput {
 201                            display_description: edit_description.into(),
 202                            path: input_file_path.into(),
 203                            mode: EditFileMode::Edit,
 204                        },
 205                    )],
 206                ),
 207            ],
 208            Some(input_file_content.into()),
 209            EvalAssertion::assert_eq(output_file_content),
 210        ))
 211    });
 212}
 213
 214#[test]
 215#[cfg_attr(not(feature = "unit-eval"), ignore)]
 216fn eval_translate_doc_comments() {
 217    //  Model                          | Pass rate
 218    // ============================================
 219    //
 220    //  claude-3.7-sonnet              |  1.0  (2025-06-14)
 221    //  claude-sonnet-4                |  1.0  (2025-06-14)
 222    //  gemini-2.5-pro-preview-03-25   |  1.0  (2025-05-22)
 223    //  gemini-2.5-flash-preview-04-17 |
 224    //  gpt-4.1                        |
 225
 226    let input_file_path = "root/canvas.rs";
 227    let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
 228    let edit_description = "Translate all doc comments to Italian";
 229
 230    eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
 231        run_eval(EvalInput::from_conversation(
 232            vec![
 233                message(
 234                    User,
 235                    [text(formatdoc! {"
 236                            Read the {input_file_path} file and edit it (without overwriting it),
 237                            translating all the doc comments to italian.
 238                        "})],
 239                ),
 240                message(
 241                    Assistant,
 242                    [tool_use(
 243                        "tool_1",
 244                        "read_file",
 245                        ReadFileToolInput {
 246                            path: input_file_path.into(),
 247                            start_line: None,
 248                            end_line: None,
 249                        },
 250                    )],
 251                ),
 252                message(
 253                    User,
 254                    [tool_result("tool_1", "read_file", input_file_content)],
 255                ),
 256                message(
 257                    Assistant,
 258                    [tool_use(
 259                        "tool_2",
 260                        "edit_file",
 261                        EditFileToolInput {
 262                            display_description: edit_description.into(),
 263                            path: input_file_path.into(),
 264                            mode: EditFileMode::Edit,
 265                        },
 266                    )],
 267                ),
 268            ],
 269            Some(input_file_content.into()),
 270            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 271        ))
 272    });
 273}
 274
 275#[test]
 276#[cfg_attr(not(feature = "unit-eval"), ignore)]
 277fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 278    //  Model                          | Pass rate
 279    // ============================================
 280    //
 281    //  claude-3.7-sonnet              |  0.96 (2025-06-14)
 282    //  claude-sonnet-4                |  0.11 (2025-06-14)
 283    //  gemini-2.5-pro-preview-latest  |  0.99 (2025-06-16)
 284    //  gemini-2.5-flash-preview-04-17 |
 285    //  gpt-4.1                        |
 286
 287    let input_file_path = "root/lib.rs";
 288    let input_file_content =
 289        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 290    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
 291
 292    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
 293        run_eval(EvalInput::from_conversation(
 294            vec![
 295                message(
 296                    User,
 297                    [text(formatdoc! {"
 298                            Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 299                            Use `ureq` to download the SDK for the current platform and architecture.
 300                            Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 301                            Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 302                            that's inside of the archive.
 303                            Don't re-download the SDK if that executable already exists.
 304
 305                            Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 306
 307                            Here are the available wasi-sdk assets:
 308                            - wasi-sdk-25.0-x86_64-macos.tar.gz
 309                            - wasi-sdk-25.0-arm64-macos.tar.gz
 310                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 311                            - wasi-sdk-25.0-arm64-linux.tar.gz
 312                            - wasi-sdk-25.0-x86_64-linux.tar.gz
 313                            - wasi-sdk-25.0-arm64-linux.tar.gz
 314                            - wasi-sdk-25.0-x86_64-windows.tar.gz
 315                        "})],
 316                ),
 317                message(
 318                    Assistant,
 319                    [tool_use(
 320                        "tool_1",
 321                        "read_file",
 322                        ReadFileToolInput {
 323                            path: input_file_path.into(),
 324                            start_line: Some(971),
 325                            end_line: Some(1050),
 326                        },
 327                    )],
 328                ),
 329                message(
 330                    User,
 331                    [tool_result(
 332                        "tool_1",
 333                        "read_file",
 334                        lines(input_file_content, 971..1050),
 335                    )],
 336                ),
 337                message(
 338                    Assistant,
 339                    [tool_use(
 340                        "tool_2",
 341                        "read_file",
 342                        ReadFileToolInput {
 343                            path: input_file_path.into(),
 344                            start_line: Some(1050),
 345                            end_line: Some(1100),
 346                        },
 347                    )],
 348                ),
 349                message(
 350                    User,
 351                    [tool_result(
 352                        "tool_2",
 353                        "read_file",
 354                        lines(input_file_content, 1050..1100),
 355                    )],
 356                ),
 357                message(
 358                    Assistant,
 359                    [tool_use(
 360                        "tool_3",
 361                        "read_file",
 362                        ReadFileToolInput {
 363                            path: input_file_path.into(),
 364                            start_line: Some(1100),
 365                            end_line: Some(1150),
 366                        },
 367                    )],
 368                ),
 369                message(
 370                    User,
 371                    [tool_result(
 372                        "tool_3",
 373                        "read_file",
 374                        lines(input_file_content, 1100..1150),
 375                    )],
 376                ),
 377                message(
 378                    Assistant,
 379                    [tool_use(
 380                        "tool_4",
 381                        "edit_file",
 382                        EditFileToolInput {
 383                            display_description: edit_description.into(),
 384                            path: input_file_path.into(),
 385                            mode: EditFileMode::Edit,
 386                        },
 387                    )],
 388                ),
 389            ],
 390            Some(input_file_content.into()),
 391            EvalAssertion::judge_diff(indoc! {"
 392                    - The compile_parser_to_wasm method has been changed to use wasi-sdk
 393                    - ureq is used to download the SDK for current platform and architecture
 394                "}),
 395        ))
 396    });
 397}
 398
 399#[test]
 400#[cfg_attr(not(feature = "unit-eval"), ignore)]
 401fn eval_disable_cursor_blinking() {
 402    //  Model                          | Pass rate
 403    // ============================================
 404    //
 405    //  claude-3.7-sonnet              |  0.59 (2025-07-14)
 406    //  claude-sonnet-4                |  0.81 (2025-07-14)
 407    //  gemini-2.5-pro                 |  0.95 (2025-07-14)
 408    //  gemini-2.5-flash-preview-04-17 |  0.78 (2025-07-14)
 409    //  gpt-4.1                        |  0.00 (2025-07-14) (follows edit_description too literally)
 410
 411    let input_file_path = "root/editor.rs";
 412    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
 413    let edit_description = "Comment out the call to `BlinkManager::enable`";
 414    let possible_diffs = vec![
 415        include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
 416        include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
 417        include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
 418        include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
 419    ];
 420    eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
 421        run_eval(EvalInput::from_conversation(
 422            vec![
 423                message(User, [text("Let's research how to cursor blinking works.")]),
 424                message(
 425                    Assistant,
 426                    [tool_use(
 427                        "tool_1",
 428                        "grep",
 429                        GrepToolInput {
 430                            regex: "blink".into(),
 431                            include_pattern: None,
 432                            offset: 0,
 433                            case_sensitive: false,
 434                        },
 435                    )],
 436                ),
 437                message(
 438                    User,
 439                    [tool_result(
 440                        "tool_1",
 441                        "grep",
 442                        [
 443                            lines(input_file_content, 100..400),
 444                            lines(input_file_content, 800..1300),
 445                            lines(input_file_content, 1600..2000),
 446                            lines(input_file_content, 5000..5500),
 447                            lines(input_file_content, 8000..9000),
 448                            lines(input_file_content, 18455..18470),
 449                            lines(input_file_content, 20000..20500),
 450                            lines(input_file_content, 21000..21300),
 451                        ]
 452                        .join("Match found:\n\n"),
 453                    )],
 454                ),
 455                message(
 456                    User,
 457                    [text(indoc! {"
 458                            Comment out the lines that interact with the BlinkManager.
 459                            Keep the outer `update` blocks, but comments everything that's inside (including if statements).
 460                            Don't add additional comments.
 461                        "})],
 462                ),
 463                message(
 464                    Assistant,
 465                    [tool_use(
 466                        "tool_4",
 467                        "edit_file",
 468                        EditFileToolInput {
 469                            display_description: edit_description.into(),
 470                            path: input_file_path.into(),
 471                            mode: EditFileMode::Edit,
 472                        },
 473                    )],
 474                ),
 475            ],
 476            Some(input_file_content.into()),
 477            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 478        ))
 479    });
 480}
 481
 482#[test]
 483#[cfg_attr(not(feature = "unit-eval"), ignore)]
 484fn eval_from_pixels_constructor() {
 485    // Results for 2025-06-13
 486    //
 487    // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
 488    // value. Higher values improve the pass rate but may sometimes cause
 489    // edits to be misapplied. In the context of this eval, this means
 490    // the agent might add from_pixels tests in incorrect locations
 491    // (e.g., at the beginning of the file), yet the evaluation may still
 492    // rate it highly.
 493    //
 494    //  Model                          | Date        | Pass rate
 495    // =========================================================
 496    //  claude-4.0-sonnet              | 2025-06-14  | 0.99
 497    //  claude-3.7-sonnet              | 2025-06-14  | 0.88
 498    //  gemini-2.5-pro-preview-06-05   | 2025-06-16  | 0.98
 499    //  gpt-4.1                        |
 500
 501    let input_file_path = "root/canvas.rs";
 502    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
 503    let edit_description = "Implement from_pixels constructor and add tests.";
 504
 505    eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
 506        run_eval(EvalInput::from_conversation(
 507            vec![
 508                message(
 509                    User,
 510                    [text(indoc! {"
 511                            Introduce a new `from_pixels` constructor in Canvas and
 512                            also add tests for it in the same file.
 513                        "})],
 514                ),
 515                message(
 516                    Assistant,
 517                    [tool_use(
 518                        "tool_1",
 519                        "read_file",
 520                        ReadFileToolInput {
 521                            path: input_file_path.into(),
 522                            start_line: None,
 523                            end_line: None,
 524                        },
 525                    )],
 526                ),
 527                message(
 528                    User,
 529                    [tool_result("tool_1", "read_file", input_file_content)],
 530                ),
 531                message(
 532                    Assistant,
 533                    [tool_use(
 534                        "tool_2",
 535                        "grep",
 536                        GrepToolInput {
 537                            regex: "mod\\s+tests".into(),
 538                            include_pattern: Some("font-kit/src/canvas.rs".into()),
 539                            offset: 0,
 540                            case_sensitive: false,
 541                        },
 542                    )],
 543                ),
 544                message(User, [tool_result("tool_2", "grep", "No matches found")]),
 545                message(
 546                    Assistant,
 547                    [tool_use(
 548                        "tool_3",
 549                        "grep",
 550                        GrepToolInput {
 551                            regex: "mod\\s+tests".into(),
 552                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 553                            offset: 0,
 554                            case_sensitive: false,
 555                        },
 556                    )],
 557                ),
 558                message(User, [tool_result("tool_3", "grep", "No matches found")]),
 559                message(
 560                    Assistant,
 561                    [tool_use(
 562                        "tool_4",
 563                        "grep",
 564                        GrepToolInput {
 565                            regex: "#\\[test\\]".into(),
 566                            include_pattern: Some("font-kit/src/**/*.rs".into()),
 567                            offset: 0,
 568                            case_sensitive: false,
 569                        },
 570                    )],
 571                ),
 572                message(
 573                    User,
 574                    [tool_result(
 575                        "tool_4",
 576                        "grep",
 577                        indoc! {"
 578                                Found 6 matches:
 579
 580                                ## Matches in font-kit/src/loaders/core_text.rs
 581
 582                                ### mod test › L926-936
 583                                ```
 584                                mod test {
 585                                    use super::Font;
 586                                    use crate::properties::{Stretch, Weight};
 587
 588                                    #[cfg(feature = \"source\")]
 589                                    use crate::source::SystemSource;
 590
 591                                    static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
 592
 593                                    #[cfg(feature = \"source\")]
 594                                    #[test]
 595                                ```
 596
 597                                55 lines remaining in ancestor node. Read the file to see all.
 598
 599                                ### mod test › L947-951
 600                                ```
 601                                    }
 602
 603                                    #[test]
 604                                    fn test_core_text_to_css_font_weight() {
 605                                        // Exact matches
 606                                ```
 607
 608                                ### mod test › L959-963
 609                                ```
 610                                    }
 611
 612                                    #[test]
 613                                    fn test_core_text_to_css_font_stretch() {
 614                                        // Exact matches
 615                                ```
 616
 617                                ## Matches in font-kit/src/loaders/freetype.rs
 618
 619                                ### mod test › L1238-1248
 620                                ```
 621                                mod test {
 622                                    use crate::loaders::freetype::Font;
 623
 624                                    static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
 625                                    static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
 626
 627                                    #[test]
 628                                    fn get_pcf_postscript_name() {
 629                                        let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
 630                                        assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
 631                                    }
 632                                ```
 633
 634                                1 lines remaining in ancestor node. Read the file to see all.
 635
 636                                ## Matches in font-kit/src/sources/core_text.rs
 637
 638                                ### mod test › L265-275
 639                                ```
 640                                mod test {
 641                                    use crate::properties::{Stretch, Weight};
 642
 643                                    #[test]
 644                                    fn test_css_to_core_text_font_weight() {
 645                                        // Exact matches
 646                                        assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
 647                                        assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
 648                                        assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
 649                                        assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
 650
 651                                ```
 652
 653                                27 lines remaining in ancestor node. Read the file to see all.
 654
 655                                ### mod test › L278-282
 656                                ```
 657                                    }
 658
 659                                    #[test]
 660                                    fn test_css_to_core_text_font_stretch() {
 661                                        // Exact matches
 662                                ```
 663                            "},
 664                    )],
 665                ),
 666                message(
 667                    Assistant,
 668                    [tool_use(
 669                        "tool_5",
 670                        "edit_file",
 671                        EditFileToolInput {
 672                            display_description: edit_description.into(),
 673                            path: input_file_path.into(),
 674                            mode: EditFileMode::Edit,
 675                        },
 676                    )],
 677                ),
 678            ],
 679            Some(input_file_content.into()),
 680            EvalAssertion::judge_diff(indoc! {"
 681                        - The diff contains a new `from_pixels` constructor
 682                        - The diff contains new tests for the `from_pixels` constructor
 683                    "}),
 684        ))
 685    });
 686}
 687
 688#[test]
 689#[cfg_attr(not(feature = "unit-eval"), ignore)]
 690fn eval_zode() {
 691    //  Model                          | Pass rate
 692    // ============================================
 693    //
 694    //  claude-3.7-sonnet              |  1.0 (2025-06-14)
 695    //  claude-sonnet-4                |  1.0 (2025-06-14)
 696    //  gemini-2.5-pro-preview-03-25   |  1.0 (2025-05-22)
 697    //  gemini-2.5-flash-preview-04-17 |  1.0 (2025-05-22)
 698    //  gpt-4.1                        |  1.0 (2025-05-22)
 699
 700    let input_file_path = "root/zode.py";
 701    let input_content = None;
 702    let edit_description = "Create the main Zode CLI script";
 703
 704    eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
 705        run_eval(EvalInput::from_conversation(
 706            vec![
 707                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
 708                message(
 709                    Assistant,
 710                    [
 711                        tool_use(
 712                            "tool_1",
 713                            "read_file",
 714                            ReadFileToolInput {
 715                                path: "root/eval/react.py".into(),
 716                                start_line: None,
 717                                end_line: None,
 718                            },
 719                        ),
 720                        tool_use(
 721                            "tool_2",
 722                            "read_file",
 723                            ReadFileToolInput {
 724                                path: "root/eval/react_test.py".into(),
 725                                start_line: None,
 726                                end_line: None,
 727                            },
 728                        ),
 729                    ],
 730                ),
 731                message(
 732                    User,
 733                    [
 734                        tool_result(
 735                            "tool_1",
 736                            "read_file",
 737                            include_str!("evals/fixtures/zode/react.py"),
 738                        ),
 739                        tool_result(
 740                            "tool_2",
 741                            "read_file",
 742                            include_str!("evals/fixtures/zode/react_test.py"),
 743                        ),
 744                    ],
 745                ),
 746                message(
 747                    Assistant,
 748                    [
 749                        text(
 750                            "Now that I understand what we need to build, I'll create the main Python script:",
 751                        ),
 752                        tool_use(
 753                            "tool_3",
 754                            "edit_file",
 755                            EditFileToolInput {
 756                                display_description: edit_description.into(),
 757                                path: input_file_path.into(),
 758                                mode: EditFileMode::Create,
 759                            },
 760                        ),
 761                    ],
 762                ),
 763            ],
 764            input_content.clone(),
 765            EvalAssertion::new(async move |sample, _, _cx| {
 766                let invalid_starts = [' ', '`', '\n'];
 767                let mut message = String::new();
 768                for start in invalid_starts {
 769                    if sample.text_after.starts_with(start) {
 770                        message.push_str(&format!("The sample starts with a {:?}\n", start));
 771                        break;
 772                    }
 773                }
 774                // Remove trailing newline.
 775                message.pop();
 776
 777                if message.is_empty() {
 778                    Ok(EvalAssertionOutcome {
 779                        score: 100,
 780                        message: None,
 781                    })
 782                } else {
 783                    Ok(EvalAssertionOutcome {
 784                        score: 0,
 785                        message: Some(message),
 786                    })
 787                }
 788            }),
 789        ))
 790    });
 791}
 792
 793#[test]
 794#[cfg_attr(not(feature = "unit-eval"), ignore)]
 795fn eval_add_overwrite_test() {
 796    //  Model                          | Pass rate
 797    // ============================================
 798    //
 799    //  claude-3.7-sonnet              |  0.65 (2025-06-14)
 800    //  claude-sonnet-4                |  0.07 (2025-06-14)
 801    //  gemini-2.5-pro-preview-03-25   |  0.35 (2025-05-22)
 802    //  gemini-2.5-flash-preview-04-17 |
 803    //  gpt-4.1                        |
 804
 805    let input_file_path = "root/action_log.rs";
 806    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
 807    let edit_description = "Add a new test for overwriting a file in action_log.rs";
 808
 809    eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
 810        run_eval(EvalInput::from_conversation(
 811            vec![
 812                message(
 813                    User,
 814                    [text(indoc! {"
 815                            Introduce a new test in `action_log.rs` to test overwriting a file.
 816                            That is, a file already exists, but we call `buffer_created` as if the file were new.
 817                            Take inspiration from all the other tests in the file.
 818                        "})],
 819                ),
 820                message(
 821                    Assistant,
 822                    [tool_use(
 823                        "tool_1",
 824                        "read_file",
 825                        ReadFileToolInput {
 826                            path: input_file_path.into(),
 827                            start_line: None,
 828                            end_line: None,
 829                        },
 830                    )],
 831                ),
 832                message(
 833                    User,
 834                    [tool_result(
 835                        "tool_1",
 836                        "read_file",
 837                        indoc! {"
 838                                pub struct ActionLog [L13-20]
 839                                 tracked_buffers [L15]
 840                                 edited_since_project_diagnostics_check [L17]
 841                                 project [L19]
 842                                impl ActionLog [L22-498]
 843                                 pub fn new [L24-30]
 844                                 pub fn project [L32-34]
 845                                 pub fn checked_project_diagnostics [L37-39]
 846                                 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
 847                                 fn track_buffer_internal [L46-101]
 848                                 fn handle_buffer_event [L103-116]
 849                                 fn handle_buffer_edited [L118-123]
 850                                 fn handle_buffer_file_changed [L125-158]
 851                                 async fn maintain_diff [L160-264]
 852                                 pub fn buffer_read [L267-269]
 853                                 pub fn buffer_created [L272-276]
 854                                 pub fn buffer_edited [L279-287]
 855                                 pub fn will_delete_buffer [L289-304]
 856                                 pub fn keep_edits_in_range [L306-364]
 857                                 pub fn reject_edits_in_ranges [L366-459]
 858                                 pub fn keep_all_edits [L461-473]
 859                                 pub fn changed_buffers [L476-482]
 860                                 pub fn stale_buffers [L485-497]
 861                                fn apply_non_conflicting_edits [L500-561]
 862                                fn diff_snapshots [L563-585]
 863                                fn point_to_row_edit [L587-614]
 864                                enum ChangeAuthor [L617-620]
 865                                 User [L618]
 866                                 Agent [L619]
 867                                enum TrackedBufferStatus [L623-627]
 868                                 Created [L624]
 869                                 Modified [L625]
 870                                 Deleted [L626]
 871                                struct TrackedBuffer [L629-641]
 872                                 buffer [L630]
 873                                 base_text [L631]
 874                                 unreviewed_changes [L632]
 875                                 status [L633]
 876                                 version [L634]
 877                                 diff [L635]
 878                                 snapshot [L636]
 879                                 diff_update [L637]
 880                                 _open_lsp_handle [L638]
 881                                 _maintain_diff [L639]
 882                                 _subscription [L640]
 883                                impl TrackedBuffer [L643-657]
 884                                 fn has_changes [L644-650]
 885                                 fn schedule_diff_update [L652-656]
 886                                pub struct ChangedBuffer [L659-661]
 887                                 pub diff [L660]
 888                                mod tests [L664-1574]
 889                                 fn init_logger [L678-682]
 890                                 fn init_test [L684-691]
 891                                 async fn test_keep_edits [L694-769]
 892                                 async fn test_deletions [L772-854]
 893                                 async fn test_overlapping_user_edits [L857-951]
 894                                 async fn test_creating_files [L954-1010]
 895                                 async fn test_deleting_files [L1013-1120]
 896                                 async fn test_reject_edits [L1123-1255]
 897                                 async fn test_reject_multiple_edits [L1258-1331]
 898                                 async fn test_reject_deleted_file [L1334-1388]
 899                                 async fn test_reject_created_file [L1391-1443]
 900                                 async fn test_random_diffs [L1446-1535]
 901                                  fn quiesce [L1510-1534]
 902                                 struct HunkStatus [L1538-1542]
 903                                  range [L1539]
 904                                  diff_status [L1540]
 905                                  old_text [L1541]
 906                                 fn unreviewed_hunks [L1544-1573]
 907
 908                                Showing symbols 1-69 (total symbols: 69)
 909
 910                                Using the line numbers in this outline, you can call this tool again while specifying
 911                                the start_line and end_line fields to see the implementations of symbols in the outline.
 912                            "},
 913                    )],
 914                ),
 915                message(
 916                    Assistant,
 917                    [
 918                        text(
 919                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
 920                        ),
 921                        tool_use(
 922                            "tool_2",
 923                            "read_file",
 924                            ReadFileToolInput {
 925                                path: input_file_path.into(),
 926                                start_line: Some(953),
 927                                end_line: Some(1010),
 928                            },
 929                        ),
 930                    ],
 931                ),
 932                message(
 933                    User,
 934                    [tool_result(
 935                        "tool_2",
 936                        "read_file",
 937                        lines(input_file_content, 953..1010),
 938                    )],
 939                ),
 940                message(
 941                    Assistant,
 942                    [
 943                        text(
 944                            "Now let's look at another relevant test to understand how they're structured:",
 945                        ),
 946                        tool_use(
 947                            "tool_3",
 948                            "read_file",
 949                            ReadFileToolInput {
 950                                path: input_file_path.into(),
 951                                start_line: Some(1012),
 952                                end_line: Some(1120),
 953                            },
 954                        ),
 955                    ],
 956                ),
 957                message(
 958                    User,
 959                    [tool_result(
 960                        "tool_3",
 961                        "read_file",
 962                        lines(input_file_content, 1012..1120),
 963                    )],
 964                ),
 965                message(
 966                    Assistant,
 967                    [
 968                        text("Now let's look at how `buffer_created` is implemented:"),
 969                        tool_use(
 970                            "tool_4",
 971                            "read_file",
 972                            ReadFileToolInput {
 973                                path: input_file_path.into(),
 974                                start_line: Some(271),
 975                                end_line: Some(276),
 976                            },
 977                        ),
 978                    ],
 979                ),
 980                message(
 981                    User,
 982                    [tool_result(
 983                        "tool_4",
 984                        "read_file",
 985                        lines(input_file_content, 271..276),
 986                    )],
 987                ),
 988                message(
 989                    Assistant,
 990                    [
 991                        text(indoc! {"
 992                                Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
 993
 994                                1. Create a file
 995                                2. Open a buffer for it
 996                                3. Track it in the action log
 997                                4. Modify the buffer
 998                                5. Call `buffer_created` again as if the file were new
 999                                6. Check that the changes are properly tracked
1000
1001                                Let's write the test:
1002                            "}),
1003                        tool_use(
1004                            "tool_5",
1005                            "edit_file",
1006                            EditFileToolInput {
1007                                display_description: edit_description.into(),
1008                                path: input_file_path.into(),
1009                                mode: EditFileMode::Edit,
1010                            },
1011                        ),
1012                    ],
1013                ),
1014            ],
1015            Some(input_file_content.into()),
1016            EvalAssertion::judge_diff(
1017                "A new test for overwritten files was created, without changing any previous test",
1018            ),
1019        ))
1020    });
1021}
1022
1023#[test]
1024#[cfg_attr(not(feature = "unit-eval"), ignore)]
1025fn eval_create_empty_file() {
1026    // Check that Edit Agent can create a file without writing its
1027    // thoughts into it. This issue is not specific to empty files, but
1028    // it's easier to reproduce with them.
1029    //
1030    //  Model                          | Pass rate
1031    // ============================================
1032    //
1033    //  claude-3.7-sonnet              |  1.00 (2025-06-14)
1034    //  claude-sonnet-4                |  1.00 (2025-06-14)
1035    //  gemini-2.5-pro-preview-03-25   |  1.00 (2025-05-21)
1036    //  gemini-2.5-flash-preview-04-17 |  1.00 (2025-05-21)
1037    //  gpt-4.1                        |  1.00 (2025-05-21)
1038    //
1039    //
1040    // TODO: gpt-4.1-mini errored 38 times:
1041    // "data did not match any variant of untagged enum ResponseStreamResult"
1042
1043    let input_file_content = None;
1044    let expected_output_content = String::new();
1045
1046    eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1047        run_eval(EvalInput::from_conversation(
1048            vec![
1049                message(User, [text("Create a second empty todo file ")]),
1050                message(
1051                    Assistant,
1052                    [
1053                        text(formatdoc! {"
1054                            I'll help you create a second empty todo file.
1055                            First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1056                            "}),
1057                        tool_use(
1058                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1059                            "list_directory",
1060                            ListDirectoryToolInput {
1061                                path: "root".to_string(),
1062                            },
1063                        ),
1064                    ],
1065                ),
1066                message(
1067                    User,
1068                    [tool_result(
1069                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1070                        "list_directory",
1071                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1072                    )],
1073                ),
1074                message(
1075                    Assistant,
1076                    [
1077                        text(formatdoc! {"
1078                            I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1079                        "}),
1080                        tool_use(
1081                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1082                            "edit_file",
1083                            EditFileToolInput {
1084                                display_description: "Create empty TODO3 file".to_string(),
1085                                mode: EditFileMode::Create,
1086                                path: "root/TODO3".into(),
1087                            },
1088                        ),
1089                    ],
1090                ),
1091            ],
1092            input_file_content.clone(),
1093            // Bad behavior is to write something like
1094            // "I'll create an empty TODO3 file as requested."
1095            EvalAssertion::assert_eq(expected_output_content.clone()),
1096        ))
1097    });
1098}
1099
1100fn message(
1101    role: Role,
1102    contents: impl IntoIterator<Item = MessageContent>,
1103) -> LanguageModelRequestMessage {
1104    LanguageModelRequestMessage {
1105        role,
1106        content: contents.into_iter().collect(),
1107        cache: false,
1108        reasoning_details: None,
1109    }
1110}
1111
1112fn text(text: impl Into<String>) -> MessageContent {
1113    MessageContent::Text(text.into())
1114}
1115
1116fn lines(input: &str, range: Range<usize>) -> String {
1117    input
1118        .lines()
1119        .skip(range.start)
1120        .take(range.len())
1121        .collect::<Vec<_>>()
1122        .join("\n")
1123}
1124
1125fn tool_use(
1126    id: impl Into<Arc<str>>,
1127    name: impl Into<Arc<str>>,
1128    input: impl Serialize,
1129) -> MessageContent {
1130    MessageContent::ToolUse(LanguageModelToolUse {
1131        id: LanguageModelToolUseId::from(id.into()),
1132        name: name.into(),
1133        raw_input: serde_json::to_string_pretty(&input).unwrap(),
1134        input: serde_json::to_value(input).unwrap(),
1135        is_input_complete: true,
1136        thought_signature: None,
1137    })
1138}
1139
1140fn tool_result(
1141    id: impl Into<Arc<str>>,
1142    name: impl Into<Arc<str>>,
1143    result: impl Into<Arc<str>>,
1144) -> MessageContent {
1145    MessageContent::ToolResult(LanguageModelToolResult {
1146        tool_use_id: LanguageModelToolUseId::from(id.into()),
1147        tool_name: name.into(),
1148        is_error: false,
1149        content: LanguageModelToolResultContent::Text(result.into()),
1150        output: None,
1151    })
1152}
1153
1154#[derive(Clone)]
1155struct EvalInput {
1156    conversation: Vec<LanguageModelRequestMessage>,
1157    edit_file_input: EditFileToolInput,
1158    input_content: Option<String>,
1159    assertion: EvalAssertion,
1160}
1161
1162impl EvalInput {
1163    fn from_conversation(
1164        conversation: Vec<LanguageModelRequestMessage>,
1165        input_content: Option<String>,
1166        assertion: EvalAssertion,
1167    ) -> Self {
1168        let msg = conversation.last().expect("Conversation must not be empty");
1169        if msg.role != Role::Assistant {
1170            panic!("Conversation must end with an assistant message");
1171        }
1172        let tool_use = msg
1173            .content
1174            .iter()
1175            .flat_map(|content| match content {
1176                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1177                    Some(tool_use)
1178                }
1179                _ => None,
1180            })
1181            .next()
1182            .expect("Conversation must end with an edit_file tool use")
1183            .clone();
1184
1185        let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1186
1187        EvalInput {
1188            conversation,
1189            edit_file_input,
1190            input_content,
1191            assertion,
1192        }
1193    }
1194}
1195
1196#[derive(Clone)]
1197struct EvalSample {
1198    text_before: String,
1199    text_after: String,
1200    edit_output: EditAgentOutput,
1201    diff: String,
1202}
1203
1204trait AssertionFn: 'static + Send + Sync {
1205    fn assert<'a>(
1206        &'a self,
1207        sample: &'a EvalSample,
1208        judge_model: Arc<dyn LanguageModel>,
1209        cx: &'a mut TestAppContext,
1210    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1211}
1212
1213impl<F> AssertionFn for F
1214where
1215    F: 'static
1216        + Send
1217        + Sync
1218        + AsyncFn(
1219            &EvalSample,
1220            Arc<dyn LanguageModel>,
1221            &mut TestAppContext,
1222        ) -> Result<EvalAssertionOutcome>,
1223{
1224    fn assert<'a>(
1225        &'a self,
1226        sample: &'a EvalSample,
1227        judge_model: Arc<dyn LanguageModel>,
1228        cx: &'a mut TestAppContext,
1229    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1230        (self)(sample, judge_model, cx).boxed_local()
1231    }
1232}
1233
1234#[derive(Clone)]
1235struct EvalAssertion(Arc<dyn AssertionFn>);
1236
1237impl EvalAssertion {
1238    fn new<F>(f: F) -> Self
1239    where
1240        F: 'static
1241            + Send
1242            + Sync
1243            + AsyncFn(
1244                &EvalSample,
1245                Arc<dyn LanguageModel>,
1246                &mut TestAppContext,
1247            ) -> Result<EvalAssertionOutcome>,
1248    {
1249        EvalAssertion(Arc::new(f))
1250    }
1251
1252    fn assert_eq(expected: impl Into<String>) -> Self {
1253        let expected = expected.into();
1254        Self::new(async move |sample, _judge, _cx| {
1255            Ok(EvalAssertionOutcome {
1256                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1257                    100
1258                } else {
1259                    0
1260                },
1261                message: None,
1262            })
1263        })
1264    }
1265
1266    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1267        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1268        Self::new(async move |sample, _judge, _cx| {
1269            let matches = expected_diffs.iter().any(|possible_diff| {
1270                let expected =
1271                    language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1272                strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1273            });
1274
1275            Ok(EvalAssertionOutcome {
1276                score: if matches { 100 } else { 0 },
1277                message: None,
1278            })
1279        })
1280    }
1281
1282    fn judge_diff(assertions: &'static str) -> Self {
1283        Self::new(async move |sample, judge, cx| {
1284            let prompt = DiffJudgeTemplate {
1285                diff: sample.diff.clone(),
1286                assertions,
1287            }
1288            .render(&Templates::new())
1289            .unwrap();
1290
1291            let request = LanguageModelRequest {
1292                messages: vec![LanguageModelRequestMessage {
1293                    role: Role::User,
1294                    content: vec![prompt.into()],
1295                    cache: false,
1296                    reasoning_details: None,
1297                }],
1298                thinking_allowed: true,
1299                ..Default::default()
1300            };
1301            let mut response = retry_on_rate_limit(async || {
1302                Ok(judge
1303                    .stream_completion_text(request.clone(), &cx.to_async())
1304                    .await?)
1305            })
1306            .await?;
1307            let mut output = String::new();
1308            while let Some(chunk) = response.stream.next().await {
1309                let chunk = chunk?;
1310                output.push_str(&chunk);
1311            }
1312
1313            // Parse the score from the response
1314            let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1315            if let Some(captures) = re.captures(&output)
1316                && let Some(score_match) = captures.get(1)
1317            {
1318                let score = score_match.as_str().parse().unwrap_or(0);
1319                return Ok(EvalAssertionOutcome {
1320                    score,
1321                    message: Some(output),
1322                });
1323            }
1324
1325            anyhow::bail!("No score found in response. Raw output: {output}");
1326        })
1327    }
1328
1329    async fn run(
1330        &self,
1331        input: &EvalSample,
1332        judge_model: Arc<dyn LanguageModel>,
1333        cx: &mut TestAppContext,
1334    ) -> Result<EvalAssertionOutcome> {
1335        self.0.assert(input, judge_model, cx).await
1336    }
1337}
1338
1339fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1340    let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1341    let mut cx = TestAppContext::build(dispatcher, None);
1342    let result = cx.executor().block_test(async {
1343        let test = EditAgentTest::new(&mut cx).await;
1344        test.eval(eval, &mut cx).await
1345    });
1346    match result {
1347        Ok(output) => eval_utils::EvalOutput {
1348            data: output.to_string(),
1349            outcome: if output.assertion.score < 80 {
1350                eval_utils::OutcomeKind::Failed
1351            } else {
1352                eval_utils::OutcomeKind::Passed
1353            },
1354            metadata: EditEvalMetadata {
1355                tags: output.sample.edit_output.parser_metrics.tags,
1356                mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1357            },
1358        },
1359        Err(e) => eval_utils::EvalOutput {
1360            data: format!("{e:?}"),
1361            outcome: eval_utils::OutcomeKind::Error,
1362            metadata: EditEvalMetadata {
1363                tags: 0,
1364                mismatched_tags: 0,
1365            },
1366        },
1367    }
1368}
1369
1370#[derive(Clone)]
1371struct EditEvalOutput {
1372    sample: EvalSample,
1373    assertion: EvalAssertionOutcome,
1374}
1375
1376impl Display for EditEvalOutput {
1377    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1378        writeln!(f, "Score: {:?}", self.assertion.score)?;
1379        if let Some(message) = self.assertion.message.as_ref() {
1380            writeln!(f, "Message: {}", message)?;
1381        }
1382
1383        writeln!(f, "Diff:\n{}", self.sample.diff)?;
1384
1385        writeln!(
1386            f,
1387            "Parser Metrics:\n{:#?}",
1388            self.sample.edit_output.parser_metrics
1389        )?;
1390        writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1391        Ok(())
1392    }
1393}
1394
1395struct EditAgentTest {
1396    agent: EditAgent,
1397    project: Entity<Project>,
1398    judge_model: Arc<dyn LanguageModel>,
1399}
1400
1401impl EditAgentTest {
1402    async fn new(cx: &mut TestAppContext) -> Self {
1403        cx.executor().allow_parking();
1404
1405        let fs = FakeFs::new(cx.executor());
1406        cx.update(|cx| {
1407            settings::init(cx);
1408            gpui_tokio::init(cx);
1409            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1410            cx.set_http_client(http_client);
1411            let client = Client::production(cx);
1412            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1413            settings::init(cx);
1414            language_model::init(client.clone(), cx);
1415            language_models::init(user_store, client.clone(), cx);
1416        });
1417
1418        fs.insert_tree("/root", json!({})).await;
1419        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1420        let agent_model = SelectedModel::from_str(
1421            &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1422        )
1423        .unwrap();
1424        let judge_model = SelectedModel::from_str(
1425            &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1426        )
1427        .unwrap();
1428
1429        let authenticate_provider_tasks = cx.update(|cx| {
1430            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1431                registry
1432                    .providers()
1433                    .iter()
1434                    .map(|p| p.authenticate(cx))
1435                    .collect::<Vec<_>>()
1436            })
1437        });
1438        let (agent_model, judge_model) = cx
1439            .update(|cx| {
1440                cx.spawn(async move |cx| {
1441                    futures::future::join_all(authenticate_provider_tasks).await;
1442                    let agent_model = Self::load_model(&agent_model, cx).await;
1443                    let judge_model = Self::load_model(&judge_model, cx).await;
1444                    (agent_model.unwrap(), judge_model.unwrap())
1445                })
1446            })
1447            .await;
1448        let action_log = cx.new(|_| ActionLog::new(project.clone()));
1449
1450        let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1451
1452        Self {
1453            agent: EditAgent::new(
1454                agent_model,
1455                project.clone(),
1456                action_log,
1457                Templates::new(),
1458                edit_format,
1459            ),
1460            project,
1461            judge_model,
1462        }
1463    }
1464
1465    async fn load_model(
1466        selected_model: &SelectedModel,
1467        cx: &mut AsyncApp,
1468    ) -> Result<Arc<dyn LanguageModel>> {
1469        cx.update(|cx| {
1470            let registry = LanguageModelRegistry::read_global(cx);
1471            let provider = registry
1472                .provider(&selected_model.provider)
1473                .expect("Provider not found");
1474            provider.authenticate(cx)
1475        })?
1476        .await?;
1477        cx.update(|cx| {
1478            let models = LanguageModelRegistry::read_global(cx);
1479            let model = models
1480                .available_models(cx)
1481                .find(|model| {
1482                    model.provider_id() == selected_model.provider
1483                        && model.id() == selected_model.model
1484                })
1485                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1486            model
1487        })
1488    }
1489
1490    async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1491        // Make sure the last message in the conversation is cached.
1492        eval.conversation.last_mut().unwrap().cache = true;
1493
1494        let path = self
1495            .project
1496            .read_with(cx, |project, cx| {
1497                project.find_project_path(eval.edit_file_input.path, cx)
1498            })
1499            .unwrap();
1500        let buffer = self
1501            .project
1502            .update(cx, |project, cx| project.open_buffer(path, cx))
1503            .await
1504            .unwrap();
1505
1506        let tools = crate::built_in_tools().collect::<Vec<_>>();
1507
1508        let system_prompt = {
1509            let worktrees = vec![WorktreeContext {
1510                root_name: "root".to_string(),
1511                abs_path: Path::new("/path/to/root").into(),
1512                rules_file: None,
1513            }];
1514            let project_context = ProjectContext::new(worktrees, Vec::default());
1515            let tool_names = tools
1516                .iter()
1517                .map(|tool| tool.name.clone().into())
1518                .collect::<Vec<_>>();
1519            let template = crate::SystemPromptTemplate {
1520                project: &project_context,
1521                available_tools: tool_names,
1522                model_name: None,
1523            };
1524            let templates = Templates::new();
1525            template.render(&templates).unwrap()
1526        };
1527
1528        let has_system_prompt = eval
1529            .conversation
1530            .first()
1531            .is_some_and(|msg| msg.role == Role::System);
1532        let messages = if has_system_prompt {
1533            eval.conversation
1534        } else {
1535            [LanguageModelRequestMessage {
1536                role: Role::System,
1537                content: vec![MessageContent::Text(system_prompt)],
1538                cache: true,
1539                reasoning_details: None,
1540            }]
1541            .into_iter()
1542            .chain(eval.conversation)
1543            .collect::<Vec<_>>()
1544        };
1545
1546        let conversation = LanguageModelRequest {
1547            messages,
1548            tools,
1549            thinking_allowed: true,
1550            ..Default::default()
1551        };
1552
1553        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1554            if let Some(input_content) = eval.input_content.as_deref() {
1555                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1556            }
1557            retry_on_rate_limit(async || {
1558                self.agent
1559                    .edit(
1560                        buffer.clone(),
1561                        eval.edit_file_input.display_description.clone(),
1562                        &conversation,
1563                        &mut cx.to_async(),
1564                    )
1565                    .0
1566                    .await
1567            })
1568            .await?
1569        } else {
1570            retry_on_rate_limit(async || {
1571                self.agent
1572                    .overwrite(
1573                        buffer.clone(),
1574                        eval.edit_file_input.display_description.clone(),
1575                        &conversation,
1576                        &mut cx.to_async(),
1577                    )
1578                    .0
1579                    .await
1580            })
1581            .await?
1582        };
1583
1584        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1585        let sample = EvalSample {
1586            edit_output,
1587            diff: language::unified_diff(
1588                eval.input_content.as_deref().unwrap_or_default(),
1589                &buffer_text,
1590            ),
1591            text_before: eval.input_content.unwrap_or_default(),
1592            text_after: buffer_text,
1593        };
1594        let assertion = eval
1595            .assertion
1596            .run(&sample, self.judge_model.clone(), cx)
1597            .await?;
1598
1599        Ok(EditEvalOutput { assertion, sample })
1600    }
1601}
1602
1603async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1604    const MAX_RETRIES: usize = 20;
1605    let mut attempt = 0;
1606
1607    loop {
1608        attempt += 1;
1609        let response = request().await;
1610
1611        if attempt >= MAX_RETRIES {
1612            return response;
1613        }
1614
1615        let retry_delay = match &response {
1616            Ok(_) => None,
1617            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1618                Some(err) => match &err {
1619                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1620                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1621                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
1622                    }
1623                    LanguageModelCompletionError::UpstreamProviderError {
1624                        status,
1625                        retry_after,
1626                        ..
1627                    } => {
1628                        // Only retry for specific status codes
1629                        let should_retry = matches!(
1630                            *status,
1631                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1632                        ) || status.as_u16() == 529;
1633
1634                        if should_retry {
1635                            // Use server-provided retry_after if available, otherwise use default
1636                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
1637                        } else {
1638                            None
1639                        }
1640                    }
1641                    LanguageModelCompletionError::ApiReadResponseError { .. }
1642                    | LanguageModelCompletionError::ApiInternalServerError { .. }
1643                    | LanguageModelCompletionError::HttpSend { .. } => {
1644                        // Exponential backoff for transient I/O and internal server errors
1645                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1646                    }
1647                    _ => None,
1648                },
1649                _ => None,
1650            },
1651        };
1652
1653        if let Some(retry_after) = retry_delay {
1654            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1655            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1656            Timer::after(retry_after + jitter).await;
1657        } else {
1658            return response;
1659        }
1660    }
1661}
1662
1663#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1664struct EvalAssertionOutcome {
1665    score: usize,
1666    message: Option<String>,
1667}
1668
1669#[derive(Serialize)]
1670pub struct DiffJudgeTemplate {
1671    diff: String,
1672    assertions: &'static str,
1673}
1674
1675impl Template for DiffJudgeTemplate {
1676    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1677}
1678
1679fn strip_empty_lines(text: &str) -> String {
1680    text.lines()
1681        .filter(|line| !line.trim().is_empty())
1682        .collect::<Vec<_>>()
1683        .join("\n")
1684}