1use super::*;
2use crate::{
3 EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
4};
5use Role::*;
6use client::{Client, UserStore};
7use collections::HashMap;
8use fs::FakeFs;
9use futures::{FutureExt, future::LocalBoxFuture};
10use gpui::{AppContext, TestAppContext, Timer};
11use http_client::StatusCode;
12use indoc::{formatdoc, indoc};
13use language_model::{
14 LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
15 LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
16};
17use project::Project;
18use prompt_store::{ProjectContext, WorktreeContext};
19use rand::prelude::*;
20use reqwest_client::ReqwestClient;
21use serde_json::json;
22use std::{
23 cmp::Reverse,
24 fmt::{self, Display},
25 io::Write as _,
26 path::Path,
27 str::FromStr,
28 sync::mpsc,
29 time::Duration,
30};
31use util::path;
32
33#[test]
34#[cfg_attr(not(feature = "eval"), ignore)]
35fn eval_extract_handle_command_output() {
36 // Test how well agent generates multiple edit hunks.
37 //
38 // Model | Pass rate
39 // ----------------------------|----------
40 // claude-3.7-sonnet | 0.99 (2025-06-14)
41 // claude-sonnet-4 | 0.97 (2025-06-14)
42 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
43 // gemini-2.5-flash | 0.11 (2025-05-22)
44 // gpt-4.1 | 1.00 (2025-05-22)
45
46 let input_file_path = "root/blame.rs";
47 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
48 let possible_diffs = vec![
49 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
50 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
51 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
56 ];
57 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
58 eval(
59 100,
60 0.95,
61 0.05,
62 EvalInput::from_conversation(
63 vec![
64 message(
65 User,
66 [text(formatdoc! {"
67 Read the `{input_file_path}` file and extract a method in
68 the final stanza of `run_git_blame` to deal with command failures,
69 call it `handle_command_output` and take the std::process::Output as the only parameter.
70 Do not document the method and do not add any comments.
71
72 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
73 "})],
74 ),
75 message(
76 Assistant,
77 [tool_use(
78 "tool_1",
79 "read_file",
80 ReadFileToolInput {
81 path: input_file_path.into(),
82 start_line: None,
83 end_line: None,
84 },
85 )],
86 ),
87 message(
88 User,
89 [tool_result("tool_1", "read_file", input_file_content)],
90 ),
91 message(
92 Assistant,
93 [tool_use(
94 "tool_2",
95 "edit_file",
96 EditFileToolInput {
97 display_description: edit_description.into(),
98 path: input_file_path.into(),
99 mode: EditFileMode::Edit,
100 },
101 )],
102 ),
103 ],
104 Some(input_file_content.into()),
105 EvalAssertion::assert_diff_any(possible_diffs),
106 ),
107 );
108}
109
110#[test]
111#[cfg_attr(not(feature = "eval"), ignore)]
112fn eval_delete_run_git_blame() {
113 // Model | Pass rate
114 // ----------------------------|----------
115 // claude-3.7-sonnet | 1.0 (2025-06-14)
116 // claude-sonnet-4 | 0.96 (2025-06-14)
117 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
118 // gemini-2.5-flash |
119 // gpt-4.1 |
120
121 let input_file_path = "root/blame.rs";
122 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
123 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
124 let edit_description = "Delete the `run_git_blame` function.";
125 eval(
126 100,
127 0.95,
128 0.05,
129 EvalInput::from_conversation(
130 vec![
131 message(
132 User,
133 [text(formatdoc! {"
134 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
135 one function, not its usages.
136 "})],
137 ),
138 message(
139 Assistant,
140 [tool_use(
141 "tool_1",
142 "read_file",
143 ReadFileToolInput {
144 path: input_file_path.into(),
145 start_line: None,
146 end_line: None,
147 },
148 )],
149 ),
150 message(
151 User,
152 [tool_result("tool_1", "read_file", input_file_content)],
153 ),
154 message(
155 Assistant,
156 [tool_use(
157 "tool_2",
158 "edit_file",
159 EditFileToolInput {
160 display_description: edit_description.into(),
161 path: input_file_path.into(),
162 mode: EditFileMode::Edit,
163 },
164 )],
165 ),
166 ],
167 Some(input_file_content.into()),
168 EvalAssertion::assert_eq(output_file_content),
169 ),
170 );
171}
172
173#[test]
174#[cfg_attr(not(feature = "eval"), ignore)]
175fn eval_translate_doc_comments() {
176 // Model | Pass rate
177 // ============================================
178 //
179 // claude-3.7-sonnet | 1.0 (2025-06-14)
180 // claude-sonnet-4 | 1.0 (2025-06-14)
181 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
182 // gemini-2.5-flash-preview-04-17 |
183 // gpt-4.1 |
184
185 let input_file_path = "root/canvas.rs";
186 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
187 let edit_description = "Translate all doc comments to Italian";
188 eval(
189 200,
190 1.,
191 0.05,
192 EvalInput::from_conversation(
193 vec![
194 message(
195 User,
196 [text(formatdoc! {"
197 Read the {input_file_path} file and edit it (without overwriting it),
198 translating all the doc comments to italian.
199 "})],
200 ),
201 message(
202 Assistant,
203 [tool_use(
204 "tool_1",
205 "read_file",
206 ReadFileToolInput {
207 path: input_file_path.into(),
208 start_line: None,
209 end_line: None,
210 },
211 )],
212 ),
213 message(
214 User,
215 [tool_result("tool_1", "read_file", input_file_content)],
216 ),
217 message(
218 Assistant,
219 [tool_use(
220 "tool_2",
221 "edit_file",
222 EditFileToolInput {
223 display_description: edit_description.into(),
224 path: input_file_path.into(),
225 mode: EditFileMode::Edit,
226 },
227 )],
228 ),
229 ],
230 Some(input_file_content.into()),
231 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
232 ),
233 );
234}
235
236#[test]
237#[cfg_attr(not(feature = "eval"), ignore)]
238fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
239 // Model | Pass rate
240 // ============================================
241 //
242 // claude-3.7-sonnet | 0.96 (2025-06-14)
243 // claude-sonnet-4 | 0.11 (2025-06-14)
244 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
245 // gemini-2.5-flash-preview-04-17 |
246 // gpt-4.1 |
247
248 let input_file_path = "root/lib.rs";
249 let input_file_content =
250 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
251 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
252 eval(
253 100,
254 0.95,
255 0.05,
256 EvalInput::from_conversation(
257 vec![
258 message(
259 User,
260 [text(formatdoc! {"
261 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
262 Use `ureq` to download the SDK for the current platform and architecture.
263 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
264 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
265 that's inside of the archive.
266 Don't re-download the SDK if that executable already exists.
267
268 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
269
270 Here are the available wasi-sdk assets:
271 - wasi-sdk-25.0-x86_64-macos.tar.gz
272 - wasi-sdk-25.0-arm64-macos.tar.gz
273 - wasi-sdk-25.0-x86_64-linux.tar.gz
274 - wasi-sdk-25.0-arm64-linux.tar.gz
275 - wasi-sdk-25.0-x86_64-linux.tar.gz
276 - wasi-sdk-25.0-arm64-linux.tar.gz
277 - wasi-sdk-25.0-x86_64-windows.tar.gz
278 "})],
279 ),
280 message(
281 Assistant,
282 [tool_use(
283 "tool_1",
284 "read_file",
285 ReadFileToolInput {
286 path: input_file_path.into(),
287 start_line: Some(971),
288 end_line: Some(1050),
289 },
290 )],
291 ),
292 message(
293 User,
294 [tool_result(
295 "tool_1",
296 "read_file",
297 lines(input_file_content, 971..1050),
298 )],
299 ),
300 message(
301 Assistant,
302 [tool_use(
303 "tool_2",
304 "read_file",
305 ReadFileToolInput {
306 path: input_file_path.into(),
307 start_line: Some(1050),
308 end_line: Some(1100),
309 },
310 )],
311 ),
312 message(
313 User,
314 [tool_result(
315 "tool_2",
316 "read_file",
317 lines(input_file_content, 1050..1100),
318 )],
319 ),
320 message(
321 Assistant,
322 [tool_use(
323 "tool_3",
324 "read_file",
325 ReadFileToolInput {
326 path: input_file_path.into(),
327 start_line: Some(1100),
328 end_line: Some(1150),
329 },
330 )],
331 ),
332 message(
333 User,
334 [tool_result(
335 "tool_3",
336 "read_file",
337 lines(input_file_content, 1100..1150),
338 )],
339 ),
340 message(
341 Assistant,
342 [tool_use(
343 "tool_4",
344 "edit_file",
345 EditFileToolInput {
346 display_description: edit_description.into(),
347 path: input_file_path.into(),
348 mode: EditFileMode::Edit,
349 },
350 )],
351 ),
352 ],
353 Some(input_file_content.into()),
354 EvalAssertion::judge_diff(indoc! {"
355 - The compile_parser_to_wasm method has been changed to use wasi-sdk
356 - ureq is used to download the SDK for current platform and architecture
357 "}),
358 ),
359 );
360}
361
362#[test]
363#[cfg_attr(not(feature = "eval"), ignore)]
364fn eval_disable_cursor_blinking() {
365 // Model | Pass rate
366 // ============================================
367 //
368 // claude-3.7-sonnet | 0.59 (2025-07-14)
369 // claude-sonnet-4 | 0.81 (2025-07-14)
370 // gemini-2.5-pro | 0.95 (2025-07-14)
371 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
372 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
373
374 let input_file_path = "root/editor.rs";
375 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
376 let edit_description = "Comment out the call to `BlinkManager::enable`";
377 let possible_diffs = vec![
378 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
379 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
380 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
381 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
382 ];
383 eval(
384 100,
385 0.51,
386 0.05,
387 EvalInput::from_conversation(
388 vec![
389 message(User, [text("Let's research how to cursor blinking works.")]),
390 message(
391 Assistant,
392 [tool_use(
393 "tool_1",
394 "grep",
395 GrepToolInput {
396 regex: "blink".into(),
397 include_pattern: None,
398 offset: 0,
399 case_sensitive: false,
400 },
401 )],
402 ),
403 message(
404 User,
405 [tool_result(
406 "tool_1",
407 "grep",
408 [
409 lines(input_file_content, 100..400),
410 lines(input_file_content, 800..1300),
411 lines(input_file_content, 1600..2000),
412 lines(input_file_content, 5000..5500),
413 lines(input_file_content, 8000..9000),
414 lines(input_file_content, 18455..18470),
415 lines(input_file_content, 20000..20500),
416 lines(input_file_content, 21000..21300),
417 ]
418 .join("Match found:\n\n"),
419 )],
420 ),
421 message(
422 User,
423 [text(indoc! {"
424 Comment out the lines that interact with the BlinkManager.
425 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
426 Don't add additional comments.
427 "})],
428 ),
429 message(
430 Assistant,
431 [tool_use(
432 "tool_4",
433 "edit_file",
434 EditFileToolInput {
435 display_description: edit_description.into(),
436 path: input_file_path.into(),
437 mode: EditFileMode::Edit,
438 },
439 )],
440 ),
441 ],
442 Some(input_file_content.into()),
443 EvalAssertion::assert_diff_any(possible_diffs),
444 ),
445 );
446}
447
448#[test]
449#[cfg_attr(not(feature = "eval"), ignore)]
450fn eval_from_pixels_constructor() {
451 // Results for 2025-06-13
452 //
453 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
454 // value. Higher values improve the pass rate but may sometimes cause
455 // edits to be misapplied. In the context of this eval, this means
456 // the agent might add from_pixels tests in incorrect locations
457 // (e.g., at the beginning of the file), yet the evaluation may still
458 // rate it highly.
459 //
460 // Model | Date | Pass rate
461 // =========================================================
462 // claude-4.0-sonnet | 2025-06-14 | 0.99
463 // claude-3.7-sonnet | 2025-06-14 | 0.88
464 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
465 // gpt-4.1 |
466
467 let input_file_path = "root/canvas.rs";
468 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
469 let edit_description = "Implement from_pixels constructor and add tests.";
470 eval(
471 100,
472 0.95,
473 // For whatever reason, this eval produces more mismatched tags.
474 // Increasing for now, let's see if we can bring this down.
475 0.25,
476 EvalInput::from_conversation(
477 vec![
478 message(
479 User,
480 [text(indoc! {"
481 Introduce a new `from_pixels` constructor in Canvas and
482 also add tests for it in the same file.
483 "})],
484 ),
485 message(
486 Assistant,
487 [tool_use(
488 "tool_1",
489 "read_file",
490 ReadFileToolInput {
491 path: input_file_path.into(),
492 start_line: None,
493 end_line: None,
494 },
495 )],
496 ),
497 message(
498 User,
499 [tool_result("tool_1", "read_file", input_file_content)],
500 ),
501 message(
502 Assistant,
503 [tool_use(
504 "tool_2",
505 "grep",
506 GrepToolInput {
507 regex: "mod\\s+tests".into(),
508 include_pattern: Some("font-kit/src/canvas.rs".into()),
509 offset: 0,
510 case_sensitive: false,
511 },
512 )],
513 ),
514 message(User, [tool_result("tool_2", "grep", "No matches found")]),
515 message(
516 Assistant,
517 [tool_use(
518 "tool_3",
519 "grep",
520 GrepToolInput {
521 regex: "mod\\s+tests".into(),
522 include_pattern: Some("font-kit/src/**/*.rs".into()),
523 offset: 0,
524 case_sensitive: false,
525 },
526 )],
527 ),
528 message(User, [tool_result("tool_3", "grep", "No matches found")]),
529 message(
530 Assistant,
531 [tool_use(
532 "tool_4",
533 "grep",
534 GrepToolInput {
535 regex: "#\\[test\\]".into(),
536 include_pattern: Some("font-kit/src/**/*.rs".into()),
537 offset: 0,
538 case_sensitive: false,
539 },
540 )],
541 ),
542 message(
543 User,
544 [tool_result(
545 "tool_4",
546 "grep",
547 indoc! {"
548 Found 6 matches:
549
550 ## Matches in font-kit/src/loaders/core_text.rs
551
552 ### mod test › L926-936
553 ```
554 mod test {
555 use super::Font;
556 use crate::properties::{Stretch, Weight};
557
558 #[cfg(feature = \"source\")]
559 use crate::source::SystemSource;
560
561 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
562
563 #[cfg(feature = \"source\")]
564 #[test]
565 ```
566
567 55 lines remaining in ancestor node. Read the file to see all.
568
569 ### mod test › L947-951
570 ```
571 }
572
573 #[test]
574 fn test_core_text_to_css_font_weight() {
575 // Exact matches
576 ```
577
578 ### mod test › L959-963
579 ```
580 }
581
582 #[test]
583 fn test_core_text_to_css_font_stretch() {
584 // Exact matches
585 ```
586
587 ## Matches in font-kit/src/loaders/freetype.rs
588
589 ### mod test › L1238-1248
590 ```
591 mod test {
592 use crate::loaders::freetype::Font;
593
594 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
595 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
596
597 #[test]
598 fn get_pcf_postscript_name() {
599 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
600 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
601 }
602 ```
603
604 1 lines remaining in ancestor node. Read the file to see all.
605
606 ## Matches in font-kit/src/sources/core_text.rs
607
608 ### mod test › L265-275
609 ```
610 mod test {
611 use crate::properties::{Stretch, Weight};
612
613 #[test]
614 fn test_css_to_core_text_font_weight() {
615 // Exact matches
616 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
617 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
618 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
619 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
620
621 ```
622
623 27 lines remaining in ancestor node. Read the file to see all.
624
625 ### mod test › L278-282
626 ```
627 }
628
629 #[test]
630 fn test_css_to_core_text_font_stretch() {
631 // Exact matches
632 ```
633 "},
634 )],
635 ),
636 message(
637 Assistant,
638 [tool_use(
639 "tool_5",
640 "edit_file",
641 EditFileToolInput {
642 display_description: edit_description.into(),
643 path: input_file_path.into(),
644 mode: EditFileMode::Edit,
645 },
646 )],
647 ),
648 ],
649 Some(input_file_content.into()),
650 EvalAssertion::judge_diff(indoc! {"
651 - The diff contains a new `from_pixels` constructor
652 - The diff contains new tests for the `from_pixels` constructor
653 "}),
654 ),
655 );
656}
657
658#[test]
659#[cfg_attr(not(feature = "eval"), ignore)]
660fn eval_zode() {
661 // Model | Pass rate
662 // ============================================
663 //
664 // claude-3.7-sonnet | 1.0 (2025-06-14)
665 // claude-sonnet-4 | 1.0 (2025-06-14)
666 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
667 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
668 // gpt-4.1 | 1.0 (2025-05-22)
669
670 let input_file_path = "root/zode.py";
671 let input_content = None;
672 let edit_description = "Create the main Zode CLI script";
673 eval(
674 50,
675 1.,
676 0.05,
677 EvalInput::from_conversation(
678 vec![
679 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
680 message(
681 Assistant,
682 [
683 tool_use(
684 "tool_1",
685 "read_file",
686 ReadFileToolInput {
687 path: "root/eval/react.py".into(),
688 start_line: None,
689 end_line: None,
690 },
691 ),
692 tool_use(
693 "tool_2",
694 "read_file",
695 ReadFileToolInput {
696 path: "root/eval/react_test.py".into(),
697 start_line: None,
698 end_line: None,
699 },
700 ),
701 ],
702 ),
703 message(
704 User,
705 [
706 tool_result(
707 "tool_1",
708 "read_file",
709 include_str!("evals/fixtures/zode/react.py"),
710 ),
711 tool_result(
712 "tool_2",
713 "read_file",
714 include_str!("evals/fixtures/zode/react_test.py"),
715 ),
716 ],
717 ),
718 message(
719 Assistant,
720 [
721 text(
722 "Now that I understand what we need to build, I'll create the main Python script:",
723 ),
724 tool_use(
725 "tool_3",
726 "edit_file",
727 EditFileToolInput {
728 display_description: edit_description.into(),
729 path: input_file_path.into(),
730 mode: EditFileMode::Create,
731 },
732 ),
733 ],
734 ),
735 ],
736 input_content,
737 EvalAssertion::new(async move |sample, _, _cx| {
738 let invalid_starts = [' ', '`', '\n'];
739 let mut message = String::new();
740 for start in invalid_starts {
741 if sample.text_after.starts_with(start) {
742 message.push_str(&format!("The sample starts with a {:?}\n", start));
743 break;
744 }
745 }
746 // Remove trailing newline.
747 message.pop();
748
749 if message.is_empty() {
750 Ok(EvalAssertionOutcome {
751 score: 100,
752 message: None,
753 })
754 } else {
755 Ok(EvalAssertionOutcome {
756 score: 0,
757 message: Some(message),
758 })
759 }
760 }),
761 ),
762 );
763}
764
765#[test]
766#[cfg_attr(not(feature = "eval"), ignore)]
767fn eval_add_overwrite_test() {
768 // Model | Pass rate
769 // ============================================
770 //
771 // claude-3.7-sonnet | 0.65 (2025-06-14)
772 // claude-sonnet-4 | 0.07 (2025-06-14)
773 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
774 // gemini-2.5-flash-preview-04-17 |
775 // gpt-4.1 |
776
777 let input_file_path = "root/action_log.rs";
778 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
779 let edit_description = "Add a new test for overwriting a file in action_log.rs";
780 eval(
781 200,
782 0.5, // TODO: make this eval better
783 0.05,
784 EvalInput::from_conversation(
785 vec![
786 message(
787 User,
788 [text(indoc! {"
789 Introduce a new test in `action_log.rs` to test overwriting a file.
790 That is, a file already exists, but we call `buffer_created` as if the file were new.
791 Take inspiration from all the other tests in the file.
792 "})],
793 ),
794 message(
795 Assistant,
796 [tool_use(
797 "tool_1",
798 "read_file",
799 ReadFileToolInput {
800 path: input_file_path.into(),
801 start_line: None,
802 end_line: None,
803 },
804 )],
805 ),
806 message(
807 User,
808 [tool_result(
809 "tool_1",
810 "read_file",
811 indoc! {"
812 pub struct ActionLog [L13-20]
813 tracked_buffers [L15]
814 edited_since_project_diagnostics_check [L17]
815 project [L19]
816 impl ActionLog [L22-498]
817 pub fn new [L24-30]
818 pub fn project [L32-34]
819 pub fn checked_project_diagnostics [L37-39]
820 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
821 fn track_buffer_internal [L46-101]
822 fn handle_buffer_event [L103-116]
823 fn handle_buffer_edited [L118-123]
824 fn handle_buffer_file_changed [L125-158]
825 async fn maintain_diff [L160-264]
826 pub fn buffer_read [L267-269]
827 pub fn buffer_created [L272-276]
828 pub fn buffer_edited [L279-287]
829 pub fn will_delete_buffer [L289-304]
830 pub fn keep_edits_in_range [L306-364]
831 pub fn reject_edits_in_ranges [L366-459]
832 pub fn keep_all_edits [L461-473]
833 pub fn changed_buffers [L476-482]
834 pub fn stale_buffers [L485-497]
835 fn apply_non_conflicting_edits [L500-561]
836 fn diff_snapshots [L563-585]
837 fn point_to_row_edit [L587-614]
838 enum ChangeAuthor [L617-620]
839 User [L618]
840 Agent [L619]
841 enum TrackedBufferStatus [L623-627]
842 Created [L624]
843 Modified [L625]
844 Deleted [L626]
845 struct TrackedBuffer [L629-641]
846 buffer [L630]
847 base_text [L631]
848 unreviewed_changes [L632]
849 status [L633]
850 version [L634]
851 diff [L635]
852 snapshot [L636]
853 diff_update [L637]
854 _open_lsp_handle [L638]
855 _maintain_diff [L639]
856 _subscription [L640]
857 impl TrackedBuffer [L643-657]
858 fn has_changes [L644-650]
859 fn schedule_diff_update [L652-656]
860 pub struct ChangedBuffer [L659-661]
861 pub diff [L660]
862 mod tests [L664-1574]
863 fn init_logger [L678-682]
864 fn init_test [L684-691]
865 async fn test_keep_edits [L694-769]
866 async fn test_deletions [L772-854]
867 async fn test_overlapping_user_edits [L857-951]
868 async fn test_creating_files [L954-1010]
869 async fn test_deleting_files [L1013-1120]
870 async fn test_reject_edits [L1123-1255]
871 async fn test_reject_multiple_edits [L1258-1331]
872 async fn test_reject_deleted_file [L1334-1388]
873 async fn test_reject_created_file [L1391-1443]
874 async fn test_random_diffs [L1446-1535]
875 fn quiesce [L1510-1534]
876 struct HunkStatus [L1538-1542]
877 range [L1539]
878 diff_status [L1540]
879 old_text [L1541]
880 fn unreviewed_hunks [L1544-1573]
881
882 Showing symbols 1-69 (total symbols: 69)
883
884 Using the line numbers in this outline, you can call this tool again while specifying
885 the start_line and end_line fields to see the implementations of symbols in the outline.
886 "},
887 )],
888 ),
889 message(
890 Assistant,
891 [
892 text(
893 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
894 ),
895 tool_use(
896 "tool_2",
897 "read_file",
898 ReadFileToolInput {
899 path: input_file_path.into(),
900 start_line: Some(953),
901 end_line: Some(1010),
902 },
903 ),
904 ],
905 ),
906 message(
907 User,
908 [tool_result(
909 "tool_2",
910 "read_file",
911 lines(input_file_content, 953..1010),
912 )],
913 ),
914 message(
915 Assistant,
916 [
917 text(
918 "Now let's look at another relevant test to understand how they're structured:",
919 ),
920 tool_use(
921 "tool_3",
922 "read_file",
923 ReadFileToolInput {
924 path: input_file_path.into(),
925 start_line: Some(1012),
926 end_line: Some(1120),
927 },
928 ),
929 ],
930 ),
931 message(
932 User,
933 [tool_result(
934 "tool_3",
935 "read_file",
936 lines(input_file_content, 1012..1120),
937 )],
938 ),
939 message(
940 Assistant,
941 [
942 text("Now let's look at how `buffer_created` is implemented:"),
943 tool_use(
944 "tool_4",
945 "read_file",
946 ReadFileToolInput {
947 path: input_file_path.into(),
948 start_line: Some(271),
949 end_line: Some(276),
950 },
951 ),
952 ],
953 ),
954 message(
955 User,
956 [tool_result(
957 "tool_4",
958 "read_file",
959 lines(input_file_content, 271..276),
960 )],
961 ),
962 message(
963 Assistant,
964 [
965 text(indoc! {"
966 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
967
968 1. Create a file
969 2. Open a buffer for it
970 3. Track it in the action log
971 4. Modify the buffer
972 5. Call `buffer_created` again as if the file were new
973 6. Check that the changes are properly tracked
974
975 Let's write the test:
976 "}),
977 tool_use(
978 "tool_5",
979 "edit_file",
980 EditFileToolInput {
981 display_description: edit_description.into(),
982 path: input_file_path.into(),
983 mode: EditFileMode::Edit,
984 },
985 ),
986 ],
987 ),
988 ],
989 Some(input_file_content.into()),
990 EvalAssertion::judge_diff(
991 "A new test for overwritten files was created, without changing any previous test",
992 ),
993 ),
994 );
995}
996
997#[test]
998#[cfg_attr(not(feature = "eval"), ignore)]
999fn eval_create_empty_file() {
1000 // Check that Edit Agent can create a file without writing its
1001 // thoughts into it. This issue is not specific to empty files, but
1002 // it's easier to reproduce with them.
1003 //
1004 // Model | Pass rate
1005 // ============================================
1006 //
1007 // claude-3.7-sonnet | 1.00 (2025-06-14)
1008 // claude-sonnet-4 | 1.00 (2025-06-14)
1009 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1010 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1011 // gpt-4.1 | 1.00 (2025-05-21)
1012 //
1013 //
1014 // TODO: gpt-4.1-mini errored 38 times:
1015 // "data did not match any variant of untagged enum ResponseStreamResult"
1016
1017 let input_file_content = None;
1018 let expected_output_content = String::new();
1019 eval(
1020 100,
1021 0.99,
1022 0.05,
1023 EvalInput::from_conversation(
1024 vec![
1025 message(User, [text("Create a second empty todo file ")]),
1026 message(
1027 Assistant,
1028 [
1029 text(formatdoc! {"
1030 I'll help you create a second empty todo file.
1031 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1032 "}),
1033 tool_use(
1034 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1035 "list_directory",
1036 ListDirectoryToolInput {
1037 path: "root".to_string(),
1038 },
1039 ),
1040 ],
1041 ),
1042 message(
1043 User,
1044 [tool_result(
1045 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1046 "list_directory",
1047 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1048 )],
1049 ),
1050 message(
1051 Assistant,
1052 [
1053 text(formatdoc! {"
1054 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1055 "}),
1056 tool_use(
1057 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1058 "edit_file",
1059 EditFileToolInput {
1060 display_description: "Create empty TODO3 file".to_string(),
1061 mode: EditFileMode::Create,
1062 path: "root/TODO3".into(),
1063 },
1064 ),
1065 ],
1066 ),
1067 ],
1068 input_file_content,
1069 // Bad behavior is to write something like
1070 // "I'll create an empty TODO3 file as requested."
1071 EvalAssertion::assert_eq(expected_output_content),
1072 ),
1073 );
1074}
1075
1076fn message(
1077 role: Role,
1078 contents: impl IntoIterator<Item = MessageContent>,
1079) -> LanguageModelRequestMessage {
1080 LanguageModelRequestMessage {
1081 role,
1082 content: contents.into_iter().collect(),
1083 cache: false,
1084 }
1085}
1086
1087fn text(text: impl Into<String>) -> MessageContent {
1088 MessageContent::Text(text.into())
1089}
1090
1091fn lines(input: &str, range: Range<usize>) -> String {
1092 input
1093 .lines()
1094 .skip(range.start)
1095 .take(range.len())
1096 .collect::<Vec<_>>()
1097 .join("\n")
1098}
1099
1100fn tool_use(
1101 id: impl Into<Arc<str>>,
1102 name: impl Into<Arc<str>>,
1103 input: impl Serialize,
1104) -> MessageContent {
1105 MessageContent::ToolUse(LanguageModelToolUse {
1106 id: LanguageModelToolUseId::from(id.into()),
1107 name: name.into(),
1108 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1109 input: serde_json::to_value(input).unwrap(),
1110 is_input_complete: true,
1111 })
1112}
1113
1114fn tool_result(
1115 id: impl Into<Arc<str>>,
1116 name: impl Into<Arc<str>>,
1117 result: impl Into<Arc<str>>,
1118) -> MessageContent {
1119 MessageContent::ToolResult(LanguageModelToolResult {
1120 tool_use_id: LanguageModelToolUseId::from(id.into()),
1121 tool_name: name.into(),
1122 is_error: false,
1123 content: LanguageModelToolResultContent::Text(result.into()),
1124 output: None,
1125 })
1126}
1127
1128#[derive(Clone)]
1129struct EvalInput {
1130 conversation: Vec<LanguageModelRequestMessage>,
1131 edit_file_input: EditFileToolInput,
1132 input_content: Option<String>,
1133 assertion: EvalAssertion,
1134}
1135
1136impl EvalInput {
1137 fn from_conversation(
1138 conversation: Vec<LanguageModelRequestMessage>,
1139 input_content: Option<String>,
1140 assertion: EvalAssertion,
1141 ) -> Self {
1142 let msg = conversation.last().expect("Conversation must not be empty");
1143 if msg.role != Role::Assistant {
1144 panic!("Conversation must end with an assistant message");
1145 }
1146 let tool_use = msg
1147 .content
1148 .iter()
1149 .flat_map(|content| match content {
1150 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1151 Some(tool_use)
1152 }
1153 _ => None,
1154 })
1155 .next()
1156 .expect("Conversation must end with an edit_file tool use")
1157 .clone();
1158
1159 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1160
1161 EvalInput {
1162 conversation,
1163 edit_file_input,
1164 input_content,
1165 assertion,
1166 }
1167 }
1168}
1169
1170#[derive(Clone)]
1171struct EvalSample {
1172 text_before: String,
1173 text_after: String,
1174 edit_output: EditAgentOutput,
1175 diff: String,
1176}
1177
1178trait AssertionFn: 'static + Send + Sync {
1179 fn assert<'a>(
1180 &'a self,
1181 sample: &'a EvalSample,
1182 judge_model: Arc<dyn LanguageModel>,
1183 cx: &'a mut TestAppContext,
1184 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1185}
1186
1187impl<F> AssertionFn for F
1188where
1189 F: 'static
1190 + Send
1191 + Sync
1192 + AsyncFn(
1193 &EvalSample,
1194 Arc<dyn LanguageModel>,
1195 &mut TestAppContext,
1196 ) -> Result<EvalAssertionOutcome>,
1197{
1198 fn assert<'a>(
1199 &'a self,
1200 sample: &'a EvalSample,
1201 judge_model: Arc<dyn LanguageModel>,
1202 cx: &'a mut TestAppContext,
1203 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1204 (self)(sample, judge_model, cx).boxed_local()
1205 }
1206}
1207
1208#[derive(Clone)]
1209struct EvalAssertion(Arc<dyn AssertionFn>);
1210
1211impl EvalAssertion {
1212 fn new<F>(f: F) -> Self
1213 where
1214 F: 'static
1215 + Send
1216 + Sync
1217 + AsyncFn(
1218 &EvalSample,
1219 Arc<dyn LanguageModel>,
1220 &mut TestAppContext,
1221 ) -> Result<EvalAssertionOutcome>,
1222 {
1223 EvalAssertion(Arc::new(f))
1224 }
1225
1226 fn assert_eq(expected: impl Into<String>) -> Self {
1227 let expected = expected.into();
1228 Self::new(async move |sample, _judge, _cx| {
1229 Ok(EvalAssertionOutcome {
1230 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1231 100
1232 } else {
1233 0
1234 },
1235 message: None,
1236 })
1237 })
1238 }
1239
1240 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1241 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1242 Self::new(async move |sample, _judge, _cx| {
1243 let matches = expected_diffs.iter().any(|possible_diff| {
1244 let expected =
1245 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1246 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1247 });
1248
1249 Ok(EvalAssertionOutcome {
1250 score: if matches { 100 } else { 0 },
1251 message: None,
1252 })
1253 })
1254 }
1255
1256 fn judge_diff(assertions: &'static str) -> Self {
1257 Self::new(async move |sample, judge, cx| {
1258 let prompt = DiffJudgeTemplate {
1259 diff: sample.diff.clone(),
1260 assertions,
1261 }
1262 .render(&Templates::new())
1263 .unwrap();
1264
1265 let request = LanguageModelRequest {
1266 messages: vec![LanguageModelRequestMessage {
1267 role: Role::User,
1268 content: vec![prompt.into()],
1269 cache: false,
1270 }],
1271 thinking_allowed: true,
1272 ..Default::default()
1273 };
1274 let mut response = retry_on_rate_limit(async || {
1275 Ok(judge
1276 .stream_completion_text(request.clone(), &cx.to_async())
1277 .await?)
1278 })
1279 .await?;
1280 let mut output = String::new();
1281 while let Some(chunk) = response.stream.next().await {
1282 let chunk = chunk?;
1283 output.push_str(&chunk);
1284 }
1285
1286 // Parse the score from the response
1287 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1288 if let Some(captures) = re.captures(&output)
1289 && let Some(score_match) = captures.get(1)
1290 {
1291 let score = score_match.as_str().parse().unwrap_or(0);
1292 return Ok(EvalAssertionOutcome {
1293 score,
1294 message: Some(output),
1295 });
1296 }
1297
1298 anyhow::bail!("No score found in response. Raw output: {output}");
1299 })
1300 }
1301
1302 async fn run(
1303 &self,
1304 input: &EvalSample,
1305 judge_model: Arc<dyn LanguageModel>,
1306 cx: &mut TestAppContext,
1307 ) -> Result<EvalAssertionOutcome> {
1308 self.0.assert(input, judge_model, cx).await
1309 }
1310}
1311
1312fn eval(
1313 iterations: usize,
1314 expected_pass_ratio: f32,
1315 mismatched_tag_threshold: f32,
1316 mut eval: EvalInput,
1317) {
1318 let mut evaluated_count = 0;
1319 let mut failed_count = 0;
1320 report_progress(evaluated_count, failed_count, iterations);
1321
1322 let (tx, rx) = mpsc::channel();
1323
1324 // Cache the last message in the conversation, and run one instance of the eval so that
1325 // all the next ones are cached.
1326 eval.conversation.last_mut().unwrap().cache = true;
1327 run_eval(eval.clone(), tx.clone());
1328
1329 let executor = gpui::background_executor();
1330 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1331 for _ in 1..iterations {
1332 let eval = eval.clone();
1333 let tx = tx.clone();
1334 let semaphore = semaphore.clone();
1335 executor
1336 .spawn(async move {
1337 let _guard = semaphore.acquire().await;
1338 run_eval(eval, tx)
1339 })
1340 .detach();
1341 }
1342 drop(tx);
1343
1344 let mut failed_evals = HashMap::default();
1345 let mut errored_evals = HashMap::default();
1346 let mut eval_outputs = Vec::new();
1347 let mut cumulative_parser_metrics = EditParserMetrics::default();
1348 while let Ok(output) = rx.recv() {
1349 match output {
1350 Ok(output) => {
1351 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1352 eval_outputs.push(output.clone());
1353 if output.assertion.score < 80 {
1354 failed_count += 1;
1355 failed_evals
1356 .entry(output.sample.text_after.clone())
1357 .or_insert(Vec::new())
1358 .push(output);
1359 }
1360 }
1361 Err(error) => {
1362 failed_count += 1;
1363 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1364 }
1365 }
1366
1367 evaluated_count += 1;
1368 report_progress(evaluated_count, failed_count, iterations);
1369 }
1370
1371 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1372 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1373 if actual_pass_ratio < expected_pass_ratio {
1374 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1375 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1376 for (error, count) in errored_evals {
1377 println!("Eval errored {} times. Error: {}", count, error);
1378 }
1379
1380 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1381 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1382 for (_buffer_output, failed_evals) in failed_evals {
1383 let eval_output = failed_evals.first().unwrap();
1384 println!("Eval failed {} times", failed_evals.len());
1385 println!("{}", eval_output);
1386 }
1387
1388 panic!(
1389 "Actual pass ratio: {}\nExpected pass ratio: {}",
1390 actual_pass_ratio, expected_pass_ratio
1391 );
1392 }
1393
1394 let mismatched_tag_ratio =
1395 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1396 if mismatched_tag_ratio > mismatched_tag_threshold {
1397 for eval_output in eval_outputs {
1398 println!("{}", eval_output);
1399 }
1400 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1401 }
1402}
1403
1404fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1405 let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1406 let mut cx = TestAppContext::build(dispatcher, None);
1407 let output = cx.executor().block_test(async {
1408 let test = EditAgentTest::new(&mut cx).await;
1409 test.eval(eval, &mut cx).await
1410 });
1411 tx.send(output).unwrap();
1412}
1413
1414#[derive(Clone)]
1415struct EvalOutput {
1416 sample: EvalSample,
1417 assertion: EvalAssertionOutcome,
1418}
1419
1420impl Display for EvalOutput {
1421 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1422 writeln!(f, "Score: {:?}", self.assertion.score)?;
1423 if let Some(message) = self.assertion.message.as_ref() {
1424 writeln!(f, "Message: {}", message)?;
1425 }
1426
1427 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1428
1429 writeln!(
1430 f,
1431 "Parser Metrics:\n{:#?}",
1432 self.sample.edit_output.parser_metrics
1433 )?;
1434 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1435 Ok(())
1436 }
1437}
1438
1439fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1440 let passed_count = evaluated_count - failed_count;
1441 let passed_ratio = if evaluated_count == 0 {
1442 0.0
1443 } else {
1444 passed_count as f64 / evaluated_count as f64
1445 };
1446 print!(
1447 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1448 evaluated_count,
1449 iterations,
1450 passed_ratio * 100.0
1451 );
1452 std::io::stdout().flush().unwrap();
1453}
1454
1455struct EditAgentTest {
1456 agent: EditAgent,
1457 project: Entity<Project>,
1458 judge_model: Arc<dyn LanguageModel>,
1459}
1460
1461impl EditAgentTest {
1462 async fn new(cx: &mut TestAppContext) -> Self {
1463 cx.executor().allow_parking();
1464
1465 let fs = FakeFs::new(cx.executor());
1466 cx.update(|cx| {
1467 settings::init(cx);
1468 gpui_tokio::init(cx);
1469 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1470 cx.set_http_client(http_client);
1471
1472 client::init_settings(cx);
1473 let client = Client::production(cx);
1474 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1475
1476 settings::init(cx);
1477 Project::init_settings(cx);
1478 language::init(cx);
1479 language_model::init(client.clone(), cx);
1480 language_models::init(user_store, client.clone(), cx);
1481 });
1482
1483 fs.insert_tree("/root", json!({})).await;
1484 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1485 let agent_model = SelectedModel::from_str(
1486 &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-4-sonnet-latest".into()),
1487 )
1488 .unwrap();
1489 let judge_model = SelectedModel::from_str(
1490 &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-4-sonnet-latest".into()),
1491 )
1492 .unwrap();
1493 let (agent_model, judge_model) = cx
1494 .update(|cx| {
1495 cx.spawn(async move |cx| {
1496 let agent_model = Self::load_model(&agent_model, cx).await;
1497 let judge_model = Self::load_model(&judge_model, cx).await;
1498 (agent_model.unwrap(), judge_model.unwrap())
1499 })
1500 })
1501 .await;
1502 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1503
1504 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1505
1506 Self {
1507 agent: EditAgent::new(
1508 agent_model,
1509 project.clone(),
1510 action_log,
1511 Templates::new(),
1512 edit_format,
1513 ),
1514 project,
1515 judge_model,
1516 }
1517 }
1518
1519 async fn load_model(
1520 selected_model: &SelectedModel,
1521 cx: &mut AsyncApp,
1522 ) -> Result<Arc<dyn LanguageModel>> {
1523 cx.update(|cx| {
1524 let registry = LanguageModelRegistry::read_global(cx);
1525 let provider = registry
1526 .provider(&selected_model.provider)
1527 .expect("Provider not found");
1528 provider.authenticate(cx)
1529 })?
1530 .await?;
1531 cx.update(|cx| {
1532 let models = LanguageModelRegistry::read_global(cx);
1533 let model = models
1534 .available_models(cx)
1535 .find(|model| {
1536 model.provider_id() == selected_model.provider
1537 && model.id() == selected_model.model
1538 })
1539 .expect("Model not found");
1540 model
1541 })
1542 }
1543
1544 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1545 let path = self
1546 .project
1547 .read_with(cx, |project, cx| {
1548 project.find_project_path(eval.edit_file_input.path, cx)
1549 })
1550 .unwrap();
1551 let buffer = self
1552 .project
1553 .update(cx, |project, cx| project.open_buffer(path, cx))
1554 .await
1555 .unwrap();
1556
1557 let tools = crate::built_in_tools().collect::<Vec<_>>();
1558
1559 let system_prompt = {
1560 let worktrees = vec![WorktreeContext {
1561 root_name: "root".to_string(),
1562 abs_path: Path::new("/path/to/root").into(),
1563 rules_file: None,
1564 }];
1565 let project_context = ProjectContext::new(worktrees, Vec::default());
1566 let tool_names = tools
1567 .iter()
1568 .map(|tool| tool.name.clone().into())
1569 .collect::<Vec<_>>();
1570 let template = crate::SystemPromptTemplate {
1571 project: &project_context,
1572 available_tools: tool_names,
1573 };
1574 let templates = Templates::new();
1575 template.render(&templates).unwrap()
1576 };
1577
1578 let has_system_prompt = eval
1579 .conversation
1580 .first()
1581 .is_some_and(|msg| msg.role == Role::System);
1582 let messages = if has_system_prompt {
1583 eval.conversation
1584 } else {
1585 [LanguageModelRequestMessage {
1586 role: Role::System,
1587 content: vec![MessageContent::Text(system_prompt)],
1588 cache: true,
1589 }]
1590 .into_iter()
1591 .chain(eval.conversation)
1592 .collect::<Vec<_>>()
1593 };
1594
1595 let conversation = LanguageModelRequest {
1596 messages,
1597 tools,
1598 thinking_allowed: true,
1599 ..Default::default()
1600 };
1601
1602 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1603 if let Some(input_content) = eval.input_content.as_deref() {
1604 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1605 }
1606 retry_on_rate_limit(async || {
1607 self.agent
1608 .edit(
1609 buffer.clone(),
1610 eval.edit_file_input.display_description.clone(),
1611 &conversation,
1612 &mut cx.to_async(),
1613 )
1614 .0
1615 .await
1616 })
1617 .await?
1618 } else {
1619 retry_on_rate_limit(async || {
1620 self.agent
1621 .overwrite(
1622 buffer.clone(),
1623 eval.edit_file_input.display_description.clone(),
1624 &conversation,
1625 &mut cx.to_async(),
1626 )
1627 .0
1628 .await
1629 })
1630 .await?
1631 };
1632
1633 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1634 let sample = EvalSample {
1635 edit_output,
1636 diff: language::unified_diff(
1637 eval.input_content.as_deref().unwrap_or_default(),
1638 &buffer_text,
1639 ),
1640 text_before: eval.input_content.unwrap_or_default(),
1641 text_after: buffer_text,
1642 };
1643 let assertion = eval
1644 .assertion
1645 .run(&sample, self.judge_model.clone(), cx)
1646 .await?;
1647
1648 Ok(EvalOutput { assertion, sample })
1649 }
1650}
1651
1652async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1653 const MAX_RETRIES: usize = 20;
1654 let mut attempt = 0;
1655
1656 loop {
1657 attempt += 1;
1658 let response = request().await;
1659
1660 if attempt >= MAX_RETRIES {
1661 return response;
1662 }
1663
1664 let retry_delay = match &response {
1665 Ok(_) => None,
1666 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1667 Some(err) => match &err {
1668 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1669 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1670 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1671 }
1672 LanguageModelCompletionError::UpstreamProviderError {
1673 status,
1674 retry_after,
1675 ..
1676 } => {
1677 // Only retry for specific status codes
1678 let should_retry = matches!(
1679 *status,
1680 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1681 ) || status.as_u16() == 529;
1682
1683 if should_retry {
1684 // Use server-provided retry_after if available, otherwise use default
1685 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1686 } else {
1687 None
1688 }
1689 }
1690 LanguageModelCompletionError::ApiReadResponseError { .. }
1691 | LanguageModelCompletionError::ApiInternalServerError { .. }
1692 | LanguageModelCompletionError::HttpSend { .. } => {
1693 // Exponential backoff for transient I/O and internal server errors
1694 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1695 }
1696 _ => None,
1697 },
1698 _ => None,
1699 },
1700 };
1701
1702 if let Some(retry_after) = retry_delay {
1703 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1704 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1705 Timer::after(retry_after + jitter).await;
1706 } else {
1707 return response;
1708 }
1709 }
1710}
1711
1712#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1713struct EvalAssertionOutcome {
1714 score: usize,
1715 message: Option<String>,
1716}
1717
1718#[derive(Serialize)]
1719pub struct DiffJudgeTemplate {
1720 diff: String,
1721 assertions: &'static str,
1722}
1723
1724impl Template for DiffJudgeTemplate {
1725 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1726}
1727
1728fn strip_empty_lines(text: &str) -> String {
1729 text.lines()
1730 .filter(|line| !line.trim().is_empty())
1731 .collect::<Vec<_>>()
1732 .join("\n")
1733}