1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 str::FromStr,
30 sync::mpsc,
31};
32use util::path;
33
34#[test]
35#[cfg_attr(not(feature = "eval"), ignore)]
36fn eval_extract_handle_command_output() {
37 // Test how well agent generates multiple edit hunks.
38 //
39 // Model | Pass rate
40 // ----------------------------|----------
41 // claude-3.7-sonnet | 0.98
42 // gemini-2.5-pro | 0.86
43 // gemini-2.5-flash | 0.11
44 // gpt-4.1 | 1.00
45
46 let input_file_path = "root/blame.rs";
47 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
48 let possible_diffs = vec![
49 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
50 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
51 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
56 ];
57 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
58 eval(
59 100,
60 0.7, // Taking the lower bar for Gemini
61 EvalInput::from_conversation(
62 vec![
63 message(
64 User,
65 [text(formatdoc! {"
66 Read the `{input_file_path}` file and extract a method in
67 the final stanza of `run_git_blame` to deal with command failures,
68 call it `handle_command_output` and take the std::process::Output as the only parameter.
69 Do not document the method and do not add any comments.
70
71 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
72 "})],
73 ),
74 message(
75 Assistant,
76 [tool_use(
77 "tool_1",
78 "read_file",
79 ReadFileToolInput {
80 path: input_file_path.into(),
81 start_line: None,
82 end_line: None,
83 },
84 )],
85 ),
86 message(
87 User,
88 [tool_result("tool_1", "read_file", input_file_content)],
89 ),
90 message(
91 Assistant,
92 [tool_use(
93 "tool_2",
94 "edit_file",
95 EditFileToolInput {
96 display_description: edit_description.into(),
97 path: input_file_path.into(),
98 mode: EditFileMode::Edit,
99 },
100 )],
101 ),
102 ],
103 Some(input_file_content.into()),
104 EvalAssertion::assert_diff_any(possible_diffs),
105 ),
106 );
107}
108
109#[test]
110#[cfg_attr(not(feature = "eval"), ignore)]
111fn eval_delete_run_git_blame() {
112 let input_file_path = "root/blame.rs";
113 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
114 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
115 let edit_description = "Delete the `run_git_blame` function.";
116 eval(
117 100,
118 0.95,
119 EvalInput::from_conversation(
120 vec![
121 message(
122 User,
123 [text(formatdoc! {"
124 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
125 one function, not its usages.
126 "})],
127 ),
128 message(
129 Assistant,
130 [tool_use(
131 "tool_1",
132 "read_file",
133 ReadFileToolInput {
134 path: input_file_path.into(),
135 start_line: None,
136 end_line: None,
137 },
138 )],
139 ),
140 message(
141 User,
142 [tool_result("tool_1", "read_file", input_file_content)],
143 ),
144 message(
145 Assistant,
146 [tool_use(
147 "tool_2",
148 "edit_file",
149 EditFileToolInput {
150 display_description: edit_description.into(),
151 path: input_file_path.into(),
152 mode: EditFileMode::Edit,
153 },
154 )],
155 ),
156 ],
157 Some(input_file_content.into()),
158 EvalAssertion::assert_eq(output_file_content),
159 ),
160 );
161}
162
163#[test]
164#[cfg_attr(not(feature = "eval"), ignore)]
165fn eval_translate_doc_comments() {
166 let input_file_path = "root/canvas.rs";
167 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
168 let edit_description = "Translate all doc comments to Italian";
169 eval(
170 200,
171 1.,
172 EvalInput::from_conversation(
173 vec![
174 message(
175 User,
176 [text(formatdoc! {"
177 Read the {input_file_path} file and edit it (without overwriting it),
178 translating all the doc comments to italian.
179 "})],
180 ),
181 message(
182 Assistant,
183 [tool_use(
184 "tool_1",
185 "read_file",
186 ReadFileToolInput {
187 path: input_file_path.into(),
188 start_line: None,
189 end_line: None,
190 },
191 )],
192 ),
193 message(
194 User,
195 [tool_result("tool_1", "read_file", input_file_content)],
196 ),
197 message(
198 Assistant,
199 [tool_use(
200 "tool_2",
201 "edit_file",
202 EditFileToolInput {
203 display_description: edit_description.into(),
204 path: input_file_path.into(),
205 mode: EditFileMode::Edit,
206 },
207 )],
208 ),
209 ],
210 Some(input_file_content.into()),
211 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
212 ),
213 );
214}
215
216#[test]
217#[cfg_attr(not(feature = "eval"), ignore)]
218fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
219 let input_file_path = "root/lib.rs";
220 let input_file_content =
221 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
222 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
223 eval(
224 100,
225 0.95,
226 EvalInput::from_conversation(
227 vec![
228 message(
229 User,
230 [text(formatdoc! {"
231 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
232 Use `ureq` to download the SDK for the current platform and architecture.
233 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
234 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
235 that's inside of the archive.
236 Don't re-download the SDK if that executable already exists.
237
238 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
239
240 Here are the available wasi-sdk assets:
241 - wasi-sdk-25.0-x86_64-macos.tar.gz
242 - wasi-sdk-25.0-arm64-macos.tar.gz
243 - wasi-sdk-25.0-x86_64-linux.tar.gz
244 - wasi-sdk-25.0-arm64-linux.tar.gz
245 - wasi-sdk-25.0-x86_64-linux.tar.gz
246 - wasi-sdk-25.0-arm64-linux.tar.gz
247 - wasi-sdk-25.0-x86_64-windows.tar.gz
248 "})],
249 ),
250 message(
251 Assistant,
252 [tool_use(
253 "tool_1",
254 "read_file",
255 ReadFileToolInput {
256 path: input_file_path.into(),
257 start_line: Some(971),
258 end_line: Some(1050),
259 },
260 )],
261 ),
262 message(
263 User,
264 [tool_result(
265 "tool_1",
266 "read_file",
267 lines(input_file_content, 971..1050),
268 )],
269 ),
270 message(
271 Assistant,
272 [tool_use(
273 "tool_2",
274 "read_file",
275 ReadFileToolInput {
276 path: input_file_path.into(),
277 start_line: Some(1050),
278 end_line: Some(1100),
279 },
280 )],
281 ),
282 message(
283 User,
284 [tool_result(
285 "tool_2",
286 "read_file",
287 lines(input_file_content, 1050..1100),
288 )],
289 ),
290 message(
291 Assistant,
292 [tool_use(
293 "tool_3",
294 "read_file",
295 ReadFileToolInput {
296 path: input_file_path.into(),
297 start_line: Some(1100),
298 end_line: Some(1150),
299 },
300 )],
301 ),
302 message(
303 User,
304 [tool_result(
305 "tool_3",
306 "read_file",
307 lines(input_file_content, 1100..1150),
308 )],
309 ),
310 message(
311 Assistant,
312 [tool_use(
313 "tool_4",
314 "edit_file",
315 EditFileToolInput {
316 display_description: edit_description.into(),
317 path: input_file_path.into(),
318 mode: EditFileMode::Edit,
319 },
320 )],
321 ),
322 ],
323 Some(input_file_content.into()),
324 EvalAssertion::judge_diff(indoc! {"
325 - The compile_parser_to_wasm method has been changed to use wasi-sdk
326 - ureq is used to download the SDK for current platform and architecture
327 "}),
328 ),
329 );
330}
331
332#[test]
333#[cfg_attr(not(feature = "eval"), ignore)]
334fn eval_disable_cursor_blinking() {
335 let input_file_path = "root/editor.rs";
336 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
337 let edit_description = "Comment out the call to `BlinkManager::enable`";
338 eval(
339 100,
340 0.95,
341 EvalInput::from_conversation(
342 vec![
343 message(User, [text("Let's research how to cursor blinking works.")]),
344 message(
345 Assistant,
346 [tool_use(
347 "tool_1",
348 "grep",
349 GrepToolInput {
350 regex: "blink".into(),
351 include_pattern: None,
352 offset: 0,
353 case_sensitive: false,
354 },
355 )],
356 ),
357 message(
358 User,
359 [tool_result(
360 "tool_1",
361 "grep",
362 [
363 lines(input_file_content, 100..400),
364 lines(input_file_content, 800..1300),
365 lines(input_file_content, 1600..2000),
366 lines(input_file_content, 5000..5500),
367 lines(input_file_content, 8000..9000),
368 lines(input_file_content, 18455..18470),
369 lines(input_file_content, 20000..20500),
370 lines(input_file_content, 21000..21300),
371 ]
372 .join("Match found:\n\n"),
373 )],
374 ),
375 message(
376 User,
377 [text(indoc! {"
378 Comment out the lines that interact with the BlinkManager.
379 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
380 Don't add additional comments.
381 "})],
382 ),
383 message(
384 Assistant,
385 [tool_use(
386 "tool_4",
387 "edit_file",
388 EditFileToolInput {
389 display_description: edit_description.into(),
390 path: input_file_path.into(),
391 mode: EditFileMode::Edit,
392 },
393 )],
394 ),
395 ],
396 Some(input_file_content.into()),
397 EvalAssertion::judge_diff(indoc! {"
398 - Calls to BlinkManager in `observe_window_activation` were commented out
399 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
400 - All the edits have valid indentation
401 "}),
402 ),
403 );
404}
405
406#[test]
407#[cfg_attr(not(feature = "eval"), ignore)]
408fn eval_from_pixels_constructor() {
409 let input_file_path = "root/canvas.rs";
410 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
411 let edit_description = "Implement from_pixels constructor and add tests.";
412 eval(
413 100,
414 0.95,
415 EvalInput::from_conversation(
416 vec![
417 message(
418 User,
419 [text(indoc! {"
420 Introduce a new `from_pixels` constructor in Canvas and
421 also add tests for it in the same file.
422 "})],
423 ),
424 message(
425 Assistant,
426 [tool_use(
427 "tool_1",
428 "read_file",
429 ReadFileToolInput {
430 path: input_file_path.into(),
431 start_line: None,
432 end_line: None,
433 },
434 )],
435 ),
436 message(
437 User,
438 [tool_result("tool_1", "read_file", input_file_content)],
439 ),
440 message(
441 Assistant,
442 [tool_use(
443 "tool_2",
444 "grep",
445 GrepToolInput {
446 regex: "mod\\s+tests".into(),
447 include_pattern: Some("font-kit/src/canvas.rs".into()),
448 offset: 0,
449 case_sensitive: false,
450 },
451 )],
452 ),
453 message(User, [tool_result("tool_2", "grep", "No matches found")]),
454 message(
455 Assistant,
456 [tool_use(
457 "tool_3",
458 "grep",
459 GrepToolInput {
460 regex: "mod\\s+tests".into(),
461 include_pattern: Some("font-kit/src/**/*.rs".into()),
462 offset: 0,
463 case_sensitive: false,
464 },
465 )],
466 ),
467 message(User, [tool_result("tool_3", "grep", "No matches found")]),
468 message(
469 Assistant,
470 [tool_use(
471 "tool_4",
472 "grep",
473 GrepToolInput {
474 regex: "#\\[test\\]".into(),
475 include_pattern: Some("font-kit/src/**/*.rs".into()),
476 offset: 0,
477 case_sensitive: false,
478 },
479 )],
480 ),
481 message(
482 User,
483 [tool_result(
484 "tool_4",
485 "grep",
486 indoc! {"
487 Found 6 matches:
488
489 ## Matches in font-kit/src/loaders/core_text.rs
490
491 ### mod test › L926-936
492 ```
493 mod test {
494 use super::Font;
495 use crate::properties::{Stretch, Weight};
496
497 #[cfg(feature = \"source\")]
498 use crate::source::SystemSource;
499
500 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
501
502 #[cfg(feature = \"source\")]
503 #[test]
504 ```
505
506 55 lines remaining in ancestor node. Read the file to see all.
507
508 ### mod test › L947-951
509 ```
510 }
511
512 #[test]
513 fn test_core_text_to_css_font_weight() {
514 // Exact matches
515 ```
516
517 ### mod test › L959-963
518 ```
519 }
520
521 #[test]
522 fn test_core_text_to_css_font_stretch() {
523 // Exact matches
524 ```
525
526 ## Matches in font-kit/src/loaders/freetype.rs
527
528 ### mod test › L1238-1248
529 ```
530 mod test {
531 use crate::loaders::freetype::Font;
532
533 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
534 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
535
536 #[test]
537 fn get_pcf_postscript_name() {
538 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
539 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
540 }
541 ```
542
543 1 lines remaining in ancestor node. Read the file to see all.
544
545 ## Matches in font-kit/src/sources/core_text.rs
546
547 ### mod test › L265-275
548 ```
549 mod test {
550 use crate::properties::{Stretch, Weight};
551
552 #[test]
553 fn test_css_to_core_text_font_weight() {
554 // Exact matches
555 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
556 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
557 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
558 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
559
560 ```
561
562 27 lines remaining in ancestor node. Read the file to see all.
563
564 ### mod test › L278-282
565 ```
566 }
567
568 #[test]
569 fn test_css_to_core_text_font_stretch() {
570 // Exact matches
571 ```
572 "},
573 )],
574 ),
575 message(
576 Assistant,
577 [tool_use(
578 "tool_5",
579 "edit_file",
580 EditFileToolInput {
581 display_description: edit_description.into(),
582 path: input_file_path.into(),
583 mode: EditFileMode::Edit,
584 },
585 )],
586 ),
587 ],
588 Some(input_file_content.into()),
589 EvalAssertion::judge_diff(indoc! {"
590 - The diff contains a new `from_pixels` constructor
591 - The diff contains new tests for the `from_pixels` constructor
592 "}),
593 ),
594 );
595}
596
597#[test]
598#[cfg_attr(not(feature = "eval"), ignore)]
599fn eval_zode() {
600 let input_file_path = "root/zode.py";
601 let input_content = None;
602 let edit_description = "Create the main Zode CLI script";
603 eval(
604 200,
605 1.,
606 EvalInput::from_conversation(
607 vec![
608 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
609 message(
610 Assistant,
611 [
612 tool_use(
613 "tool_1",
614 "read_file",
615 ReadFileToolInput {
616 path: "root/eval/react.py".into(),
617 start_line: None,
618 end_line: None,
619 },
620 ),
621 tool_use(
622 "tool_2",
623 "read_file",
624 ReadFileToolInput {
625 path: "root/eval/react_test.py".into(),
626 start_line: None,
627 end_line: None,
628 },
629 ),
630 ],
631 ),
632 message(
633 User,
634 [
635 tool_result(
636 "tool_1",
637 "read_file",
638 include_str!("evals/fixtures/zode/react.py"),
639 ),
640 tool_result(
641 "tool_2",
642 "read_file",
643 include_str!("evals/fixtures/zode/react_test.py"),
644 ),
645 ],
646 ),
647 message(
648 Assistant,
649 [
650 text(
651 "Now that I understand what we need to build, I'll create the main Python script:",
652 ),
653 tool_use(
654 "tool_3",
655 "edit_file",
656 EditFileToolInput {
657 display_description: edit_description.into(),
658 path: input_file_path.into(),
659 mode: EditFileMode::Create,
660 },
661 ),
662 ],
663 ),
664 ],
665 input_content,
666 EvalAssertion::new(async move |sample, _, _cx| {
667 let invalid_starts = [' ', '`', '\n'];
668 let mut message = String::new();
669 for start in invalid_starts {
670 if sample.text_after.starts_with(start) {
671 message.push_str(&format!("The sample starts with a {:?}\n", start));
672 break;
673 }
674 }
675 // Remove trailing newline.
676 message.pop();
677
678 if message.is_empty() {
679 Ok(EvalAssertionOutcome {
680 score: 100,
681 message: None,
682 })
683 } else {
684 Ok(EvalAssertionOutcome {
685 score: 0,
686 message: Some(message),
687 })
688 }
689 }),
690 ),
691 );
692}
693
694#[test]
695#[cfg_attr(not(feature = "eval"), ignore)]
696fn eval_add_overwrite_test() {
697 let input_file_path = "root/action_log.rs";
698 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
699 let edit_description = "Add a new test for overwriting a file in action_log.rs";
700 eval(
701 200,
702 0.5, // TODO: make this eval better
703 EvalInput::from_conversation(
704 vec![
705 message(
706 User,
707 [text(indoc! {"
708 Introduce a new test in `action_log.rs` to test overwriting a file.
709 That is, a file already exists, but we call `buffer_created` as if the file were new.
710 Take inspiration from all the other tests in the file.
711 "})],
712 ),
713 message(
714 Assistant,
715 [tool_use(
716 "tool_1",
717 "read_file",
718 ReadFileToolInput {
719 path: input_file_path.into(),
720 start_line: None,
721 end_line: None,
722 },
723 )],
724 ),
725 message(
726 User,
727 [tool_result(
728 "tool_1",
729 "read_file",
730 indoc! {"
731 pub struct ActionLog [L13-20]
732 tracked_buffers [L15]
733 edited_since_project_diagnostics_check [L17]
734 project [L19]
735 impl ActionLog [L22-498]
736 pub fn new [L24-30]
737 pub fn project [L32-34]
738 pub fn checked_project_diagnostics [L37-39]
739 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
740 fn track_buffer_internal [L46-101]
741 fn handle_buffer_event [L103-116]
742 fn handle_buffer_edited [L118-123]
743 fn handle_buffer_file_changed [L125-158]
744 async fn maintain_diff [L160-264]
745 pub fn buffer_read [L267-269]
746 pub fn buffer_created [L272-276]
747 pub fn buffer_edited [L279-287]
748 pub fn will_delete_buffer [L289-304]
749 pub fn keep_edits_in_range [L306-364]
750 pub fn reject_edits_in_ranges [L366-459]
751 pub fn keep_all_edits [L461-473]
752 pub fn changed_buffers [L476-482]
753 pub fn stale_buffers [L485-497]
754 fn apply_non_conflicting_edits [L500-561]
755 fn diff_snapshots [L563-585]
756 fn point_to_row_edit [L587-614]
757 enum ChangeAuthor [L617-620]
758 User [L618]
759 Agent [L619]
760 enum TrackedBufferStatus [L623-627]
761 Created [L624]
762 Modified [L625]
763 Deleted [L626]
764 struct TrackedBuffer [L629-641]
765 buffer [L630]
766 base_text [L631]
767 unreviewed_changes [L632]
768 status [L633]
769 version [L634]
770 diff [L635]
771 snapshot [L636]
772 diff_update [L637]
773 _open_lsp_handle [L638]
774 _maintain_diff [L639]
775 _subscription [L640]
776 impl TrackedBuffer [L643-657]
777 fn has_changes [L644-650]
778 fn schedule_diff_update [L652-656]
779 pub struct ChangedBuffer [L659-661]
780 pub diff [L660]
781 mod tests [L664-1574]
782 fn init_logger [L678-682]
783 fn init_test [L684-691]
784 async fn test_keep_edits [L694-769]
785 async fn test_deletions [L772-854]
786 async fn test_overlapping_user_edits [L857-951]
787 async fn test_creating_files [L954-1010]
788 async fn test_deleting_files [L1013-1120]
789 async fn test_reject_edits [L1123-1255]
790 async fn test_reject_multiple_edits [L1258-1331]
791 async fn test_reject_deleted_file [L1334-1388]
792 async fn test_reject_created_file [L1391-1443]
793 async fn test_random_diffs [L1446-1535]
794 fn quiesce [L1510-1534]
795 struct HunkStatus [L1538-1542]
796 range [L1539]
797 diff_status [L1540]
798 old_text [L1541]
799 fn unreviewed_hunks [L1544-1573]
800
801 Showing symbols 1-69 (total symbols: 69)
802
803 Using the line numbers in this outline, you can call this tool again while specifying
804 the start_line and end_line fields to see the implementations of symbols in the outline.
805 "},
806 )],
807 ),
808 message(
809 Assistant,
810 [
811 text(
812 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
813 ),
814 tool_use(
815 "tool_2",
816 "read_file",
817 ReadFileToolInput {
818 path: input_file_path.into(),
819 start_line: Some(953),
820 end_line: Some(1010),
821 },
822 ),
823 ],
824 ),
825 message(
826 User,
827 [tool_result(
828 "tool_2",
829 "read_file",
830 lines(input_file_content, 953..1010),
831 )],
832 ),
833 message(
834 Assistant,
835 [
836 text(
837 "Now let's look at another relevant test to understand how they're structured:",
838 ),
839 tool_use(
840 "tool_3",
841 "read_file",
842 ReadFileToolInput {
843 path: input_file_path.into(),
844 start_line: Some(1012),
845 end_line: Some(1120),
846 },
847 ),
848 ],
849 ),
850 message(
851 User,
852 [tool_result(
853 "tool_3",
854 "read_file",
855 lines(input_file_content, 1012..1120),
856 )],
857 ),
858 message(
859 Assistant,
860 [
861 text("Now let's look at how `buffer_created` is implemented:"),
862 tool_use(
863 "tool_4",
864 "read_file",
865 ReadFileToolInput {
866 path: input_file_path.into(),
867 start_line: Some(271),
868 end_line: Some(276),
869 },
870 ),
871 ],
872 ),
873 message(
874 User,
875 [tool_result(
876 "tool_4",
877 "read_file",
878 lines(input_file_content, 271..276),
879 )],
880 ),
881 message(
882 Assistant,
883 [
884 text(indoc! {"
885 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
886
887 1. Create a file
888 2. Open a buffer for it
889 3. Track it in the action log
890 4. Modify the buffer
891 5. Call `buffer_created` again as if the file were new
892 6. Check that the changes are properly tracked
893
894 Let's write the test:
895 "}),
896 tool_use(
897 "tool_5",
898 "edit_file",
899 EditFileToolInput {
900 display_description: edit_description.into(),
901 path: input_file_path.into(),
902 mode: EditFileMode::Edit,
903 },
904 ),
905 ],
906 ),
907 ],
908 Some(input_file_content.into()),
909 EvalAssertion::judge_diff(
910 "A new test for overwritten files was created, without changing any previous test",
911 ),
912 ),
913 );
914}
915
916#[test]
917#[cfg_attr(not(feature = "eval"), ignore)]
918fn eval_create_empty_file() {
919 // Check that Edit Agent can create a file without writing its
920 // thoughts into it. This issue is not specific to empty files, but
921 // it's easier to reproduce with them.
922 //
923 //
924 // Model | Pass rate
925 // ============================================
926 //
927 // --------------------------------------------
928 // Prompt version: 2025-05-21
929 // --------------------------------------------
930 //
931 // claude-3.7-sonnet | 1.00
932 // gemini-2.5-pro-preview-03-25 | 1.00
933 // gemini-2.5-flash-preview-04-17 | 1.00
934 // gpt-4.1 | 1.00
935 //
936 //
937 // TODO: gpt-4.1-mini errored 38 times:
938 // "data did not match any variant of untagged enum ResponseStreamResult"
939 //
940 let input_file_content = None;
941 let expected_output_content = String::new();
942 eval(
943 100,
944 0.99,
945 EvalInput::from_conversation(
946 vec![
947 message(User, [text("Create a second empty todo file ")]),
948 message(
949 Assistant,
950 [
951 text(formatdoc! {"
952 I'll help you create a second empty todo file.
953 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
954 "}),
955 tool_use(
956 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
957 "list_directory",
958 ListDirectoryToolInput {
959 path: "root".to_string(),
960 },
961 ),
962 ],
963 ),
964 message(
965 User,
966 [tool_result(
967 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
968 "list_directory",
969 "root/TODO\nroot/TODO2\nroot/new.txt\n",
970 )],
971 ),
972 message(
973 Assistant,
974 [
975 text(formatdoc! {"
976 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
977 "}),
978 tool_use(
979 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
980 "edit_file",
981 EditFileToolInput {
982 display_description: "Create empty TODO3 file".to_string(),
983 mode: EditFileMode::Create,
984 path: "root/TODO3".into(),
985 },
986 ),
987 ],
988 ),
989 ],
990 input_file_content,
991 // Bad behavior is to write something like
992 // "I'll create an empty TODO3 file as requested."
993 EvalAssertion::assert_eq(expected_output_content),
994 ),
995 );
996}
997
998fn message(
999 role: Role,
1000 contents: impl IntoIterator<Item = MessageContent>,
1001) -> LanguageModelRequestMessage {
1002 LanguageModelRequestMessage {
1003 role,
1004 content: contents.into_iter().collect(),
1005 cache: false,
1006 }
1007}
1008
1009fn text(text: impl Into<String>) -> MessageContent {
1010 MessageContent::Text(text.into())
1011}
1012
1013fn lines(input: &str, range: Range<usize>) -> String {
1014 input
1015 .lines()
1016 .skip(range.start)
1017 .take(range.len())
1018 .collect::<Vec<_>>()
1019 .join("\n")
1020}
1021
1022fn tool_use(
1023 id: impl Into<Arc<str>>,
1024 name: impl Into<Arc<str>>,
1025 input: impl Serialize,
1026) -> MessageContent {
1027 MessageContent::ToolUse(LanguageModelToolUse {
1028 id: LanguageModelToolUseId::from(id.into()),
1029 name: name.into(),
1030 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1031 input: serde_json::to_value(input).unwrap(),
1032 is_input_complete: true,
1033 })
1034}
1035
1036fn tool_result(
1037 id: impl Into<Arc<str>>,
1038 name: impl Into<Arc<str>>,
1039 result: impl Into<Arc<str>>,
1040) -> MessageContent {
1041 MessageContent::ToolResult(LanguageModelToolResult {
1042 tool_use_id: LanguageModelToolUseId::from(id.into()),
1043 tool_name: name.into(),
1044 is_error: false,
1045 content: LanguageModelToolResultContent::Text(result.into()),
1046 output: None,
1047 })
1048}
1049
1050#[derive(Clone)]
1051struct EvalInput {
1052 conversation: Vec<LanguageModelRequestMessage>,
1053 edit_file_input: EditFileToolInput,
1054 input_content: Option<String>,
1055 assertion: EvalAssertion,
1056}
1057
1058impl EvalInput {
1059 fn from_conversation(
1060 conversation: Vec<LanguageModelRequestMessage>,
1061 input_content: Option<String>,
1062 assertion: EvalAssertion,
1063 ) -> Self {
1064 let msg = conversation.last().expect("Conversation must not be empty");
1065 if msg.role != Role::Assistant {
1066 panic!("Conversation must end with an assistant message");
1067 }
1068 let tool_use = msg
1069 .content
1070 .iter()
1071 .flat_map(|content| match content {
1072 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1073 Some(tool_use)
1074 }
1075 _ => None,
1076 })
1077 .next()
1078 .expect("Conversation must end with an edit_file tool use")
1079 .clone();
1080
1081 let edit_file_input: EditFileToolInput =
1082 serde_json::from_value(tool_use.input.clone()).unwrap();
1083
1084 EvalInput {
1085 conversation,
1086 edit_file_input,
1087 input_content,
1088 assertion,
1089 }
1090 }
1091}
1092
1093#[derive(Clone)]
1094struct EvalSample {
1095 text_before: String,
1096 text_after: String,
1097 edit_output: EditAgentOutput,
1098 diff: String,
1099}
1100
1101trait AssertionFn: 'static + Send + Sync {
1102 fn assert<'a>(
1103 &'a self,
1104 sample: &'a EvalSample,
1105 judge_model: Arc<dyn LanguageModel>,
1106 cx: &'a mut TestAppContext,
1107 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1108}
1109
1110impl<F> AssertionFn for F
1111where
1112 F: 'static
1113 + Send
1114 + Sync
1115 + AsyncFn(
1116 &EvalSample,
1117 Arc<dyn LanguageModel>,
1118 &mut TestAppContext,
1119 ) -> Result<EvalAssertionOutcome>,
1120{
1121 fn assert<'a>(
1122 &'a self,
1123 sample: &'a EvalSample,
1124 judge_model: Arc<dyn LanguageModel>,
1125 cx: &'a mut TestAppContext,
1126 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1127 (self)(sample, judge_model, cx).boxed_local()
1128 }
1129}
1130
1131#[derive(Clone)]
1132struct EvalAssertion(Arc<dyn AssertionFn>);
1133
1134impl EvalAssertion {
1135 fn new<F>(f: F) -> Self
1136 where
1137 F: 'static
1138 + Send
1139 + Sync
1140 + AsyncFn(
1141 &EvalSample,
1142 Arc<dyn LanguageModel>,
1143 &mut TestAppContext,
1144 ) -> Result<EvalAssertionOutcome>,
1145 {
1146 EvalAssertion(Arc::new(f))
1147 }
1148
1149 fn assert_eq(expected: impl Into<String>) -> Self {
1150 let expected = expected.into();
1151 Self::new(async move |sample, _judge, _cx| {
1152 Ok(EvalAssertionOutcome {
1153 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1154 100
1155 } else {
1156 0
1157 },
1158 message: None,
1159 })
1160 })
1161 }
1162
1163 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1164 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1165 Self::new(async move |sample, _judge, _cx| {
1166 let matches = expected_diffs.iter().any(|possible_diff| {
1167 let expected =
1168 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1169 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1170 });
1171
1172 Ok(EvalAssertionOutcome {
1173 score: if matches { 100 } else { 0 },
1174 message: None,
1175 })
1176 })
1177 }
1178
1179 fn judge_diff(assertions: &'static str) -> Self {
1180 Self::new(async move |sample, judge, cx| {
1181 let prompt = DiffJudgeTemplate {
1182 diff: sample.diff.clone(),
1183 assertions,
1184 }
1185 .render(&Templates::new())
1186 .unwrap();
1187
1188 let request = LanguageModelRequest {
1189 messages: vec![LanguageModelRequestMessage {
1190 role: Role::User,
1191 content: vec![prompt.into()],
1192 cache: false,
1193 }],
1194 ..Default::default()
1195 };
1196 let mut response = judge
1197 .stream_completion_text(request, &cx.to_async())
1198 .await?;
1199 let mut output = String::new();
1200 while let Some(chunk) = response.stream.next().await {
1201 let chunk = chunk?;
1202 output.push_str(&chunk);
1203 }
1204
1205 // Parse the score from the response
1206 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1207 if let Some(captures) = re.captures(&output) {
1208 if let Some(score_match) = captures.get(1) {
1209 let score = score_match.as_str().parse().unwrap_or(0);
1210 return Ok(EvalAssertionOutcome {
1211 score,
1212 message: Some(output),
1213 });
1214 }
1215 }
1216
1217 anyhow::bail!("No score found in response. Raw output: {output}");
1218 })
1219 }
1220
1221 async fn run(
1222 &self,
1223 input: &EvalSample,
1224 judge_model: Arc<dyn LanguageModel>,
1225 cx: &mut TestAppContext,
1226 ) -> Result<EvalAssertionOutcome> {
1227 self.0.assert(input, judge_model, cx).await
1228 }
1229}
1230
1231fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1232 let mut evaluated_count = 0;
1233 let mut failed_count = 0;
1234 report_progress(evaluated_count, failed_count, iterations);
1235
1236 let (tx, rx) = mpsc::channel();
1237
1238 // Cache the last message in the conversation, and run one instance of the eval so that
1239 // all the next ones are cached.
1240 eval.conversation.last_mut().unwrap().cache = true;
1241 run_eval(eval.clone(), tx.clone());
1242
1243 let executor = gpui::background_executor();
1244 for _ in 1..iterations {
1245 let eval = eval.clone();
1246 let tx = tx.clone();
1247 executor.spawn(async move { run_eval(eval, tx) }).detach();
1248 }
1249 drop(tx);
1250
1251 let mut failed_evals = HashMap::default();
1252 let mut errored_evals = HashMap::default();
1253 let mut eval_outputs = Vec::new();
1254 let mut cumulative_parser_metrics = EditParserMetrics::default();
1255 while let Ok(output) = rx.recv() {
1256 match output {
1257 Ok(output) => {
1258 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1259 eval_outputs.push(output.clone());
1260 if output.assertion.score < 80 {
1261 failed_count += 1;
1262 failed_evals
1263 .entry(output.sample.text_after.clone())
1264 .or_insert(Vec::new())
1265 .push(output);
1266 }
1267 }
1268 Err(error) => {
1269 failed_count += 1;
1270 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1271 }
1272 }
1273
1274 evaluated_count += 1;
1275 report_progress(evaluated_count, failed_count, iterations);
1276 }
1277
1278 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1279 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1280 if actual_pass_ratio < expected_pass_ratio {
1281 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1282 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1283 for (error, count) in errored_evals {
1284 println!("Eval errored {} times. Error: {}", count, error);
1285 }
1286
1287 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1288 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1289 for (_buffer_output, failed_evals) in failed_evals {
1290 let eval_output = failed_evals.first().unwrap();
1291 println!("Eval failed {} times", failed_evals.len());
1292 println!("{}", eval_output);
1293 }
1294
1295 panic!(
1296 "Actual pass ratio: {}\nExpected pass ratio: {}",
1297 actual_pass_ratio, expected_pass_ratio
1298 );
1299 }
1300
1301 let mismatched_tag_ratio =
1302 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1303 if mismatched_tag_ratio > 0.05 {
1304 for eval_output in eval_outputs {
1305 println!("{}", eval_output);
1306 }
1307 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1308 }
1309}
1310
1311fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1312 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1313 let mut cx = TestAppContext::build(dispatcher, None);
1314 let output = cx.executor().block_test(async {
1315 let test = EditAgentTest::new(&mut cx).await;
1316 test.eval(eval, &mut cx).await
1317 });
1318 tx.send(output).unwrap();
1319}
1320
1321#[derive(Clone)]
1322struct EvalOutput {
1323 sample: EvalSample,
1324 assertion: EvalAssertionOutcome,
1325}
1326
1327impl Display for EvalOutput {
1328 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1329 writeln!(f, "Score: {:?}", self.assertion.score)?;
1330 if let Some(message) = self.assertion.message.as_ref() {
1331 writeln!(f, "Message: {}", message)?;
1332 }
1333
1334 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1335
1336 writeln!(
1337 f,
1338 "Parser Metrics:\n{:#?}",
1339 self.sample.edit_output.parser_metrics
1340 )?;
1341 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1342 Ok(())
1343 }
1344}
1345
1346fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1347 let passed_count = evaluated_count - failed_count;
1348 let passed_ratio = if evaluated_count == 0 {
1349 0.0
1350 } else {
1351 passed_count as f64 / evaluated_count as f64
1352 };
1353 print!(
1354 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1355 evaluated_count,
1356 iterations,
1357 passed_ratio * 100.0
1358 );
1359 std::io::stdout().flush().unwrap();
1360}
1361
1362struct EditAgentTest {
1363 agent: EditAgent,
1364 project: Entity<Project>,
1365 judge_model: Arc<dyn LanguageModel>,
1366}
1367
1368impl EditAgentTest {
1369 async fn new(cx: &mut TestAppContext) -> Self {
1370 cx.executor().allow_parking();
1371
1372 let fs = FakeFs::new(cx.executor().clone());
1373 cx.update(|cx| {
1374 settings::init(cx);
1375 gpui_tokio::init(cx);
1376 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1377 cx.set_http_client(http_client);
1378
1379 client::init_settings(cx);
1380 let client = Client::production(cx);
1381 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1382
1383 settings::init(cx);
1384 Project::init_settings(cx);
1385 language::init(cx);
1386 language_model::init(client.clone(), cx);
1387 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1388 crate::init(client.http_client(), cx);
1389 });
1390
1391 fs.insert_tree("/root", json!({})).await;
1392 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1393 let agent_model = SelectedModel::from_str(
1394 &std::env::var("ZED_AGENT_MODEL")
1395 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1396 )
1397 .unwrap();
1398 let judge_model = SelectedModel::from_str(
1399 &std::env::var("ZED_JUDGE_MODEL")
1400 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1401 )
1402 .unwrap();
1403 let (agent_model, judge_model) = cx
1404 .update(|cx| {
1405 cx.spawn(async move |cx| {
1406 let agent_model = Self::load_model(&agent_model, cx).await;
1407 let judge_model = Self::load_model(&judge_model, cx).await;
1408 (agent_model.unwrap(), judge_model.unwrap())
1409 })
1410 })
1411 .await;
1412 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1413
1414 Self {
1415 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1416 project,
1417 judge_model,
1418 }
1419 }
1420
1421 async fn load_model(
1422 selected_model: &SelectedModel,
1423 cx: &mut AsyncApp,
1424 ) -> Result<Arc<dyn LanguageModel>> {
1425 let (provider, model) = cx.update(|cx| {
1426 let models = LanguageModelRegistry::read_global(cx);
1427 let model = models
1428 .available_models(cx)
1429 .find(|model| {
1430 model.provider_id() == selected_model.provider
1431 && model.id() == selected_model.model
1432 })
1433 .unwrap();
1434 let provider = models.provider(&model.provider_id()).unwrap();
1435 (provider, model)
1436 })?;
1437 cx.update(|cx| provider.authenticate(cx))?.await?;
1438 Ok(model)
1439 }
1440
1441 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1442 let path = self
1443 .project
1444 .read_with(cx, |project, cx| {
1445 project.find_project_path(eval.edit_file_input.path, cx)
1446 })
1447 .unwrap();
1448 let buffer = self
1449 .project
1450 .update(cx, |project, cx| project.open_buffer(path, cx))
1451 .await
1452 .unwrap();
1453 let tools = cx.update(|cx| {
1454 ToolRegistry::default_global(cx)
1455 .tools()
1456 .into_iter()
1457 .filter_map(|tool| {
1458 let input_schema = tool
1459 .input_schema(self.agent.model.tool_input_format())
1460 .ok()?;
1461 Some(LanguageModelRequestTool {
1462 name: tool.name(),
1463 description: tool.description(),
1464 input_schema,
1465 })
1466 })
1467 .collect::<Vec<_>>()
1468 });
1469 let tool_names = tools
1470 .iter()
1471 .map(|tool| tool.name.clone())
1472 .collect::<Vec<_>>();
1473 let worktrees = vec![WorktreeContext {
1474 root_name: "root".to_string(),
1475 rules_file: None,
1476 }];
1477 let prompt_builder = PromptBuilder::new(None)?;
1478 let project_context = ProjectContext::new(worktrees, Vec::default());
1479 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1480 &project_context,
1481 &ModelContext {
1482 available_tools: tool_names,
1483 },
1484 )?;
1485
1486 let has_system_prompt = eval
1487 .conversation
1488 .first()
1489 .map_or(false, |msg| msg.role == Role::System);
1490 let messages = if has_system_prompt {
1491 eval.conversation
1492 } else {
1493 [LanguageModelRequestMessage {
1494 role: Role::System,
1495 content: vec![MessageContent::Text(system_prompt)],
1496 cache: true,
1497 }]
1498 .into_iter()
1499 .chain(eval.conversation)
1500 .collect::<Vec<_>>()
1501 };
1502
1503 let conversation = LanguageModelRequest {
1504 messages,
1505 tools,
1506 ..Default::default()
1507 };
1508
1509 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1510 if let Some(input_content) = eval.input_content.as_deref() {
1511 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1512 }
1513 let (edit_output, _) = self.agent.edit(
1514 buffer.clone(),
1515 eval.edit_file_input.display_description,
1516 &conversation,
1517 &mut cx.to_async(),
1518 );
1519 edit_output.await?
1520 } else {
1521 let (edit_output, _) = self.agent.overwrite(
1522 buffer.clone(),
1523 eval.edit_file_input.display_description,
1524 &conversation,
1525 &mut cx.to_async(),
1526 );
1527 edit_output.await?
1528 };
1529
1530 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1531 let sample = EvalSample {
1532 edit_output,
1533 diff: language::unified_diff(
1534 eval.input_content.as_deref().unwrap_or_default(),
1535 &buffer_text,
1536 ),
1537 text_before: eval.input_content.unwrap_or_default(),
1538 text_after: buffer_text,
1539 };
1540 let assertion = eval
1541 .assertion
1542 .run(&sample, self.judge_model.clone(), cx)
1543 .await?;
1544
1545 Ok(EvalOutput { assertion, sample })
1546 }
1547}
1548
1549#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1550struct EvalAssertionOutcome {
1551 score: usize,
1552 message: Option<String>,
1553}
1554
1555#[derive(Serialize)]
1556pub struct DiffJudgeTemplate {
1557 diff: String,
1558 assertions: &'static str,
1559}
1560
1561impl Template for DiffJudgeTemplate {
1562 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1563}
1564
1565fn strip_empty_lines(text: &str) -> String {
1566 text.lines()
1567 .filter(|line| !line.trim().is_empty())
1568 .collect::<Vec<_>>()
1569 .join("\n")
1570}