1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 str::FromStr,
30 sync::mpsc,
31};
32use util::path;
33
34#[test]
35#[cfg_attr(not(feature = "eval"), ignore)]
36fn eval_extract_handle_command_output() {
37 let input_file_path = "root/blame.rs";
38 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
39 let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
40 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
41 eval(
42 100,
43 0.95,
44 EvalInput::from_conversation(
45 vec![
46 message(
47 User,
48 [text(formatdoc! {"
49 Read the `{input_file_path}` file and extract a method in
50 the final stanza of `run_git_blame` to deal with command failures,
51 call it `handle_command_output` and take the std::process::Output as the only parameter.
52
53 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
54 "})],
55 ),
56 message(
57 Assistant,
58 [tool_use(
59 "tool_1",
60 "read_file",
61 ReadFileToolInput {
62 path: input_file_path.into(),
63 start_line: None,
64 end_line: None,
65 },
66 )],
67 ),
68 message(
69 User,
70 [tool_result("tool_1", "read_file", input_file_content)],
71 ),
72 message(
73 Assistant,
74 [tool_use(
75 "tool_2",
76 "edit_file",
77 EditFileToolInput {
78 display_description: edit_description.into(),
79 path: input_file_path.into(),
80 mode: EditFileMode::Edit,
81 },
82 )],
83 ),
84 ],
85 Some(input_file_content.into()),
86 EvalAssertion::assert_eq(output_file_content),
87 ),
88 );
89}
90
91#[test]
92#[cfg_attr(not(feature = "eval"), ignore)]
93fn eval_delete_run_git_blame() {
94 let input_file_path = "root/blame.rs";
95 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
96 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
97 let edit_description = "Delete the `run_git_blame` function.";
98 eval(
99 100,
100 0.95,
101 EvalInput::from_conversation(
102 vec![
103 message(
104 User,
105 [text(formatdoc! {"
106 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
107 one function, not its usages.
108 "})],
109 ),
110 message(
111 Assistant,
112 [tool_use(
113 "tool_1",
114 "read_file",
115 ReadFileToolInput {
116 path: input_file_path.into(),
117 start_line: None,
118 end_line: None,
119 },
120 )],
121 ),
122 message(
123 User,
124 [tool_result("tool_1", "read_file", input_file_content)],
125 ),
126 message(
127 Assistant,
128 [tool_use(
129 "tool_2",
130 "edit_file",
131 EditFileToolInput {
132 display_description: edit_description.into(),
133 path: input_file_path.into(),
134 mode: EditFileMode::Edit,
135 },
136 )],
137 ),
138 ],
139 Some(input_file_content.into()),
140 EvalAssertion::assert_eq(output_file_content),
141 ),
142 );
143}
144
145#[test]
146#[cfg_attr(not(feature = "eval"), ignore)]
147fn eval_translate_doc_comments() {
148 let input_file_path = "root/canvas.rs";
149 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
150 let edit_description = "Translate all doc comments to Italian";
151 eval(
152 200,
153 1.,
154 EvalInput::from_conversation(
155 vec![
156 message(
157 User,
158 [text(formatdoc! {"
159 Read the {input_file_path} file and edit it (without overwriting it),
160 translating all the doc comments to italian.
161 "})],
162 ),
163 message(
164 Assistant,
165 [tool_use(
166 "tool_1",
167 "read_file",
168 ReadFileToolInput {
169 path: input_file_path.into(),
170 start_line: None,
171 end_line: None,
172 },
173 )],
174 ),
175 message(
176 User,
177 [tool_result("tool_1", "read_file", input_file_content)],
178 ),
179 message(
180 Assistant,
181 [tool_use(
182 "tool_2",
183 "edit_file",
184 EditFileToolInput {
185 display_description: edit_description.into(),
186 path: input_file_path.into(),
187 mode: EditFileMode::Edit,
188 },
189 )],
190 ),
191 ],
192 Some(input_file_content.into()),
193 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
194 ),
195 );
196}
197
198#[test]
199#[cfg_attr(not(feature = "eval"), ignore)]
200fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
201 let input_file_path = "root/lib.rs";
202 let input_file_content =
203 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
204 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
205 eval(
206 100,
207 0.95,
208 EvalInput::from_conversation(
209 vec![
210 message(
211 User,
212 [text(formatdoc! {"
213 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
214 Use `ureq` to download the SDK for the current platform and architecture.
215 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
216 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
217 that's inside of the archive.
218 Don't re-download the SDK if that executable already exists.
219
220 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
221
222 Here are the available wasi-sdk assets:
223 - wasi-sdk-25.0-x86_64-macos.tar.gz
224 - wasi-sdk-25.0-arm64-macos.tar.gz
225 - wasi-sdk-25.0-x86_64-linux.tar.gz
226 - wasi-sdk-25.0-arm64-linux.tar.gz
227 - wasi-sdk-25.0-x86_64-linux.tar.gz
228 - wasi-sdk-25.0-arm64-linux.tar.gz
229 - wasi-sdk-25.0-x86_64-windows.tar.gz
230 "})],
231 ),
232 message(
233 Assistant,
234 [tool_use(
235 "tool_1",
236 "read_file",
237 ReadFileToolInput {
238 path: input_file_path.into(),
239 start_line: Some(971),
240 end_line: Some(1050),
241 },
242 )],
243 ),
244 message(
245 User,
246 [tool_result(
247 "tool_1",
248 "read_file",
249 lines(input_file_content, 971..1050),
250 )],
251 ),
252 message(
253 Assistant,
254 [tool_use(
255 "tool_2",
256 "read_file",
257 ReadFileToolInput {
258 path: input_file_path.into(),
259 start_line: Some(1050),
260 end_line: Some(1100),
261 },
262 )],
263 ),
264 message(
265 User,
266 [tool_result(
267 "tool_2",
268 "read_file",
269 lines(input_file_content, 1050..1100),
270 )],
271 ),
272 message(
273 Assistant,
274 [tool_use(
275 "tool_3",
276 "read_file",
277 ReadFileToolInput {
278 path: input_file_path.into(),
279 start_line: Some(1100),
280 end_line: Some(1150),
281 },
282 )],
283 ),
284 message(
285 User,
286 [tool_result(
287 "tool_3",
288 "read_file",
289 lines(input_file_content, 1100..1150),
290 )],
291 ),
292 message(
293 Assistant,
294 [tool_use(
295 "tool_4",
296 "edit_file",
297 EditFileToolInput {
298 display_description: edit_description.into(),
299 path: input_file_path.into(),
300 mode: EditFileMode::Edit,
301 },
302 )],
303 ),
304 ],
305 Some(input_file_content.into()),
306 EvalAssertion::judge_diff(indoc! {"
307 - The compile_parser_to_wasm method has been changed to use wasi-sdk
308 - ureq is used to download the SDK for current platform and architecture
309 "}),
310 ),
311 );
312}
313
314#[test]
315#[cfg_attr(not(feature = "eval"), ignore)]
316fn eval_disable_cursor_blinking() {
317 let input_file_path = "root/editor.rs";
318 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
319 let edit_description = "Comment out the call to `BlinkManager::enable`";
320 eval(
321 100,
322 0.95,
323 EvalInput::from_conversation(
324 vec![
325 message(User, [text("Let's research how to cursor blinking works.")]),
326 message(
327 Assistant,
328 [tool_use(
329 "tool_1",
330 "grep",
331 GrepToolInput {
332 regex: "blink".into(),
333 include_pattern: None,
334 offset: 0,
335 case_sensitive: false,
336 },
337 )],
338 ),
339 message(
340 User,
341 [tool_result(
342 "tool_1",
343 "grep",
344 [
345 lines(input_file_content, 100..400),
346 lines(input_file_content, 800..1300),
347 lines(input_file_content, 1600..2000),
348 lines(input_file_content, 5000..5500),
349 lines(input_file_content, 8000..9000),
350 lines(input_file_content, 18455..18470),
351 lines(input_file_content, 20000..20500),
352 lines(input_file_content, 21000..21300),
353 ]
354 .join("Match found:\n\n"),
355 )],
356 ),
357 message(
358 User,
359 [text(indoc! {"
360 Comment out the lines that interact with the BlinkManager.
361 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
362 Don't add additional comments.
363 "})],
364 ),
365 message(
366 Assistant,
367 [tool_use(
368 "tool_4",
369 "edit_file",
370 EditFileToolInput {
371 display_description: edit_description.into(),
372 path: input_file_path.into(),
373 mode: EditFileMode::Edit,
374 },
375 )],
376 ),
377 ],
378 Some(input_file_content.into()),
379 EvalAssertion::judge_diff(indoc! {"
380 - Calls to BlinkManager in `observe_window_activation` were commented out
381 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
382 - All the edits have valid indentation
383 "}),
384 ),
385 );
386}
387
388#[test]
389#[cfg_attr(not(feature = "eval"), ignore)]
390fn eval_from_pixels_constructor() {
391 let input_file_path = "root/canvas.rs";
392 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
393 let edit_description = "Implement from_pixels constructor and add tests.";
394 eval(
395 100,
396 0.95,
397 EvalInput::from_conversation(
398 vec![
399 message(
400 User,
401 [text(indoc! {"
402 Introduce a new `from_pixels` constructor in Canvas and
403 also add tests for it in the same file.
404 "})],
405 ),
406 message(
407 Assistant,
408 [tool_use(
409 "tool_1",
410 "read_file",
411 ReadFileToolInput {
412 path: input_file_path.into(),
413 start_line: None,
414 end_line: None,
415 },
416 )],
417 ),
418 message(
419 User,
420 [tool_result("tool_1", "read_file", input_file_content)],
421 ),
422 message(
423 Assistant,
424 [tool_use(
425 "tool_2",
426 "grep",
427 GrepToolInput {
428 regex: "mod\\s+tests".into(),
429 include_pattern: Some("font-kit/src/canvas.rs".into()),
430 offset: 0,
431 case_sensitive: false,
432 },
433 )],
434 ),
435 message(User, [tool_result("tool_2", "grep", "No matches found")]),
436 message(
437 Assistant,
438 [tool_use(
439 "tool_3",
440 "grep",
441 GrepToolInput {
442 regex: "mod\\s+tests".into(),
443 include_pattern: Some("font-kit/src/**/*.rs".into()),
444 offset: 0,
445 case_sensitive: false,
446 },
447 )],
448 ),
449 message(User, [tool_result("tool_3", "grep", "No matches found")]),
450 message(
451 Assistant,
452 [tool_use(
453 "tool_4",
454 "grep",
455 GrepToolInput {
456 regex: "#\\[test\\]".into(),
457 include_pattern: Some("font-kit/src/**/*.rs".into()),
458 offset: 0,
459 case_sensitive: false,
460 },
461 )],
462 ),
463 message(
464 User,
465 [tool_result(
466 "tool_4",
467 "grep",
468 indoc! {"
469 Found 6 matches:
470
471 ## Matches in font-kit/src/loaders/core_text.rs
472
473 ### mod test › L926-936
474 ```
475 mod test {
476 use super::Font;
477 use crate::properties::{Stretch, Weight};
478
479 #[cfg(feature = \"source\")]
480 use crate::source::SystemSource;
481
482 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
483
484 #[cfg(feature = \"source\")]
485 #[test]
486 ```
487
488 55 lines remaining in ancestor node. Read the file to see all.
489
490 ### mod test › L947-951
491 ```
492 }
493
494 #[test]
495 fn test_core_text_to_css_font_weight() {
496 // Exact matches
497 ```
498
499 ### mod test › L959-963
500 ```
501 }
502
503 #[test]
504 fn test_core_text_to_css_font_stretch() {
505 // Exact matches
506 ```
507
508 ## Matches in font-kit/src/loaders/freetype.rs
509
510 ### mod test › L1238-1248
511 ```
512 mod test {
513 use crate::loaders::freetype::Font;
514
515 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
516 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
517
518 #[test]
519 fn get_pcf_postscript_name() {
520 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
521 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
522 }
523 ```
524
525 1 lines remaining in ancestor node. Read the file to see all.
526
527 ## Matches in font-kit/src/sources/core_text.rs
528
529 ### mod test › L265-275
530 ```
531 mod test {
532 use crate::properties::{Stretch, Weight};
533
534 #[test]
535 fn test_css_to_core_text_font_weight() {
536 // Exact matches
537 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
538 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
539 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
540 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
541
542 ```
543
544 27 lines remaining in ancestor node. Read the file to see all.
545
546 ### mod test › L278-282
547 ```
548 }
549
550 #[test]
551 fn test_css_to_core_text_font_stretch() {
552 // Exact matches
553 ```
554 "},
555 )],
556 ),
557 message(
558 Assistant,
559 [tool_use(
560 "tool_5",
561 "edit_file",
562 EditFileToolInput {
563 display_description: edit_description.into(),
564 path: input_file_path.into(),
565 mode: EditFileMode::Edit,
566 },
567 )],
568 ),
569 ],
570 Some(input_file_content.into()),
571 EvalAssertion::judge_diff(indoc! {"
572 - The diff contains a new `from_pixels` constructor
573 - The diff contains new tests for the `from_pixels` constructor
574 "}),
575 ),
576 );
577}
578
579#[test]
580#[cfg_attr(not(feature = "eval"), ignore)]
581fn eval_zode() {
582 let input_file_path = "root/zode.py";
583 let input_content = None;
584 let edit_description = "Create the main Zode CLI script";
585 eval(
586 200,
587 1.,
588 EvalInput::from_conversation(
589 vec![
590 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
591 message(
592 Assistant,
593 [
594 tool_use(
595 "tool_1",
596 "read_file",
597 ReadFileToolInput {
598 path: "root/eval/react.py".into(),
599 start_line: None,
600 end_line: None,
601 },
602 ),
603 tool_use(
604 "tool_2",
605 "read_file",
606 ReadFileToolInput {
607 path: "root/eval/react_test.py".into(),
608 start_line: None,
609 end_line: None,
610 },
611 ),
612 ],
613 ),
614 message(
615 User,
616 [
617 tool_result(
618 "tool_1",
619 "read_file",
620 include_str!("evals/fixtures/zode/react.py"),
621 ),
622 tool_result(
623 "tool_2",
624 "read_file",
625 include_str!("evals/fixtures/zode/react_test.py"),
626 ),
627 ],
628 ),
629 message(
630 Assistant,
631 [
632 text(
633 "Now that I understand what we need to build, I'll create the main Python script:",
634 ),
635 tool_use(
636 "tool_3",
637 "edit_file",
638 EditFileToolInput {
639 display_description: edit_description.into(),
640 path: input_file_path.into(),
641 mode: EditFileMode::Create,
642 },
643 ),
644 ],
645 ),
646 ],
647 input_content,
648 EvalAssertion::new(async move |sample, _, _cx| {
649 let invalid_starts = [' ', '`', '\n'];
650 let mut message = String::new();
651 for start in invalid_starts {
652 if sample.text.starts_with(start) {
653 message.push_str(&format!("The sample starts with a {:?}\n", start));
654 break;
655 }
656 }
657 // Remove trailing newline.
658 message.pop();
659
660 if message.is_empty() {
661 Ok(EvalAssertionOutcome {
662 score: 100,
663 message: None,
664 })
665 } else {
666 Ok(EvalAssertionOutcome {
667 score: 0,
668 message: Some(message),
669 })
670 }
671 }),
672 ),
673 );
674}
675
676#[test]
677#[cfg_attr(not(feature = "eval"), ignore)]
678fn eval_add_overwrite_test() {
679 let input_file_path = "root/action_log.rs";
680 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
681 let edit_description = "Add a new test for overwriting a file in action_log.rs";
682 eval(
683 200,
684 0.5, // TODO: make this eval better
685 EvalInput::from_conversation(
686 vec![
687 message(
688 User,
689 [text(indoc! {"
690 Introduce a new test in `action_log.rs` to test overwriting a file.
691 That is, a file already exists, but we call `buffer_created` as if the file were new.
692 Take inspiration from all the other tests in the file.
693 "})],
694 ),
695 message(
696 Assistant,
697 [tool_use(
698 "tool_1",
699 "read_file",
700 ReadFileToolInput {
701 path: input_file_path.into(),
702 start_line: None,
703 end_line: None,
704 },
705 )],
706 ),
707 message(
708 User,
709 [tool_result(
710 "tool_1",
711 "read_file",
712 indoc! {"
713 pub struct ActionLog [L13-20]
714 tracked_buffers [L15]
715 edited_since_project_diagnostics_check [L17]
716 project [L19]
717 impl ActionLog [L22-498]
718 pub fn new [L24-30]
719 pub fn project [L32-34]
720 pub fn checked_project_diagnostics [L37-39]
721 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
722 fn track_buffer_internal [L46-101]
723 fn handle_buffer_event [L103-116]
724 fn handle_buffer_edited [L118-123]
725 fn handle_buffer_file_changed [L125-158]
726 async fn maintain_diff [L160-264]
727 pub fn buffer_read [L267-269]
728 pub fn buffer_created [L272-276]
729 pub fn buffer_edited [L279-287]
730 pub fn will_delete_buffer [L289-304]
731 pub fn keep_edits_in_range [L306-364]
732 pub fn reject_edits_in_ranges [L366-459]
733 pub fn keep_all_edits [L461-473]
734 pub fn changed_buffers [L476-482]
735 pub fn stale_buffers [L485-497]
736 fn apply_non_conflicting_edits [L500-561]
737 fn diff_snapshots [L563-585]
738 fn point_to_row_edit [L587-614]
739 enum ChangeAuthor [L617-620]
740 User [L618]
741 Agent [L619]
742 enum TrackedBufferStatus [L623-627]
743 Created [L624]
744 Modified [L625]
745 Deleted [L626]
746 struct TrackedBuffer [L629-641]
747 buffer [L630]
748 base_text [L631]
749 unreviewed_changes [L632]
750 status [L633]
751 version [L634]
752 diff [L635]
753 snapshot [L636]
754 diff_update [L637]
755 _open_lsp_handle [L638]
756 _maintain_diff [L639]
757 _subscription [L640]
758 impl TrackedBuffer [L643-657]
759 fn has_changes [L644-650]
760 fn schedule_diff_update [L652-656]
761 pub struct ChangedBuffer [L659-661]
762 pub diff [L660]
763 mod tests [L664-1574]
764 fn init_logger [L678-682]
765 fn init_test [L684-691]
766 async fn test_keep_edits [L694-769]
767 async fn test_deletions [L772-854]
768 async fn test_overlapping_user_edits [L857-951]
769 async fn test_creating_files [L954-1010]
770 async fn test_deleting_files [L1013-1120]
771 async fn test_reject_edits [L1123-1255]
772 async fn test_reject_multiple_edits [L1258-1331]
773 async fn test_reject_deleted_file [L1334-1388]
774 async fn test_reject_created_file [L1391-1443]
775 async fn test_random_diffs [L1446-1535]
776 fn quiesce [L1510-1534]
777 struct HunkStatus [L1538-1542]
778 range [L1539]
779 diff_status [L1540]
780 old_text [L1541]
781 fn unreviewed_hunks [L1544-1573]
782
783 Showing symbols 1-69 (total symbols: 69)
784
785 Using the line numbers in this outline, you can call this tool again while specifying
786 the start_line and end_line fields to see the implementations of symbols in the outline.
787 "},
788 )],
789 ),
790 message(
791 Assistant,
792 [
793 text(
794 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
795 ),
796 tool_use(
797 "tool_2",
798 "read_file",
799 ReadFileToolInput {
800 path: input_file_path.into(),
801 start_line: Some(953),
802 end_line: Some(1010),
803 },
804 ),
805 ],
806 ),
807 message(
808 User,
809 [tool_result(
810 "tool_2",
811 "read_file",
812 lines(input_file_content, 953..1010),
813 )],
814 ),
815 message(
816 Assistant,
817 [
818 text(
819 "Now let's look at another relevant test to understand how they're structured:",
820 ),
821 tool_use(
822 "tool_3",
823 "read_file",
824 ReadFileToolInput {
825 path: input_file_path.into(),
826 start_line: Some(1012),
827 end_line: Some(1120),
828 },
829 ),
830 ],
831 ),
832 message(
833 User,
834 [tool_result(
835 "tool_3",
836 "read_file",
837 lines(input_file_content, 1012..1120),
838 )],
839 ),
840 message(
841 Assistant,
842 [
843 text("Now let's look at how `buffer_created` is implemented:"),
844 tool_use(
845 "tool_4",
846 "read_file",
847 ReadFileToolInput {
848 path: input_file_path.into(),
849 start_line: Some(271),
850 end_line: Some(276),
851 },
852 ),
853 ],
854 ),
855 message(
856 User,
857 [tool_result(
858 "tool_4",
859 "read_file",
860 lines(input_file_content, 271..276),
861 )],
862 ),
863 message(
864 Assistant,
865 [
866 text(indoc! {"
867 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
868
869 1. Create a file
870 2. Open a buffer for it
871 3. Track it in the action log
872 4. Modify the buffer
873 5. Call `buffer_created` again as if the file were new
874 6. Check that the changes are properly tracked
875
876 Let's write the test:
877 "}),
878 tool_use(
879 "tool_5",
880 "edit_file",
881 EditFileToolInput {
882 display_description: edit_description.into(),
883 path: input_file_path.into(),
884 mode: EditFileMode::Edit,
885 },
886 ),
887 ],
888 ),
889 ],
890 Some(input_file_content.into()),
891 EvalAssertion::judge_diff(
892 "A new test for overwritten files was created, without changing any previous test",
893 ),
894 ),
895 );
896}
897
898#[test]
899#[cfg_attr(not(feature = "eval"), ignore)]
900fn eval_create_empty_file() {
901 // Check that Edit Agent can create a file without writing its
902 // thoughts into it. This issue is not specific to empty files, but
903 // it's easier to reproduce with them.
904 //
905 //
906 // Model | Pass rate
907 // ============================================
908 //
909 // --------------------------------------------
910 // Prompt version: 2025-05-21
911 // --------------------------------------------
912 //
913 // claude-3.7-sonnet | 1.00
914 // gemini-2.5-pro-preview-03-25 | 1.00
915 // gemini-2.5-flash-preview-04-17 | 1.00
916 // gpt-4.1 | 1.00
917 //
918 //
919 // TODO: gpt-4.1-mini errored 38 times:
920 // "data did not match any variant of untagged enum ResponseStreamResult"
921 //
922 let input_file_content = None;
923 let expected_output_content = String::new();
924 eval(
925 100,
926 0.99,
927 EvalInput::from_conversation(
928 vec![
929 message(User, [text("Create a second empty todo file ")]),
930 message(
931 Assistant,
932 [
933 text(formatdoc! {"
934 I'll help you create a second empty todo file.
935 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
936 "}),
937 tool_use(
938 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
939 "list_directory",
940 ListDirectoryToolInput {
941 path: "root".to_string(),
942 },
943 ),
944 ],
945 ),
946 message(
947 User,
948 [tool_result(
949 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
950 "list_directory",
951 "root/TODO\nroot/TODO2\nroot/new.txt\n",
952 )],
953 ),
954 message(
955 Assistant,
956 [
957 text(formatdoc! {"
958 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
959 "}),
960 tool_use(
961 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
962 "edit_file",
963 EditFileToolInput {
964 display_description: "Create empty TODO3 file".to_string(),
965 mode: EditFileMode::Create,
966 path: "root/TODO3".into(),
967 },
968 ),
969 ],
970 ),
971 ],
972 input_file_content,
973 // Bad behavior is to write something like
974 // "I'll create an empty TODO3 file as requested."
975 EvalAssertion::assert_eq(expected_output_content),
976 ),
977 );
978}
979
980fn message(
981 role: Role,
982 contents: impl IntoIterator<Item = MessageContent>,
983) -> LanguageModelRequestMessage {
984 LanguageModelRequestMessage {
985 role,
986 content: contents.into_iter().collect(),
987 cache: false,
988 }
989}
990
991fn text(text: impl Into<String>) -> MessageContent {
992 MessageContent::Text(text.into())
993}
994
995fn lines(input: &str, range: Range<usize>) -> String {
996 input
997 .lines()
998 .skip(range.start)
999 .take(range.len())
1000 .collect::<Vec<_>>()
1001 .join("\n")
1002}
1003
1004fn tool_use(
1005 id: impl Into<Arc<str>>,
1006 name: impl Into<Arc<str>>,
1007 input: impl Serialize,
1008) -> MessageContent {
1009 MessageContent::ToolUse(LanguageModelToolUse {
1010 id: LanguageModelToolUseId::from(id.into()),
1011 name: name.into(),
1012 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1013 input: serde_json::to_value(input).unwrap(),
1014 is_input_complete: true,
1015 })
1016}
1017
1018fn tool_result(
1019 id: impl Into<Arc<str>>,
1020 name: impl Into<Arc<str>>,
1021 result: impl Into<Arc<str>>,
1022) -> MessageContent {
1023 MessageContent::ToolResult(LanguageModelToolResult {
1024 tool_use_id: LanguageModelToolUseId::from(id.into()),
1025 tool_name: name.into(),
1026 is_error: false,
1027 content: LanguageModelToolResultContent::Text(result.into()),
1028 output: None,
1029 })
1030}
1031
1032#[derive(Clone)]
1033struct EvalInput {
1034 conversation: Vec<LanguageModelRequestMessage>,
1035 edit_file_input: EditFileToolInput,
1036 input_content: Option<String>,
1037 assertion: EvalAssertion,
1038}
1039
1040impl EvalInput {
1041 fn from_conversation(
1042 conversation: Vec<LanguageModelRequestMessage>,
1043 input_content: Option<String>,
1044 assertion: EvalAssertion,
1045 ) -> Self {
1046 let msg = conversation.last().expect("Conversation must not be empty");
1047 if msg.role != Role::Assistant {
1048 panic!("Conversation must end with an assistant message");
1049 }
1050 let tool_use = msg
1051 .content
1052 .iter()
1053 .flat_map(|content| match content {
1054 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1055 Some(tool_use)
1056 }
1057 _ => None,
1058 })
1059 .next()
1060 .expect("Conversation must end with an edit_file tool use")
1061 .clone();
1062
1063 let edit_file_input: EditFileToolInput =
1064 serde_json::from_value(tool_use.input.clone()).unwrap();
1065
1066 EvalInput {
1067 conversation,
1068 edit_file_input,
1069 input_content,
1070 assertion,
1071 }
1072 }
1073}
1074
1075#[derive(Clone)]
1076struct EvalSample {
1077 text: String,
1078 edit_output: EditAgentOutput,
1079 diff: String,
1080}
1081
1082trait AssertionFn: 'static + Send + Sync {
1083 fn assert<'a>(
1084 &'a self,
1085 sample: &'a EvalSample,
1086 judge_model: Arc<dyn LanguageModel>,
1087 cx: &'a mut TestAppContext,
1088 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1089}
1090
1091impl<F> AssertionFn for F
1092where
1093 F: 'static
1094 + Send
1095 + Sync
1096 + AsyncFn(
1097 &EvalSample,
1098 Arc<dyn LanguageModel>,
1099 &mut TestAppContext,
1100 ) -> Result<EvalAssertionOutcome>,
1101{
1102 fn assert<'a>(
1103 &'a self,
1104 sample: &'a EvalSample,
1105 judge_model: Arc<dyn LanguageModel>,
1106 cx: &'a mut TestAppContext,
1107 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1108 (self)(sample, judge_model, cx).boxed_local()
1109 }
1110}
1111
1112#[derive(Clone)]
1113struct EvalAssertion(Arc<dyn AssertionFn>);
1114
1115impl EvalAssertion {
1116 fn new<F>(f: F) -> Self
1117 where
1118 F: 'static
1119 + Send
1120 + Sync
1121 + AsyncFn(
1122 &EvalSample,
1123 Arc<dyn LanguageModel>,
1124 &mut TestAppContext,
1125 ) -> Result<EvalAssertionOutcome>,
1126 {
1127 EvalAssertion(Arc::new(f))
1128 }
1129
1130 fn assert_eq(expected: impl Into<String>) -> Self {
1131 let expected = expected.into();
1132 Self::new(async move |sample, _judge, _cx| {
1133 Ok(EvalAssertionOutcome {
1134 score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
1135 100
1136 } else {
1137 0
1138 },
1139 message: None,
1140 })
1141 })
1142 }
1143
1144 fn judge_diff(assertions: &'static str) -> Self {
1145 Self::new(async move |sample, judge, cx| {
1146 let prompt = DiffJudgeTemplate {
1147 diff: sample.diff.clone(),
1148 assertions,
1149 }
1150 .render(&Templates::new())
1151 .unwrap();
1152
1153 let request = LanguageModelRequest {
1154 messages: vec![LanguageModelRequestMessage {
1155 role: Role::User,
1156 content: vec![prompt.into()],
1157 cache: false,
1158 }],
1159 ..Default::default()
1160 };
1161 let mut response = judge
1162 .stream_completion_text(request, &cx.to_async())
1163 .await?;
1164 let mut output = String::new();
1165 while let Some(chunk) = response.stream.next().await {
1166 let chunk = chunk?;
1167 output.push_str(&chunk);
1168 }
1169
1170 // Parse the score from the response
1171 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1172 if let Some(captures) = re.captures(&output) {
1173 if let Some(score_match) = captures.get(1) {
1174 let score = score_match.as_str().parse().unwrap_or(0);
1175 return Ok(EvalAssertionOutcome {
1176 score,
1177 message: Some(output),
1178 });
1179 }
1180 }
1181
1182 anyhow::bail!("No score found in response. Raw output: {output}");
1183 })
1184 }
1185
1186 async fn run(
1187 &self,
1188 input: &EvalSample,
1189 judge_model: Arc<dyn LanguageModel>,
1190 cx: &mut TestAppContext,
1191 ) -> Result<EvalAssertionOutcome> {
1192 self.0.assert(input, judge_model, cx).await
1193 }
1194}
1195
1196fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1197 let mut evaluated_count = 0;
1198 let mut failed_count = 0;
1199 report_progress(evaluated_count, failed_count, iterations);
1200
1201 let (tx, rx) = mpsc::channel();
1202
1203 // Cache the last message in the conversation, and run one instance of the eval so that
1204 // all the next ones are cached.
1205 eval.conversation.last_mut().unwrap().cache = true;
1206 run_eval(eval.clone(), tx.clone());
1207
1208 let executor = gpui::background_executor();
1209 for _ in 1..iterations {
1210 let eval = eval.clone();
1211 let tx = tx.clone();
1212 executor.spawn(async move { run_eval(eval, tx) }).detach();
1213 }
1214 drop(tx);
1215
1216 let mut failed_evals = HashMap::default();
1217 let mut errored_evals = HashMap::default();
1218 let mut eval_outputs = Vec::new();
1219 let mut cumulative_parser_metrics = EditParserMetrics::default();
1220 while let Ok(output) = rx.recv() {
1221 match output {
1222 Ok(output) => {
1223 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1224 eval_outputs.push(output.clone());
1225 if output.assertion.score < 80 {
1226 failed_count += 1;
1227 failed_evals
1228 .entry(output.sample.text.clone())
1229 .or_insert(Vec::new())
1230 .push(output);
1231 }
1232 }
1233 Err(error) => {
1234 failed_count += 1;
1235 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1236 }
1237 }
1238
1239 evaluated_count += 1;
1240 report_progress(evaluated_count, failed_count, iterations);
1241 }
1242
1243 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1244 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1245 if actual_pass_ratio < expected_pass_ratio {
1246 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1247 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1248 for (error, count) in errored_evals {
1249 println!("Eval errored {} times. Error: {}", count, error);
1250 }
1251
1252 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1253 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1254 for (_buffer_output, failed_evals) in failed_evals {
1255 let eval_output = failed_evals.first().unwrap();
1256 println!("Eval failed {} times", failed_evals.len());
1257 println!("{}", eval_output);
1258 }
1259
1260 panic!(
1261 "Actual pass ratio: {}\nExpected pass ratio: {}",
1262 actual_pass_ratio, expected_pass_ratio
1263 );
1264 }
1265
1266 let mismatched_tag_ratio =
1267 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1268 if mismatched_tag_ratio > 0.05 {
1269 for eval_output in eval_outputs {
1270 println!("{}", eval_output);
1271 }
1272 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1273 }
1274}
1275
1276fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1277 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1278 let mut cx = TestAppContext::build(dispatcher, None);
1279 let output = cx.executor().block_test(async {
1280 let test = EditAgentTest::new(&mut cx).await;
1281 test.eval(eval, &mut cx).await
1282 });
1283 tx.send(output).unwrap();
1284}
1285
1286#[derive(Clone)]
1287struct EvalOutput {
1288 sample: EvalSample,
1289 assertion: EvalAssertionOutcome,
1290}
1291
1292impl Display for EvalOutput {
1293 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1294 writeln!(f, "Score: {:?}", self.assertion.score)?;
1295 if let Some(message) = self.assertion.message.as_ref() {
1296 writeln!(f, "Message: {}", message)?;
1297 }
1298
1299 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1300
1301 writeln!(
1302 f,
1303 "Parser Metrics:\n{:#?}",
1304 self.sample.edit_output.parser_metrics
1305 )?;
1306 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1307 Ok(())
1308 }
1309}
1310
1311fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1312 let passed_count = evaluated_count - failed_count;
1313 let passed_ratio = if evaluated_count == 0 {
1314 0.0
1315 } else {
1316 passed_count as f64 / evaluated_count as f64
1317 };
1318 print!(
1319 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1320 evaluated_count,
1321 iterations,
1322 passed_ratio * 100.0
1323 );
1324 std::io::stdout().flush().unwrap();
1325}
1326
1327struct EditAgentTest {
1328 agent: EditAgent,
1329 project: Entity<Project>,
1330 judge_model: Arc<dyn LanguageModel>,
1331}
1332
1333impl EditAgentTest {
1334 async fn new(cx: &mut TestAppContext) -> Self {
1335 cx.executor().allow_parking();
1336
1337 let fs = FakeFs::new(cx.executor().clone());
1338 cx.update(|cx| {
1339 settings::init(cx);
1340 gpui_tokio::init(cx);
1341 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1342 cx.set_http_client(http_client);
1343
1344 client::init_settings(cx);
1345 let client = Client::production(cx);
1346 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1347
1348 settings::init(cx);
1349 Project::init_settings(cx);
1350 language::init(cx);
1351 language_model::init(client.clone(), cx);
1352 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1353 crate::init(client.http_client(), cx);
1354 });
1355
1356 fs.insert_tree("/root", json!({})).await;
1357 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1358 let agent_model = SelectedModel::from_str(
1359 &std::env::var("ZED_AGENT_MODEL")
1360 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1361 )
1362 .unwrap();
1363 let judge_model = SelectedModel::from_str(
1364 &std::env::var("ZED_JUDGE_MODEL")
1365 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1366 )
1367 .unwrap();
1368 let (agent_model, judge_model) = cx
1369 .update(|cx| {
1370 cx.spawn(async move |cx| {
1371 let agent_model = Self::load_model(&agent_model, cx).await;
1372 let judge_model = Self::load_model(&judge_model, cx).await;
1373 (agent_model.unwrap(), judge_model.unwrap())
1374 })
1375 })
1376 .await;
1377 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1378
1379 Self {
1380 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1381 project,
1382 judge_model,
1383 }
1384 }
1385
1386 async fn load_model(
1387 selected_model: &SelectedModel,
1388 cx: &mut AsyncApp,
1389 ) -> Result<Arc<dyn LanguageModel>> {
1390 let (provider, model) = cx.update(|cx| {
1391 let models = LanguageModelRegistry::read_global(cx);
1392 let model = models
1393 .available_models(cx)
1394 .find(|model| {
1395 model.provider_id() == selected_model.provider
1396 && model.id() == selected_model.model
1397 })
1398 .unwrap();
1399 let provider = models.provider(&model.provider_id()).unwrap();
1400 (provider, model)
1401 })?;
1402 cx.update(|cx| provider.authenticate(cx))?.await?;
1403 Ok(model)
1404 }
1405
1406 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1407 let path = self
1408 .project
1409 .read_with(cx, |project, cx| {
1410 project.find_project_path(eval.edit_file_input.path, cx)
1411 })
1412 .unwrap();
1413 let buffer = self
1414 .project
1415 .update(cx, |project, cx| project.open_buffer(path, cx))
1416 .await
1417 .unwrap();
1418 let tools = cx.update(|cx| {
1419 ToolRegistry::default_global(cx)
1420 .tools()
1421 .into_iter()
1422 .filter_map(|tool| {
1423 let input_schema = tool
1424 .input_schema(self.agent.model.tool_input_format())
1425 .ok()?;
1426 Some(LanguageModelRequestTool {
1427 name: tool.name(),
1428 description: tool.description(),
1429 input_schema,
1430 })
1431 })
1432 .collect::<Vec<_>>()
1433 });
1434 let tool_names = tools
1435 .iter()
1436 .map(|tool| tool.name.clone())
1437 .collect::<Vec<_>>();
1438 let worktrees = vec![WorktreeContext {
1439 root_name: "root".to_string(),
1440 rules_file: None,
1441 }];
1442 let prompt_builder = PromptBuilder::new(None)?;
1443 let project_context = ProjectContext::new(worktrees, Vec::default());
1444 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1445 &project_context,
1446 &ModelContext {
1447 available_tools: tool_names,
1448 },
1449 )?;
1450
1451 let has_system_prompt = eval
1452 .conversation
1453 .first()
1454 .map_or(false, |msg| msg.role == Role::System);
1455 let messages = if has_system_prompt {
1456 eval.conversation
1457 } else {
1458 [LanguageModelRequestMessage {
1459 role: Role::System,
1460 content: vec![MessageContent::Text(system_prompt)],
1461 cache: true,
1462 }]
1463 .into_iter()
1464 .chain(eval.conversation)
1465 .collect::<Vec<_>>()
1466 };
1467
1468 let conversation = LanguageModelRequest {
1469 messages,
1470 tools,
1471 ..Default::default()
1472 };
1473 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1474 if let Some(input_content) = eval.input_content.as_deref() {
1475 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1476 }
1477 let (edit_output, _) = self.agent.edit(
1478 buffer.clone(),
1479 eval.edit_file_input.display_description,
1480 &conversation,
1481 &mut cx.to_async(),
1482 );
1483 edit_output.await?
1484 } else {
1485 let (edit_output, _) = self.agent.overwrite(
1486 buffer.clone(),
1487 eval.edit_file_input.display_description,
1488 &conversation,
1489 &mut cx.to_async(),
1490 );
1491 edit_output.await?
1492 };
1493
1494 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1495 let sample = EvalSample {
1496 edit_output,
1497 diff: language::unified_diff(
1498 eval.input_content.as_deref().unwrap_or_default(),
1499 &buffer_text,
1500 ),
1501 text: buffer_text,
1502 };
1503 let assertion = eval
1504 .assertion
1505 .run(&sample, self.judge_model.clone(), cx)
1506 .await?;
1507
1508 Ok(EvalOutput { assertion, sample })
1509 }
1510}
1511
1512#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1513struct EvalAssertionOutcome {
1514 score: usize,
1515 message: Option<String>,
1516}
1517
1518#[derive(Serialize)]
1519pub struct DiffJudgeTemplate {
1520 diff: String,
1521 assertions: &'static str,
1522}
1523
1524impl Template for DiffJudgeTemplate {
1525 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1526}
1527
1528fn strip_empty_lines(text: &str) -> String {
1529 text.lines()
1530 .filter(|line| !line.trim().is_empty())
1531 .collect::<Vec<_>>()
1532 .join("\n")
1533}