1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use rand::prelude::*;
22use reqwest_client::ReqwestClient;
23use serde_json::json;
24use std::{
25 cmp::Reverse,
26 fmt::{self, Display},
27 io::Write as _,
28 str::FromStr,
29 sync::mpsc,
30};
31use util::path;
32
33#[test]
34#[cfg_attr(not(feature = "eval"), ignore)]
35fn eval_extract_handle_command_output() {
36 let input_file_path = "root/blame.rs";
37 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
38 let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
39 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
40 eval(
41 100,
42 0.95,
43 EvalInput::from_conversation(
44 vec![
45 message(
46 User,
47 [text(formatdoc! {"
48 Read the `{input_file_path}` file and extract a method in
49 the final stanza of `run_git_blame` to deal with command failures,
50 call it `handle_command_output` and take the std::process::Output as the only parameter.
51
52 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
53 "})],
54 ),
55 message(
56 Assistant,
57 [tool_use(
58 "tool_1",
59 "read_file",
60 ReadFileToolInput {
61 path: input_file_path.into(),
62 start_line: None,
63 end_line: None,
64 },
65 )],
66 ),
67 message(
68 User,
69 [tool_result("tool_1", "read_file", input_file_content)],
70 ),
71 message(
72 Assistant,
73 [tool_use(
74 "tool_2",
75 "edit_file",
76 EditFileToolInput {
77 display_description: edit_description.into(),
78 path: input_file_path.into(),
79 mode: EditFileMode::Edit,
80 },
81 )],
82 ),
83 ],
84 Some(input_file_content.into()),
85 EvalAssertion::assert_eq(output_file_content),
86 ),
87 );
88}
89
90#[test]
91#[cfg_attr(not(feature = "eval"), ignore)]
92fn eval_delete_run_git_blame() {
93 let input_file_path = "root/blame.rs";
94 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
95 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
96 let edit_description = "Delete the `run_git_blame` function.";
97 eval(
98 100,
99 0.95,
100 EvalInput::from_conversation(
101 vec![
102 message(
103 User,
104 [text(formatdoc! {"
105 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
106 one function, not its usages.
107 "})],
108 ),
109 message(
110 Assistant,
111 [tool_use(
112 "tool_1",
113 "read_file",
114 ReadFileToolInput {
115 path: input_file_path.into(),
116 start_line: None,
117 end_line: None,
118 },
119 )],
120 ),
121 message(
122 User,
123 [tool_result("tool_1", "read_file", input_file_content)],
124 ),
125 message(
126 Assistant,
127 [tool_use(
128 "tool_2",
129 "edit_file",
130 EditFileToolInput {
131 display_description: edit_description.into(),
132 path: input_file_path.into(),
133 mode: EditFileMode::Edit,
134 },
135 )],
136 ),
137 ],
138 Some(input_file_content.into()),
139 EvalAssertion::assert_eq(output_file_content),
140 ),
141 );
142}
143
144#[test]
145#[cfg_attr(not(feature = "eval"), ignore)]
146fn eval_translate_doc_comments() {
147 let input_file_path = "root/canvas.rs";
148 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
149 let edit_description = "Translate all doc comments to Italian";
150 eval(
151 200,
152 1.,
153 EvalInput::from_conversation(
154 vec![
155 message(
156 User,
157 [text(formatdoc! {"
158 Read the {input_file_path} file and edit it (without overwriting it),
159 translating all the doc comments to italian.
160 "})],
161 ),
162 message(
163 Assistant,
164 [tool_use(
165 "tool_1",
166 "read_file",
167 ReadFileToolInput {
168 path: input_file_path.into(),
169 start_line: None,
170 end_line: None,
171 },
172 )],
173 ),
174 message(
175 User,
176 [tool_result("tool_1", "read_file", input_file_content)],
177 ),
178 message(
179 Assistant,
180 [tool_use(
181 "tool_2",
182 "edit_file",
183 EditFileToolInput {
184 display_description: edit_description.into(),
185 path: input_file_path.into(),
186 mode: EditFileMode::Edit,
187 },
188 )],
189 ),
190 ],
191 Some(input_file_content.into()),
192 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
193 ),
194 );
195}
196
197#[test]
198#[cfg_attr(not(feature = "eval"), ignore)]
199fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
200 let input_file_path = "root/lib.rs";
201 let input_file_content =
202 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
203 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
204 eval(
205 100,
206 0.95,
207 EvalInput::from_conversation(
208 vec![
209 message(
210 User,
211 [text(formatdoc! {"
212 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
213 Use `ureq` to download the SDK for the current platform and architecture.
214 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
215 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
216 that's inside of the archive.
217 Don't re-download the SDK if that executable already exists.
218
219 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
220
221 Here are the available wasi-sdk assets:
222 - wasi-sdk-25.0-x86_64-macos.tar.gz
223 - wasi-sdk-25.0-arm64-macos.tar.gz
224 - wasi-sdk-25.0-x86_64-linux.tar.gz
225 - wasi-sdk-25.0-arm64-linux.tar.gz
226 - wasi-sdk-25.0-x86_64-linux.tar.gz
227 - wasi-sdk-25.0-arm64-linux.tar.gz
228 - wasi-sdk-25.0-x86_64-windows.tar.gz
229 "})],
230 ),
231 message(
232 Assistant,
233 [tool_use(
234 "tool_1",
235 "read_file",
236 ReadFileToolInput {
237 path: input_file_path.into(),
238 start_line: Some(971),
239 end_line: Some(1050),
240 },
241 )],
242 ),
243 message(
244 User,
245 [tool_result(
246 "tool_1",
247 "read_file",
248 lines(input_file_content, 971..1050),
249 )],
250 ),
251 message(
252 Assistant,
253 [tool_use(
254 "tool_2",
255 "read_file",
256 ReadFileToolInput {
257 path: input_file_path.into(),
258 start_line: Some(1050),
259 end_line: Some(1100),
260 },
261 )],
262 ),
263 message(
264 User,
265 [tool_result(
266 "tool_2",
267 "read_file",
268 lines(input_file_content, 1050..1100),
269 )],
270 ),
271 message(
272 Assistant,
273 [tool_use(
274 "tool_3",
275 "read_file",
276 ReadFileToolInput {
277 path: input_file_path.into(),
278 start_line: Some(1100),
279 end_line: Some(1150),
280 },
281 )],
282 ),
283 message(
284 User,
285 [tool_result(
286 "tool_3",
287 "read_file",
288 lines(input_file_content, 1100..1150),
289 )],
290 ),
291 message(
292 Assistant,
293 [tool_use(
294 "tool_4",
295 "edit_file",
296 EditFileToolInput {
297 display_description: edit_description.into(),
298 path: input_file_path.into(),
299 mode: EditFileMode::Edit,
300 },
301 )],
302 ),
303 ],
304 Some(input_file_content.into()),
305 EvalAssertion::judge_diff(indoc! {"
306 - The compile_parser_to_wasm method has been changed to use wasi-sdk
307 - ureq is used to download the SDK for current platform and architecture
308 "}),
309 ),
310 );
311}
312
313#[test]
314#[cfg_attr(not(feature = "eval"), ignore)]
315fn eval_disable_cursor_blinking() {
316 let input_file_path = "root/editor.rs";
317 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
318 let edit_description = "Comment out the call to `BlinkManager::enable`";
319 eval(
320 100,
321 0.95,
322 EvalInput::from_conversation(
323 vec![
324 message(User, [text("Let's research how to cursor blinking works.")]),
325 message(
326 Assistant,
327 [tool_use(
328 "tool_1",
329 "grep",
330 GrepToolInput {
331 regex: "blink".into(),
332 include_pattern: None,
333 offset: 0,
334 case_sensitive: false,
335 },
336 )],
337 ),
338 message(
339 User,
340 [tool_result(
341 "tool_1",
342 "grep",
343 [
344 lines(input_file_content, 100..400),
345 lines(input_file_content, 800..1300),
346 lines(input_file_content, 1600..2000),
347 lines(input_file_content, 5000..5500),
348 lines(input_file_content, 8000..9000),
349 lines(input_file_content, 18455..18470),
350 lines(input_file_content, 20000..20500),
351 lines(input_file_content, 21000..21300),
352 ]
353 .join("Match found:\n\n"),
354 )],
355 ),
356 message(
357 User,
358 [text(indoc! {"
359 Comment out the lines that interact with the BlinkManager.
360 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
361 Don't add additional comments.
362 "})],
363 ),
364 message(
365 Assistant,
366 [tool_use(
367 "tool_4",
368 "edit_file",
369 EditFileToolInput {
370 display_description: edit_description.into(),
371 path: input_file_path.into(),
372 mode: EditFileMode::Edit,
373 },
374 )],
375 ),
376 ],
377 Some(input_file_content.into()),
378 EvalAssertion::judge_diff(indoc! {"
379 - Calls to BlinkManager in `observe_window_activation` were commented out
380 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
381 - All the edits have valid indentation
382 "}),
383 ),
384 );
385}
386
387#[test]
388#[cfg_attr(not(feature = "eval"), ignore)]
389fn eval_from_pixels_constructor() {
390 let input_file_path = "root/canvas.rs";
391 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
392 let edit_description = "Implement from_pixels constructor and add tests.";
393 eval(
394 100,
395 0.95,
396 EvalInput::from_conversation(
397 vec![
398 message(
399 User,
400 [text(indoc! {"
401 Introduce a new `from_pixels` constructor in Canvas and
402 also add tests for it in the same file.
403 "})],
404 ),
405 message(
406 Assistant,
407 [tool_use(
408 "tool_1",
409 "read_file",
410 ReadFileToolInput {
411 path: input_file_path.into(),
412 start_line: None,
413 end_line: None,
414 },
415 )],
416 ),
417 message(
418 User,
419 [tool_result("tool_1", "read_file", input_file_content)],
420 ),
421 message(
422 Assistant,
423 [tool_use(
424 "tool_2",
425 "grep",
426 GrepToolInput {
427 regex: "mod\\s+tests".into(),
428 include_pattern: Some("font-kit/src/canvas.rs".into()),
429 offset: 0,
430 case_sensitive: false,
431 },
432 )],
433 ),
434 message(User, [tool_result("tool_2", "grep", "No matches found")]),
435 message(
436 Assistant,
437 [tool_use(
438 "tool_3",
439 "grep",
440 GrepToolInput {
441 regex: "mod\\s+tests".into(),
442 include_pattern: Some("font-kit/src/**/*.rs".into()),
443 offset: 0,
444 case_sensitive: false,
445 },
446 )],
447 ),
448 message(User, [tool_result("tool_3", "grep", "No matches found")]),
449 message(
450 Assistant,
451 [tool_use(
452 "tool_4",
453 "grep",
454 GrepToolInput {
455 regex: "#\\[test\\]".into(),
456 include_pattern: Some("font-kit/src/**/*.rs".into()),
457 offset: 0,
458 case_sensitive: false,
459 },
460 )],
461 ),
462 message(
463 User,
464 [tool_result(
465 "tool_4",
466 "grep",
467 indoc! {"
468 Found 6 matches:
469
470 ## Matches in font-kit/src/loaders/core_text.rs
471
472 ### mod test › L926-936
473 ```
474 mod test {
475 use super::Font;
476 use crate::properties::{Stretch, Weight};
477
478 #[cfg(feature = \"source\")]
479 use crate::source::SystemSource;
480
481 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
482
483 #[cfg(feature = \"source\")]
484 #[test]
485 ```
486
487 55 lines remaining in ancestor node. Read the file to see all.
488
489 ### mod test › L947-951
490 ```
491 }
492
493 #[test]
494 fn test_core_text_to_css_font_weight() {
495 // Exact matches
496 ```
497
498 ### mod test › L959-963
499 ```
500 }
501
502 #[test]
503 fn test_core_text_to_css_font_stretch() {
504 // Exact matches
505 ```
506
507 ## Matches in font-kit/src/loaders/freetype.rs
508
509 ### mod test › L1238-1248
510 ```
511 mod test {
512 use crate::loaders::freetype::Font;
513
514 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
515 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
516
517 #[test]
518 fn get_pcf_postscript_name() {
519 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
520 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
521 }
522 ```
523
524 1 lines remaining in ancestor node. Read the file to see all.
525
526 ## Matches in font-kit/src/sources/core_text.rs
527
528 ### mod test › L265-275
529 ```
530 mod test {
531 use crate::properties::{Stretch, Weight};
532
533 #[test]
534 fn test_css_to_core_text_font_weight() {
535 // Exact matches
536 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
537 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
538 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
539 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
540
541 ```
542
543 27 lines remaining in ancestor node. Read the file to see all.
544
545 ### mod test › L278-282
546 ```
547 }
548
549 #[test]
550 fn test_css_to_core_text_font_stretch() {
551 // Exact matches
552 ```
553 "},
554 )],
555 ),
556 message(
557 Assistant,
558 [tool_use(
559 "tool_5",
560 "edit_file",
561 EditFileToolInput {
562 display_description: edit_description.into(),
563 path: input_file_path.into(),
564 mode: EditFileMode::Edit,
565 },
566 )],
567 ),
568 ],
569 Some(input_file_content.into()),
570 EvalAssertion::judge_diff(indoc! {"
571 - The diff contains a new `from_pixels` constructor
572 - The diff contains new tests for the `from_pixels` constructor
573 "}),
574 ),
575 );
576}
577
578#[test]
579#[cfg_attr(not(feature = "eval"), ignore)]
580fn eval_zode() {
581 let input_file_path = "root/zode.py";
582 let input_content = None;
583 let edit_description = "Create the main Zode CLI script";
584 eval(
585 200,
586 1.,
587 EvalInput::from_conversation(
588 vec![
589 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
590 message(
591 Assistant,
592 [
593 tool_use(
594 "tool_1",
595 "read_file",
596 ReadFileToolInput {
597 path: "root/eval/react.py".into(),
598 start_line: None,
599 end_line: None,
600 },
601 ),
602 tool_use(
603 "tool_2",
604 "read_file",
605 ReadFileToolInput {
606 path: "root/eval/react_test.py".into(),
607 start_line: None,
608 end_line: None,
609 },
610 ),
611 ],
612 ),
613 message(
614 User,
615 [
616 tool_result(
617 "tool_1",
618 "read_file",
619 include_str!("evals/fixtures/zode/react.py"),
620 ),
621 tool_result(
622 "tool_2",
623 "read_file",
624 include_str!("evals/fixtures/zode/react_test.py"),
625 ),
626 ],
627 ),
628 message(
629 Assistant,
630 [
631 text(
632 "Now that I understand what we need to build, I'll create the main Python script:",
633 ),
634 tool_use(
635 "tool_3",
636 "edit_file",
637 EditFileToolInput {
638 display_description: edit_description.into(),
639 path: input_file_path.into(),
640 mode: EditFileMode::Create,
641 },
642 ),
643 ],
644 ),
645 ],
646 input_content,
647 EvalAssertion::new(async move |sample, _, _cx| {
648 let invalid_starts = [' ', '`', '\n'];
649 let mut message = String::new();
650 for start in invalid_starts {
651 if sample.text.starts_with(start) {
652 message.push_str(&format!("The sample starts with a {:?}\n", start));
653 break;
654 }
655 }
656 // Remove trailing newline.
657 message.pop();
658
659 if message.is_empty() {
660 Ok(EvalAssertionOutcome {
661 score: 100,
662 message: None,
663 })
664 } else {
665 Ok(EvalAssertionOutcome {
666 score: 0,
667 message: Some(message),
668 })
669 }
670 }),
671 ),
672 );
673}
674
675#[test]
676#[cfg_attr(not(feature = "eval"), ignore)]
677fn eval_add_overwrite_test() {
678 let input_file_path = "root/action_log.rs";
679 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
680 let edit_description = "Add a new test for overwriting a file in action_log.rs";
681 eval(
682 200,
683 0.5, // TODO: make this eval better
684 EvalInput::from_conversation(
685 vec![
686 message(
687 User,
688 [text(indoc! {"
689 Introduce a new test in `action_log.rs` to test overwriting a file.
690 That is, a file already exists, but we call `buffer_created` as if the file were new.
691 Take inspiration from all the other tests in the file.
692 "})],
693 ),
694 message(
695 Assistant,
696 [tool_use(
697 "tool_1",
698 "read_file",
699 ReadFileToolInput {
700 path: input_file_path.into(),
701 start_line: None,
702 end_line: None,
703 },
704 )],
705 ),
706 message(
707 User,
708 [tool_result(
709 "tool_1",
710 "read_file",
711 indoc! {"
712 pub struct ActionLog [L13-20]
713 tracked_buffers [L15]
714 edited_since_project_diagnostics_check [L17]
715 project [L19]
716 impl ActionLog [L22-498]
717 pub fn new [L24-30]
718 pub fn project [L32-34]
719 pub fn checked_project_diagnostics [L37-39]
720 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
721 fn track_buffer_internal [L46-101]
722 fn handle_buffer_event [L103-116]
723 fn handle_buffer_edited [L118-123]
724 fn handle_buffer_file_changed [L125-158]
725 async fn maintain_diff [L160-264]
726 pub fn buffer_read [L267-269]
727 pub fn buffer_created [L272-276]
728 pub fn buffer_edited [L279-287]
729 pub fn will_delete_buffer [L289-304]
730 pub fn keep_edits_in_range [L306-364]
731 pub fn reject_edits_in_ranges [L366-459]
732 pub fn keep_all_edits [L461-473]
733 pub fn changed_buffers [L476-482]
734 pub fn stale_buffers [L485-497]
735 fn apply_non_conflicting_edits [L500-561]
736 fn diff_snapshots [L563-585]
737 fn point_to_row_edit [L587-614]
738 enum ChangeAuthor [L617-620]
739 User [L618]
740 Agent [L619]
741 enum TrackedBufferStatus [L623-627]
742 Created [L624]
743 Modified [L625]
744 Deleted [L626]
745 struct TrackedBuffer [L629-641]
746 buffer [L630]
747 base_text [L631]
748 unreviewed_changes [L632]
749 status [L633]
750 version [L634]
751 diff [L635]
752 snapshot [L636]
753 diff_update [L637]
754 _open_lsp_handle [L638]
755 _maintain_diff [L639]
756 _subscription [L640]
757 impl TrackedBuffer [L643-657]
758 fn has_changes [L644-650]
759 fn schedule_diff_update [L652-656]
760 pub struct ChangedBuffer [L659-661]
761 pub diff [L660]
762 mod tests [L664-1574]
763 fn init_logger [L678-682]
764 fn init_test [L684-691]
765 async fn test_keep_edits [L694-769]
766 async fn test_deletions [L772-854]
767 async fn test_overlapping_user_edits [L857-951]
768 async fn test_creating_files [L954-1010]
769 async fn test_deleting_files [L1013-1120]
770 async fn test_reject_edits [L1123-1255]
771 async fn test_reject_multiple_edits [L1258-1331]
772 async fn test_reject_deleted_file [L1334-1388]
773 async fn test_reject_created_file [L1391-1443]
774 async fn test_random_diffs [L1446-1535]
775 fn quiesce [L1510-1534]
776 struct HunkStatus [L1538-1542]
777 range [L1539]
778 diff_status [L1540]
779 old_text [L1541]
780 fn unreviewed_hunks [L1544-1573]
781
782 Showing symbols 1-69 (total symbols: 69)
783
784 Using the line numbers in this outline, you can call this tool again while specifying
785 the start_line and end_line fields to see the implementations of symbols in the outline.
786 "},
787 )],
788 ),
789 message(
790 Assistant,
791 [
792 text(
793 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
794 ),
795 tool_use(
796 "tool_2",
797 "read_file",
798 ReadFileToolInput {
799 path: input_file_path.into(),
800 start_line: Some(953),
801 end_line: Some(1010),
802 },
803 ),
804 ],
805 ),
806 message(
807 User,
808 [tool_result(
809 "tool_2",
810 "read_file",
811 lines(input_file_content, 953..1010),
812 )],
813 ),
814 message(
815 Assistant,
816 [
817 text(
818 "Now let's look at another relevant test to understand how they're structured:",
819 ),
820 tool_use(
821 "tool_3",
822 "read_file",
823 ReadFileToolInput {
824 path: input_file_path.into(),
825 start_line: Some(1012),
826 end_line: Some(1120),
827 },
828 ),
829 ],
830 ),
831 message(
832 User,
833 [tool_result(
834 "tool_3",
835 "read_file",
836 lines(input_file_content, 1012..1120),
837 )],
838 ),
839 message(
840 Assistant,
841 [
842 text("Now let's look at how `buffer_created` is implemented:"),
843 tool_use(
844 "tool_4",
845 "read_file",
846 ReadFileToolInput {
847 path: input_file_path.into(),
848 start_line: Some(271),
849 end_line: Some(276),
850 },
851 ),
852 ],
853 ),
854 message(
855 User,
856 [tool_result(
857 "tool_4",
858 "read_file",
859 lines(input_file_content, 271..276),
860 )],
861 ),
862 message(
863 Assistant,
864 [
865 text(indoc! {"
866 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
867
868 1. Create a file
869 2. Open a buffer for it
870 3. Track it in the action log
871 4. Modify the buffer
872 5. Call `buffer_created` again as if the file were new
873 6. Check that the changes are properly tracked
874
875 Let's write the test:
876 "}),
877 tool_use(
878 "tool_5",
879 "edit_file",
880 EditFileToolInput {
881 display_description: edit_description.into(),
882 path: input_file_path.into(),
883 mode: EditFileMode::Edit,
884 },
885 ),
886 ],
887 ),
888 ],
889 Some(input_file_content.into()),
890 EvalAssertion::judge_diff(
891 "A new test for overwritten files was created, without changing any previous test",
892 ),
893 ),
894 );
895}
896
897#[test]
898#[ignore] // until we figure out the mystery described in the comments
899// #[cfg_attr(not(feature = "eval"), ignore)]
900fn eval_create_empty_file() {
901 // Check that Edit Agent can create a file without writing its
902 // thoughts into it. This issue is not specific to empty files, but
903 // it's easier to reproduce with them.
904 //
905 // NOTE: For some mysterious reason, I could easily reproduce this
906 // issue roughly 90% of the time in actual Zed. However, once I
907 // extract the exact LLM request before the failure point and
908 // generate from that, the reproduction rate drops to 2%!
909 //
910 // Things I've tried to make sure it's not a fluke: disabling prompt
911 // caching, capturing the LLM request via a proxy server, running the
912 // prompt on Claude separately from evals. Every time it was mostly
913 // giving good outcomes, which doesn't match my actual experience in
914 // Zed.
915 //
916 // At some point I discovered that simply adding one insignificant
917 // space or a newline to the prompt suddenly results in an outcome I
918 // tried to reproduce almost perfectly.
919 //
920 // This weirdness happens even outside of the Zed code base and even
921 // when using a different subscription. The result is the same: an
922 // extra newline or space changes the model behavior significantly
923 // enough, so that the pass rate drops from 99% to 0-3%
924 //
925 // I have no explanation to this.
926 //
927 //
928 // Model | Pass rate
929 // ============================================
930 //
931 // --------------------------------------------
932 // Prompt version: 2025-05-19
933 // --------------------------------------------
934 //
935 // claude-3.7-sonnet | 0.98
936 // + one extra space in prompt | 0.00
937 // + original prompt again | 0.99
938 // + extra newline | 0.03
939 // gemini-2.5-pro-preview-03-25 | 1.00
940 // gemini-2.5-flash-preview-04-17 | 1.00
941 // + one extra space | 1.00
942 // gpt-4.1 | 1.00
943 // + one extra space | 1.00
944 //
945 //
946 // TODO: gpt-4.1-mini errored 38 times:
947 // "data did not match any variant of untagged enum ResponseStreamResult"
948 //
949 let input_file_content = None;
950 let expected_output_content = String::new();
951 eval(
952 1,
953 1.0,
954 EvalInput::from_conversation(
955 vec![
956 message(User, [text("Create a second empty todo file ")]),
957 message(
958 Assistant,
959 [
960 text(formatdoc! {"
961 I'll help you create a second empty todo file.
962 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
963 "}),
964 tool_use(
965 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
966 "list_directory",
967 ListDirectoryToolInput {
968 path: "root".to_string(),
969 },
970 ),
971 ],
972 ),
973 message(
974 User,
975 [tool_result(
976 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
977 "list_directory",
978 "root/TODO\nroot/TODO2\nroot/new.txt\n",
979 )],
980 ),
981 message(
982 Assistant,
983 [
984 text(formatdoc! {"
985 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
986 "}),
987 tool_use(
988 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
989 "edit_file",
990 EditFileToolInput {
991 display_description: "Create empty TODO3 file".to_string(),
992 mode: EditFileMode::Create,
993 path: "root/TODO3".into(),
994 },
995 ),
996 ],
997 ),
998 ],
999 input_file_content,
1000 // Bad behavior is to write something like
1001 // "I'll create an empty TODO3 file as requested."
1002 EvalAssertion::assert_eq(expected_output_content),
1003 ),
1004 );
1005}
1006
1007fn message(
1008 role: Role,
1009 contents: impl IntoIterator<Item = MessageContent>,
1010) -> LanguageModelRequestMessage {
1011 LanguageModelRequestMessage {
1012 role,
1013 content: contents.into_iter().collect(),
1014 cache: false,
1015 }
1016}
1017
1018fn text(text: impl Into<String>) -> MessageContent {
1019 MessageContent::Text(text.into())
1020}
1021
1022fn lines(input: &str, range: Range<usize>) -> String {
1023 input
1024 .lines()
1025 .skip(range.start)
1026 .take(range.len())
1027 .collect::<Vec<_>>()
1028 .join("\n")
1029}
1030
1031fn tool_use(
1032 id: impl Into<Arc<str>>,
1033 name: impl Into<Arc<str>>,
1034 input: impl Serialize,
1035) -> MessageContent {
1036 MessageContent::ToolUse(LanguageModelToolUse {
1037 id: LanguageModelToolUseId::from(id.into()),
1038 name: name.into(),
1039 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1040 input: serde_json::to_value(input).unwrap(),
1041 is_input_complete: true,
1042 })
1043}
1044
1045fn tool_result(
1046 id: impl Into<Arc<str>>,
1047 name: impl Into<Arc<str>>,
1048 result: impl Into<Arc<str>>,
1049) -> MessageContent {
1050 MessageContent::ToolResult(LanguageModelToolResult {
1051 tool_use_id: LanguageModelToolUseId::from(id.into()),
1052 tool_name: name.into(),
1053 is_error: false,
1054 content: LanguageModelToolResultContent::Text(result.into()),
1055 output: None,
1056 })
1057}
1058
1059#[derive(Clone)]
1060struct EvalInput {
1061 conversation: Vec<LanguageModelRequestMessage>,
1062 edit_file_input: EditFileToolInput,
1063 input_content: Option<String>,
1064 assertion: EvalAssertion,
1065}
1066
1067impl EvalInput {
1068 fn from_conversation(
1069 conversation: Vec<LanguageModelRequestMessage>,
1070 input_content: Option<String>,
1071 assertion: EvalAssertion,
1072 ) -> Self {
1073 let msg = conversation.last().expect("Conversation must not be empty");
1074 if msg.role != Role::Assistant {
1075 panic!("Conversation must end with an assistant message");
1076 }
1077 let tool_use = msg
1078 .content
1079 .iter()
1080 .flat_map(|content| match content {
1081 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1082 Some(tool_use)
1083 }
1084 _ => None,
1085 })
1086 .next()
1087 .expect("Conversation must end with an edit_file tool use")
1088 .clone();
1089
1090 let edit_file_input: EditFileToolInput =
1091 serde_json::from_value(tool_use.input.clone()).unwrap();
1092
1093 EvalInput {
1094 conversation,
1095 edit_file_input,
1096 input_content,
1097 assertion,
1098 }
1099 }
1100}
1101
1102#[derive(Clone)]
1103struct EvalSample {
1104 text: String,
1105 edit_output: EditAgentOutput,
1106 diff: String,
1107}
1108
1109trait AssertionFn: 'static + Send + Sync {
1110 fn assert<'a>(
1111 &'a self,
1112 sample: &'a EvalSample,
1113 judge_model: Arc<dyn LanguageModel>,
1114 cx: &'a mut TestAppContext,
1115 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1116}
1117
1118impl<F> AssertionFn for F
1119where
1120 F: 'static
1121 + Send
1122 + Sync
1123 + AsyncFn(
1124 &EvalSample,
1125 Arc<dyn LanguageModel>,
1126 &mut TestAppContext,
1127 ) -> Result<EvalAssertionOutcome>,
1128{
1129 fn assert<'a>(
1130 &'a self,
1131 sample: &'a EvalSample,
1132 judge_model: Arc<dyn LanguageModel>,
1133 cx: &'a mut TestAppContext,
1134 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1135 (self)(sample, judge_model, cx).boxed_local()
1136 }
1137}
1138
1139#[derive(Clone)]
1140struct EvalAssertion(Arc<dyn AssertionFn>);
1141
1142impl EvalAssertion {
1143 fn new<F>(f: F) -> Self
1144 where
1145 F: 'static
1146 + Send
1147 + Sync
1148 + AsyncFn(
1149 &EvalSample,
1150 Arc<dyn LanguageModel>,
1151 &mut TestAppContext,
1152 ) -> Result<EvalAssertionOutcome>,
1153 {
1154 EvalAssertion(Arc::new(f))
1155 }
1156
1157 fn assert_eq(expected: impl Into<String>) -> Self {
1158 let expected = expected.into();
1159 Self::new(async move |sample, _judge, _cx| {
1160 Ok(EvalAssertionOutcome {
1161 score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
1162 100
1163 } else {
1164 0
1165 },
1166 message: None,
1167 })
1168 })
1169 }
1170
1171 fn judge_diff(assertions: &'static str) -> Self {
1172 Self::new(async move |sample, judge, cx| {
1173 let prompt = DiffJudgeTemplate {
1174 diff: sample.diff.clone(),
1175 assertions,
1176 }
1177 .render(&Templates::new())
1178 .unwrap();
1179
1180 let request = LanguageModelRequest {
1181 messages: vec![LanguageModelRequestMessage {
1182 role: Role::User,
1183 content: vec![prompt.into()],
1184 cache: false,
1185 }],
1186 ..Default::default()
1187 };
1188 let mut response = judge
1189 .stream_completion_text(request, &cx.to_async())
1190 .await?;
1191 let mut output = String::new();
1192 while let Some(chunk) = response.stream.next().await {
1193 let chunk = chunk?;
1194 output.push_str(&chunk);
1195 }
1196
1197 // Parse the score from the response
1198 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1199 if let Some(captures) = re.captures(&output) {
1200 if let Some(score_match) = captures.get(1) {
1201 let score = score_match.as_str().parse().unwrap_or(0);
1202 return Ok(EvalAssertionOutcome {
1203 score,
1204 message: Some(output),
1205 });
1206 }
1207 }
1208
1209 anyhow::bail!("No score found in response. Raw output: {output}");
1210 })
1211 }
1212
1213 async fn run(
1214 &self,
1215 input: &EvalSample,
1216 judge_model: Arc<dyn LanguageModel>,
1217 cx: &mut TestAppContext,
1218 ) -> Result<EvalAssertionOutcome> {
1219 self.0.assert(input, judge_model, cx).await
1220 }
1221}
1222
1223fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1224 let mut evaluated_count = 0;
1225 let mut failed_count = 0;
1226 report_progress(evaluated_count, failed_count, iterations);
1227
1228 let (tx, rx) = mpsc::channel();
1229
1230 // Cache the last message in the conversation, and run one instance of the eval so that
1231 // all the next ones are cached.
1232 eval.conversation.last_mut().unwrap().cache = true;
1233 run_eval(eval.clone(), tx.clone());
1234
1235 let executor = gpui::background_executor();
1236 for _ in 1..iterations {
1237 let eval = eval.clone();
1238 let tx = tx.clone();
1239 executor.spawn(async move { run_eval(eval, tx) }).detach();
1240 }
1241 drop(tx);
1242
1243 let mut failed_evals = HashMap::default();
1244 let mut errored_evals = HashMap::default();
1245 let mut eval_outputs = Vec::new();
1246 let mut cumulative_parser_metrics = EditParserMetrics::default();
1247 while let Ok(output) = rx.recv() {
1248 match output {
1249 Ok(output) => {
1250 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1251 eval_outputs.push(output.clone());
1252 if output.assertion.score < 80 {
1253 failed_count += 1;
1254 failed_evals
1255 .entry(output.sample.text.clone())
1256 .or_insert(Vec::new())
1257 .push(output);
1258 }
1259 }
1260 Err(error) => {
1261 failed_count += 1;
1262 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1263 }
1264 }
1265
1266 evaluated_count += 1;
1267 report_progress(evaluated_count, failed_count, iterations);
1268 }
1269
1270 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1271 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1272 if actual_pass_ratio < expected_pass_ratio {
1273 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1274 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1275 for (error, count) in errored_evals {
1276 println!("Eval errored {} times. Error: {}", count, error);
1277 }
1278
1279 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1280 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1281 for (_buffer_output, failed_evals) in failed_evals {
1282 let eval_output = failed_evals.first().unwrap();
1283 println!("Eval failed {} times", failed_evals.len());
1284 println!("{}", eval_output);
1285 }
1286
1287 panic!(
1288 "Actual pass ratio: {}\nExpected pass ratio: {}",
1289 actual_pass_ratio, expected_pass_ratio
1290 );
1291 }
1292
1293 let mismatched_tag_ratio =
1294 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1295 if mismatched_tag_ratio > 0.05 {
1296 for eval_output in eval_outputs {
1297 println!("{}", eval_output);
1298 }
1299 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1300 }
1301}
1302
1303fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1304 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1305 let mut cx = TestAppContext::build(dispatcher, None);
1306 let output = cx.executor().block_test(async {
1307 let test = EditAgentTest::new(&mut cx).await;
1308 test.eval(eval, &mut cx).await
1309 });
1310 tx.send(output).unwrap();
1311}
1312
1313#[derive(Clone)]
1314struct EvalOutput {
1315 sample: EvalSample,
1316 assertion: EvalAssertionOutcome,
1317}
1318
1319impl Display for EvalOutput {
1320 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1321 writeln!(f, "Score: {:?}", self.assertion.score)?;
1322 if let Some(message) = self.assertion.message.as_ref() {
1323 writeln!(f, "Message: {}", message)?;
1324 }
1325
1326 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1327
1328 writeln!(
1329 f,
1330 "Parser Metrics:\n{:#?}",
1331 self.sample.edit_output.parser_metrics
1332 )?;
1333 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1334 Ok(())
1335 }
1336}
1337
1338fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1339 let passed_count = evaluated_count - failed_count;
1340 let passed_ratio = if evaluated_count == 0 {
1341 0.0
1342 } else {
1343 passed_count as f64 / evaluated_count as f64
1344 };
1345 print!(
1346 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1347 evaluated_count,
1348 iterations,
1349 passed_ratio * 100.0
1350 );
1351 std::io::stdout().flush().unwrap();
1352}
1353
1354struct EditAgentTest {
1355 agent: EditAgent,
1356 project: Entity<Project>,
1357 judge_model: Arc<dyn LanguageModel>,
1358}
1359
1360impl EditAgentTest {
1361 async fn new(cx: &mut TestAppContext) -> Self {
1362 cx.executor().allow_parking();
1363
1364 let fs = FakeFs::new(cx.executor().clone());
1365 cx.update(|cx| {
1366 settings::init(cx);
1367 gpui_tokio::init(cx);
1368 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1369 cx.set_http_client(http_client);
1370
1371 client::init_settings(cx);
1372 let client = Client::production(cx);
1373 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1374
1375 settings::init(cx);
1376 Project::init_settings(cx);
1377 language::init(cx);
1378 language_model::init(client.clone(), cx);
1379 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1380 crate::init(client.http_client(), cx);
1381 });
1382
1383 fs.insert_tree("/root", json!({})).await;
1384 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1385 let agent_model = SelectedModel::from_str(
1386 &std::env::var("ZED_AGENT_MODEL")
1387 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1388 )
1389 .unwrap();
1390 let judge_model = SelectedModel::from_str(
1391 &std::env::var("ZED_JUDGE_MODEL")
1392 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1393 )
1394 .unwrap();
1395 let (agent_model, judge_model) = cx
1396 .update(|cx| {
1397 cx.spawn(async move |cx| {
1398 let agent_model = Self::load_model(&agent_model, cx).await;
1399 let judge_model = Self::load_model(&judge_model, cx).await;
1400 (agent_model.unwrap(), judge_model.unwrap())
1401 })
1402 })
1403 .await;
1404 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1405
1406 Self {
1407 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1408 project,
1409 judge_model,
1410 }
1411 }
1412
1413 async fn load_model(
1414 selected_model: &SelectedModel,
1415 cx: &mut AsyncApp,
1416 ) -> Result<Arc<dyn LanguageModel>> {
1417 let (provider, model) = cx.update(|cx| {
1418 let models = LanguageModelRegistry::read_global(cx);
1419 let model = models
1420 .available_models(cx)
1421 .find(|model| {
1422 model.provider_id() == selected_model.provider
1423 && model.id() == selected_model.model
1424 })
1425 .unwrap();
1426 let provider = models.provider(&model.provider_id()).unwrap();
1427 (provider, model)
1428 })?;
1429 cx.update(|cx| provider.authenticate(cx))?.await?;
1430 Ok(model)
1431 }
1432
1433 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1434 let path = self
1435 .project
1436 .read_with(cx, |project, cx| {
1437 project.find_project_path(eval.edit_file_input.path, cx)
1438 })
1439 .unwrap();
1440 let buffer = self
1441 .project
1442 .update(cx, |project, cx| project.open_buffer(path, cx))
1443 .await
1444 .unwrap();
1445 let conversation = LanguageModelRequest {
1446 messages: eval.conversation,
1447 tools: cx.update(|cx| {
1448 ToolRegistry::default_global(cx)
1449 .tools()
1450 .into_iter()
1451 .filter_map(|tool| {
1452 let input_schema = tool
1453 .input_schema(self.agent.model.tool_input_format())
1454 .ok()?;
1455 Some(LanguageModelRequestTool {
1456 name: tool.name(),
1457 description: tool.description(),
1458 input_schema,
1459 })
1460 })
1461 .collect()
1462 }),
1463 ..Default::default()
1464 };
1465 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1466 if let Some(input_content) = eval.input_content.as_deref() {
1467 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1468 }
1469 let (edit_output, _) = self.agent.edit(
1470 buffer.clone(),
1471 eval.edit_file_input.display_description,
1472 &conversation,
1473 &mut cx.to_async(),
1474 );
1475 edit_output.await?
1476 } else {
1477 let (edit_output, _) = self.agent.overwrite(
1478 buffer.clone(),
1479 eval.edit_file_input.display_description,
1480 &conversation,
1481 &mut cx.to_async(),
1482 );
1483 edit_output.await?
1484 };
1485
1486 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1487 let sample = EvalSample {
1488 edit_output,
1489 diff: language::unified_diff(
1490 eval.input_content.as_deref().unwrap_or_default(),
1491 &buffer_text,
1492 ),
1493 text: buffer_text,
1494 };
1495 let assertion = eval
1496 .assertion
1497 .run(&sample, self.judge_model.clone(), cx)
1498 .await?;
1499
1500 Ok(EvalOutput { assertion, sample })
1501 }
1502}
1503
1504#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1505struct EvalAssertionOutcome {
1506 score: usize,
1507 message: Option<String>,
1508}
1509
1510#[derive(Serialize)]
1511pub struct DiffJudgeTemplate {
1512 diff: String,
1513 assertions: &'static str,
1514}
1515
1516impl Template for DiffJudgeTemplate {
1517 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1518}
1519
1520fn strip_empty_lines(text: &str) -> String {
1521 text.lines()
1522 .filter(|line| !line.trim().is_empty())
1523 .collect::<Vec<_>>()
1524 .join("\n")
1525}