1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use anyhow::anyhow;
10use assistant_tool::ToolRegistry;
11use client::{Client, UserStore};
12use collections::HashMap;
13use fs::FakeFs;
14use futures::{FutureExt, future::LocalBoxFuture};
15use gpui::{AppContext, TestAppContext};
16use indoc::{formatdoc, indoc};
17use language_model::{
18 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
19 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
20};
21use project::Project;
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 str::FromStr,
30 sync::mpsc,
31};
32use util::path;
33
34#[test]
35#[cfg_attr(not(feature = "eval"), ignore)]
36fn eval_extract_handle_command_output() {
37 let input_file_path = "root/blame.rs";
38 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
39 let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
40 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
41 eval(
42 100,
43 0.95,
44 EvalInput::from_conversation(
45 vec![
46 message(
47 User,
48 [text(formatdoc! {"
49 Read the `{input_file_path}` file and extract a method in
50 the final stanza of `run_git_blame` to deal with command failures,
51 call it `handle_command_output` and take the std::process::Output as the only parameter.
52
53 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
54 "})],
55 ),
56 message(
57 Assistant,
58 [tool_use(
59 "tool_1",
60 "read_file",
61 ReadFileToolInput {
62 path: input_file_path.into(),
63 start_line: None,
64 end_line: None,
65 },
66 )],
67 ),
68 message(
69 User,
70 [tool_result("tool_1", "read_file", input_file_content)],
71 ),
72 message(
73 Assistant,
74 [tool_use(
75 "tool_2",
76 "edit_file",
77 EditFileToolInput {
78 display_description: edit_description.into(),
79 path: input_file_path.into(),
80 mode: EditFileMode::Edit,
81 },
82 )],
83 ),
84 ],
85 Some(input_file_content.into()),
86 EvalAssertion::assert_eq(output_file_content),
87 ),
88 );
89}
90
91#[test]
92#[cfg_attr(not(feature = "eval"), ignore)]
93fn eval_delete_run_git_blame() {
94 let input_file_path = "root/blame.rs";
95 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
96 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
97 let edit_description = "Delete the `run_git_blame` function.";
98 eval(
99 100,
100 0.95,
101 EvalInput::from_conversation(
102 vec![
103 message(
104 User,
105 [text(formatdoc! {"
106 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
107 one function, not its usages.
108 "})],
109 ),
110 message(
111 Assistant,
112 [tool_use(
113 "tool_1",
114 "read_file",
115 ReadFileToolInput {
116 path: input_file_path.into(),
117 start_line: None,
118 end_line: None,
119 },
120 )],
121 ),
122 message(
123 User,
124 [tool_result("tool_1", "read_file", input_file_content)],
125 ),
126 message(
127 Assistant,
128 [tool_use(
129 "tool_2",
130 "edit_file",
131 EditFileToolInput {
132 display_description: edit_description.into(),
133 path: input_file_path.into(),
134 mode: EditFileMode::Edit,
135 },
136 )],
137 ),
138 ],
139 Some(input_file_content.into()),
140 EvalAssertion::assert_eq(output_file_content),
141 ),
142 );
143}
144
145#[test]
146#[cfg_attr(not(feature = "eval"), ignore)]
147fn eval_translate_doc_comments() {
148 let input_file_path = "root/canvas.rs";
149 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
150 let edit_description = "Translate all doc comments to Italian";
151 eval(
152 200,
153 1.,
154 EvalInput::from_conversation(
155 vec![
156 message(
157 User,
158 [text(formatdoc! {"
159 Read the {input_file_path} file and edit it (without overwriting it),
160 translating all the doc comments to italian.
161 "})],
162 ),
163 message(
164 Assistant,
165 [tool_use(
166 "tool_1",
167 "read_file",
168 ReadFileToolInput {
169 path: input_file_path.into(),
170 start_line: None,
171 end_line: None,
172 },
173 )],
174 ),
175 message(
176 User,
177 [tool_result("tool_1", "read_file", input_file_content)],
178 ),
179 message(
180 Assistant,
181 [tool_use(
182 "tool_2",
183 "edit_file",
184 EditFileToolInput {
185 display_description: edit_description.into(),
186 path: input_file_path.into(),
187 mode: EditFileMode::Edit,
188 },
189 )],
190 ),
191 ],
192 Some(input_file_content.into()),
193 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
194 ),
195 );
196}
197
198#[test]
199#[cfg_attr(not(feature = "eval"), ignore)]
200fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
201 let input_file_path = "root/lib.rs";
202 let input_file_content =
203 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
204 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
205 eval(
206 100,
207 0.95,
208 EvalInput::from_conversation(
209 vec![
210 message(
211 User,
212 [text(formatdoc! {"
213 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
214 Use `ureq` to download the SDK for the current platform and architecture.
215 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
216 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
217 that's inside of the archive.
218 Don't re-download the SDK if that executable already exists.
219
220 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
221
222 Here are the available wasi-sdk assets:
223 - wasi-sdk-25.0-x86_64-macos.tar.gz
224 - wasi-sdk-25.0-arm64-macos.tar.gz
225 - wasi-sdk-25.0-x86_64-linux.tar.gz
226 - wasi-sdk-25.0-arm64-linux.tar.gz
227 - wasi-sdk-25.0-x86_64-linux.tar.gz
228 - wasi-sdk-25.0-arm64-linux.tar.gz
229 - wasi-sdk-25.0-x86_64-windows.tar.gz
230 "})],
231 ),
232 message(
233 Assistant,
234 [tool_use(
235 "tool_1",
236 "read_file",
237 ReadFileToolInput {
238 path: input_file_path.into(),
239 start_line: Some(971),
240 end_line: Some(1050),
241 },
242 )],
243 ),
244 message(
245 User,
246 [tool_result(
247 "tool_1",
248 "read_file",
249 lines(input_file_content, 971..1050),
250 )],
251 ),
252 message(
253 Assistant,
254 [tool_use(
255 "tool_2",
256 "read_file",
257 ReadFileToolInput {
258 path: input_file_path.into(),
259 start_line: Some(1050),
260 end_line: Some(1100),
261 },
262 )],
263 ),
264 message(
265 User,
266 [tool_result(
267 "tool_2",
268 "read_file",
269 lines(input_file_content, 1050..1100),
270 )],
271 ),
272 message(
273 Assistant,
274 [tool_use(
275 "tool_3",
276 "read_file",
277 ReadFileToolInput {
278 path: input_file_path.into(),
279 start_line: Some(1100),
280 end_line: Some(1150),
281 },
282 )],
283 ),
284 message(
285 User,
286 [tool_result(
287 "tool_3",
288 "read_file",
289 lines(input_file_content, 1100..1150),
290 )],
291 ),
292 message(
293 Assistant,
294 [tool_use(
295 "tool_4",
296 "edit_file",
297 EditFileToolInput {
298 display_description: edit_description.into(),
299 path: input_file_path.into(),
300 mode: EditFileMode::Edit,
301 },
302 )],
303 ),
304 ],
305 Some(input_file_content.into()),
306 EvalAssertion::judge_diff(indoc! {"
307 - The compile_parser_to_wasm method has been changed to use wasi-sdk
308 - ureq is used to download the SDK for current platform and architecture
309 "}),
310 ),
311 );
312}
313
314#[test]
315#[cfg_attr(not(feature = "eval"), ignore)]
316fn eval_disable_cursor_blinking() {
317 let input_file_path = "root/editor.rs";
318 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
319 let edit_description = "Comment out the call to `BlinkManager::enable`";
320 eval(
321 100,
322 0.95,
323 EvalInput::from_conversation(
324 vec![
325 message(User, [text("Let's research how to cursor blinking works.")]),
326 message(
327 Assistant,
328 [tool_use(
329 "tool_1",
330 "grep",
331 GrepToolInput {
332 regex: "blink".into(),
333 include_pattern: None,
334 offset: 0,
335 case_sensitive: false,
336 },
337 )],
338 ),
339 message(
340 User,
341 [tool_result(
342 "tool_1",
343 "grep",
344 [
345 lines(input_file_content, 100..400),
346 lines(input_file_content, 800..1300),
347 lines(input_file_content, 1600..2000),
348 lines(input_file_content, 5000..5500),
349 lines(input_file_content, 8000..9000),
350 lines(input_file_content, 18455..18470),
351 lines(input_file_content, 20000..20500),
352 lines(input_file_content, 21000..21300),
353 ]
354 .join("Match found:\n\n"),
355 )],
356 ),
357 message(
358 User,
359 [text(indoc! {"
360 Comment out the lines that interact with the BlinkManager.
361 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
362 Don't add additional comments.
363 "})],
364 ),
365 message(
366 Assistant,
367 [tool_use(
368 "tool_4",
369 "edit_file",
370 EditFileToolInput {
371 display_description: edit_description.into(),
372 path: input_file_path.into(),
373 mode: EditFileMode::Edit,
374 },
375 )],
376 ),
377 ],
378 Some(input_file_content.into()),
379 EvalAssertion::judge_diff(indoc! {"
380 - Calls to BlinkManager in `observe_window_activation` were commented out
381 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
382 - All the edits have valid indentation
383 "}),
384 ),
385 );
386}
387
388#[test]
389#[cfg_attr(not(feature = "eval"), ignore)]
390fn eval_from_pixels_constructor() {
391 let input_file_path = "root/canvas.rs";
392 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
393 let edit_description = "Implement from_pixels constructor and add tests.";
394 eval(
395 100,
396 0.95,
397 EvalInput::from_conversation(
398 vec![
399 message(
400 User,
401 [text(indoc! {"
402 Introduce a new `from_pixels` constructor in Canvas and
403 also add tests for it in the same file.
404 "})],
405 ),
406 message(
407 Assistant,
408 [tool_use(
409 "tool_1",
410 "read_file",
411 ReadFileToolInput {
412 path: input_file_path.into(),
413 start_line: None,
414 end_line: None,
415 },
416 )],
417 ),
418 message(
419 User,
420 [tool_result("tool_1", "read_file", input_file_content)],
421 ),
422 message(
423 Assistant,
424 [tool_use(
425 "tool_2",
426 "grep",
427 GrepToolInput {
428 regex: "mod\\s+tests".into(),
429 include_pattern: Some("font-kit/src/canvas.rs".into()),
430 offset: 0,
431 case_sensitive: false,
432 },
433 )],
434 ),
435 message(User, [tool_result("tool_2", "grep", "No matches found")]),
436 message(
437 Assistant,
438 [tool_use(
439 "tool_3",
440 "grep",
441 GrepToolInput {
442 regex: "mod\\s+tests".into(),
443 include_pattern: Some("font-kit/src/**/*.rs".into()),
444 offset: 0,
445 case_sensitive: false,
446 },
447 )],
448 ),
449 message(User, [tool_result("tool_3", "grep", "No matches found")]),
450 message(
451 Assistant,
452 [tool_use(
453 "tool_4",
454 "grep",
455 GrepToolInput {
456 regex: "#\\[test\\]".into(),
457 include_pattern: Some("font-kit/src/**/*.rs".into()),
458 offset: 0,
459 case_sensitive: false,
460 },
461 )],
462 ),
463 message(
464 User,
465 [tool_result(
466 "tool_4",
467 "grep",
468 indoc! {"
469 Found 6 matches:
470
471 ## Matches in font-kit/src/loaders/core_text.rs
472
473 ### mod test › L926-936
474 ```
475 mod test {
476 use super::Font;
477 use crate::properties::{Stretch, Weight};
478
479 #[cfg(feature = \"source\")]
480 use crate::source::SystemSource;
481
482 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
483
484 #[cfg(feature = \"source\")]
485 #[test]
486 ```
487
488 55 lines remaining in ancestor node. Read the file to see all.
489
490 ### mod test › L947-951
491 ```
492 }
493
494 #[test]
495 fn test_core_text_to_css_font_weight() {
496 // Exact matches
497 ```
498
499 ### mod test › L959-963
500 ```
501 }
502
503 #[test]
504 fn test_core_text_to_css_font_stretch() {
505 // Exact matches
506 ```
507
508 ## Matches in font-kit/src/loaders/freetype.rs
509
510 ### mod test › L1238-1248
511 ```
512 mod test {
513 use crate::loaders::freetype::Font;
514
515 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
516 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
517
518 #[test]
519 fn get_pcf_postscript_name() {
520 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
521 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
522 }
523 ```
524
525 1 lines remaining in ancestor node. Read the file to see all.
526
527 ## Matches in font-kit/src/sources/core_text.rs
528
529 ### mod test › L265-275
530 ```
531 mod test {
532 use crate::properties::{Stretch, Weight};
533
534 #[test]
535 fn test_css_to_core_text_font_weight() {
536 // Exact matches
537 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
538 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
539 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
540 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
541
542 ```
543
544 27 lines remaining in ancestor node. Read the file to see all.
545
546 ### mod test › L278-282
547 ```
548 }
549
550 #[test]
551 fn test_css_to_core_text_font_stretch() {
552 // Exact matches
553 ```
554 "},
555 )],
556 ),
557 message(
558 Assistant,
559 [tool_use(
560 "tool_5",
561 "edit_file",
562 EditFileToolInput {
563 display_description: edit_description.into(),
564 path: input_file_path.into(),
565 mode: EditFileMode::Edit,
566 },
567 )],
568 ),
569 ],
570 Some(input_file_content.into()),
571 EvalAssertion::judge_diff(indoc! {"
572 - The diff contains a new `from_pixels` constructor
573 - The diff contains new tests for the `from_pixels` constructor
574 "}),
575 ),
576 );
577}
578
579#[test]
580#[cfg_attr(not(feature = "eval"), ignore)]
581fn eval_zode() {
582 let input_file_path = "root/zode.py";
583 let input_content = None;
584 let edit_description = "Create the main Zode CLI script";
585 eval(
586 200,
587 1.,
588 EvalInput::from_conversation(
589 vec![
590 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
591 message(
592 Assistant,
593 [
594 tool_use(
595 "tool_1",
596 "read_file",
597 ReadFileToolInput {
598 path: "root/eval/react.py".into(),
599 start_line: None,
600 end_line: None,
601 },
602 ),
603 tool_use(
604 "tool_2",
605 "read_file",
606 ReadFileToolInput {
607 path: "root/eval/react_test.py".into(),
608 start_line: None,
609 end_line: None,
610 },
611 ),
612 ],
613 ),
614 message(
615 User,
616 [
617 tool_result(
618 "tool_1",
619 "read_file",
620 include_str!("evals/fixtures/zode/react.py"),
621 ),
622 tool_result(
623 "tool_2",
624 "read_file",
625 include_str!("evals/fixtures/zode/react_test.py"),
626 ),
627 ],
628 ),
629 message(
630 Assistant,
631 [
632 text(
633 "Now that I understand what we need to build, I'll create the main Python script:",
634 ),
635 tool_use(
636 "tool_3",
637 "edit_file",
638 EditFileToolInput {
639 display_description: edit_description.into(),
640 path: input_file_path.into(),
641 mode: EditFileMode::Create,
642 },
643 ),
644 ],
645 ),
646 ],
647 input_content,
648 EvalAssertion::new(async move |sample, _, _cx| {
649 let invalid_starts = [' ', '`', '\n'];
650 let mut message = String::new();
651 for start in invalid_starts {
652 if sample.text.starts_with(start) {
653 message.push_str(&format!("The sample starts with a {:?}\n", start));
654 break;
655 }
656 }
657 // Remove trailing newline.
658 message.pop();
659
660 if message.is_empty() {
661 Ok(EvalAssertionOutcome {
662 score: 100,
663 message: None,
664 })
665 } else {
666 Ok(EvalAssertionOutcome {
667 score: 0,
668 message: Some(message),
669 })
670 }
671 }),
672 ),
673 );
674}
675
676#[test]
677#[cfg_attr(not(feature = "eval"), ignore)]
678fn eval_add_overwrite_test() {
679 let input_file_path = "root/action_log.rs";
680 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
681 let edit_description = "Add a new test for overwriting a file in action_log.rs";
682 eval(
683 200,
684 0.5, // TODO: make this eval better
685 EvalInput::from_conversation(
686 vec![
687 message(
688 User,
689 [text(indoc! {"
690 Introduce a new test in `action_log.rs` to test overwriting a file.
691 That is, a file already exists, but we call `buffer_created` as if the file were new.
692 Take inspiration from all the other tests in the file.
693 "})],
694 ),
695 message(
696 Assistant,
697 [tool_use(
698 "tool_1",
699 "read_file",
700 ReadFileToolInput {
701 path: input_file_path.into(),
702 start_line: None,
703 end_line: None,
704 },
705 )],
706 ),
707 message(
708 User,
709 [tool_result(
710 "tool_1",
711 "read_file",
712 indoc! {"
713 pub struct ActionLog [L13-20]
714 tracked_buffers [L15]
715 edited_since_project_diagnostics_check [L17]
716 project [L19]
717 impl ActionLog [L22-498]
718 pub fn new [L24-30]
719 pub fn project [L32-34]
720 pub fn checked_project_diagnostics [L37-39]
721 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
722 fn track_buffer_internal [L46-101]
723 fn handle_buffer_event [L103-116]
724 fn handle_buffer_edited [L118-123]
725 fn handle_buffer_file_changed [L125-158]
726 async fn maintain_diff [L160-264]
727 pub fn buffer_read [L267-269]
728 pub fn buffer_created [L272-276]
729 pub fn buffer_edited [L279-287]
730 pub fn will_delete_buffer [L289-304]
731 pub fn keep_edits_in_range [L306-364]
732 pub fn reject_edits_in_ranges [L366-459]
733 pub fn keep_all_edits [L461-473]
734 pub fn changed_buffers [L476-482]
735 pub fn stale_buffers [L485-497]
736 fn apply_non_conflicting_edits [L500-561]
737 fn diff_snapshots [L563-585]
738 fn point_to_row_edit [L587-614]
739 enum ChangeAuthor [L617-620]
740 User [L618]
741 Agent [L619]
742 enum TrackedBufferStatus [L623-627]
743 Created [L624]
744 Modified [L625]
745 Deleted [L626]
746 struct TrackedBuffer [L629-641]
747 buffer [L630]
748 base_text [L631]
749 unreviewed_changes [L632]
750 status [L633]
751 version [L634]
752 diff [L635]
753 snapshot [L636]
754 diff_update [L637]
755 _open_lsp_handle [L638]
756 _maintain_diff [L639]
757 _subscription [L640]
758 impl TrackedBuffer [L643-657]
759 fn has_changes [L644-650]
760 fn schedule_diff_update [L652-656]
761 pub struct ChangedBuffer [L659-661]
762 pub diff [L660]
763 mod tests [L664-1574]
764 fn init_logger [L678-682]
765 fn init_test [L684-691]
766 async fn test_keep_edits [L694-769]
767 async fn test_deletions [L772-854]
768 async fn test_overlapping_user_edits [L857-951]
769 async fn test_creating_files [L954-1010]
770 async fn test_deleting_files [L1013-1120]
771 async fn test_reject_edits [L1123-1255]
772 async fn test_reject_multiple_edits [L1258-1331]
773 async fn test_reject_deleted_file [L1334-1388]
774 async fn test_reject_created_file [L1391-1443]
775 async fn test_random_diffs [L1446-1535]
776 fn quiesce [L1510-1534]
777 struct HunkStatus [L1538-1542]
778 range [L1539]
779 diff_status [L1540]
780 old_text [L1541]
781 fn unreviewed_hunks [L1544-1573]
782
783 Showing symbols 1-69 (total symbols: 69)
784
785 Using the line numbers in this outline, you can call this tool again while specifying
786 the start_line and end_line fields to see the implementations of symbols in the outline.
787 "},
788 )],
789 ),
790 message(
791 Assistant,
792 [
793 text(
794 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
795 ),
796 tool_use(
797 "tool_2",
798 "read_file",
799 ReadFileToolInput {
800 path: input_file_path.into(),
801 start_line: Some(953),
802 end_line: Some(1010),
803 },
804 ),
805 ],
806 ),
807 message(
808 User,
809 [tool_result(
810 "tool_2",
811 "read_file",
812 lines(input_file_content, 953..1010),
813 )],
814 ),
815 message(
816 Assistant,
817 [
818 text(
819 "Now let's look at another relevant test to understand how they're structured:",
820 ),
821 tool_use(
822 "tool_3",
823 "read_file",
824 ReadFileToolInput {
825 path: input_file_path.into(),
826 start_line: Some(1012),
827 end_line: Some(1120),
828 },
829 ),
830 ],
831 ),
832 message(
833 User,
834 [tool_result(
835 "tool_3",
836 "read_file",
837 lines(input_file_content, 1012..1120),
838 )],
839 ),
840 message(
841 Assistant,
842 [
843 text("Now let's look at how `buffer_created` is implemented:"),
844 tool_use(
845 "tool_4",
846 "read_file",
847 ReadFileToolInput {
848 path: input_file_path.into(),
849 start_line: Some(271),
850 end_line: Some(276),
851 },
852 ),
853 ],
854 ),
855 message(
856 User,
857 [tool_result(
858 "tool_4",
859 "read_file",
860 lines(input_file_content, 271..276),
861 )],
862 ),
863 message(
864 Assistant,
865 [
866 text(indoc! {"
867 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
868
869 1. Create a file
870 2. Open a buffer for it
871 3. Track it in the action log
872 4. Modify the buffer
873 5. Call `buffer_created` again as if the file were new
874 6. Check that the changes are properly tracked
875
876 Let's write the test:
877 "}),
878 tool_use(
879 "tool_5",
880 "edit_file",
881 EditFileToolInput {
882 display_description: edit_description.into(),
883 path: input_file_path.into(),
884 mode: EditFileMode::Edit,
885 },
886 ),
887 ],
888 ),
889 ],
890 Some(input_file_content.into()),
891 EvalAssertion::judge_diff(
892 "A new test for overwritten files was created, without changing any previous test",
893 ),
894 ),
895 );
896}
897
898#[test]
899#[ignore] // until we figure out the mystery described in the comments
900// #[cfg_attr(not(feature = "eval"), ignore)]
901fn eval_create_empty_file() {
902 // Check that Edit Agent can create a file without writing its
903 // thoughts into it. This issue is not specific to empty files, but
904 // it's easier to reproduce with them.
905 //
906 // NOTE: For some mysterious reason, I could easily reproduce this
907 // issue roughly 90% of the time in actual Zed. However, once I
908 // extract the exact LLM request before the failure point and
909 // generate from that, the reproduction rate drops to 2%!
910 //
911 // Things I've tried to make sure it's not a fluke: disabling prompt
912 // caching, capturing the LLM request via a proxy server, running the
913 // prompt on Claude separately from evals. Every time it was mostly
914 // giving good outcomes, which doesn't match my actual experience in
915 // Zed.
916 //
917 // At some point I discovered that simply adding one insignificant
918 // space or a newline to the prompt suddenly results in an outcome I
919 // tried to reproduce almost perfectly.
920 //
921 // This weirdness happens even outside of the Zed code base and even
922 // when using a different subscription. The result is the same: an
923 // extra newline or space changes the model behavior significantly
924 // enough, so that the pass rate drops from 99% to 0-3%
925 //
926 // I have no explanation to this.
927 //
928 //
929 // Model | Pass rate
930 // ============================================
931 //
932 // --------------------------------------------
933 // Prompt version: 2025-05-19
934 // --------------------------------------------
935 //
936 // claude-3.7-sonnet | 0.98
937 // + one extra space in prompt | 0.00
938 // + original prompt again | 0.99
939 // + extra newline | 0.03
940 // gemini-2.5-pro-preview-03-25 | 1.00
941 // gemini-2.5-flash-preview-04-17 | 1.00
942 // + one extra space | 1.00
943 // gpt-4.1 | 1.00
944 // + one extra space | 1.00
945 //
946 //
947 // TODO: gpt-4.1-mini errored 38 times:
948 // "data did not match any variant of untagged enum ResponseStreamResult"
949 //
950 let input_file_content = None;
951 let expected_output_content = String::new();
952 eval(
953 1,
954 1.0,
955 EvalInput::from_conversation(
956 vec![
957 message(User, [text("Create a second empty todo file ")]),
958 message(
959 Assistant,
960 [
961 text(formatdoc! {"
962 I'll help you create a second empty todo file.
963 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
964 "}),
965 tool_use(
966 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
967 "list_directory",
968 ListDirectoryToolInput {
969 path: "root".to_string(),
970 },
971 ),
972 ],
973 ),
974 message(
975 User,
976 [tool_result(
977 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
978 "list_directory",
979 "root/TODO\nroot/TODO2\nroot/new.txt\n",
980 )],
981 ),
982 message(
983 Assistant,
984 [
985 text(formatdoc! {"
986 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
987 "}),
988 tool_use(
989 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
990 "edit_file",
991 EditFileToolInput {
992 display_description: "Create empty TODO3 file".to_string(),
993 mode: EditFileMode::Create,
994 path: "root/TODO3".into(),
995 },
996 ),
997 ],
998 ),
999 ],
1000 input_file_content,
1001 // Bad behavior is to write something like
1002 // "I'll create an empty TODO3 file as requested."
1003 EvalAssertion::assert_eq(expected_output_content),
1004 ),
1005 );
1006}
1007
1008fn message(
1009 role: Role,
1010 contents: impl IntoIterator<Item = MessageContent>,
1011) -> LanguageModelRequestMessage {
1012 LanguageModelRequestMessage {
1013 role,
1014 content: contents.into_iter().collect(),
1015 cache: false,
1016 }
1017}
1018
1019fn text(text: impl Into<String>) -> MessageContent {
1020 MessageContent::Text(text.into())
1021}
1022
1023fn lines(input: &str, range: Range<usize>) -> String {
1024 input
1025 .lines()
1026 .skip(range.start)
1027 .take(range.len())
1028 .collect::<Vec<_>>()
1029 .join("\n")
1030}
1031
1032fn tool_use(
1033 id: impl Into<Arc<str>>,
1034 name: impl Into<Arc<str>>,
1035 input: impl Serialize,
1036) -> MessageContent {
1037 MessageContent::ToolUse(LanguageModelToolUse {
1038 id: LanguageModelToolUseId::from(id.into()),
1039 name: name.into(),
1040 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1041 input: serde_json::to_value(input).unwrap(),
1042 is_input_complete: true,
1043 })
1044}
1045
1046fn tool_result(
1047 id: impl Into<Arc<str>>,
1048 name: impl Into<Arc<str>>,
1049 result: impl Into<Arc<str>>,
1050) -> MessageContent {
1051 MessageContent::ToolResult(LanguageModelToolResult {
1052 tool_use_id: LanguageModelToolUseId::from(id.into()),
1053 tool_name: name.into(),
1054 is_error: false,
1055 content: LanguageModelToolResultContent::Text(result.into()),
1056 output: None,
1057 })
1058}
1059
1060#[derive(Clone)]
1061struct EvalInput {
1062 conversation: Vec<LanguageModelRequestMessage>,
1063 edit_file_input: EditFileToolInput,
1064 input_content: Option<String>,
1065 assertion: EvalAssertion,
1066}
1067
1068impl EvalInput {
1069 fn from_conversation(
1070 conversation: Vec<LanguageModelRequestMessage>,
1071 input_content: Option<String>,
1072 assertion: EvalAssertion,
1073 ) -> Self {
1074 let msg = conversation.last().expect("Conversation must not be empty");
1075 if msg.role != Role::Assistant {
1076 panic!("Conversation must end with an assistant message");
1077 }
1078 let tool_use = msg
1079 .content
1080 .iter()
1081 .flat_map(|content| match content {
1082 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1083 Some(tool_use)
1084 }
1085 _ => None,
1086 })
1087 .next()
1088 .expect("Conversation must end with an edit_file tool use")
1089 .clone();
1090
1091 let edit_file_input: EditFileToolInput =
1092 serde_json::from_value(tool_use.input.clone()).unwrap();
1093
1094 EvalInput {
1095 conversation,
1096 edit_file_input,
1097 input_content,
1098 assertion,
1099 }
1100 }
1101}
1102
1103#[derive(Clone)]
1104struct EvalSample {
1105 text: String,
1106 edit_output: EditAgentOutput,
1107 diff: String,
1108}
1109
1110trait AssertionFn: 'static + Send + Sync {
1111 fn assert<'a>(
1112 &'a self,
1113 sample: &'a EvalSample,
1114 judge_model: Arc<dyn LanguageModel>,
1115 cx: &'a mut TestAppContext,
1116 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1117}
1118
1119impl<F> AssertionFn for F
1120where
1121 F: 'static
1122 + Send
1123 + Sync
1124 + AsyncFn(
1125 &EvalSample,
1126 Arc<dyn LanguageModel>,
1127 &mut TestAppContext,
1128 ) -> Result<EvalAssertionOutcome>,
1129{
1130 fn assert<'a>(
1131 &'a self,
1132 sample: &'a EvalSample,
1133 judge_model: Arc<dyn LanguageModel>,
1134 cx: &'a mut TestAppContext,
1135 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1136 (self)(sample, judge_model, cx).boxed_local()
1137 }
1138}
1139
1140#[derive(Clone)]
1141struct EvalAssertion(Arc<dyn AssertionFn>);
1142
1143impl EvalAssertion {
1144 fn new<F>(f: F) -> Self
1145 where
1146 F: 'static
1147 + Send
1148 + Sync
1149 + AsyncFn(
1150 &EvalSample,
1151 Arc<dyn LanguageModel>,
1152 &mut TestAppContext,
1153 ) -> Result<EvalAssertionOutcome>,
1154 {
1155 EvalAssertion(Arc::new(f))
1156 }
1157
1158 fn assert_eq(expected: impl Into<String>) -> Self {
1159 let expected = expected.into();
1160 Self::new(async move |sample, _judge, _cx| {
1161 Ok(EvalAssertionOutcome {
1162 score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
1163 100
1164 } else {
1165 0
1166 },
1167 message: None,
1168 })
1169 })
1170 }
1171
1172 fn judge_diff(assertions: &'static str) -> Self {
1173 Self::new(async move |sample, judge, cx| {
1174 let prompt = DiffJudgeTemplate {
1175 diff: sample.diff.clone(),
1176 assertions,
1177 }
1178 .render(&Templates::new())
1179 .unwrap();
1180
1181 let request = LanguageModelRequest {
1182 messages: vec![LanguageModelRequestMessage {
1183 role: Role::User,
1184 content: vec![prompt.into()],
1185 cache: false,
1186 }],
1187 ..Default::default()
1188 };
1189 let mut response = judge
1190 .stream_completion_text(request, &cx.to_async())
1191 .await?;
1192 let mut output = String::new();
1193 while let Some(chunk) = response.stream.next().await {
1194 let chunk = chunk?;
1195 output.push_str(&chunk);
1196 }
1197
1198 // Parse the score from the response
1199 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1200 if let Some(captures) = re.captures(&output) {
1201 if let Some(score_match) = captures.get(1) {
1202 let score = score_match.as_str().parse().unwrap_or(0);
1203 return Ok(EvalAssertionOutcome {
1204 score,
1205 message: Some(output),
1206 });
1207 }
1208 }
1209
1210 Err(anyhow!(
1211 "No score found in response. Raw output: {}",
1212 output
1213 ))
1214 })
1215 }
1216
1217 async fn run(
1218 &self,
1219 input: &EvalSample,
1220 judge_model: Arc<dyn LanguageModel>,
1221 cx: &mut TestAppContext,
1222 ) -> Result<EvalAssertionOutcome> {
1223 self.0.assert(input, judge_model, cx).await
1224 }
1225}
1226
1227fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1228 let mut evaluated_count = 0;
1229 let mut failed_count = 0;
1230 report_progress(evaluated_count, failed_count, iterations);
1231
1232 let (tx, rx) = mpsc::channel();
1233
1234 // Cache the last message in the conversation, and run one instance of the eval so that
1235 // all the next ones are cached.
1236 eval.conversation.last_mut().unwrap().cache = true;
1237 run_eval(eval.clone(), tx.clone());
1238
1239 let executor = gpui::background_executor();
1240 for _ in 1..iterations {
1241 let eval = eval.clone();
1242 let tx = tx.clone();
1243 executor.spawn(async move { run_eval(eval, tx) }).detach();
1244 }
1245 drop(tx);
1246
1247 let mut failed_evals = HashMap::default();
1248 let mut errored_evals = HashMap::default();
1249 let mut eval_outputs = Vec::new();
1250 let mut cumulative_parser_metrics = EditParserMetrics::default();
1251 while let Ok(output) = rx.recv() {
1252 match output {
1253 Ok(output) => {
1254 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1255 eval_outputs.push(output.clone());
1256 if output.assertion.score < 80 {
1257 failed_count += 1;
1258 failed_evals
1259 .entry(output.sample.text.clone())
1260 .or_insert(Vec::new())
1261 .push(output);
1262 }
1263 }
1264 Err(error) => {
1265 failed_count += 1;
1266 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1267 }
1268 }
1269
1270 evaluated_count += 1;
1271 report_progress(evaluated_count, failed_count, iterations);
1272 }
1273
1274 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1275 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1276 if actual_pass_ratio < expected_pass_ratio {
1277 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1278 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1279 for (error, count) in errored_evals {
1280 println!("Eval errored {} times. Error: {}", count, error);
1281 }
1282
1283 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1284 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1285 for (_buffer_output, failed_evals) in failed_evals {
1286 let eval_output = failed_evals.first().unwrap();
1287 println!("Eval failed {} times", failed_evals.len());
1288 println!("{}", eval_output);
1289 }
1290
1291 panic!(
1292 "Actual pass ratio: {}\nExpected pass ratio: {}",
1293 actual_pass_ratio, expected_pass_ratio
1294 );
1295 }
1296
1297 let mismatched_tag_ratio =
1298 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1299 if mismatched_tag_ratio > 0.05 {
1300 for eval_output in eval_outputs {
1301 println!("{}", eval_output);
1302 }
1303 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1304 }
1305}
1306
1307fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1308 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1309 let mut cx = TestAppContext::build(dispatcher, None);
1310 let output = cx.executor().block_test(async {
1311 let test = EditAgentTest::new(&mut cx).await;
1312 test.eval(eval, &mut cx).await
1313 });
1314 tx.send(output).unwrap();
1315}
1316
1317#[derive(Clone)]
1318struct EvalOutput {
1319 sample: EvalSample,
1320 assertion: EvalAssertionOutcome,
1321}
1322
1323impl Display for EvalOutput {
1324 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1325 writeln!(f, "Score: {:?}", self.assertion.score)?;
1326 if let Some(message) = self.assertion.message.as_ref() {
1327 writeln!(f, "Message: {}", message)?;
1328 }
1329
1330 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1331
1332 writeln!(
1333 f,
1334 "Parser Metrics:\n{:#?}",
1335 self.sample.edit_output.parser_metrics
1336 )?;
1337 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1338 Ok(())
1339 }
1340}
1341
1342fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1343 let passed_count = evaluated_count - failed_count;
1344 let passed_ratio = if evaluated_count == 0 {
1345 0.0
1346 } else {
1347 passed_count as f64 / evaluated_count as f64
1348 };
1349 print!(
1350 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1351 evaluated_count,
1352 iterations,
1353 passed_ratio * 100.0
1354 );
1355 std::io::stdout().flush().unwrap();
1356}
1357
1358struct EditAgentTest {
1359 agent: EditAgent,
1360 project: Entity<Project>,
1361 judge_model: Arc<dyn LanguageModel>,
1362}
1363
1364impl EditAgentTest {
1365 async fn new(cx: &mut TestAppContext) -> Self {
1366 cx.executor().allow_parking();
1367
1368 let fs = FakeFs::new(cx.executor().clone());
1369 cx.update(|cx| {
1370 settings::init(cx);
1371 gpui_tokio::init(cx);
1372 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1373 cx.set_http_client(http_client);
1374
1375 client::init_settings(cx);
1376 let client = Client::production(cx);
1377 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1378
1379 settings::init(cx);
1380 Project::init_settings(cx);
1381 language::init(cx);
1382 language_model::init(client.clone(), cx);
1383 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1384 crate::init(client.http_client(), cx);
1385 });
1386
1387 fs.insert_tree("/root", json!({})).await;
1388 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1389 let agent_model = SelectedModel::from_str(
1390 &std::env::var("ZED_AGENT_MODEL")
1391 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1392 )
1393 .unwrap();
1394 let judge_model = SelectedModel::from_str(
1395 &std::env::var("ZED_JUDGE_MODEL")
1396 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1397 )
1398 .unwrap();
1399 let (agent_model, judge_model) = cx
1400 .update(|cx| {
1401 cx.spawn(async move |cx| {
1402 let agent_model = Self::load_model(&agent_model, cx).await;
1403 let judge_model = Self::load_model(&judge_model, cx).await;
1404 (agent_model.unwrap(), judge_model.unwrap())
1405 })
1406 })
1407 .await;
1408 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1409
1410 Self {
1411 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1412 project,
1413 judge_model,
1414 }
1415 }
1416
1417 async fn load_model(
1418 selected_model: &SelectedModel,
1419 cx: &mut AsyncApp,
1420 ) -> Result<Arc<dyn LanguageModel>> {
1421 let (provider, model) = cx.update(|cx| {
1422 let models = LanguageModelRegistry::read_global(cx);
1423 let model = models
1424 .available_models(cx)
1425 .find(|model| {
1426 model.provider_id() == selected_model.provider
1427 && model.id() == selected_model.model
1428 })
1429 .unwrap();
1430 let provider = models.provider(&model.provider_id()).unwrap();
1431 (provider, model)
1432 })?;
1433 cx.update(|cx| provider.authenticate(cx))?.await?;
1434 Ok(model)
1435 }
1436
1437 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1438 let path = self
1439 .project
1440 .read_with(cx, |project, cx| {
1441 project.find_project_path(eval.edit_file_input.path, cx)
1442 })
1443 .unwrap();
1444 let buffer = self
1445 .project
1446 .update(cx, |project, cx| project.open_buffer(path, cx))
1447 .await
1448 .unwrap();
1449 let conversation = LanguageModelRequest {
1450 messages: eval.conversation,
1451 tools: cx.update(|cx| {
1452 ToolRegistry::default_global(cx)
1453 .tools()
1454 .into_iter()
1455 .filter_map(|tool| {
1456 let input_schema = tool
1457 .input_schema(self.agent.model.tool_input_format())
1458 .ok()?;
1459 Some(LanguageModelRequestTool {
1460 name: tool.name(),
1461 description: tool.description(),
1462 input_schema,
1463 })
1464 })
1465 .collect()
1466 }),
1467 ..Default::default()
1468 };
1469 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1470 if let Some(input_content) = eval.input_content.as_deref() {
1471 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1472 }
1473 let (edit_output, _) = self.agent.edit(
1474 buffer.clone(),
1475 eval.edit_file_input.display_description,
1476 &conversation,
1477 &mut cx.to_async(),
1478 );
1479 edit_output.await?
1480 } else {
1481 let (edit_output, _) = self.agent.overwrite(
1482 buffer.clone(),
1483 eval.edit_file_input.display_description,
1484 &conversation,
1485 &mut cx.to_async(),
1486 );
1487 edit_output.await?
1488 };
1489
1490 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1491 let sample = EvalSample {
1492 edit_output,
1493 diff: language::unified_diff(
1494 eval.input_content.as_deref().unwrap_or_default(),
1495 &buffer_text,
1496 ),
1497 text: buffer_text,
1498 };
1499 let assertion = eval
1500 .assertion
1501 .run(&sample, self.judge_model.clone(), cx)
1502 .await?;
1503
1504 Ok(EvalOutput { assertion, sample })
1505 }
1506}
1507
1508#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1509struct EvalAssertionOutcome {
1510 score: usize,
1511 message: Option<String>,
1512}
1513
1514#[derive(Serialize)]
1515pub struct DiffJudgeTemplate {
1516 diff: String,
1517 assertions: &'static str,
1518}
1519
1520impl Template for DiffJudgeTemplate {
1521 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1522}
1523
1524fn strip_empty_lines(text: &str) -> String {
1525 text.lines()
1526 .filter(|line| !line.trim().is_empty())
1527 .collect::<Vec<_>>()
1528 .join("\n")
1529}