1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 str::FromStr,
30 sync::mpsc,
31};
32use util::path;
33
34#[test]
35#[cfg_attr(not(feature = "eval"), ignore)]
36fn eval_extract_handle_command_output() {
37 // Test how well agent generates multiple edit hunks.
38 //
39 // Model | Pass rate
40 // ----------------------------|----------
41 // claude-3.7-sonnet | 0.98
42 // gemini-2.5-pro-06-05 | 0.77
43 // gemini-2.5-flash | 0.11
44 // gpt-4.1 | 1.00
45
46 let input_file_path = "root/blame.rs";
47 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
48 let possible_diffs = vec![
49 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
50 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
51 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
56 ];
57 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
58 eval(
59 100,
60 0.7, // Taking the lower bar for Gemini
61 0.05,
62 EvalInput::from_conversation(
63 vec![
64 message(
65 User,
66 [text(formatdoc! {"
67 Read the `{input_file_path}` file and extract a method in
68 the final stanza of `run_git_blame` to deal with command failures,
69 call it `handle_command_output` and take the std::process::Output as the only parameter.
70 Do not document the method and do not add any comments.
71
72 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
73 "})],
74 ),
75 message(
76 Assistant,
77 [tool_use(
78 "tool_1",
79 "read_file",
80 ReadFileToolInput {
81 path: input_file_path.into(),
82 start_line: None,
83 end_line: None,
84 },
85 )],
86 ),
87 message(
88 User,
89 [tool_result("tool_1", "read_file", input_file_content)],
90 ),
91 message(
92 Assistant,
93 [tool_use(
94 "tool_2",
95 "edit_file",
96 EditFileToolInput {
97 display_description: edit_description.into(),
98 path: input_file_path.into(),
99 mode: EditFileMode::Edit,
100 },
101 )],
102 ),
103 ],
104 Some(input_file_content.into()),
105 EvalAssertion::assert_diff_any(possible_diffs),
106 ),
107 );
108}
109
110#[test]
111#[cfg_attr(not(feature = "eval"), ignore)]
112fn eval_delete_run_git_blame() {
113 let input_file_path = "root/blame.rs";
114 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
115 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
116 let edit_description = "Delete the `run_git_blame` function.";
117 eval(
118 100,
119 0.95,
120 0.05,
121 EvalInput::from_conversation(
122 vec![
123 message(
124 User,
125 [text(formatdoc! {"
126 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
127 one function, not its usages.
128 "})],
129 ),
130 message(
131 Assistant,
132 [tool_use(
133 "tool_1",
134 "read_file",
135 ReadFileToolInput {
136 path: input_file_path.into(),
137 start_line: None,
138 end_line: None,
139 },
140 )],
141 ),
142 message(
143 User,
144 [tool_result("tool_1", "read_file", input_file_content)],
145 ),
146 message(
147 Assistant,
148 [tool_use(
149 "tool_2",
150 "edit_file",
151 EditFileToolInput {
152 display_description: edit_description.into(),
153 path: input_file_path.into(),
154 mode: EditFileMode::Edit,
155 },
156 )],
157 ),
158 ],
159 Some(input_file_content.into()),
160 EvalAssertion::assert_eq(output_file_content),
161 ),
162 );
163}
164
165#[test]
166#[cfg_attr(not(feature = "eval"), ignore)]
167fn eval_translate_doc_comments() {
168 // Results for 2025-05-22
169 //
170 // Model | Pass rate
171 // ============================================
172 //
173 // claude-3.7-sonnet |
174 // gemini-2.5-pro-preview-03-25 | 1.0
175 // gemini-2.5-flash-preview-04-17 |
176 // gpt-4.1 |
177 let input_file_path = "root/canvas.rs";
178 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
179 let edit_description = "Translate all doc comments to Italian";
180 eval(
181 200,
182 1.,
183 0.05,
184 EvalInput::from_conversation(
185 vec![
186 message(
187 User,
188 [text(formatdoc! {"
189 Read the {input_file_path} file and edit it (without overwriting it),
190 translating all the doc comments to italian.
191 "})],
192 ),
193 message(
194 Assistant,
195 [tool_use(
196 "tool_1",
197 "read_file",
198 ReadFileToolInput {
199 path: input_file_path.into(),
200 start_line: None,
201 end_line: None,
202 },
203 )],
204 ),
205 message(
206 User,
207 [tool_result("tool_1", "read_file", input_file_content)],
208 ),
209 message(
210 Assistant,
211 [tool_use(
212 "tool_2",
213 "edit_file",
214 EditFileToolInput {
215 display_description: edit_description.into(),
216 path: input_file_path.into(),
217 mode: EditFileMode::Edit,
218 },
219 )],
220 ),
221 ],
222 Some(input_file_content.into()),
223 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
224 ),
225 );
226}
227
228#[test]
229#[cfg_attr(not(feature = "eval"), ignore)]
230fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
231 // Results for 2025-05-22
232 //
233 // Model | Pass rate
234 // ============================================
235 //
236 // claude-3.7-sonnet | 0.98
237 // gemini-2.5-pro-preview-03-25 | 0.99
238 // gemini-2.5-flash-preview-04-17 |
239 // gpt-4.1 |
240 let input_file_path = "root/lib.rs";
241 let input_file_content =
242 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
243 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
244 eval(
245 100,
246 0.95,
247 0.05,
248 EvalInput::from_conversation(
249 vec![
250 message(
251 User,
252 [text(formatdoc! {"
253 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
254 Use `ureq` to download the SDK for the current platform and architecture.
255 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
256 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
257 that's inside of the archive.
258 Don't re-download the SDK if that executable already exists.
259
260 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
261
262 Here are the available wasi-sdk assets:
263 - wasi-sdk-25.0-x86_64-macos.tar.gz
264 - wasi-sdk-25.0-arm64-macos.tar.gz
265 - wasi-sdk-25.0-x86_64-linux.tar.gz
266 - wasi-sdk-25.0-arm64-linux.tar.gz
267 - wasi-sdk-25.0-x86_64-linux.tar.gz
268 - wasi-sdk-25.0-arm64-linux.tar.gz
269 - wasi-sdk-25.0-x86_64-windows.tar.gz
270 "})],
271 ),
272 message(
273 Assistant,
274 [tool_use(
275 "tool_1",
276 "read_file",
277 ReadFileToolInput {
278 path: input_file_path.into(),
279 start_line: Some(971),
280 end_line: Some(1050),
281 },
282 )],
283 ),
284 message(
285 User,
286 [tool_result(
287 "tool_1",
288 "read_file",
289 lines(input_file_content, 971..1050),
290 )],
291 ),
292 message(
293 Assistant,
294 [tool_use(
295 "tool_2",
296 "read_file",
297 ReadFileToolInput {
298 path: input_file_path.into(),
299 start_line: Some(1050),
300 end_line: Some(1100),
301 },
302 )],
303 ),
304 message(
305 User,
306 [tool_result(
307 "tool_2",
308 "read_file",
309 lines(input_file_content, 1050..1100),
310 )],
311 ),
312 message(
313 Assistant,
314 [tool_use(
315 "tool_3",
316 "read_file",
317 ReadFileToolInput {
318 path: input_file_path.into(),
319 start_line: Some(1100),
320 end_line: Some(1150),
321 },
322 )],
323 ),
324 message(
325 User,
326 [tool_result(
327 "tool_3",
328 "read_file",
329 lines(input_file_content, 1100..1150),
330 )],
331 ),
332 message(
333 Assistant,
334 [tool_use(
335 "tool_4",
336 "edit_file",
337 EditFileToolInput {
338 display_description: edit_description.into(),
339 path: input_file_path.into(),
340 mode: EditFileMode::Edit,
341 },
342 )],
343 ),
344 ],
345 Some(input_file_content.into()),
346 EvalAssertion::judge_diff(indoc! {"
347 - The compile_parser_to_wasm method has been changed to use wasi-sdk
348 - ureq is used to download the SDK for current platform and architecture
349 "}),
350 ),
351 );
352}
353
354#[test]
355#[cfg_attr(not(feature = "eval"), ignore)]
356fn eval_disable_cursor_blinking() {
357 // Results for 2025-05-22
358 //
359 // Model | Pass rate
360 // ============================================
361 //
362 // claude-3.7-sonnet |
363 // gemini-2.5-pro-preview-03-25 | 1.0
364 // gemini-2.5-flash-preview-04-17 |
365 // gpt-4.1 |
366 let input_file_path = "root/editor.rs";
367 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
368 let edit_description = "Comment out the call to `BlinkManager::enable`";
369 eval(
370 100,
371 0.95,
372 0.05,
373 EvalInput::from_conversation(
374 vec![
375 message(User, [text("Let's research how to cursor blinking works.")]),
376 message(
377 Assistant,
378 [tool_use(
379 "tool_1",
380 "grep",
381 GrepToolInput {
382 regex: "blink".into(),
383 include_pattern: None,
384 offset: 0,
385 case_sensitive: false,
386 },
387 )],
388 ),
389 message(
390 User,
391 [tool_result(
392 "tool_1",
393 "grep",
394 [
395 lines(input_file_content, 100..400),
396 lines(input_file_content, 800..1300),
397 lines(input_file_content, 1600..2000),
398 lines(input_file_content, 5000..5500),
399 lines(input_file_content, 8000..9000),
400 lines(input_file_content, 18455..18470),
401 lines(input_file_content, 20000..20500),
402 lines(input_file_content, 21000..21300),
403 ]
404 .join("Match found:\n\n"),
405 )],
406 ),
407 message(
408 User,
409 [text(indoc! {"
410 Comment out the lines that interact with the BlinkManager.
411 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
412 Don't add additional comments.
413 "})],
414 ),
415 message(
416 Assistant,
417 [tool_use(
418 "tool_4",
419 "edit_file",
420 EditFileToolInput {
421 display_description: edit_description.into(),
422 path: input_file_path.into(),
423 mode: EditFileMode::Edit,
424 },
425 )],
426 ),
427 ],
428 Some(input_file_content.into()),
429 EvalAssertion::judge_diff(indoc! {"
430 - Calls to BlinkManager in `observe_window_activation` were commented out
431 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
432 - All the edits have valid indentation
433 "}),
434 ),
435 );
436}
437
438#[test]
439#[cfg_attr(not(feature = "eval"), ignore)]
440fn eval_from_pixels_constructor() {
441 // Results for 2025-05-22
442 //
443 // Model | Pass rate
444 // ============================================
445 //
446 // claude-3.7-sonnet |
447 // gemini-2.5-pro-preview-03-25 | 0.94
448 // gemini-2.5-flash-preview-04-17 |
449 // gpt-4.1 |
450 let input_file_path = "root/canvas.rs";
451 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
452 let edit_description = "Implement from_pixels constructor and add tests.";
453 eval(
454 100,
455 0.95,
456 // For whatever reason, this eval produces more mismatched tags.
457 // Increasing for now, let's see if we can bring this down.
458 0.2,
459 EvalInput::from_conversation(
460 vec![
461 message(
462 User,
463 [text(indoc! {"
464 Introduce a new `from_pixels` constructor in Canvas and
465 also add tests for it in the same file.
466 "})],
467 ),
468 message(
469 Assistant,
470 [tool_use(
471 "tool_1",
472 "read_file",
473 ReadFileToolInput {
474 path: input_file_path.into(),
475 start_line: None,
476 end_line: None,
477 },
478 )],
479 ),
480 message(
481 User,
482 [tool_result("tool_1", "read_file", input_file_content)],
483 ),
484 message(
485 Assistant,
486 [tool_use(
487 "tool_2",
488 "grep",
489 GrepToolInput {
490 regex: "mod\\s+tests".into(),
491 include_pattern: Some("font-kit/src/canvas.rs".into()),
492 offset: 0,
493 case_sensitive: false,
494 },
495 )],
496 ),
497 message(User, [tool_result("tool_2", "grep", "No matches found")]),
498 message(
499 Assistant,
500 [tool_use(
501 "tool_3",
502 "grep",
503 GrepToolInput {
504 regex: "mod\\s+tests".into(),
505 include_pattern: Some("font-kit/src/**/*.rs".into()),
506 offset: 0,
507 case_sensitive: false,
508 },
509 )],
510 ),
511 message(User, [tool_result("tool_3", "grep", "No matches found")]),
512 message(
513 Assistant,
514 [tool_use(
515 "tool_4",
516 "grep",
517 GrepToolInput {
518 regex: "#\\[test\\]".into(),
519 include_pattern: Some("font-kit/src/**/*.rs".into()),
520 offset: 0,
521 case_sensitive: false,
522 },
523 )],
524 ),
525 message(
526 User,
527 [tool_result(
528 "tool_4",
529 "grep",
530 indoc! {"
531 Found 6 matches:
532
533 ## Matches in font-kit/src/loaders/core_text.rs
534
535 ### mod test › L926-936
536 ```
537 mod test {
538 use super::Font;
539 use crate::properties::{Stretch, Weight};
540
541 #[cfg(feature = \"source\")]
542 use crate::source::SystemSource;
543
544 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
545
546 #[cfg(feature = \"source\")]
547 #[test]
548 ```
549
550 55 lines remaining in ancestor node. Read the file to see all.
551
552 ### mod test › L947-951
553 ```
554 }
555
556 #[test]
557 fn test_core_text_to_css_font_weight() {
558 // Exact matches
559 ```
560
561 ### mod test › L959-963
562 ```
563 }
564
565 #[test]
566 fn test_core_text_to_css_font_stretch() {
567 // Exact matches
568 ```
569
570 ## Matches in font-kit/src/loaders/freetype.rs
571
572 ### mod test › L1238-1248
573 ```
574 mod test {
575 use crate::loaders::freetype::Font;
576
577 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
578 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
579
580 #[test]
581 fn get_pcf_postscript_name() {
582 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
583 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
584 }
585 ```
586
587 1 lines remaining in ancestor node. Read the file to see all.
588
589 ## Matches in font-kit/src/sources/core_text.rs
590
591 ### mod test › L265-275
592 ```
593 mod test {
594 use crate::properties::{Stretch, Weight};
595
596 #[test]
597 fn test_css_to_core_text_font_weight() {
598 // Exact matches
599 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
600 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
601 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
602 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
603
604 ```
605
606 27 lines remaining in ancestor node. Read the file to see all.
607
608 ### mod test › L278-282
609 ```
610 }
611
612 #[test]
613 fn test_css_to_core_text_font_stretch() {
614 // Exact matches
615 ```
616 "},
617 )],
618 ),
619 message(
620 Assistant,
621 [tool_use(
622 "tool_5",
623 "edit_file",
624 EditFileToolInput {
625 display_description: edit_description.into(),
626 path: input_file_path.into(),
627 mode: EditFileMode::Edit,
628 },
629 )],
630 ),
631 ],
632 Some(input_file_content.into()),
633 EvalAssertion::judge_diff(indoc! {"
634 - The diff contains a new `from_pixels` constructor
635 - The diff contains new tests for the `from_pixels` constructor
636 "}),
637 ),
638 );
639}
640
641#[test]
642#[cfg_attr(not(feature = "eval"), ignore)]
643fn eval_zode() {
644 // Results for 2025-05-22
645 //
646 // Model | Pass rate
647 // ============================================
648 //
649 // claude-3.7-sonnet | 1.0
650 // gemini-2.5-pro-preview-03-25 | 1.0
651 // gemini-2.5-flash-preview-04-17 | 1.0
652 // gpt-4.1 | 1.0
653 let input_file_path = "root/zode.py";
654 let input_content = None;
655 let edit_description = "Create the main Zode CLI script";
656 eval(
657 50,
658 1.,
659 0.05,
660 EvalInput::from_conversation(
661 vec![
662 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
663 message(
664 Assistant,
665 [
666 tool_use(
667 "tool_1",
668 "read_file",
669 ReadFileToolInput {
670 path: "root/eval/react.py".into(),
671 start_line: None,
672 end_line: None,
673 },
674 ),
675 tool_use(
676 "tool_2",
677 "read_file",
678 ReadFileToolInput {
679 path: "root/eval/react_test.py".into(),
680 start_line: None,
681 end_line: None,
682 },
683 ),
684 ],
685 ),
686 message(
687 User,
688 [
689 tool_result(
690 "tool_1",
691 "read_file",
692 include_str!("evals/fixtures/zode/react.py"),
693 ),
694 tool_result(
695 "tool_2",
696 "read_file",
697 include_str!("evals/fixtures/zode/react_test.py"),
698 ),
699 ],
700 ),
701 message(
702 Assistant,
703 [
704 text(
705 "Now that I understand what we need to build, I'll create the main Python script:",
706 ),
707 tool_use(
708 "tool_3",
709 "edit_file",
710 EditFileToolInput {
711 display_description: edit_description.into(),
712 path: input_file_path.into(),
713 mode: EditFileMode::Create,
714 },
715 ),
716 ],
717 ),
718 ],
719 input_content,
720 EvalAssertion::new(async move |sample, _, _cx| {
721 let invalid_starts = [' ', '`', '\n'];
722 let mut message = String::new();
723 for start in invalid_starts {
724 if sample.text_after.starts_with(start) {
725 message.push_str(&format!("The sample starts with a {:?}\n", start));
726 break;
727 }
728 }
729 // Remove trailing newline.
730 message.pop();
731
732 if message.is_empty() {
733 Ok(EvalAssertionOutcome {
734 score: 100,
735 message: None,
736 })
737 } else {
738 Ok(EvalAssertionOutcome {
739 score: 0,
740 message: Some(message),
741 })
742 }
743 }),
744 ),
745 );
746}
747
748#[test]
749#[cfg_attr(not(feature = "eval"), ignore)]
750fn eval_add_overwrite_test() {
751 // Results for 2025-05-22
752 //
753 // Model | Pass rate
754 // ============================================
755 //
756 // claude-3.7-sonnet | 0.16
757 // gemini-2.5-pro-preview-03-25 | 0.35
758 // gemini-2.5-flash-preview-04-17 |
759 // gpt-4.1 |
760 let input_file_path = "root/action_log.rs";
761 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
762 let edit_description = "Add a new test for overwriting a file in action_log.rs";
763 eval(
764 200,
765 0.5, // TODO: make this eval better
766 0.05,
767 EvalInput::from_conversation(
768 vec![
769 message(
770 User,
771 [text(indoc! {"
772 Introduce a new test in `action_log.rs` to test overwriting a file.
773 That is, a file already exists, but we call `buffer_created` as if the file were new.
774 Take inspiration from all the other tests in the file.
775 "})],
776 ),
777 message(
778 Assistant,
779 [tool_use(
780 "tool_1",
781 "read_file",
782 ReadFileToolInput {
783 path: input_file_path.into(),
784 start_line: None,
785 end_line: None,
786 },
787 )],
788 ),
789 message(
790 User,
791 [tool_result(
792 "tool_1",
793 "read_file",
794 indoc! {"
795 pub struct ActionLog [L13-20]
796 tracked_buffers [L15]
797 edited_since_project_diagnostics_check [L17]
798 project [L19]
799 impl ActionLog [L22-498]
800 pub fn new [L24-30]
801 pub fn project [L32-34]
802 pub fn checked_project_diagnostics [L37-39]
803 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
804 fn track_buffer_internal [L46-101]
805 fn handle_buffer_event [L103-116]
806 fn handle_buffer_edited [L118-123]
807 fn handle_buffer_file_changed [L125-158]
808 async fn maintain_diff [L160-264]
809 pub fn buffer_read [L267-269]
810 pub fn buffer_created [L272-276]
811 pub fn buffer_edited [L279-287]
812 pub fn will_delete_buffer [L289-304]
813 pub fn keep_edits_in_range [L306-364]
814 pub fn reject_edits_in_ranges [L366-459]
815 pub fn keep_all_edits [L461-473]
816 pub fn changed_buffers [L476-482]
817 pub fn stale_buffers [L485-497]
818 fn apply_non_conflicting_edits [L500-561]
819 fn diff_snapshots [L563-585]
820 fn point_to_row_edit [L587-614]
821 enum ChangeAuthor [L617-620]
822 User [L618]
823 Agent [L619]
824 enum TrackedBufferStatus [L623-627]
825 Created [L624]
826 Modified [L625]
827 Deleted [L626]
828 struct TrackedBuffer [L629-641]
829 buffer [L630]
830 base_text [L631]
831 unreviewed_changes [L632]
832 status [L633]
833 version [L634]
834 diff [L635]
835 snapshot [L636]
836 diff_update [L637]
837 _open_lsp_handle [L638]
838 _maintain_diff [L639]
839 _subscription [L640]
840 impl TrackedBuffer [L643-657]
841 fn has_changes [L644-650]
842 fn schedule_diff_update [L652-656]
843 pub struct ChangedBuffer [L659-661]
844 pub diff [L660]
845 mod tests [L664-1574]
846 fn init_logger [L678-682]
847 fn init_test [L684-691]
848 async fn test_keep_edits [L694-769]
849 async fn test_deletions [L772-854]
850 async fn test_overlapping_user_edits [L857-951]
851 async fn test_creating_files [L954-1010]
852 async fn test_deleting_files [L1013-1120]
853 async fn test_reject_edits [L1123-1255]
854 async fn test_reject_multiple_edits [L1258-1331]
855 async fn test_reject_deleted_file [L1334-1388]
856 async fn test_reject_created_file [L1391-1443]
857 async fn test_random_diffs [L1446-1535]
858 fn quiesce [L1510-1534]
859 struct HunkStatus [L1538-1542]
860 range [L1539]
861 diff_status [L1540]
862 old_text [L1541]
863 fn unreviewed_hunks [L1544-1573]
864
865 Showing symbols 1-69 (total symbols: 69)
866
867 Using the line numbers in this outline, you can call this tool again while specifying
868 the start_line and end_line fields to see the implementations of symbols in the outline.
869 "},
870 )],
871 ),
872 message(
873 Assistant,
874 [
875 text(
876 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
877 ),
878 tool_use(
879 "tool_2",
880 "read_file",
881 ReadFileToolInput {
882 path: input_file_path.into(),
883 start_line: Some(953),
884 end_line: Some(1010),
885 },
886 ),
887 ],
888 ),
889 message(
890 User,
891 [tool_result(
892 "tool_2",
893 "read_file",
894 lines(input_file_content, 953..1010),
895 )],
896 ),
897 message(
898 Assistant,
899 [
900 text(
901 "Now let's look at another relevant test to understand how they're structured:",
902 ),
903 tool_use(
904 "tool_3",
905 "read_file",
906 ReadFileToolInput {
907 path: input_file_path.into(),
908 start_line: Some(1012),
909 end_line: Some(1120),
910 },
911 ),
912 ],
913 ),
914 message(
915 User,
916 [tool_result(
917 "tool_3",
918 "read_file",
919 lines(input_file_content, 1012..1120),
920 )],
921 ),
922 message(
923 Assistant,
924 [
925 text("Now let's look at how `buffer_created` is implemented:"),
926 tool_use(
927 "tool_4",
928 "read_file",
929 ReadFileToolInput {
930 path: input_file_path.into(),
931 start_line: Some(271),
932 end_line: Some(276),
933 },
934 ),
935 ],
936 ),
937 message(
938 User,
939 [tool_result(
940 "tool_4",
941 "read_file",
942 lines(input_file_content, 271..276),
943 )],
944 ),
945 message(
946 Assistant,
947 [
948 text(indoc! {"
949 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
950
951 1. Create a file
952 2. Open a buffer for it
953 3. Track it in the action log
954 4. Modify the buffer
955 5. Call `buffer_created` again as if the file were new
956 6. Check that the changes are properly tracked
957
958 Let's write the test:
959 "}),
960 tool_use(
961 "tool_5",
962 "edit_file",
963 EditFileToolInput {
964 display_description: edit_description.into(),
965 path: input_file_path.into(),
966 mode: EditFileMode::Edit,
967 },
968 ),
969 ],
970 ),
971 ],
972 Some(input_file_content.into()),
973 EvalAssertion::judge_diff(
974 "A new test for overwritten files was created, without changing any previous test",
975 ),
976 ),
977 );
978}
979
980#[test]
981#[cfg_attr(not(feature = "eval"), ignore)]
982fn eval_create_empty_file() {
983 // Check that Edit Agent can create a file without writing its
984 // thoughts into it. This issue is not specific to empty files, but
985 // it's easier to reproduce with them.
986 //
987 // Results for 2025-05-21:
988 //
989 // Model | Pass rate
990 // ============================================
991 //
992 // claude-3.7-sonnet | 1.00
993 // gemini-2.5-pro-preview-03-25 | 1.00
994 // gemini-2.5-flash-preview-04-17 | 1.00
995 // gpt-4.1 | 1.00
996 //
997 //
998 // TODO: gpt-4.1-mini errored 38 times:
999 // "data did not match any variant of untagged enum ResponseStreamResult"
1000 //
1001 let input_file_content = None;
1002 let expected_output_content = String::new();
1003 eval(
1004 100,
1005 0.99,
1006 0.05,
1007 EvalInput::from_conversation(
1008 vec![
1009 message(User, [text("Create a second empty todo file ")]),
1010 message(
1011 Assistant,
1012 [
1013 text(formatdoc! {"
1014 I'll help you create a second empty todo file.
1015 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1016 "}),
1017 tool_use(
1018 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1019 "list_directory",
1020 ListDirectoryToolInput {
1021 path: "root".to_string(),
1022 },
1023 ),
1024 ],
1025 ),
1026 message(
1027 User,
1028 [tool_result(
1029 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1030 "list_directory",
1031 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1032 )],
1033 ),
1034 message(
1035 Assistant,
1036 [
1037 text(formatdoc! {"
1038 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1039 "}),
1040 tool_use(
1041 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1042 "edit_file",
1043 EditFileToolInput {
1044 display_description: "Create empty TODO3 file".to_string(),
1045 mode: EditFileMode::Create,
1046 path: "root/TODO3".into(),
1047 },
1048 ),
1049 ],
1050 ),
1051 ],
1052 input_file_content,
1053 // Bad behavior is to write something like
1054 // "I'll create an empty TODO3 file as requested."
1055 EvalAssertion::assert_eq(expected_output_content),
1056 ),
1057 );
1058}
1059
1060fn message(
1061 role: Role,
1062 contents: impl IntoIterator<Item = MessageContent>,
1063) -> LanguageModelRequestMessage {
1064 LanguageModelRequestMessage {
1065 role,
1066 content: contents.into_iter().collect(),
1067 cache: false,
1068 }
1069}
1070
1071fn text(text: impl Into<String>) -> MessageContent {
1072 MessageContent::Text(text.into())
1073}
1074
1075fn lines(input: &str, range: Range<usize>) -> String {
1076 input
1077 .lines()
1078 .skip(range.start)
1079 .take(range.len())
1080 .collect::<Vec<_>>()
1081 .join("\n")
1082}
1083
1084fn tool_use(
1085 id: impl Into<Arc<str>>,
1086 name: impl Into<Arc<str>>,
1087 input: impl Serialize,
1088) -> MessageContent {
1089 MessageContent::ToolUse(LanguageModelToolUse {
1090 id: LanguageModelToolUseId::from(id.into()),
1091 name: name.into(),
1092 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1093 input: serde_json::to_value(input).unwrap(),
1094 is_input_complete: true,
1095 })
1096}
1097
1098fn tool_result(
1099 id: impl Into<Arc<str>>,
1100 name: impl Into<Arc<str>>,
1101 result: impl Into<Arc<str>>,
1102) -> MessageContent {
1103 MessageContent::ToolResult(LanguageModelToolResult {
1104 tool_use_id: LanguageModelToolUseId::from(id.into()),
1105 tool_name: name.into(),
1106 is_error: false,
1107 content: LanguageModelToolResultContent::Text(result.into()),
1108 output: None,
1109 })
1110}
1111
1112#[derive(Clone)]
1113struct EvalInput {
1114 conversation: Vec<LanguageModelRequestMessage>,
1115 edit_file_input: EditFileToolInput,
1116 input_content: Option<String>,
1117 assertion: EvalAssertion,
1118}
1119
1120impl EvalInput {
1121 fn from_conversation(
1122 conversation: Vec<LanguageModelRequestMessage>,
1123 input_content: Option<String>,
1124 assertion: EvalAssertion,
1125 ) -> Self {
1126 let msg = conversation.last().expect("Conversation must not be empty");
1127 if msg.role != Role::Assistant {
1128 panic!("Conversation must end with an assistant message");
1129 }
1130 let tool_use = msg
1131 .content
1132 .iter()
1133 .flat_map(|content| match content {
1134 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1135 Some(tool_use)
1136 }
1137 _ => None,
1138 })
1139 .next()
1140 .expect("Conversation must end with an edit_file tool use")
1141 .clone();
1142
1143 let edit_file_input: EditFileToolInput =
1144 serde_json::from_value(tool_use.input.clone()).unwrap();
1145
1146 EvalInput {
1147 conversation,
1148 edit_file_input,
1149 input_content,
1150 assertion,
1151 }
1152 }
1153}
1154
1155#[derive(Clone)]
1156struct EvalSample {
1157 text_before: String,
1158 text_after: String,
1159 edit_output: EditAgentOutput,
1160 diff: String,
1161}
1162
1163trait AssertionFn: 'static + Send + Sync {
1164 fn assert<'a>(
1165 &'a self,
1166 sample: &'a EvalSample,
1167 judge_model: Arc<dyn LanguageModel>,
1168 cx: &'a mut TestAppContext,
1169 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1170}
1171
1172impl<F> AssertionFn for F
1173where
1174 F: 'static
1175 + Send
1176 + Sync
1177 + AsyncFn(
1178 &EvalSample,
1179 Arc<dyn LanguageModel>,
1180 &mut TestAppContext,
1181 ) -> Result<EvalAssertionOutcome>,
1182{
1183 fn assert<'a>(
1184 &'a self,
1185 sample: &'a EvalSample,
1186 judge_model: Arc<dyn LanguageModel>,
1187 cx: &'a mut TestAppContext,
1188 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1189 (self)(sample, judge_model, cx).boxed_local()
1190 }
1191}
1192
1193#[derive(Clone)]
1194struct EvalAssertion(Arc<dyn AssertionFn>);
1195
1196impl EvalAssertion {
1197 fn new<F>(f: F) -> Self
1198 where
1199 F: 'static
1200 + Send
1201 + Sync
1202 + AsyncFn(
1203 &EvalSample,
1204 Arc<dyn LanguageModel>,
1205 &mut TestAppContext,
1206 ) -> Result<EvalAssertionOutcome>,
1207 {
1208 EvalAssertion(Arc::new(f))
1209 }
1210
1211 fn assert_eq(expected: impl Into<String>) -> Self {
1212 let expected = expected.into();
1213 Self::new(async move |sample, _judge, _cx| {
1214 Ok(EvalAssertionOutcome {
1215 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1216 100
1217 } else {
1218 0
1219 },
1220 message: None,
1221 })
1222 })
1223 }
1224
1225 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1226 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1227 Self::new(async move |sample, _judge, _cx| {
1228 let matches = expected_diffs.iter().any(|possible_diff| {
1229 let expected =
1230 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1231 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1232 });
1233
1234 Ok(EvalAssertionOutcome {
1235 score: if matches { 100 } else { 0 },
1236 message: None,
1237 })
1238 })
1239 }
1240
1241 fn judge_diff(assertions: &'static str) -> Self {
1242 Self::new(async move |sample, judge, cx| {
1243 let prompt = DiffJudgeTemplate {
1244 diff: sample.diff.clone(),
1245 assertions,
1246 }
1247 .render(&Templates::new())
1248 .unwrap();
1249
1250 let request = LanguageModelRequest {
1251 messages: vec![LanguageModelRequestMessage {
1252 role: Role::User,
1253 content: vec![prompt.into()],
1254 cache: false,
1255 }],
1256 ..Default::default()
1257 };
1258 let mut response = retry_on_rate_limit(async || {
1259 Ok(judge
1260 .stream_completion_text(request.clone(), &cx.to_async())
1261 .await?)
1262 })
1263 .await?;
1264 let mut output = String::new();
1265 while let Some(chunk) = response.stream.next().await {
1266 let chunk = chunk?;
1267 output.push_str(&chunk);
1268 }
1269
1270 // Parse the score from the response
1271 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1272 if let Some(captures) = re.captures(&output) {
1273 if let Some(score_match) = captures.get(1) {
1274 let score = score_match.as_str().parse().unwrap_or(0);
1275 return Ok(EvalAssertionOutcome {
1276 score,
1277 message: Some(output),
1278 });
1279 }
1280 }
1281
1282 anyhow::bail!("No score found in response. Raw output: {output}");
1283 })
1284 }
1285
1286 async fn run(
1287 &self,
1288 input: &EvalSample,
1289 judge_model: Arc<dyn LanguageModel>,
1290 cx: &mut TestAppContext,
1291 ) -> Result<EvalAssertionOutcome> {
1292 self.0.assert(input, judge_model, cx).await
1293 }
1294}
1295
1296fn eval(
1297 iterations: usize,
1298 expected_pass_ratio: f32,
1299 mismatched_tag_threshold: f32,
1300 mut eval: EvalInput,
1301) {
1302 let mut evaluated_count = 0;
1303 let mut failed_count = 0;
1304 report_progress(evaluated_count, failed_count, iterations);
1305
1306 let (tx, rx) = mpsc::channel();
1307
1308 // Cache the last message in the conversation, and run one instance of the eval so that
1309 // all the next ones are cached.
1310 eval.conversation.last_mut().unwrap().cache = true;
1311 run_eval(eval.clone(), tx.clone());
1312
1313 let executor = gpui::background_executor();
1314 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1315 for _ in 1..iterations {
1316 let eval = eval.clone();
1317 let tx = tx.clone();
1318 let semaphore = semaphore.clone();
1319 executor
1320 .spawn(async move {
1321 let _guard = semaphore.acquire().await;
1322 run_eval(eval, tx)
1323 })
1324 .detach();
1325 }
1326 drop(tx);
1327
1328 let mut failed_evals = HashMap::default();
1329 let mut errored_evals = HashMap::default();
1330 let mut eval_outputs = Vec::new();
1331 let mut cumulative_parser_metrics = EditParserMetrics::default();
1332 while let Ok(output) = rx.recv() {
1333 match output {
1334 Ok(output) => {
1335 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1336 eval_outputs.push(output.clone());
1337 if output.assertion.score < 80 {
1338 failed_count += 1;
1339 failed_evals
1340 .entry(output.sample.text_after.clone())
1341 .or_insert(Vec::new())
1342 .push(output);
1343 }
1344 }
1345 Err(error) => {
1346 failed_count += 1;
1347 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1348 }
1349 }
1350
1351 evaluated_count += 1;
1352 report_progress(evaluated_count, failed_count, iterations);
1353 }
1354
1355 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1356 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1357 if actual_pass_ratio < expected_pass_ratio {
1358 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1359 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1360 for (error, count) in errored_evals {
1361 println!("Eval errored {} times. Error: {}", count, error);
1362 }
1363
1364 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1365 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1366 for (_buffer_output, failed_evals) in failed_evals {
1367 let eval_output = failed_evals.first().unwrap();
1368 println!("Eval failed {} times", failed_evals.len());
1369 println!("{}", eval_output);
1370 }
1371
1372 panic!(
1373 "Actual pass ratio: {}\nExpected pass ratio: {}",
1374 actual_pass_ratio, expected_pass_ratio
1375 );
1376 }
1377
1378 let mismatched_tag_ratio =
1379 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1380 if mismatched_tag_ratio > mismatched_tag_threshold {
1381 for eval_output in eval_outputs {
1382 println!("{}", eval_output);
1383 }
1384 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1385 }
1386}
1387
1388fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1389 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1390 let mut cx = TestAppContext::build(dispatcher, None);
1391 let output = cx.executor().block_test(async {
1392 let test = EditAgentTest::new(&mut cx).await;
1393 test.eval(eval, &mut cx).await
1394 });
1395 tx.send(output).unwrap();
1396}
1397
1398#[derive(Clone)]
1399struct EvalOutput {
1400 sample: EvalSample,
1401 assertion: EvalAssertionOutcome,
1402}
1403
1404impl Display for EvalOutput {
1405 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1406 writeln!(f, "Score: {:?}", self.assertion.score)?;
1407 if let Some(message) = self.assertion.message.as_ref() {
1408 writeln!(f, "Message: {}", message)?;
1409 }
1410
1411 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1412
1413 writeln!(
1414 f,
1415 "Parser Metrics:\n{:#?}",
1416 self.sample.edit_output.parser_metrics
1417 )?;
1418 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1419 Ok(())
1420 }
1421}
1422
1423fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1424 let passed_count = evaluated_count - failed_count;
1425 let passed_ratio = if evaluated_count == 0 {
1426 0.0
1427 } else {
1428 passed_count as f64 / evaluated_count as f64
1429 };
1430 print!(
1431 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1432 evaluated_count,
1433 iterations,
1434 passed_ratio * 100.0
1435 );
1436 std::io::stdout().flush().unwrap();
1437}
1438
1439struct EditAgentTest {
1440 agent: EditAgent,
1441 project: Entity<Project>,
1442 judge_model: Arc<dyn LanguageModel>,
1443}
1444
1445impl EditAgentTest {
1446 async fn new(cx: &mut TestAppContext) -> Self {
1447 cx.executor().allow_parking();
1448
1449 let fs = FakeFs::new(cx.executor().clone());
1450 cx.update(|cx| {
1451 settings::init(cx);
1452 gpui_tokio::init(cx);
1453 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1454 cx.set_http_client(http_client);
1455
1456 client::init_settings(cx);
1457 let client = Client::production(cx);
1458 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1459
1460 settings::init(cx);
1461 Project::init_settings(cx);
1462 language::init(cx);
1463 language_model::init(client.clone(), cx);
1464 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1465 crate::init(client.http_client(), cx);
1466 });
1467
1468 fs.insert_tree("/root", json!({})).await;
1469 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1470 let agent_model = SelectedModel::from_str(
1471 &std::env::var("ZED_AGENT_MODEL")
1472 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1473 )
1474 .unwrap();
1475 let judge_model = SelectedModel::from_str(
1476 &std::env::var("ZED_JUDGE_MODEL")
1477 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1478 )
1479 .unwrap();
1480 let (agent_model, judge_model) = cx
1481 .update(|cx| {
1482 cx.spawn(async move |cx| {
1483 let agent_model = Self::load_model(&agent_model, cx).await;
1484 let judge_model = Self::load_model(&judge_model, cx).await;
1485 (agent_model.unwrap(), judge_model.unwrap())
1486 })
1487 })
1488 .await;
1489 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1490
1491 Self {
1492 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1493 project,
1494 judge_model,
1495 }
1496 }
1497
1498 async fn load_model(
1499 selected_model: &SelectedModel,
1500 cx: &mut AsyncApp,
1501 ) -> Result<Arc<dyn LanguageModel>> {
1502 let (provider, model) = cx.update(|cx| {
1503 let models = LanguageModelRegistry::read_global(cx);
1504 let model = models
1505 .available_models(cx)
1506 .find(|model| {
1507 model.provider_id() == selected_model.provider
1508 && model.id() == selected_model.model
1509 })
1510 .expect("Model not found");
1511 let provider = models.provider(&model.provider_id()).unwrap();
1512 (provider, model)
1513 })?;
1514 cx.update(|cx| provider.authenticate(cx))?.await?;
1515 Ok(model)
1516 }
1517
1518 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1519 let path = self
1520 .project
1521 .read_with(cx, |project, cx| {
1522 project.find_project_path(eval.edit_file_input.path, cx)
1523 })
1524 .unwrap();
1525 let buffer = self
1526 .project
1527 .update(cx, |project, cx| project.open_buffer(path, cx))
1528 .await
1529 .unwrap();
1530 let tools = cx.update(|cx| {
1531 ToolRegistry::default_global(cx)
1532 .tools()
1533 .into_iter()
1534 .filter_map(|tool| {
1535 let input_schema = tool
1536 .input_schema(self.agent.model.tool_input_format())
1537 .ok()?;
1538 Some(LanguageModelRequestTool {
1539 name: tool.name(),
1540 description: tool.description(),
1541 input_schema,
1542 })
1543 })
1544 .collect::<Vec<_>>()
1545 });
1546 let tool_names = tools
1547 .iter()
1548 .map(|tool| tool.name.clone())
1549 .collect::<Vec<_>>();
1550 let worktrees = vec![WorktreeContext {
1551 root_name: "root".to_string(),
1552 rules_file: None,
1553 }];
1554 let prompt_builder = PromptBuilder::new(None)?;
1555 let project_context = ProjectContext::new(worktrees, Vec::default());
1556 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1557 &project_context,
1558 &ModelContext {
1559 available_tools: tool_names,
1560 },
1561 )?;
1562
1563 let has_system_prompt = eval
1564 .conversation
1565 .first()
1566 .map_or(false, |msg| msg.role == Role::System);
1567 let messages = if has_system_prompt {
1568 eval.conversation
1569 } else {
1570 [LanguageModelRequestMessage {
1571 role: Role::System,
1572 content: vec![MessageContent::Text(system_prompt)],
1573 cache: true,
1574 }]
1575 .into_iter()
1576 .chain(eval.conversation)
1577 .collect::<Vec<_>>()
1578 };
1579
1580 let conversation = LanguageModelRequest {
1581 messages,
1582 tools,
1583 ..Default::default()
1584 };
1585
1586 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1587 if let Some(input_content) = eval.input_content.as_deref() {
1588 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1589 }
1590 retry_on_rate_limit(async || {
1591 self.agent
1592 .edit(
1593 buffer.clone(),
1594 eval.edit_file_input.display_description.clone(),
1595 &conversation,
1596 &mut cx.to_async(),
1597 )
1598 .0
1599 .await
1600 })
1601 .await?
1602 } else {
1603 retry_on_rate_limit(async || {
1604 self.agent
1605 .overwrite(
1606 buffer.clone(),
1607 eval.edit_file_input.display_description.clone(),
1608 &conversation,
1609 &mut cx.to_async(),
1610 )
1611 .0
1612 .await
1613 })
1614 .await?
1615 };
1616
1617 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1618 let sample = EvalSample {
1619 edit_output,
1620 diff: language::unified_diff(
1621 eval.input_content.as_deref().unwrap_or_default(),
1622 &buffer_text,
1623 ),
1624 text_before: eval.input_content.unwrap_or_default(),
1625 text_after: buffer_text,
1626 };
1627 let assertion = eval
1628 .assertion
1629 .run(&sample, self.judge_model.clone(), cx)
1630 .await?;
1631
1632 Ok(EvalOutput { assertion, sample })
1633 }
1634}
1635
1636async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1637 loop {
1638 match request().await {
1639 Ok(result) => return Ok(result),
1640 Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1641 Ok(err) => match err {
1642 LanguageModelCompletionError::RateLimit(duration) => {
1643 // Wait until after we are allowed to try again
1644 eprintln!("Rate limit exceeded. Waiting for {duration:?}...",);
1645 Timer::after(duration).await;
1646 continue;
1647 }
1648 _ => return Err(err.into()),
1649 },
1650 Err(err) => return Err(err),
1651 },
1652 }
1653 }
1654}
1655
1656#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1657struct EvalAssertionOutcome {
1658 score: usize,
1659 message: Option<String>,
1660}
1661
1662#[derive(Serialize)]
1663pub struct DiffJudgeTemplate {
1664 diff: String,
1665 assertions: &'static str,
1666}
1667
1668impl Template for DiffJudgeTemplate {
1669 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1670}
1671
1672fn strip_empty_lines(text: &str) -> String {
1673 text.lines()
1674 .filter(|line| !line.trim().is_empty())
1675 .collect::<Vec<_>>()
1676 .join("\n")
1677}