1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 str::FromStr,
30 sync::mpsc,
31};
32use util::path;
33
34#[test]
35#[cfg_attr(not(feature = "eval"), ignore)]
36fn eval_extract_handle_command_output() {
37 // Test how well agent generates multiple edit hunks.
38 //
39 // Model | Pass rate
40 // ----------------------------|----------
41 // claude-3.7-sonnet | 0.98
42 // gemini-2.5-pro-06-05 | 0.77
43 // gemini-2.5-flash | 0.11
44 // gpt-4.1 | 1.00
45
46 let input_file_path = "root/blame.rs";
47 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
48 let possible_diffs = vec![
49 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
50 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
51 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
56 ];
57 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
58 eval(
59 100,
60 0.7, // Taking the lower bar for Gemini
61 0.05,
62 EvalInput::from_conversation(
63 vec![
64 message(
65 User,
66 [text(formatdoc! {"
67 Read the `{input_file_path}` file and extract a method in
68 the final stanza of `run_git_blame` to deal with command failures,
69 call it `handle_command_output` and take the std::process::Output as the only parameter.
70 Do not document the method and do not add any comments.
71
72 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
73 "})],
74 ),
75 message(
76 Assistant,
77 [tool_use(
78 "tool_1",
79 "read_file",
80 ReadFileToolInput {
81 path: input_file_path.into(),
82 start_line: None,
83 end_line: None,
84 },
85 )],
86 ),
87 message(
88 User,
89 [tool_result("tool_1", "read_file", input_file_content)],
90 ),
91 message(
92 Assistant,
93 [tool_use(
94 "tool_2",
95 "edit_file",
96 EditFileToolInput {
97 display_description: edit_description.into(),
98 path: input_file_path.into(),
99 mode: EditFileMode::Edit,
100 },
101 )],
102 ),
103 ],
104 Some(input_file_content.into()),
105 EvalAssertion::assert_diff_any(possible_diffs),
106 ),
107 );
108}
109
110#[test]
111#[cfg_attr(not(feature = "eval"), ignore)]
112fn eval_delete_run_git_blame() {
113 let input_file_path = "root/blame.rs";
114 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
115 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
116 let edit_description = "Delete the `run_git_blame` function.";
117 eval(
118 100,
119 0.95,
120 0.05,
121 EvalInput::from_conversation(
122 vec![
123 message(
124 User,
125 [text(formatdoc! {"
126 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
127 one function, not its usages.
128 "})],
129 ),
130 message(
131 Assistant,
132 [tool_use(
133 "tool_1",
134 "read_file",
135 ReadFileToolInput {
136 path: input_file_path.into(),
137 start_line: None,
138 end_line: None,
139 },
140 )],
141 ),
142 message(
143 User,
144 [tool_result("tool_1", "read_file", input_file_content)],
145 ),
146 message(
147 Assistant,
148 [tool_use(
149 "tool_2",
150 "edit_file",
151 EditFileToolInput {
152 display_description: edit_description.into(),
153 path: input_file_path.into(),
154 mode: EditFileMode::Edit,
155 },
156 )],
157 ),
158 ],
159 Some(input_file_content.into()),
160 EvalAssertion::assert_eq(output_file_content),
161 ),
162 );
163}
164
165#[test]
166#[cfg_attr(not(feature = "eval"), ignore)]
167fn eval_translate_doc_comments() {
168 // Results for 2025-05-22
169 //
170 // Model | Pass rate
171 // ============================================
172 //
173 // claude-3.7-sonnet |
174 // gemini-2.5-pro-preview-03-25 | 1.0
175 // gemini-2.5-flash-preview-04-17 |
176 // gpt-4.1 |
177 let input_file_path = "root/canvas.rs";
178 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
179 let edit_description = "Translate all doc comments to Italian";
180 eval(
181 200,
182 1.,
183 0.05,
184 EvalInput::from_conversation(
185 vec![
186 message(
187 User,
188 [text(formatdoc! {"
189 Read the {input_file_path} file and edit it (without overwriting it),
190 translating all the doc comments to italian.
191 "})],
192 ),
193 message(
194 Assistant,
195 [tool_use(
196 "tool_1",
197 "read_file",
198 ReadFileToolInput {
199 path: input_file_path.into(),
200 start_line: None,
201 end_line: None,
202 },
203 )],
204 ),
205 message(
206 User,
207 [tool_result("tool_1", "read_file", input_file_content)],
208 ),
209 message(
210 Assistant,
211 [tool_use(
212 "tool_2",
213 "edit_file",
214 EditFileToolInput {
215 display_description: edit_description.into(),
216 path: input_file_path.into(),
217 mode: EditFileMode::Edit,
218 },
219 )],
220 ),
221 ],
222 Some(input_file_content.into()),
223 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
224 ),
225 );
226}
227
228#[test]
229#[cfg_attr(not(feature = "eval"), ignore)]
230fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
231 // Results for 2025-05-22
232 //
233 // Model | Pass rate
234 // ============================================
235 //
236 // claude-3.7-sonnet | 0.98
237 // gemini-2.5-pro-preview-03-25 | 0.99
238 // gemini-2.5-flash-preview-04-17 |
239 // gpt-4.1 |
240 let input_file_path = "root/lib.rs";
241 let input_file_content =
242 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
243 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
244 eval(
245 100,
246 0.95,
247 0.05,
248 EvalInput::from_conversation(
249 vec![
250 message(
251 User,
252 [text(formatdoc! {"
253 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
254 Use `ureq` to download the SDK for the current platform and architecture.
255 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
256 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
257 that's inside of the archive.
258 Don't re-download the SDK if that executable already exists.
259
260 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
261
262 Here are the available wasi-sdk assets:
263 - wasi-sdk-25.0-x86_64-macos.tar.gz
264 - wasi-sdk-25.0-arm64-macos.tar.gz
265 - wasi-sdk-25.0-x86_64-linux.tar.gz
266 - wasi-sdk-25.0-arm64-linux.tar.gz
267 - wasi-sdk-25.0-x86_64-linux.tar.gz
268 - wasi-sdk-25.0-arm64-linux.tar.gz
269 - wasi-sdk-25.0-x86_64-windows.tar.gz
270 "})],
271 ),
272 message(
273 Assistant,
274 [tool_use(
275 "tool_1",
276 "read_file",
277 ReadFileToolInput {
278 path: input_file_path.into(),
279 start_line: Some(971),
280 end_line: Some(1050),
281 },
282 )],
283 ),
284 message(
285 User,
286 [tool_result(
287 "tool_1",
288 "read_file",
289 lines(input_file_content, 971..1050),
290 )],
291 ),
292 message(
293 Assistant,
294 [tool_use(
295 "tool_2",
296 "read_file",
297 ReadFileToolInput {
298 path: input_file_path.into(),
299 start_line: Some(1050),
300 end_line: Some(1100),
301 },
302 )],
303 ),
304 message(
305 User,
306 [tool_result(
307 "tool_2",
308 "read_file",
309 lines(input_file_content, 1050..1100),
310 )],
311 ),
312 message(
313 Assistant,
314 [tool_use(
315 "tool_3",
316 "read_file",
317 ReadFileToolInput {
318 path: input_file_path.into(),
319 start_line: Some(1100),
320 end_line: Some(1150),
321 },
322 )],
323 ),
324 message(
325 User,
326 [tool_result(
327 "tool_3",
328 "read_file",
329 lines(input_file_content, 1100..1150),
330 )],
331 ),
332 message(
333 Assistant,
334 [tool_use(
335 "tool_4",
336 "edit_file",
337 EditFileToolInput {
338 display_description: edit_description.into(),
339 path: input_file_path.into(),
340 mode: EditFileMode::Edit,
341 },
342 )],
343 ),
344 ],
345 Some(input_file_content.into()),
346 EvalAssertion::judge_diff(indoc! {"
347 - The compile_parser_to_wasm method has been changed to use wasi-sdk
348 - ureq is used to download the SDK for current platform and architecture
349 "}),
350 ),
351 );
352}
353
354#[test]
355#[cfg_attr(not(feature = "eval"), ignore)]
356fn eval_disable_cursor_blinking() {
357 // Results for 2025-05-22
358 //
359 // Model | Pass rate
360 // ============================================
361 //
362 // claude-3.7-sonnet |
363 // gemini-2.5-pro-preview-03-25 | 1.0
364 // gemini-2.5-flash-preview-04-17 |
365 // gpt-4.1 |
366 let input_file_path = "root/editor.rs";
367 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
368 let edit_description = "Comment out the call to `BlinkManager::enable`";
369 eval(
370 100,
371 0.95,
372 0.05,
373 EvalInput::from_conversation(
374 vec![
375 message(User, [text("Let's research how to cursor blinking works.")]),
376 message(
377 Assistant,
378 [tool_use(
379 "tool_1",
380 "grep",
381 GrepToolInput {
382 regex: "blink".into(),
383 include_pattern: None,
384 offset: 0,
385 case_sensitive: false,
386 },
387 )],
388 ),
389 message(
390 User,
391 [tool_result(
392 "tool_1",
393 "grep",
394 [
395 lines(input_file_content, 100..400),
396 lines(input_file_content, 800..1300),
397 lines(input_file_content, 1600..2000),
398 lines(input_file_content, 5000..5500),
399 lines(input_file_content, 8000..9000),
400 lines(input_file_content, 18455..18470),
401 lines(input_file_content, 20000..20500),
402 lines(input_file_content, 21000..21300),
403 ]
404 .join("Match found:\n\n"),
405 )],
406 ),
407 message(
408 User,
409 [text(indoc! {"
410 Comment out the lines that interact with the BlinkManager.
411 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
412 Don't add additional comments.
413 "})],
414 ),
415 message(
416 Assistant,
417 [tool_use(
418 "tool_4",
419 "edit_file",
420 EditFileToolInput {
421 display_description: edit_description.into(),
422 path: input_file_path.into(),
423 mode: EditFileMode::Edit,
424 },
425 )],
426 ),
427 ],
428 Some(input_file_content.into()),
429 EvalAssertion::judge_diff(indoc! {"
430 - Calls to BlinkManager in `observe_window_activation` were commented out
431 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
432 - All the edits have valid indentation
433 "}),
434 ),
435 );
436}
437
438#[test]
439#[cfg_attr(not(feature = "eval"), ignore)]
440fn eval_from_pixels_constructor() {
441 // Results for 2025-06-13
442 //
443 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
444 // value. Higher values improve the pass rate but may sometimes cause
445 // edits to be misapplied. In the context of this eval, this means
446 // the agent might add from_pixels tests in incorrect locations
447 // (e.g., at the beginning of the file), yet the evaluation may still
448 // rate it highly.
449 //
450 // Model | Pass rate
451 // ============================================
452 //
453 // claude-4.0-sonnet | 0.99
454 // claude-3.7-sonnet | 0.88
455 // gemini-2.5-pro-preview-03-25 | 0.96
456 // gpt-4.1 |
457 let input_file_path = "root/canvas.rs";
458 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
459 let edit_description = "Implement from_pixels constructor and add tests.";
460 eval(
461 100,
462 0.95,
463 // For whatever reason, this eval produces more mismatched tags.
464 // Increasing for now, let's see if we can bring this down.
465 0.2,
466 EvalInput::from_conversation(
467 vec![
468 message(
469 User,
470 [text(indoc! {"
471 Introduce a new `from_pixels` constructor in Canvas and
472 also add tests for it in the same file.
473 "})],
474 ),
475 message(
476 Assistant,
477 [tool_use(
478 "tool_1",
479 "read_file",
480 ReadFileToolInput {
481 path: input_file_path.into(),
482 start_line: None,
483 end_line: None,
484 },
485 )],
486 ),
487 message(
488 User,
489 [tool_result("tool_1", "read_file", input_file_content)],
490 ),
491 message(
492 Assistant,
493 [tool_use(
494 "tool_2",
495 "grep",
496 GrepToolInput {
497 regex: "mod\\s+tests".into(),
498 include_pattern: Some("font-kit/src/canvas.rs".into()),
499 offset: 0,
500 case_sensitive: false,
501 },
502 )],
503 ),
504 message(User, [tool_result("tool_2", "grep", "No matches found")]),
505 message(
506 Assistant,
507 [tool_use(
508 "tool_3",
509 "grep",
510 GrepToolInput {
511 regex: "mod\\s+tests".into(),
512 include_pattern: Some("font-kit/src/**/*.rs".into()),
513 offset: 0,
514 case_sensitive: false,
515 },
516 )],
517 ),
518 message(User, [tool_result("tool_3", "grep", "No matches found")]),
519 message(
520 Assistant,
521 [tool_use(
522 "tool_4",
523 "grep",
524 GrepToolInput {
525 regex: "#\\[test\\]".into(),
526 include_pattern: Some("font-kit/src/**/*.rs".into()),
527 offset: 0,
528 case_sensitive: false,
529 },
530 )],
531 ),
532 message(
533 User,
534 [tool_result(
535 "tool_4",
536 "grep",
537 indoc! {"
538 Found 6 matches:
539
540 ## Matches in font-kit/src/loaders/core_text.rs
541
542 ### mod test › L926-936
543 ```
544 mod test {
545 use super::Font;
546 use crate::properties::{Stretch, Weight};
547
548 #[cfg(feature = \"source\")]
549 use crate::source::SystemSource;
550
551 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
552
553 #[cfg(feature = \"source\")]
554 #[test]
555 ```
556
557 55 lines remaining in ancestor node. Read the file to see all.
558
559 ### mod test › L947-951
560 ```
561 }
562
563 #[test]
564 fn test_core_text_to_css_font_weight() {
565 // Exact matches
566 ```
567
568 ### mod test › L959-963
569 ```
570 }
571
572 #[test]
573 fn test_core_text_to_css_font_stretch() {
574 // Exact matches
575 ```
576
577 ## Matches in font-kit/src/loaders/freetype.rs
578
579 ### mod test › L1238-1248
580 ```
581 mod test {
582 use crate::loaders::freetype::Font;
583
584 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
585 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
586
587 #[test]
588 fn get_pcf_postscript_name() {
589 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
590 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
591 }
592 ```
593
594 1 lines remaining in ancestor node. Read the file to see all.
595
596 ## Matches in font-kit/src/sources/core_text.rs
597
598 ### mod test › L265-275
599 ```
600 mod test {
601 use crate::properties::{Stretch, Weight};
602
603 #[test]
604 fn test_css_to_core_text_font_weight() {
605 // Exact matches
606 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
607 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
608 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
609 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
610
611 ```
612
613 27 lines remaining in ancestor node. Read the file to see all.
614
615 ### mod test › L278-282
616 ```
617 }
618
619 #[test]
620 fn test_css_to_core_text_font_stretch() {
621 // Exact matches
622 ```
623 "},
624 )],
625 ),
626 message(
627 Assistant,
628 [tool_use(
629 "tool_5",
630 "edit_file",
631 EditFileToolInput {
632 display_description: edit_description.into(),
633 path: input_file_path.into(),
634 mode: EditFileMode::Edit,
635 },
636 )],
637 ),
638 ],
639 Some(input_file_content.into()),
640 EvalAssertion::judge_diff(indoc! {"
641 - The diff contains a new `from_pixels` constructor
642 - The diff contains new tests for the `from_pixels` constructor
643 "}),
644 ),
645 );
646}
647
648#[test]
649#[cfg_attr(not(feature = "eval"), ignore)]
650fn eval_zode() {
651 // Results for 2025-05-22
652 //
653 // Model | Pass rate
654 // ============================================
655 //
656 // claude-3.7-sonnet | 1.0
657 // gemini-2.5-pro-preview-03-25 | 1.0
658 // gemini-2.5-flash-preview-04-17 | 1.0
659 // gpt-4.1 | 1.0
660 let input_file_path = "root/zode.py";
661 let input_content = None;
662 let edit_description = "Create the main Zode CLI script";
663 eval(
664 50,
665 1.,
666 0.05,
667 EvalInput::from_conversation(
668 vec![
669 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
670 message(
671 Assistant,
672 [
673 tool_use(
674 "tool_1",
675 "read_file",
676 ReadFileToolInput {
677 path: "root/eval/react.py".into(),
678 start_line: None,
679 end_line: None,
680 },
681 ),
682 tool_use(
683 "tool_2",
684 "read_file",
685 ReadFileToolInput {
686 path: "root/eval/react_test.py".into(),
687 start_line: None,
688 end_line: None,
689 },
690 ),
691 ],
692 ),
693 message(
694 User,
695 [
696 tool_result(
697 "tool_1",
698 "read_file",
699 include_str!("evals/fixtures/zode/react.py"),
700 ),
701 tool_result(
702 "tool_2",
703 "read_file",
704 include_str!("evals/fixtures/zode/react_test.py"),
705 ),
706 ],
707 ),
708 message(
709 Assistant,
710 [
711 text(
712 "Now that I understand what we need to build, I'll create the main Python script:",
713 ),
714 tool_use(
715 "tool_3",
716 "edit_file",
717 EditFileToolInput {
718 display_description: edit_description.into(),
719 path: input_file_path.into(),
720 mode: EditFileMode::Create,
721 },
722 ),
723 ],
724 ),
725 ],
726 input_content,
727 EvalAssertion::new(async move |sample, _, _cx| {
728 let invalid_starts = [' ', '`', '\n'];
729 let mut message = String::new();
730 for start in invalid_starts {
731 if sample.text_after.starts_with(start) {
732 message.push_str(&format!("The sample starts with a {:?}\n", start));
733 break;
734 }
735 }
736 // Remove trailing newline.
737 message.pop();
738
739 if message.is_empty() {
740 Ok(EvalAssertionOutcome {
741 score: 100,
742 message: None,
743 })
744 } else {
745 Ok(EvalAssertionOutcome {
746 score: 0,
747 message: Some(message),
748 })
749 }
750 }),
751 ),
752 );
753}
754
755#[test]
756#[cfg_attr(not(feature = "eval"), ignore)]
757fn eval_add_overwrite_test() {
758 // Results for 2025-05-22
759 //
760 // Model | Pass rate
761 // ============================================
762 //
763 // claude-3.7-sonnet | 0.16
764 // gemini-2.5-pro-preview-03-25 | 0.35
765 // gemini-2.5-flash-preview-04-17 |
766 // gpt-4.1 |
767 let input_file_path = "root/action_log.rs";
768 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
769 let edit_description = "Add a new test for overwriting a file in action_log.rs";
770 eval(
771 200,
772 0.5, // TODO: make this eval better
773 0.05,
774 EvalInput::from_conversation(
775 vec![
776 message(
777 User,
778 [text(indoc! {"
779 Introduce a new test in `action_log.rs` to test overwriting a file.
780 That is, a file already exists, but we call `buffer_created` as if the file were new.
781 Take inspiration from all the other tests in the file.
782 "})],
783 ),
784 message(
785 Assistant,
786 [tool_use(
787 "tool_1",
788 "read_file",
789 ReadFileToolInput {
790 path: input_file_path.into(),
791 start_line: None,
792 end_line: None,
793 },
794 )],
795 ),
796 message(
797 User,
798 [tool_result(
799 "tool_1",
800 "read_file",
801 indoc! {"
802 pub struct ActionLog [L13-20]
803 tracked_buffers [L15]
804 edited_since_project_diagnostics_check [L17]
805 project [L19]
806 impl ActionLog [L22-498]
807 pub fn new [L24-30]
808 pub fn project [L32-34]
809 pub fn checked_project_diagnostics [L37-39]
810 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
811 fn track_buffer_internal [L46-101]
812 fn handle_buffer_event [L103-116]
813 fn handle_buffer_edited [L118-123]
814 fn handle_buffer_file_changed [L125-158]
815 async fn maintain_diff [L160-264]
816 pub fn buffer_read [L267-269]
817 pub fn buffer_created [L272-276]
818 pub fn buffer_edited [L279-287]
819 pub fn will_delete_buffer [L289-304]
820 pub fn keep_edits_in_range [L306-364]
821 pub fn reject_edits_in_ranges [L366-459]
822 pub fn keep_all_edits [L461-473]
823 pub fn changed_buffers [L476-482]
824 pub fn stale_buffers [L485-497]
825 fn apply_non_conflicting_edits [L500-561]
826 fn diff_snapshots [L563-585]
827 fn point_to_row_edit [L587-614]
828 enum ChangeAuthor [L617-620]
829 User [L618]
830 Agent [L619]
831 enum TrackedBufferStatus [L623-627]
832 Created [L624]
833 Modified [L625]
834 Deleted [L626]
835 struct TrackedBuffer [L629-641]
836 buffer [L630]
837 base_text [L631]
838 unreviewed_changes [L632]
839 status [L633]
840 version [L634]
841 diff [L635]
842 snapshot [L636]
843 diff_update [L637]
844 _open_lsp_handle [L638]
845 _maintain_diff [L639]
846 _subscription [L640]
847 impl TrackedBuffer [L643-657]
848 fn has_changes [L644-650]
849 fn schedule_diff_update [L652-656]
850 pub struct ChangedBuffer [L659-661]
851 pub diff [L660]
852 mod tests [L664-1574]
853 fn init_logger [L678-682]
854 fn init_test [L684-691]
855 async fn test_keep_edits [L694-769]
856 async fn test_deletions [L772-854]
857 async fn test_overlapping_user_edits [L857-951]
858 async fn test_creating_files [L954-1010]
859 async fn test_deleting_files [L1013-1120]
860 async fn test_reject_edits [L1123-1255]
861 async fn test_reject_multiple_edits [L1258-1331]
862 async fn test_reject_deleted_file [L1334-1388]
863 async fn test_reject_created_file [L1391-1443]
864 async fn test_random_diffs [L1446-1535]
865 fn quiesce [L1510-1534]
866 struct HunkStatus [L1538-1542]
867 range [L1539]
868 diff_status [L1540]
869 old_text [L1541]
870 fn unreviewed_hunks [L1544-1573]
871
872 Showing symbols 1-69 (total symbols: 69)
873
874 Using the line numbers in this outline, you can call this tool again while specifying
875 the start_line and end_line fields to see the implementations of symbols in the outline.
876 "},
877 )],
878 ),
879 message(
880 Assistant,
881 [
882 text(
883 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
884 ),
885 tool_use(
886 "tool_2",
887 "read_file",
888 ReadFileToolInput {
889 path: input_file_path.into(),
890 start_line: Some(953),
891 end_line: Some(1010),
892 },
893 ),
894 ],
895 ),
896 message(
897 User,
898 [tool_result(
899 "tool_2",
900 "read_file",
901 lines(input_file_content, 953..1010),
902 )],
903 ),
904 message(
905 Assistant,
906 [
907 text(
908 "Now let's look at another relevant test to understand how they're structured:",
909 ),
910 tool_use(
911 "tool_3",
912 "read_file",
913 ReadFileToolInput {
914 path: input_file_path.into(),
915 start_line: Some(1012),
916 end_line: Some(1120),
917 },
918 ),
919 ],
920 ),
921 message(
922 User,
923 [tool_result(
924 "tool_3",
925 "read_file",
926 lines(input_file_content, 1012..1120),
927 )],
928 ),
929 message(
930 Assistant,
931 [
932 text("Now let's look at how `buffer_created` is implemented:"),
933 tool_use(
934 "tool_4",
935 "read_file",
936 ReadFileToolInput {
937 path: input_file_path.into(),
938 start_line: Some(271),
939 end_line: Some(276),
940 },
941 ),
942 ],
943 ),
944 message(
945 User,
946 [tool_result(
947 "tool_4",
948 "read_file",
949 lines(input_file_content, 271..276),
950 )],
951 ),
952 message(
953 Assistant,
954 [
955 text(indoc! {"
956 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
957
958 1. Create a file
959 2. Open a buffer for it
960 3. Track it in the action log
961 4. Modify the buffer
962 5. Call `buffer_created` again as if the file were new
963 6. Check that the changes are properly tracked
964
965 Let's write the test:
966 "}),
967 tool_use(
968 "tool_5",
969 "edit_file",
970 EditFileToolInput {
971 display_description: edit_description.into(),
972 path: input_file_path.into(),
973 mode: EditFileMode::Edit,
974 },
975 ),
976 ],
977 ),
978 ],
979 Some(input_file_content.into()),
980 EvalAssertion::judge_diff(
981 "A new test for overwritten files was created, without changing any previous test",
982 ),
983 ),
984 );
985}
986
987#[test]
988#[cfg_attr(not(feature = "eval"), ignore)]
989fn eval_create_empty_file() {
990 // Check that Edit Agent can create a file without writing its
991 // thoughts into it. This issue is not specific to empty files, but
992 // it's easier to reproduce with them.
993 //
994 // Results for 2025-05-21:
995 //
996 // Model | Pass rate
997 // ============================================
998 //
999 // claude-3.7-sonnet | 1.00
1000 // gemini-2.5-pro-preview-03-25 | 1.00
1001 // gemini-2.5-flash-preview-04-17 | 1.00
1002 // gpt-4.1 | 1.00
1003 //
1004 //
1005 // TODO: gpt-4.1-mini errored 38 times:
1006 // "data did not match any variant of untagged enum ResponseStreamResult"
1007 //
1008 let input_file_content = None;
1009 let expected_output_content = String::new();
1010 eval(
1011 100,
1012 0.99,
1013 0.05,
1014 EvalInput::from_conversation(
1015 vec![
1016 message(User, [text("Create a second empty todo file ")]),
1017 message(
1018 Assistant,
1019 [
1020 text(formatdoc! {"
1021 I'll help you create a second empty todo file.
1022 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1023 "}),
1024 tool_use(
1025 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1026 "list_directory",
1027 ListDirectoryToolInput {
1028 path: "root".to_string(),
1029 },
1030 ),
1031 ],
1032 ),
1033 message(
1034 User,
1035 [tool_result(
1036 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1037 "list_directory",
1038 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1039 )],
1040 ),
1041 message(
1042 Assistant,
1043 [
1044 text(formatdoc! {"
1045 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1046 "}),
1047 tool_use(
1048 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1049 "edit_file",
1050 EditFileToolInput {
1051 display_description: "Create empty TODO3 file".to_string(),
1052 mode: EditFileMode::Create,
1053 path: "root/TODO3".into(),
1054 },
1055 ),
1056 ],
1057 ),
1058 ],
1059 input_file_content,
1060 // Bad behavior is to write something like
1061 // "I'll create an empty TODO3 file as requested."
1062 EvalAssertion::assert_eq(expected_output_content),
1063 ),
1064 );
1065}
1066
1067fn message(
1068 role: Role,
1069 contents: impl IntoIterator<Item = MessageContent>,
1070) -> LanguageModelRequestMessage {
1071 LanguageModelRequestMessage {
1072 role,
1073 content: contents.into_iter().collect(),
1074 cache: false,
1075 }
1076}
1077
1078fn text(text: impl Into<String>) -> MessageContent {
1079 MessageContent::Text(text.into())
1080}
1081
1082fn lines(input: &str, range: Range<usize>) -> String {
1083 input
1084 .lines()
1085 .skip(range.start)
1086 .take(range.len())
1087 .collect::<Vec<_>>()
1088 .join("\n")
1089}
1090
1091fn tool_use(
1092 id: impl Into<Arc<str>>,
1093 name: impl Into<Arc<str>>,
1094 input: impl Serialize,
1095) -> MessageContent {
1096 MessageContent::ToolUse(LanguageModelToolUse {
1097 id: LanguageModelToolUseId::from(id.into()),
1098 name: name.into(),
1099 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1100 input: serde_json::to_value(input).unwrap(),
1101 is_input_complete: true,
1102 })
1103}
1104
1105fn tool_result(
1106 id: impl Into<Arc<str>>,
1107 name: impl Into<Arc<str>>,
1108 result: impl Into<Arc<str>>,
1109) -> MessageContent {
1110 MessageContent::ToolResult(LanguageModelToolResult {
1111 tool_use_id: LanguageModelToolUseId::from(id.into()),
1112 tool_name: name.into(),
1113 is_error: false,
1114 content: LanguageModelToolResultContent::Text(result.into()),
1115 output: None,
1116 })
1117}
1118
1119#[derive(Clone)]
1120struct EvalInput {
1121 conversation: Vec<LanguageModelRequestMessage>,
1122 edit_file_input: EditFileToolInput,
1123 input_content: Option<String>,
1124 assertion: EvalAssertion,
1125}
1126
1127impl EvalInput {
1128 fn from_conversation(
1129 conversation: Vec<LanguageModelRequestMessage>,
1130 input_content: Option<String>,
1131 assertion: EvalAssertion,
1132 ) -> Self {
1133 let msg = conversation.last().expect("Conversation must not be empty");
1134 if msg.role != Role::Assistant {
1135 panic!("Conversation must end with an assistant message");
1136 }
1137 let tool_use = msg
1138 .content
1139 .iter()
1140 .flat_map(|content| match content {
1141 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1142 Some(tool_use)
1143 }
1144 _ => None,
1145 })
1146 .next()
1147 .expect("Conversation must end with an edit_file tool use")
1148 .clone();
1149
1150 let edit_file_input: EditFileToolInput =
1151 serde_json::from_value(tool_use.input.clone()).unwrap();
1152
1153 EvalInput {
1154 conversation,
1155 edit_file_input,
1156 input_content,
1157 assertion,
1158 }
1159 }
1160}
1161
1162#[derive(Clone)]
1163struct EvalSample {
1164 text_before: String,
1165 text_after: String,
1166 edit_output: EditAgentOutput,
1167 diff: String,
1168}
1169
1170trait AssertionFn: 'static + Send + Sync {
1171 fn assert<'a>(
1172 &'a self,
1173 sample: &'a EvalSample,
1174 judge_model: Arc<dyn LanguageModel>,
1175 cx: &'a mut TestAppContext,
1176 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1177}
1178
1179impl<F> AssertionFn for F
1180where
1181 F: 'static
1182 + Send
1183 + Sync
1184 + AsyncFn(
1185 &EvalSample,
1186 Arc<dyn LanguageModel>,
1187 &mut TestAppContext,
1188 ) -> Result<EvalAssertionOutcome>,
1189{
1190 fn assert<'a>(
1191 &'a self,
1192 sample: &'a EvalSample,
1193 judge_model: Arc<dyn LanguageModel>,
1194 cx: &'a mut TestAppContext,
1195 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1196 (self)(sample, judge_model, cx).boxed_local()
1197 }
1198}
1199
1200#[derive(Clone)]
1201struct EvalAssertion(Arc<dyn AssertionFn>);
1202
1203impl EvalAssertion {
1204 fn new<F>(f: F) -> Self
1205 where
1206 F: 'static
1207 + Send
1208 + Sync
1209 + AsyncFn(
1210 &EvalSample,
1211 Arc<dyn LanguageModel>,
1212 &mut TestAppContext,
1213 ) -> Result<EvalAssertionOutcome>,
1214 {
1215 EvalAssertion(Arc::new(f))
1216 }
1217
1218 fn assert_eq(expected: impl Into<String>) -> Self {
1219 let expected = expected.into();
1220 Self::new(async move |sample, _judge, _cx| {
1221 Ok(EvalAssertionOutcome {
1222 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1223 100
1224 } else {
1225 0
1226 },
1227 message: None,
1228 })
1229 })
1230 }
1231
1232 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1233 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1234 Self::new(async move |sample, _judge, _cx| {
1235 let matches = expected_diffs.iter().any(|possible_diff| {
1236 let expected =
1237 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1238 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1239 });
1240
1241 Ok(EvalAssertionOutcome {
1242 score: if matches { 100 } else { 0 },
1243 message: None,
1244 })
1245 })
1246 }
1247
1248 fn judge_diff(assertions: &'static str) -> Self {
1249 Self::new(async move |sample, judge, cx| {
1250 let prompt = DiffJudgeTemplate {
1251 diff: sample.diff.clone(),
1252 assertions,
1253 }
1254 .render(&Templates::new())
1255 .unwrap();
1256
1257 let request = LanguageModelRequest {
1258 messages: vec![LanguageModelRequestMessage {
1259 role: Role::User,
1260 content: vec![prompt.into()],
1261 cache: false,
1262 }],
1263 ..Default::default()
1264 };
1265 let mut response = retry_on_rate_limit(async || {
1266 Ok(judge
1267 .stream_completion_text(request.clone(), &cx.to_async())
1268 .await?)
1269 })
1270 .await?;
1271 let mut output = String::new();
1272 while let Some(chunk) = response.stream.next().await {
1273 let chunk = chunk?;
1274 output.push_str(&chunk);
1275 }
1276
1277 // Parse the score from the response
1278 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1279 if let Some(captures) = re.captures(&output) {
1280 if let Some(score_match) = captures.get(1) {
1281 let score = score_match.as_str().parse().unwrap_or(0);
1282 return Ok(EvalAssertionOutcome {
1283 score,
1284 message: Some(output),
1285 });
1286 }
1287 }
1288
1289 anyhow::bail!("No score found in response. Raw output: {output}");
1290 })
1291 }
1292
1293 async fn run(
1294 &self,
1295 input: &EvalSample,
1296 judge_model: Arc<dyn LanguageModel>,
1297 cx: &mut TestAppContext,
1298 ) -> Result<EvalAssertionOutcome> {
1299 self.0.assert(input, judge_model, cx).await
1300 }
1301}
1302
1303fn eval(
1304 iterations: usize,
1305 expected_pass_ratio: f32,
1306 mismatched_tag_threshold: f32,
1307 mut eval: EvalInput,
1308) {
1309 let mut evaluated_count = 0;
1310 let mut failed_count = 0;
1311 report_progress(evaluated_count, failed_count, iterations);
1312
1313 let (tx, rx) = mpsc::channel();
1314
1315 // Cache the last message in the conversation, and run one instance of the eval so that
1316 // all the next ones are cached.
1317 eval.conversation.last_mut().unwrap().cache = true;
1318 run_eval(eval.clone(), tx.clone());
1319
1320 let executor = gpui::background_executor();
1321 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1322 for _ in 1..iterations {
1323 let eval = eval.clone();
1324 let tx = tx.clone();
1325 let semaphore = semaphore.clone();
1326 executor
1327 .spawn(async move {
1328 let _guard = semaphore.acquire().await;
1329 run_eval(eval, tx)
1330 })
1331 .detach();
1332 }
1333 drop(tx);
1334
1335 let mut failed_evals = HashMap::default();
1336 let mut errored_evals = HashMap::default();
1337 let mut eval_outputs = Vec::new();
1338 let mut cumulative_parser_metrics = EditParserMetrics::default();
1339 while let Ok(output) = rx.recv() {
1340 match output {
1341 Ok(output) => {
1342 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1343 eval_outputs.push(output.clone());
1344 if output.assertion.score < 80 {
1345 failed_count += 1;
1346 failed_evals
1347 .entry(output.sample.text_after.clone())
1348 .or_insert(Vec::new())
1349 .push(output);
1350 }
1351 }
1352 Err(error) => {
1353 failed_count += 1;
1354 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1355 }
1356 }
1357
1358 evaluated_count += 1;
1359 report_progress(evaluated_count, failed_count, iterations);
1360 }
1361
1362 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1363 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1364 if actual_pass_ratio < expected_pass_ratio {
1365 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1366 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1367 for (error, count) in errored_evals {
1368 println!("Eval errored {} times. Error: {}", count, error);
1369 }
1370
1371 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1372 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1373 for (_buffer_output, failed_evals) in failed_evals {
1374 let eval_output = failed_evals.first().unwrap();
1375 println!("Eval failed {} times", failed_evals.len());
1376 println!("{}", eval_output);
1377 }
1378
1379 panic!(
1380 "Actual pass ratio: {}\nExpected pass ratio: {}",
1381 actual_pass_ratio, expected_pass_ratio
1382 );
1383 }
1384
1385 let mismatched_tag_ratio =
1386 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1387 if mismatched_tag_ratio > mismatched_tag_threshold {
1388 for eval_output in eval_outputs {
1389 println!("{}", eval_output);
1390 }
1391 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1392 }
1393}
1394
1395fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1396 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1397 let mut cx = TestAppContext::build(dispatcher, None);
1398 let output = cx.executor().block_test(async {
1399 let test = EditAgentTest::new(&mut cx).await;
1400 test.eval(eval, &mut cx).await
1401 });
1402 tx.send(output).unwrap();
1403}
1404
1405#[derive(Clone)]
1406struct EvalOutput {
1407 sample: EvalSample,
1408 assertion: EvalAssertionOutcome,
1409}
1410
1411impl Display for EvalOutput {
1412 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1413 writeln!(f, "Score: {:?}", self.assertion.score)?;
1414 if let Some(message) = self.assertion.message.as_ref() {
1415 writeln!(f, "Message: {}", message)?;
1416 }
1417
1418 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1419
1420 writeln!(
1421 f,
1422 "Parser Metrics:\n{:#?}",
1423 self.sample.edit_output.parser_metrics
1424 )?;
1425 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1426 Ok(())
1427 }
1428}
1429
1430fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1431 let passed_count = evaluated_count - failed_count;
1432 let passed_ratio = if evaluated_count == 0 {
1433 0.0
1434 } else {
1435 passed_count as f64 / evaluated_count as f64
1436 };
1437 print!(
1438 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1439 evaluated_count,
1440 iterations,
1441 passed_ratio * 100.0
1442 );
1443 std::io::stdout().flush().unwrap();
1444}
1445
1446struct EditAgentTest {
1447 agent: EditAgent,
1448 project: Entity<Project>,
1449 judge_model: Arc<dyn LanguageModel>,
1450}
1451
1452impl EditAgentTest {
1453 async fn new(cx: &mut TestAppContext) -> Self {
1454 cx.executor().allow_parking();
1455
1456 let fs = FakeFs::new(cx.executor().clone());
1457 cx.update(|cx| {
1458 settings::init(cx);
1459 gpui_tokio::init(cx);
1460 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1461 cx.set_http_client(http_client);
1462
1463 client::init_settings(cx);
1464 let client = Client::production(cx);
1465 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1466
1467 settings::init(cx);
1468 Project::init_settings(cx);
1469 language::init(cx);
1470 language_model::init(client.clone(), cx);
1471 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1472 crate::init(client.http_client(), cx);
1473 });
1474
1475 fs.insert_tree("/root", json!({})).await;
1476 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1477 let agent_model = SelectedModel::from_str(
1478 &std::env::var("ZED_AGENT_MODEL")
1479 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1480 )
1481 .unwrap();
1482 let judge_model = SelectedModel::from_str(
1483 &std::env::var("ZED_JUDGE_MODEL")
1484 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1485 )
1486 .unwrap();
1487 let (agent_model, judge_model) = cx
1488 .update(|cx| {
1489 cx.spawn(async move |cx| {
1490 let agent_model = Self::load_model(&agent_model, cx).await;
1491 let judge_model = Self::load_model(&judge_model, cx).await;
1492 (agent_model.unwrap(), judge_model.unwrap())
1493 })
1494 })
1495 .await;
1496 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1497
1498 Self {
1499 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1500 project,
1501 judge_model,
1502 }
1503 }
1504
1505 async fn load_model(
1506 selected_model: &SelectedModel,
1507 cx: &mut AsyncApp,
1508 ) -> Result<Arc<dyn LanguageModel>> {
1509 let (provider, model) = cx.update(|cx| {
1510 let models = LanguageModelRegistry::read_global(cx);
1511 let model = models
1512 .available_models(cx)
1513 .find(|model| {
1514 model.provider_id() == selected_model.provider
1515 && model.id() == selected_model.model
1516 })
1517 .expect("Model not found");
1518 let provider = models.provider(&model.provider_id()).unwrap();
1519 (provider, model)
1520 })?;
1521 cx.update(|cx| provider.authenticate(cx))?.await?;
1522 Ok(model)
1523 }
1524
1525 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1526 let path = self
1527 .project
1528 .read_with(cx, |project, cx| {
1529 project.find_project_path(eval.edit_file_input.path, cx)
1530 })
1531 .unwrap();
1532 let buffer = self
1533 .project
1534 .update(cx, |project, cx| project.open_buffer(path, cx))
1535 .await
1536 .unwrap();
1537 let tools = cx.update(|cx| {
1538 ToolRegistry::default_global(cx)
1539 .tools()
1540 .into_iter()
1541 .filter_map(|tool| {
1542 let input_schema = tool
1543 .input_schema(self.agent.model.tool_input_format())
1544 .ok()?;
1545 Some(LanguageModelRequestTool {
1546 name: tool.name(),
1547 description: tool.description(),
1548 input_schema,
1549 })
1550 })
1551 .collect::<Vec<_>>()
1552 });
1553 let tool_names = tools
1554 .iter()
1555 .map(|tool| tool.name.clone())
1556 .collect::<Vec<_>>();
1557 let worktrees = vec![WorktreeContext {
1558 root_name: "root".to_string(),
1559 rules_file: None,
1560 }];
1561 let prompt_builder = PromptBuilder::new(None)?;
1562 let project_context = ProjectContext::new(worktrees, Vec::default());
1563 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1564 &project_context,
1565 &ModelContext {
1566 available_tools: tool_names,
1567 },
1568 )?;
1569
1570 let has_system_prompt = eval
1571 .conversation
1572 .first()
1573 .map_or(false, |msg| msg.role == Role::System);
1574 let messages = if has_system_prompt {
1575 eval.conversation
1576 } else {
1577 [LanguageModelRequestMessage {
1578 role: Role::System,
1579 content: vec![MessageContent::Text(system_prompt)],
1580 cache: true,
1581 }]
1582 .into_iter()
1583 .chain(eval.conversation)
1584 .collect::<Vec<_>>()
1585 };
1586
1587 let conversation = LanguageModelRequest {
1588 messages,
1589 tools,
1590 ..Default::default()
1591 };
1592
1593 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1594 if let Some(input_content) = eval.input_content.as_deref() {
1595 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1596 }
1597 retry_on_rate_limit(async || {
1598 self.agent
1599 .edit(
1600 buffer.clone(),
1601 eval.edit_file_input.display_description.clone(),
1602 &conversation,
1603 &mut cx.to_async(),
1604 )
1605 .0
1606 .await
1607 })
1608 .await?
1609 } else {
1610 retry_on_rate_limit(async || {
1611 self.agent
1612 .overwrite(
1613 buffer.clone(),
1614 eval.edit_file_input.display_description.clone(),
1615 &conversation,
1616 &mut cx.to_async(),
1617 )
1618 .0
1619 .await
1620 })
1621 .await?
1622 };
1623
1624 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1625 let sample = EvalSample {
1626 edit_output,
1627 diff: language::unified_diff(
1628 eval.input_content.as_deref().unwrap_or_default(),
1629 &buffer_text,
1630 ),
1631 text_before: eval.input_content.unwrap_or_default(),
1632 text_after: buffer_text,
1633 };
1634 let assertion = eval
1635 .assertion
1636 .run(&sample, self.judge_model.clone(), cx)
1637 .await?;
1638
1639 Ok(EvalOutput { assertion, sample })
1640 }
1641}
1642
1643async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1644 let mut attempt = 0;
1645 loop {
1646 attempt += 1;
1647 match request().await {
1648 Ok(result) => return Ok(result),
1649 Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1650 Ok(err) => match err {
1651 LanguageModelCompletionError::RateLimit(duration) => {
1652 // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1653 let jitter = duration.mul_f64(rand::thread_rng().gen_range(0.0..0.5));
1654 eprintln!(
1655 "Attempt #{attempt}: Rate limit exceeded. Retry after {duration:?} + jitter of {jitter:?}"
1656 );
1657 Timer::after(duration + jitter).await;
1658 continue;
1659 }
1660 _ => return Err(err.into()),
1661 },
1662 Err(err) => return Err(err),
1663 },
1664 }
1665 }
1666}
1667
1668#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1669struct EvalAssertionOutcome {
1670 score: usize,
1671 message: Option<String>,
1672}
1673
1674#[derive(Serialize)]
1675pub struct DiffJudgeTemplate {
1676 diff: String,
1677 assertions: &'static str,
1678}
1679
1680impl Template for DiffJudgeTemplate {
1681 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1682}
1683
1684fn strip_empty_lines(text: &str) -> String {
1685 text.lines()
1686 .filter(|line| !line.trim().is_empty())
1687 .collect::<Vec<_>>()
1688 .join("\n")
1689}