1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 path::Path,
30 str::FromStr,
31 sync::mpsc,
32 time::Duration,
33};
34use util::path;
35
36#[test]
37#[cfg_attr(not(feature = "eval"), ignore)]
38fn eval_extract_handle_command_output() {
39 // Test how well agent generates multiple edit hunks.
40 //
41 // Model | Pass rate
42 // ----------------------------|----------
43 // claude-3.7-sonnet | 0.99 (2025-06-14)
44 // claude-sonnet-4 | 0.97 (2025-06-14)
45 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
46 // gemini-2.5-flash | 0.11 (2025-05-22)
47 // gpt-4.1 | 1.00 (2025-05-22)
48
49 let input_file_path = "root/blame.rs";
50 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
51 let possible_diffs = vec![
52 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
57 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
58 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
59 ];
60 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
61 eval(
62 100,
63 0.95,
64 0.05,
65 EvalInput::from_conversation(
66 vec![
67 message(
68 User,
69 [text(formatdoc! {"
70 Read the `{input_file_path}` file and extract a method in
71 the final stanza of `run_git_blame` to deal with command failures,
72 call it `handle_command_output` and take the std::process::Output as the only parameter.
73 Do not document the method and do not add any comments.
74
75 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
76 "})],
77 ),
78 message(
79 Assistant,
80 [tool_use(
81 "tool_1",
82 "read_file",
83 ReadFileToolInput {
84 path: input_file_path.into(),
85 start_line: None,
86 end_line: None,
87 },
88 )],
89 ),
90 message(
91 User,
92 [tool_result("tool_1", "read_file", input_file_content)],
93 ),
94 message(
95 Assistant,
96 [tool_use(
97 "tool_2",
98 "edit_file",
99 EditFileToolInput {
100 display_description: edit_description.into(),
101 path: input_file_path.into(),
102 mode: EditFileMode::Edit,
103 },
104 )],
105 ),
106 ],
107 Some(input_file_content.into()),
108 EvalAssertion::assert_diff_any(possible_diffs),
109 ),
110 );
111}
112
113#[test]
114#[cfg_attr(not(feature = "eval"), ignore)]
115fn eval_delete_run_git_blame() {
116 // Model | Pass rate
117 // ----------------------------|----------
118 // claude-3.7-sonnet | 1.0 (2025-06-14)
119 // claude-sonnet-4 | 0.96 (2025-06-14)
120 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
121 // gemini-2.5-flash |
122 // gpt-4.1 |
123 let input_file_path = "root/blame.rs";
124 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
125 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
126 let edit_description = "Delete the `run_git_blame` function.";
127 eval(
128 100,
129 0.95,
130 0.05,
131 EvalInput::from_conversation(
132 vec![
133 message(
134 User,
135 [text(formatdoc! {"
136 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
137 one function, not its usages.
138 "})],
139 ),
140 message(
141 Assistant,
142 [tool_use(
143 "tool_1",
144 "read_file",
145 ReadFileToolInput {
146 path: input_file_path.into(),
147 start_line: None,
148 end_line: None,
149 },
150 )],
151 ),
152 message(
153 User,
154 [tool_result("tool_1", "read_file", input_file_content)],
155 ),
156 message(
157 Assistant,
158 [tool_use(
159 "tool_2",
160 "edit_file",
161 EditFileToolInput {
162 display_description: edit_description.into(),
163 path: input_file_path.into(),
164 mode: EditFileMode::Edit,
165 },
166 )],
167 ),
168 ],
169 Some(input_file_content.into()),
170 EvalAssertion::assert_eq(output_file_content),
171 ),
172 );
173}
174
175#[test]
176#[cfg_attr(not(feature = "eval"), ignore)]
177fn eval_translate_doc_comments() {
178 // Model | Pass rate
179 // ============================================
180 //
181 // claude-3.7-sonnet | 1.0 (2025-06-14)
182 // claude-sonnet-4 | 1.0 (2025-06-14)
183 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
184 // gemini-2.5-flash-preview-04-17 |
185 // gpt-4.1 |
186 let input_file_path = "root/canvas.rs";
187 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
188 let edit_description = "Translate all doc comments to Italian";
189 eval(
190 200,
191 1.,
192 0.05,
193 EvalInput::from_conversation(
194 vec![
195 message(
196 User,
197 [text(formatdoc! {"
198 Read the {input_file_path} file and edit it (without overwriting it),
199 translating all the doc comments to italian.
200 "})],
201 ),
202 message(
203 Assistant,
204 [tool_use(
205 "tool_1",
206 "read_file",
207 ReadFileToolInput {
208 path: input_file_path.into(),
209 start_line: None,
210 end_line: None,
211 },
212 )],
213 ),
214 message(
215 User,
216 [tool_result("tool_1", "read_file", input_file_content)],
217 ),
218 message(
219 Assistant,
220 [tool_use(
221 "tool_2",
222 "edit_file",
223 EditFileToolInput {
224 display_description: edit_description.into(),
225 path: input_file_path.into(),
226 mode: EditFileMode::Edit,
227 },
228 )],
229 ),
230 ],
231 Some(input_file_content.into()),
232 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
233 ),
234 );
235}
236
237#[test]
238#[cfg_attr(not(feature = "eval"), ignore)]
239fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
240 // Model | Pass rate
241 // ============================================
242 //
243 // claude-3.7-sonnet | 0.96 (2025-06-14)
244 // claude-sonnet-4 | 0.11 (2025-06-14)
245 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
246 // gemini-2.5-flash-preview-04-17 |
247 // gpt-4.1 |
248 let input_file_path = "root/lib.rs";
249 let input_file_content =
250 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
251 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
252 eval(
253 100,
254 0.95,
255 0.05,
256 EvalInput::from_conversation(
257 vec![
258 message(
259 User,
260 [text(formatdoc! {"
261 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
262 Use `ureq` to download the SDK for the current platform and architecture.
263 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
264 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
265 that's inside of the archive.
266 Don't re-download the SDK if that executable already exists.
267
268 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
269
270 Here are the available wasi-sdk assets:
271 - wasi-sdk-25.0-x86_64-macos.tar.gz
272 - wasi-sdk-25.0-arm64-macos.tar.gz
273 - wasi-sdk-25.0-x86_64-linux.tar.gz
274 - wasi-sdk-25.0-arm64-linux.tar.gz
275 - wasi-sdk-25.0-x86_64-linux.tar.gz
276 - wasi-sdk-25.0-arm64-linux.tar.gz
277 - wasi-sdk-25.0-x86_64-windows.tar.gz
278 "})],
279 ),
280 message(
281 Assistant,
282 [tool_use(
283 "tool_1",
284 "read_file",
285 ReadFileToolInput {
286 path: input_file_path.into(),
287 start_line: Some(971),
288 end_line: Some(1050),
289 },
290 )],
291 ),
292 message(
293 User,
294 [tool_result(
295 "tool_1",
296 "read_file",
297 lines(input_file_content, 971..1050),
298 )],
299 ),
300 message(
301 Assistant,
302 [tool_use(
303 "tool_2",
304 "read_file",
305 ReadFileToolInput {
306 path: input_file_path.into(),
307 start_line: Some(1050),
308 end_line: Some(1100),
309 },
310 )],
311 ),
312 message(
313 User,
314 [tool_result(
315 "tool_2",
316 "read_file",
317 lines(input_file_content, 1050..1100),
318 )],
319 ),
320 message(
321 Assistant,
322 [tool_use(
323 "tool_3",
324 "read_file",
325 ReadFileToolInput {
326 path: input_file_path.into(),
327 start_line: Some(1100),
328 end_line: Some(1150),
329 },
330 )],
331 ),
332 message(
333 User,
334 [tool_result(
335 "tool_3",
336 "read_file",
337 lines(input_file_content, 1100..1150),
338 )],
339 ),
340 message(
341 Assistant,
342 [tool_use(
343 "tool_4",
344 "edit_file",
345 EditFileToolInput {
346 display_description: edit_description.into(),
347 path: input_file_path.into(),
348 mode: EditFileMode::Edit,
349 },
350 )],
351 ),
352 ],
353 Some(input_file_content.into()),
354 EvalAssertion::judge_diff(indoc! {"
355 - The compile_parser_to_wasm method has been changed to use wasi-sdk
356 - ureq is used to download the SDK for current platform and architecture
357 "}),
358 ),
359 );
360}
361
362#[test]
363#[cfg_attr(not(feature = "eval"), ignore)]
364fn eval_disable_cursor_blinking() {
365 // Model | Pass rate
366 // ============================================
367 //
368 // claude-3.7-sonnet | 0.99 (2025-06-14)
369 // claude-sonnet-4 | 0.85 (2025-06-14)
370 // gemini-2.5-pro-preview-latest | 0.97 (2025-06-16)
371 // gemini-2.5-flash-preview-04-17 |
372 // gpt-4.1 |
373 let input_file_path = "root/editor.rs";
374 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
375 let edit_description = "Comment out the call to `BlinkManager::enable`";
376 eval(
377 100,
378 0.95,
379 0.05,
380 EvalInput::from_conversation(
381 vec![
382 message(User, [text("Let's research how to cursor blinking works.")]),
383 message(
384 Assistant,
385 [tool_use(
386 "tool_1",
387 "grep",
388 GrepToolInput {
389 regex: "blink".into(),
390 include_pattern: None,
391 offset: 0,
392 case_sensitive: false,
393 },
394 )],
395 ),
396 message(
397 User,
398 [tool_result(
399 "tool_1",
400 "grep",
401 [
402 lines(input_file_content, 100..400),
403 lines(input_file_content, 800..1300),
404 lines(input_file_content, 1600..2000),
405 lines(input_file_content, 5000..5500),
406 lines(input_file_content, 8000..9000),
407 lines(input_file_content, 18455..18470),
408 lines(input_file_content, 20000..20500),
409 lines(input_file_content, 21000..21300),
410 ]
411 .join("Match found:\n\n"),
412 )],
413 ),
414 message(
415 User,
416 [text(indoc! {"
417 Comment out the lines that interact with the BlinkManager.
418 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
419 Don't add additional comments.
420 "})],
421 ),
422 message(
423 Assistant,
424 [tool_use(
425 "tool_4",
426 "edit_file",
427 EditFileToolInput {
428 display_description: edit_description.into(),
429 path: input_file_path.into(),
430 mode: EditFileMode::Edit,
431 },
432 )],
433 ),
434 ],
435 Some(input_file_content.into()),
436 EvalAssertion::judge_diff(indoc! {"
437 - Calls to BlinkManager in `observe_window_activation` were commented out
438 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
439 - All the edits have valid indentation
440 "}),
441 ),
442 );
443}
444
445#[test]
446#[cfg_attr(not(feature = "eval"), ignore)]
447fn eval_from_pixels_constructor() {
448 // Results for 2025-06-13
449 //
450 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
451 // value. Higher values improve the pass rate but may sometimes cause
452 // edits to be misapplied. In the context of this eval, this means
453 // the agent might add from_pixels tests in incorrect locations
454 // (e.g., at the beginning of the file), yet the evaluation may still
455 // rate it highly.
456 //
457 // Model | Date | Pass rate
458 // =========================================================
459 // claude-4.0-sonnet | 2025-06-14 | 0.99
460 // claude-3.7-sonnet | 2025-06-14 | 0.88
461 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
462 // gpt-4.1 |
463 let input_file_path = "root/canvas.rs";
464 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
465 let edit_description = "Implement from_pixels constructor and add tests.";
466 eval(
467 100,
468 0.95,
469 // For whatever reason, this eval produces more mismatched tags.
470 // Increasing for now, let's see if we can bring this down.
471 0.25,
472 EvalInput::from_conversation(
473 vec![
474 message(
475 User,
476 [text(indoc! {"
477 Introduce a new `from_pixels` constructor in Canvas and
478 also add tests for it in the same file.
479 "})],
480 ),
481 message(
482 Assistant,
483 [tool_use(
484 "tool_1",
485 "read_file",
486 ReadFileToolInput {
487 path: input_file_path.into(),
488 start_line: None,
489 end_line: None,
490 },
491 )],
492 ),
493 message(
494 User,
495 [tool_result("tool_1", "read_file", input_file_content)],
496 ),
497 message(
498 Assistant,
499 [tool_use(
500 "tool_2",
501 "grep",
502 GrepToolInput {
503 regex: "mod\\s+tests".into(),
504 include_pattern: Some("font-kit/src/canvas.rs".into()),
505 offset: 0,
506 case_sensitive: false,
507 },
508 )],
509 ),
510 message(User, [tool_result("tool_2", "grep", "No matches found")]),
511 message(
512 Assistant,
513 [tool_use(
514 "tool_3",
515 "grep",
516 GrepToolInput {
517 regex: "mod\\s+tests".into(),
518 include_pattern: Some("font-kit/src/**/*.rs".into()),
519 offset: 0,
520 case_sensitive: false,
521 },
522 )],
523 ),
524 message(User, [tool_result("tool_3", "grep", "No matches found")]),
525 message(
526 Assistant,
527 [tool_use(
528 "tool_4",
529 "grep",
530 GrepToolInput {
531 regex: "#\\[test\\]".into(),
532 include_pattern: Some("font-kit/src/**/*.rs".into()),
533 offset: 0,
534 case_sensitive: false,
535 },
536 )],
537 ),
538 message(
539 User,
540 [tool_result(
541 "tool_4",
542 "grep",
543 indoc! {"
544 Found 6 matches:
545
546 ## Matches in font-kit/src/loaders/core_text.rs
547
548 ### mod test › L926-936
549 ```
550 mod test {
551 use super::Font;
552 use crate::properties::{Stretch, Weight};
553
554 #[cfg(feature = \"source\")]
555 use crate::source::SystemSource;
556
557 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
558
559 #[cfg(feature = \"source\")]
560 #[test]
561 ```
562
563 55 lines remaining in ancestor node. Read the file to see all.
564
565 ### mod test › L947-951
566 ```
567 }
568
569 #[test]
570 fn test_core_text_to_css_font_weight() {
571 // Exact matches
572 ```
573
574 ### mod test › L959-963
575 ```
576 }
577
578 #[test]
579 fn test_core_text_to_css_font_stretch() {
580 // Exact matches
581 ```
582
583 ## Matches in font-kit/src/loaders/freetype.rs
584
585 ### mod test › L1238-1248
586 ```
587 mod test {
588 use crate::loaders::freetype::Font;
589
590 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
591 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
592
593 #[test]
594 fn get_pcf_postscript_name() {
595 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
596 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
597 }
598 ```
599
600 1 lines remaining in ancestor node. Read the file to see all.
601
602 ## Matches in font-kit/src/sources/core_text.rs
603
604 ### mod test › L265-275
605 ```
606 mod test {
607 use crate::properties::{Stretch, Weight};
608
609 #[test]
610 fn test_css_to_core_text_font_weight() {
611 // Exact matches
612 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
613 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
614 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
615 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
616
617 ```
618
619 27 lines remaining in ancestor node. Read the file to see all.
620
621 ### mod test › L278-282
622 ```
623 }
624
625 #[test]
626 fn test_css_to_core_text_font_stretch() {
627 // Exact matches
628 ```
629 "},
630 )],
631 ),
632 message(
633 Assistant,
634 [tool_use(
635 "tool_5",
636 "edit_file",
637 EditFileToolInput {
638 display_description: edit_description.into(),
639 path: input_file_path.into(),
640 mode: EditFileMode::Edit,
641 },
642 )],
643 ),
644 ],
645 Some(input_file_content.into()),
646 EvalAssertion::judge_diff(indoc! {"
647 - The diff contains a new `from_pixels` constructor
648 - The diff contains new tests for the `from_pixels` constructor
649 "}),
650 ),
651 );
652}
653
654#[test]
655#[cfg_attr(not(feature = "eval"), ignore)]
656fn eval_zode() {
657 // Model | Pass rate
658 // ============================================
659 //
660 // claude-3.7-sonnet | 1.0 (2025-06-14)
661 // claude-sonnet-4 | 1.0 (2025-06-14)
662 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
663 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
664 // gpt-4.1 | 1.0 (2025-05-22)
665 let input_file_path = "root/zode.py";
666 let input_content = None;
667 let edit_description = "Create the main Zode CLI script";
668 eval(
669 50,
670 1.,
671 0.05,
672 EvalInput::from_conversation(
673 vec![
674 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
675 message(
676 Assistant,
677 [
678 tool_use(
679 "tool_1",
680 "read_file",
681 ReadFileToolInput {
682 path: "root/eval/react.py".into(),
683 start_line: None,
684 end_line: None,
685 },
686 ),
687 tool_use(
688 "tool_2",
689 "read_file",
690 ReadFileToolInput {
691 path: "root/eval/react_test.py".into(),
692 start_line: None,
693 end_line: None,
694 },
695 ),
696 ],
697 ),
698 message(
699 User,
700 [
701 tool_result(
702 "tool_1",
703 "read_file",
704 include_str!("evals/fixtures/zode/react.py"),
705 ),
706 tool_result(
707 "tool_2",
708 "read_file",
709 include_str!("evals/fixtures/zode/react_test.py"),
710 ),
711 ],
712 ),
713 message(
714 Assistant,
715 [
716 text(
717 "Now that I understand what we need to build, I'll create the main Python script:",
718 ),
719 tool_use(
720 "tool_3",
721 "edit_file",
722 EditFileToolInput {
723 display_description: edit_description.into(),
724 path: input_file_path.into(),
725 mode: EditFileMode::Create,
726 },
727 ),
728 ],
729 ),
730 ],
731 input_content,
732 EvalAssertion::new(async move |sample, _, _cx| {
733 let invalid_starts = [' ', '`', '\n'];
734 let mut message = String::new();
735 for start in invalid_starts {
736 if sample.text_after.starts_with(start) {
737 message.push_str(&format!("The sample starts with a {:?}\n", start));
738 break;
739 }
740 }
741 // Remove trailing newline.
742 message.pop();
743
744 if message.is_empty() {
745 Ok(EvalAssertionOutcome {
746 score: 100,
747 message: None,
748 })
749 } else {
750 Ok(EvalAssertionOutcome {
751 score: 0,
752 message: Some(message),
753 })
754 }
755 }),
756 ),
757 );
758}
759
760#[test]
761#[cfg_attr(not(feature = "eval"), ignore)]
762fn eval_add_overwrite_test() {
763 // Model | Pass rate
764 // ============================================
765 //
766 // claude-3.7-sonnet | 0.65 (2025-06-14)
767 // claude-sonnet-4 | 0.07 (2025-06-14)
768 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
769 // gemini-2.5-flash-preview-04-17 |
770 // gpt-4.1 |
771 let input_file_path = "root/action_log.rs";
772 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
773 let edit_description = "Add a new test for overwriting a file in action_log.rs";
774 eval(
775 200,
776 0.5, // TODO: make this eval better
777 0.05,
778 EvalInput::from_conversation(
779 vec![
780 message(
781 User,
782 [text(indoc! {"
783 Introduce a new test in `action_log.rs` to test overwriting a file.
784 That is, a file already exists, but we call `buffer_created` as if the file were new.
785 Take inspiration from all the other tests in the file.
786 "})],
787 ),
788 message(
789 Assistant,
790 [tool_use(
791 "tool_1",
792 "read_file",
793 ReadFileToolInput {
794 path: input_file_path.into(),
795 start_line: None,
796 end_line: None,
797 },
798 )],
799 ),
800 message(
801 User,
802 [tool_result(
803 "tool_1",
804 "read_file",
805 indoc! {"
806 pub struct ActionLog [L13-20]
807 tracked_buffers [L15]
808 edited_since_project_diagnostics_check [L17]
809 project [L19]
810 impl ActionLog [L22-498]
811 pub fn new [L24-30]
812 pub fn project [L32-34]
813 pub fn checked_project_diagnostics [L37-39]
814 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
815 fn track_buffer_internal [L46-101]
816 fn handle_buffer_event [L103-116]
817 fn handle_buffer_edited [L118-123]
818 fn handle_buffer_file_changed [L125-158]
819 async fn maintain_diff [L160-264]
820 pub fn buffer_read [L267-269]
821 pub fn buffer_created [L272-276]
822 pub fn buffer_edited [L279-287]
823 pub fn will_delete_buffer [L289-304]
824 pub fn keep_edits_in_range [L306-364]
825 pub fn reject_edits_in_ranges [L366-459]
826 pub fn keep_all_edits [L461-473]
827 pub fn changed_buffers [L476-482]
828 pub fn stale_buffers [L485-497]
829 fn apply_non_conflicting_edits [L500-561]
830 fn diff_snapshots [L563-585]
831 fn point_to_row_edit [L587-614]
832 enum ChangeAuthor [L617-620]
833 User [L618]
834 Agent [L619]
835 enum TrackedBufferStatus [L623-627]
836 Created [L624]
837 Modified [L625]
838 Deleted [L626]
839 struct TrackedBuffer [L629-641]
840 buffer [L630]
841 base_text [L631]
842 unreviewed_changes [L632]
843 status [L633]
844 version [L634]
845 diff [L635]
846 snapshot [L636]
847 diff_update [L637]
848 _open_lsp_handle [L638]
849 _maintain_diff [L639]
850 _subscription [L640]
851 impl TrackedBuffer [L643-657]
852 fn has_changes [L644-650]
853 fn schedule_diff_update [L652-656]
854 pub struct ChangedBuffer [L659-661]
855 pub diff [L660]
856 mod tests [L664-1574]
857 fn init_logger [L678-682]
858 fn init_test [L684-691]
859 async fn test_keep_edits [L694-769]
860 async fn test_deletions [L772-854]
861 async fn test_overlapping_user_edits [L857-951]
862 async fn test_creating_files [L954-1010]
863 async fn test_deleting_files [L1013-1120]
864 async fn test_reject_edits [L1123-1255]
865 async fn test_reject_multiple_edits [L1258-1331]
866 async fn test_reject_deleted_file [L1334-1388]
867 async fn test_reject_created_file [L1391-1443]
868 async fn test_random_diffs [L1446-1535]
869 fn quiesce [L1510-1534]
870 struct HunkStatus [L1538-1542]
871 range [L1539]
872 diff_status [L1540]
873 old_text [L1541]
874 fn unreviewed_hunks [L1544-1573]
875
876 Showing symbols 1-69 (total symbols: 69)
877
878 Using the line numbers in this outline, you can call this tool again while specifying
879 the start_line and end_line fields to see the implementations of symbols in the outline.
880 "},
881 )],
882 ),
883 message(
884 Assistant,
885 [
886 text(
887 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
888 ),
889 tool_use(
890 "tool_2",
891 "read_file",
892 ReadFileToolInput {
893 path: input_file_path.into(),
894 start_line: Some(953),
895 end_line: Some(1010),
896 },
897 ),
898 ],
899 ),
900 message(
901 User,
902 [tool_result(
903 "tool_2",
904 "read_file",
905 lines(input_file_content, 953..1010),
906 )],
907 ),
908 message(
909 Assistant,
910 [
911 text(
912 "Now let's look at another relevant test to understand how they're structured:",
913 ),
914 tool_use(
915 "tool_3",
916 "read_file",
917 ReadFileToolInput {
918 path: input_file_path.into(),
919 start_line: Some(1012),
920 end_line: Some(1120),
921 },
922 ),
923 ],
924 ),
925 message(
926 User,
927 [tool_result(
928 "tool_3",
929 "read_file",
930 lines(input_file_content, 1012..1120),
931 )],
932 ),
933 message(
934 Assistant,
935 [
936 text("Now let's look at how `buffer_created` is implemented:"),
937 tool_use(
938 "tool_4",
939 "read_file",
940 ReadFileToolInput {
941 path: input_file_path.into(),
942 start_line: Some(271),
943 end_line: Some(276),
944 },
945 ),
946 ],
947 ),
948 message(
949 User,
950 [tool_result(
951 "tool_4",
952 "read_file",
953 lines(input_file_content, 271..276),
954 )],
955 ),
956 message(
957 Assistant,
958 [
959 text(indoc! {"
960 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
961
962 1. Create a file
963 2. Open a buffer for it
964 3. Track it in the action log
965 4. Modify the buffer
966 5. Call `buffer_created` again as if the file were new
967 6. Check that the changes are properly tracked
968
969 Let's write the test:
970 "}),
971 tool_use(
972 "tool_5",
973 "edit_file",
974 EditFileToolInput {
975 display_description: edit_description.into(),
976 path: input_file_path.into(),
977 mode: EditFileMode::Edit,
978 },
979 ),
980 ],
981 ),
982 ],
983 Some(input_file_content.into()),
984 EvalAssertion::judge_diff(
985 "A new test for overwritten files was created, without changing any previous test",
986 ),
987 ),
988 );
989}
990
991#[test]
992#[cfg_attr(not(feature = "eval"), ignore)]
993fn eval_create_empty_file() {
994 // Check that Edit Agent can create a file without writing its
995 // thoughts into it. This issue is not specific to empty files, but
996 // it's easier to reproduce with them.
997 //
998 // Model | Pass rate
999 // ============================================
1000 //
1001 // claude-3.7-sonnet | 1.00 (2025-06-14)
1002 // claude-sonnet-4 | 1.00 (2025-06-14)
1003 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1004 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1005 // gpt-4.1 | 1.00 (2025-05-21)
1006 //
1007 //
1008 // TODO: gpt-4.1-mini errored 38 times:
1009 // "data did not match any variant of untagged enum ResponseStreamResult"
1010 //
1011 let input_file_content = None;
1012 let expected_output_content = String::new();
1013 eval(
1014 100,
1015 0.99,
1016 0.05,
1017 EvalInput::from_conversation(
1018 vec![
1019 message(User, [text("Create a second empty todo file ")]),
1020 message(
1021 Assistant,
1022 [
1023 text(formatdoc! {"
1024 I'll help you create a second empty todo file.
1025 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1026 "}),
1027 tool_use(
1028 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1029 "list_directory",
1030 ListDirectoryToolInput {
1031 path: "root".to_string(),
1032 },
1033 ),
1034 ],
1035 ),
1036 message(
1037 User,
1038 [tool_result(
1039 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1040 "list_directory",
1041 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1042 )],
1043 ),
1044 message(
1045 Assistant,
1046 [
1047 text(formatdoc! {"
1048 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1049 "}),
1050 tool_use(
1051 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1052 "edit_file",
1053 EditFileToolInput {
1054 display_description: "Create empty TODO3 file".to_string(),
1055 mode: EditFileMode::Create,
1056 path: "root/TODO3".into(),
1057 },
1058 ),
1059 ],
1060 ),
1061 ],
1062 input_file_content,
1063 // Bad behavior is to write something like
1064 // "I'll create an empty TODO3 file as requested."
1065 EvalAssertion::assert_eq(expected_output_content),
1066 ),
1067 );
1068}
1069
1070fn message(
1071 role: Role,
1072 contents: impl IntoIterator<Item = MessageContent>,
1073) -> LanguageModelRequestMessage {
1074 LanguageModelRequestMessage {
1075 role,
1076 content: contents.into_iter().collect(),
1077 cache: false,
1078 }
1079}
1080
1081fn text(text: impl Into<String>) -> MessageContent {
1082 MessageContent::Text(text.into())
1083}
1084
1085fn lines(input: &str, range: Range<usize>) -> String {
1086 input
1087 .lines()
1088 .skip(range.start)
1089 .take(range.len())
1090 .collect::<Vec<_>>()
1091 .join("\n")
1092}
1093
1094fn tool_use(
1095 id: impl Into<Arc<str>>,
1096 name: impl Into<Arc<str>>,
1097 input: impl Serialize,
1098) -> MessageContent {
1099 MessageContent::ToolUse(LanguageModelToolUse {
1100 id: LanguageModelToolUseId::from(id.into()),
1101 name: name.into(),
1102 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1103 input: serde_json::to_value(input).unwrap(),
1104 is_input_complete: true,
1105 })
1106}
1107
1108fn tool_result(
1109 id: impl Into<Arc<str>>,
1110 name: impl Into<Arc<str>>,
1111 result: impl Into<Arc<str>>,
1112) -> MessageContent {
1113 MessageContent::ToolResult(LanguageModelToolResult {
1114 tool_use_id: LanguageModelToolUseId::from(id.into()),
1115 tool_name: name.into(),
1116 is_error: false,
1117 content: LanguageModelToolResultContent::Text(result.into()),
1118 output: None,
1119 })
1120}
1121
1122#[derive(Clone)]
1123struct EvalInput {
1124 conversation: Vec<LanguageModelRequestMessage>,
1125 edit_file_input: EditFileToolInput,
1126 input_content: Option<String>,
1127 assertion: EvalAssertion,
1128}
1129
1130impl EvalInput {
1131 fn from_conversation(
1132 conversation: Vec<LanguageModelRequestMessage>,
1133 input_content: Option<String>,
1134 assertion: EvalAssertion,
1135 ) -> Self {
1136 let msg = conversation.last().expect("Conversation must not be empty");
1137 if msg.role != Role::Assistant {
1138 panic!("Conversation must end with an assistant message");
1139 }
1140 let tool_use = msg
1141 .content
1142 .iter()
1143 .flat_map(|content| match content {
1144 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1145 Some(tool_use)
1146 }
1147 _ => None,
1148 })
1149 .next()
1150 .expect("Conversation must end with an edit_file tool use")
1151 .clone();
1152
1153 let edit_file_input: EditFileToolInput =
1154 serde_json::from_value(tool_use.input.clone()).unwrap();
1155
1156 EvalInput {
1157 conversation,
1158 edit_file_input,
1159 input_content,
1160 assertion,
1161 }
1162 }
1163}
1164
1165#[derive(Clone)]
1166struct EvalSample {
1167 text_before: String,
1168 text_after: String,
1169 edit_output: EditAgentOutput,
1170 diff: String,
1171}
1172
1173trait AssertionFn: 'static + Send + Sync {
1174 fn assert<'a>(
1175 &'a self,
1176 sample: &'a EvalSample,
1177 judge_model: Arc<dyn LanguageModel>,
1178 cx: &'a mut TestAppContext,
1179 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1180}
1181
1182impl<F> AssertionFn for F
1183where
1184 F: 'static
1185 + Send
1186 + Sync
1187 + AsyncFn(
1188 &EvalSample,
1189 Arc<dyn LanguageModel>,
1190 &mut TestAppContext,
1191 ) -> Result<EvalAssertionOutcome>,
1192{
1193 fn assert<'a>(
1194 &'a self,
1195 sample: &'a EvalSample,
1196 judge_model: Arc<dyn LanguageModel>,
1197 cx: &'a mut TestAppContext,
1198 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1199 (self)(sample, judge_model, cx).boxed_local()
1200 }
1201}
1202
1203#[derive(Clone)]
1204struct EvalAssertion(Arc<dyn AssertionFn>);
1205
1206impl EvalAssertion {
1207 fn new<F>(f: F) -> Self
1208 where
1209 F: 'static
1210 + Send
1211 + Sync
1212 + AsyncFn(
1213 &EvalSample,
1214 Arc<dyn LanguageModel>,
1215 &mut TestAppContext,
1216 ) -> Result<EvalAssertionOutcome>,
1217 {
1218 EvalAssertion(Arc::new(f))
1219 }
1220
1221 fn assert_eq(expected: impl Into<String>) -> Self {
1222 let expected = expected.into();
1223 Self::new(async move |sample, _judge, _cx| {
1224 Ok(EvalAssertionOutcome {
1225 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1226 100
1227 } else {
1228 0
1229 },
1230 message: None,
1231 })
1232 })
1233 }
1234
1235 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1236 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1237 Self::new(async move |sample, _judge, _cx| {
1238 let matches = expected_diffs.iter().any(|possible_diff| {
1239 let expected =
1240 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1241 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1242 });
1243
1244 Ok(EvalAssertionOutcome {
1245 score: if matches { 100 } else { 0 },
1246 message: None,
1247 })
1248 })
1249 }
1250
1251 fn judge_diff(assertions: &'static str) -> Self {
1252 Self::new(async move |sample, judge, cx| {
1253 let prompt = DiffJudgeTemplate {
1254 diff: sample.diff.clone(),
1255 assertions,
1256 }
1257 .render(&Templates::new())
1258 .unwrap();
1259
1260 let request = LanguageModelRequest {
1261 messages: vec![LanguageModelRequestMessage {
1262 role: Role::User,
1263 content: vec![prompt.into()],
1264 cache: false,
1265 }],
1266 ..Default::default()
1267 };
1268 let mut response = retry_on_rate_limit(async || {
1269 Ok(judge
1270 .stream_completion_text(request.clone(), &cx.to_async())
1271 .await?)
1272 })
1273 .await?;
1274 let mut output = String::new();
1275 while let Some(chunk) = response.stream.next().await {
1276 let chunk = chunk?;
1277 output.push_str(&chunk);
1278 }
1279
1280 // Parse the score from the response
1281 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1282 if let Some(captures) = re.captures(&output) {
1283 if let Some(score_match) = captures.get(1) {
1284 let score = score_match.as_str().parse().unwrap_or(0);
1285 return Ok(EvalAssertionOutcome {
1286 score,
1287 message: Some(output),
1288 });
1289 }
1290 }
1291
1292 anyhow::bail!("No score found in response. Raw output: {output}");
1293 })
1294 }
1295
1296 async fn run(
1297 &self,
1298 input: &EvalSample,
1299 judge_model: Arc<dyn LanguageModel>,
1300 cx: &mut TestAppContext,
1301 ) -> Result<EvalAssertionOutcome> {
1302 self.0.assert(input, judge_model, cx).await
1303 }
1304}
1305
1306fn eval(
1307 iterations: usize,
1308 expected_pass_ratio: f32,
1309 mismatched_tag_threshold: f32,
1310 mut eval: EvalInput,
1311) {
1312 let mut evaluated_count = 0;
1313 let mut failed_count = 0;
1314 report_progress(evaluated_count, failed_count, iterations);
1315
1316 let (tx, rx) = mpsc::channel();
1317
1318 // Cache the last message in the conversation, and run one instance of the eval so that
1319 // all the next ones are cached.
1320 eval.conversation.last_mut().unwrap().cache = true;
1321 run_eval(eval.clone(), tx.clone());
1322
1323 let executor = gpui::background_executor();
1324 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1325 for _ in 1..iterations {
1326 let eval = eval.clone();
1327 let tx = tx.clone();
1328 let semaphore = semaphore.clone();
1329 executor
1330 .spawn(async move {
1331 let _guard = semaphore.acquire().await;
1332 run_eval(eval, tx)
1333 })
1334 .detach();
1335 }
1336 drop(tx);
1337
1338 let mut failed_evals = HashMap::default();
1339 let mut errored_evals = HashMap::default();
1340 let mut eval_outputs = Vec::new();
1341 let mut cumulative_parser_metrics = EditParserMetrics::default();
1342 while let Ok(output) = rx.recv() {
1343 match output {
1344 Ok(output) => {
1345 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1346 eval_outputs.push(output.clone());
1347 if output.assertion.score < 80 {
1348 failed_count += 1;
1349 failed_evals
1350 .entry(output.sample.text_after.clone())
1351 .or_insert(Vec::new())
1352 .push(output);
1353 }
1354 }
1355 Err(error) => {
1356 failed_count += 1;
1357 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1358 }
1359 }
1360
1361 evaluated_count += 1;
1362 report_progress(evaluated_count, failed_count, iterations);
1363 }
1364
1365 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1366 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1367 if actual_pass_ratio < expected_pass_ratio {
1368 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1369 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1370 for (error, count) in errored_evals {
1371 println!("Eval errored {} times. Error: {}", count, error);
1372 }
1373
1374 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1375 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1376 for (_buffer_output, failed_evals) in failed_evals {
1377 let eval_output = failed_evals.first().unwrap();
1378 println!("Eval failed {} times", failed_evals.len());
1379 println!("{}", eval_output);
1380 }
1381
1382 panic!(
1383 "Actual pass ratio: {}\nExpected pass ratio: {}",
1384 actual_pass_ratio, expected_pass_ratio
1385 );
1386 }
1387
1388 let mismatched_tag_ratio =
1389 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1390 if mismatched_tag_ratio > mismatched_tag_threshold {
1391 for eval_output in eval_outputs {
1392 println!("{}", eval_output);
1393 }
1394 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1395 }
1396}
1397
1398fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1399 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1400 let mut cx = TestAppContext::build(dispatcher, None);
1401 let output = cx.executor().block_test(async {
1402 let test = EditAgentTest::new(&mut cx).await;
1403 test.eval(eval, &mut cx).await
1404 });
1405 tx.send(output).unwrap();
1406}
1407
1408#[derive(Clone)]
1409struct EvalOutput {
1410 sample: EvalSample,
1411 assertion: EvalAssertionOutcome,
1412}
1413
1414impl Display for EvalOutput {
1415 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1416 writeln!(f, "Score: {:?}", self.assertion.score)?;
1417 if let Some(message) = self.assertion.message.as_ref() {
1418 writeln!(f, "Message: {}", message)?;
1419 }
1420
1421 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1422
1423 writeln!(
1424 f,
1425 "Parser Metrics:\n{:#?}",
1426 self.sample.edit_output.parser_metrics
1427 )?;
1428 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1429 Ok(())
1430 }
1431}
1432
1433fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1434 let passed_count = evaluated_count - failed_count;
1435 let passed_ratio = if evaluated_count == 0 {
1436 0.0
1437 } else {
1438 passed_count as f64 / evaluated_count as f64
1439 };
1440 print!(
1441 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1442 evaluated_count,
1443 iterations,
1444 passed_ratio * 100.0
1445 );
1446 std::io::stdout().flush().unwrap();
1447}
1448
1449struct EditAgentTest {
1450 agent: EditAgent,
1451 project: Entity<Project>,
1452 judge_model: Arc<dyn LanguageModel>,
1453}
1454
1455impl EditAgentTest {
1456 async fn new(cx: &mut TestAppContext) -> Self {
1457 cx.executor().allow_parking();
1458
1459 let fs = FakeFs::new(cx.executor().clone());
1460 cx.update(|cx| {
1461 settings::init(cx);
1462 gpui_tokio::init(cx);
1463 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1464 cx.set_http_client(http_client);
1465
1466 client::init_settings(cx);
1467 let client = Client::production(cx);
1468 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1469
1470 settings::init(cx);
1471 Project::init_settings(cx);
1472 language::init(cx);
1473 language_model::init(client.clone(), cx);
1474 language_models::init(user_store.clone(), client.clone(), cx);
1475 crate::init(client.http_client(), cx);
1476 });
1477
1478 fs.insert_tree("/root", json!({})).await;
1479 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1480 let agent_model = SelectedModel::from_str(
1481 &std::env::var("ZED_AGENT_MODEL")
1482 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1483 )
1484 .unwrap();
1485 let judge_model = SelectedModel::from_str(
1486 &std::env::var("ZED_JUDGE_MODEL")
1487 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1488 )
1489 .unwrap();
1490 let (agent_model, judge_model) = cx
1491 .update(|cx| {
1492 cx.spawn(async move |cx| {
1493 let agent_model = Self::load_model(&agent_model, cx).await;
1494 let judge_model = Self::load_model(&judge_model, cx).await;
1495 (agent_model.unwrap(), judge_model.unwrap())
1496 })
1497 })
1498 .await;
1499 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1500
1501 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1502
1503 Self {
1504 agent: EditAgent::new(
1505 agent_model,
1506 project.clone(),
1507 action_log,
1508 Templates::new(),
1509 edit_format,
1510 ),
1511 project,
1512 judge_model,
1513 }
1514 }
1515
1516 async fn load_model(
1517 selected_model: &SelectedModel,
1518 cx: &mut AsyncApp,
1519 ) -> Result<Arc<dyn LanguageModel>> {
1520 let (provider, model) = cx.update(|cx| {
1521 let models = LanguageModelRegistry::read_global(cx);
1522 let model = models
1523 .available_models(cx)
1524 .find(|model| {
1525 model.provider_id() == selected_model.provider
1526 && model.id() == selected_model.model
1527 })
1528 .expect("Model not found");
1529 let provider = models.provider(&model.provider_id()).unwrap();
1530 (provider, model)
1531 })?;
1532 cx.update(|cx| provider.authenticate(cx))?.await?;
1533 Ok(model)
1534 }
1535
1536 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1537 let path = self
1538 .project
1539 .read_with(cx, |project, cx| {
1540 project.find_project_path(eval.edit_file_input.path, cx)
1541 })
1542 .unwrap();
1543 let buffer = self
1544 .project
1545 .update(cx, |project, cx| project.open_buffer(path, cx))
1546 .await
1547 .unwrap();
1548 let tools = cx.update(|cx| {
1549 ToolRegistry::default_global(cx)
1550 .tools()
1551 .into_iter()
1552 .filter_map(|tool| {
1553 let input_schema = tool
1554 .input_schema(self.agent.model.tool_input_format())
1555 .ok()?;
1556 Some(LanguageModelRequestTool {
1557 name: tool.name(),
1558 description: tool.description(),
1559 input_schema,
1560 })
1561 })
1562 .collect::<Vec<_>>()
1563 });
1564 let tool_names = tools
1565 .iter()
1566 .map(|tool| tool.name.clone())
1567 .collect::<Vec<_>>();
1568 let worktrees = vec![WorktreeContext {
1569 root_name: "root".to_string(),
1570 abs_path: Path::new("/path/to/root").into(),
1571 rules_file: None,
1572 }];
1573 let prompt_builder = PromptBuilder::new(None)?;
1574 let project_context = ProjectContext::new(worktrees, Vec::default());
1575 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1576 &project_context,
1577 &ModelContext {
1578 available_tools: tool_names,
1579 },
1580 )?;
1581
1582 let has_system_prompt = eval
1583 .conversation
1584 .first()
1585 .map_or(false, |msg| msg.role == Role::System);
1586 let messages = if has_system_prompt {
1587 eval.conversation
1588 } else {
1589 [LanguageModelRequestMessage {
1590 role: Role::System,
1591 content: vec![MessageContent::Text(system_prompt)],
1592 cache: true,
1593 }]
1594 .into_iter()
1595 .chain(eval.conversation)
1596 .collect::<Vec<_>>()
1597 };
1598
1599 let conversation = LanguageModelRequest {
1600 messages,
1601 tools,
1602 ..Default::default()
1603 };
1604
1605 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1606 if let Some(input_content) = eval.input_content.as_deref() {
1607 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1608 }
1609 retry_on_rate_limit(async || {
1610 self.agent
1611 .edit(
1612 buffer.clone(),
1613 eval.edit_file_input.display_description.clone(),
1614 &conversation,
1615 &mut cx.to_async(),
1616 )
1617 .0
1618 .await
1619 })
1620 .await?
1621 } else {
1622 retry_on_rate_limit(async || {
1623 self.agent
1624 .overwrite(
1625 buffer.clone(),
1626 eval.edit_file_input.display_description.clone(),
1627 &conversation,
1628 &mut cx.to_async(),
1629 )
1630 .0
1631 .await
1632 })
1633 .await?
1634 };
1635
1636 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1637 let sample = EvalSample {
1638 edit_output,
1639 diff: language::unified_diff(
1640 eval.input_content.as_deref().unwrap_or_default(),
1641 &buffer_text,
1642 ),
1643 text_before: eval.input_content.unwrap_or_default(),
1644 text_after: buffer_text,
1645 };
1646 let assertion = eval
1647 .assertion
1648 .run(&sample, self.judge_model.clone(), cx)
1649 .await?;
1650
1651 Ok(EvalOutput { assertion, sample })
1652 }
1653}
1654
1655async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1656 let mut attempt = 0;
1657 loop {
1658 attempt += 1;
1659 match request().await {
1660 Ok(result) => return Ok(result),
1661 Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1662 Ok(err) => match &err {
1663 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1664 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1665 let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
1666 // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1667 let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1668 eprintln!(
1669 "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
1670 );
1671 Timer::after(retry_after + jitter).await;
1672 continue;
1673 }
1674 _ => return Err(err.into()),
1675 },
1676 Err(err) => return Err(err),
1677 },
1678 }
1679 }
1680}
1681
1682#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1683struct EvalAssertionOutcome {
1684 score: usize,
1685 message: Option<String>,
1686}
1687
1688#[derive(Serialize)]
1689pub struct DiffJudgeTemplate {
1690 diff: String,
1691 assertions: &'static str,
1692}
1693
1694impl Template for DiffJudgeTemplate {
1695 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1696}
1697
1698fn strip_empty_lines(text: &str) -> String {
1699 text.lines()
1700 .filter(|line| !line.trim().is_empty())
1701 .collect::<Vec<_>>()
1702 .join("\n")
1703}