1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 path::Path,
30 str::FromStr,
31 sync::mpsc,
32};
33use util::path;
34
35#[test]
36#[cfg_attr(not(feature = "eval"), ignore)]
37fn eval_extract_handle_command_output() {
38 // Test how well agent generates multiple edit hunks.
39 //
40 // Model | Pass rate
41 // ----------------------------|----------
42 // claude-3.7-sonnet | 0.99 (2025-06-14)
43 // claude-sonnet-4 | 0.97 (2025-06-14)
44 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
45 // gemini-2.5-flash | 0.11 (2025-05-22)
46 // gpt-4.1 | 1.00 (2025-05-22)
47
48 let input_file_path = "root/blame.rs";
49 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
50 let possible_diffs = vec![
51 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
57 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
58 ];
59 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
60 eval(
61 100,
62 0.95,
63 0.05,
64 EvalInput::from_conversation(
65 vec![
66 message(
67 User,
68 [text(formatdoc! {"
69 Read the `{input_file_path}` file and extract a method in
70 the final stanza of `run_git_blame` to deal with command failures,
71 call it `handle_command_output` and take the std::process::Output as the only parameter.
72 Do not document the method and do not add any comments.
73
74 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
75 "})],
76 ),
77 message(
78 Assistant,
79 [tool_use(
80 "tool_1",
81 "read_file",
82 ReadFileToolInput {
83 path: input_file_path.into(),
84 start_line: None,
85 end_line: None,
86 },
87 )],
88 ),
89 message(
90 User,
91 [tool_result("tool_1", "read_file", input_file_content)],
92 ),
93 message(
94 Assistant,
95 [tool_use(
96 "tool_2",
97 "edit_file",
98 EditFileToolInput {
99 display_description: edit_description.into(),
100 path: input_file_path.into(),
101 mode: EditFileMode::Edit,
102 },
103 )],
104 ),
105 ],
106 Some(input_file_content.into()),
107 EvalAssertion::assert_diff_any(possible_diffs),
108 ),
109 );
110}
111
112#[test]
113#[cfg_attr(not(feature = "eval"), ignore)]
114fn eval_delete_run_git_blame() {
115 // Model | Pass rate
116 // ----------------------------|----------
117 // claude-3.7-sonnet | 1.0 (2025-06-14)
118 // claude-sonnet-4 | 0.96 (2025-06-14)
119 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
120 // gemini-2.5-flash |
121 // gpt-4.1 |
122 let input_file_path = "root/blame.rs";
123 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
124 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
125 let edit_description = "Delete the `run_git_blame` function.";
126 eval(
127 100,
128 0.95,
129 0.05,
130 EvalInput::from_conversation(
131 vec![
132 message(
133 User,
134 [text(formatdoc! {"
135 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
136 one function, not its usages.
137 "})],
138 ),
139 message(
140 Assistant,
141 [tool_use(
142 "tool_1",
143 "read_file",
144 ReadFileToolInput {
145 path: input_file_path.into(),
146 start_line: None,
147 end_line: None,
148 },
149 )],
150 ),
151 message(
152 User,
153 [tool_result("tool_1", "read_file", input_file_content)],
154 ),
155 message(
156 Assistant,
157 [tool_use(
158 "tool_2",
159 "edit_file",
160 EditFileToolInput {
161 display_description: edit_description.into(),
162 path: input_file_path.into(),
163 mode: EditFileMode::Edit,
164 },
165 )],
166 ),
167 ],
168 Some(input_file_content.into()),
169 EvalAssertion::assert_eq(output_file_content),
170 ),
171 );
172}
173
174#[test]
175#[cfg_attr(not(feature = "eval"), ignore)]
176fn eval_translate_doc_comments() {
177 // Model | Pass rate
178 // ============================================
179 //
180 // claude-3.7-sonnet | 1.0 (2025-06-14)
181 // claude-sonnet-4 | 1.0 (2025-06-14)
182 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
183 // gemini-2.5-flash-preview-04-17 |
184 // gpt-4.1 |
185 let input_file_path = "root/canvas.rs";
186 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
187 let edit_description = "Translate all doc comments to Italian";
188 eval(
189 200,
190 1.,
191 0.05,
192 EvalInput::from_conversation(
193 vec![
194 message(
195 User,
196 [text(formatdoc! {"
197 Read the {input_file_path} file and edit it (without overwriting it),
198 translating all the doc comments to italian.
199 "})],
200 ),
201 message(
202 Assistant,
203 [tool_use(
204 "tool_1",
205 "read_file",
206 ReadFileToolInput {
207 path: input_file_path.into(),
208 start_line: None,
209 end_line: None,
210 },
211 )],
212 ),
213 message(
214 User,
215 [tool_result("tool_1", "read_file", input_file_content)],
216 ),
217 message(
218 Assistant,
219 [tool_use(
220 "tool_2",
221 "edit_file",
222 EditFileToolInput {
223 display_description: edit_description.into(),
224 path: input_file_path.into(),
225 mode: EditFileMode::Edit,
226 },
227 )],
228 ),
229 ],
230 Some(input_file_content.into()),
231 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
232 ),
233 );
234}
235
236#[test]
237#[cfg_attr(not(feature = "eval"), ignore)]
238fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
239 // Model | Pass rate
240 // ============================================
241 //
242 // claude-3.7-sonnet | 0.96 (2025-06-14)
243 // claude-sonnet-4 | 0.11 (2025-06-14)
244 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
245 // gemini-2.5-flash-preview-04-17 |
246 // gpt-4.1 |
247 let input_file_path = "root/lib.rs";
248 let input_file_content =
249 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
250 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
251 eval(
252 100,
253 0.95,
254 0.05,
255 EvalInput::from_conversation(
256 vec![
257 message(
258 User,
259 [text(formatdoc! {"
260 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
261 Use `ureq` to download the SDK for the current platform and architecture.
262 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
263 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
264 that's inside of the archive.
265 Don't re-download the SDK if that executable already exists.
266
267 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
268
269 Here are the available wasi-sdk assets:
270 - wasi-sdk-25.0-x86_64-macos.tar.gz
271 - wasi-sdk-25.0-arm64-macos.tar.gz
272 - wasi-sdk-25.0-x86_64-linux.tar.gz
273 - wasi-sdk-25.0-arm64-linux.tar.gz
274 - wasi-sdk-25.0-x86_64-linux.tar.gz
275 - wasi-sdk-25.0-arm64-linux.tar.gz
276 - wasi-sdk-25.0-x86_64-windows.tar.gz
277 "})],
278 ),
279 message(
280 Assistant,
281 [tool_use(
282 "tool_1",
283 "read_file",
284 ReadFileToolInput {
285 path: input_file_path.into(),
286 start_line: Some(971),
287 end_line: Some(1050),
288 },
289 )],
290 ),
291 message(
292 User,
293 [tool_result(
294 "tool_1",
295 "read_file",
296 lines(input_file_content, 971..1050),
297 )],
298 ),
299 message(
300 Assistant,
301 [tool_use(
302 "tool_2",
303 "read_file",
304 ReadFileToolInput {
305 path: input_file_path.into(),
306 start_line: Some(1050),
307 end_line: Some(1100),
308 },
309 )],
310 ),
311 message(
312 User,
313 [tool_result(
314 "tool_2",
315 "read_file",
316 lines(input_file_content, 1050..1100),
317 )],
318 ),
319 message(
320 Assistant,
321 [tool_use(
322 "tool_3",
323 "read_file",
324 ReadFileToolInput {
325 path: input_file_path.into(),
326 start_line: Some(1100),
327 end_line: Some(1150),
328 },
329 )],
330 ),
331 message(
332 User,
333 [tool_result(
334 "tool_3",
335 "read_file",
336 lines(input_file_content, 1100..1150),
337 )],
338 ),
339 message(
340 Assistant,
341 [tool_use(
342 "tool_4",
343 "edit_file",
344 EditFileToolInput {
345 display_description: edit_description.into(),
346 path: input_file_path.into(),
347 mode: EditFileMode::Edit,
348 },
349 )],
350 ),
351 ],
352 Some(input_file_content.into()),
353 EvalAssertion::judge_diff(indoc! {"
354 - The compile_parser_to_wasm method has been changed to use wasi-sdk
355 - ureq is used to download the SDK for current platform and architecture
356 "}),
357 ),
358 );
359}
360
361#[test]
362#[cfg_attr(not(feature = "eval"), ignore)]
363fn eval_disable_cursor_blinking() {
364 // Model | Pass rate
365 // ============================================
366 //
367 // claude-3.7-sonnet | 0.99 (2025-06-14)
368 // claude-sonnet-4 | 0.85 (2025-06-14)
369 // gemini-2.5-pro-preview-latest | 0.97 (2025-06-16)
370 // gemini-2.5-flash-preview-04-17 |
371 // gpt-4.1 |
372 let input_file_path = "root/editor.rs";
373 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
374 let edit_description = "Comment out the call to `BlinkManager::enable`";
375 eval(
376 100,
377 0.95,
378 0.05,
379 EvalInput::from_conversation(
380 vec![
381 message(User, [text("Let's research how to cursor blinking works.")]),
382 message(
383 Assistant,
384 [tool_use(
385 "tool_1",
386 "grep",
387 GrepToolInput {
388 regex: "blink".into(),
389 include_pattern: None,
390 offset: 0,
391 case_sensitive: false,
392 },
393 )],
394 ),
395 message(
396 User,
397 [tool_result(
398 "tool_1",
399 "grep",
400 [
401 lines(input_file_content, 100..400),
402 lines(input_file_content, 800..1300),
403 lines(input_file_content, 1600..2000),
404 lines(input_file_content, 5000..5500),
405 lines(input_file_content, 8000..9000),
406 lines(input_file_content, 18455..18470),
407 lines(input_file_content, 20000..20500),
408 lines(input_file_content, 21000..21300),
409 ]
410 .join("Match found:\n\n"),
411 )],
412 ),
413 message(
414 User,
415 [text(indoc! {"
416 Comment out the lines that interact with the BlinkManager.
417 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
418 Don't add additional comments.
419 "})],
420 ),
421 message(
422 Assistant,
423 [tool_use(
424 "tool_4",
425 "edit_file",
426 EditFileToolInput {
427 display_description: edit_description.into(),
428 path: input_file_path.into(),
429 mode: EditFileMode::Edit,
430 },
431 )],
432 ),
433 ],
434 Some(input_file_content.into()),
435 EvalAssertion::judge_diff(indoc! {"
436 - Calls to BlinkManager in `observe_window_activation` were commented out
437 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
438 - All the edits have valid indentation
439 "}),
440 ),
441 );
442}
443
444#[test]
445#[cfg_attr(not(feature = "eval"), ignore)]
446fn eval_from_pixels_constructor() {
447 // Results for 2025-06-13
448 //
449 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
450 // value. Higher values improve the pass rate but may sometimes cause
451 // edits to be misapplied. In the context of this eval, this means
452 // the agent might add from_pixels tests in incorrect locations
453 // (e.g., at the beginning of the file), yet the evaluation may still
454 // rate it highly.
455 //
456 // Model | Date | Pass rate
457 // =========================================================
458 // claude-4.0-sonnet | 2025-06-14 | 0.99
459 // claude-3.7-sonnet | 2025-06-14 | 0.88
460 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
461 // gpt-4.1 |
462 let input_file_path = "root/canvas.rs";
463 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
464 let edit_description = "Implement from_pixels constructor and add tests.";
465 eval(
466 100,
467 0.95,
468 // For whatever reason, this eval produces more mismatched tags.
469 // Increasing for now, let's see if we can bring this down.
470 0.25,
471 EvalInput::from_conversation(
472 vec![
473 message(
474 User,
475 [text(indoc! {"
476 Introduce a new `from_pixels` constructor in Canvas and
477 also add tests for it in the same file.
478 "})],
479 ),
480 message(
481 Assistant,
482 [tool_use(
483 "tool_1",
484 "read_file",
485 ReadFileToolInput {
486 path: input_file_path.into(),
487 start_line: None,
488 end_line: None,
489 },
490 )],
491 ),
492 message(
493 User,
494 [tool_result("tool_1", "read_file", input_file_content)],
495 ),
496 message(
497 Assistant,
498 [tool_use(
499 "tool_2",
500 "grep",
501 GrepToolInput {
502 regex: "mod\\s+tests".into(),
503 include_pattern: Some("font-kit/src/canvas.rs".into()),
504 offset: 0,
505 case_sensitive: false,
506 },
507 )],
508 ),
509 message(User, [tool_result("tool_2", "grep", "No matches found")]),
510 message(
511 Assistant,
512 [tool_use(
513 "tool_3",
514 "grep",
515 GrepToolInput {
516 regex: "mod\\s+tests".into(),
517 include_pattern: Some("font-kit/src/**/*.rs".into()),
518 offset: 0,
519 case_sensitive: false,
520 },
521 )],
522 ),
523 message(User, [tool_result("tool_3", "grep", "No matches found")]),
524 message(
525 Assistant,
526 [tool_use(
527 "tool_4",
528 "grep",
529 GrepToolInput {
530 regex: "#\\[test\\]".into(),
531 include_pattern: Some("font-kit/src/**/*.rs".into()),
532 offset: 0,
533 case_sensitive: false,
534 },
535 )],
536 ),
537 message(
538 User,
539 [tool_result(
540 "tool_4",
541 "grep",
542 indoc! {"
543 Found 6 matches:
544
545 ## Matches in font-kit/src/loaders/core_text.rs
546
547 ### mod test › L926-936
548 ```
549 mod test {
550 use super::Font;
551 use crate::properties::{Stretch, Weight};
552
553 #[cfg(feature = \"source\")]
554 use crate::source::SystemSource;
555
556 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
557
558 #[cfg(feature = \"source\")]
559 #[test]
560 ```
561
562 55 lines remaining in ancestor node. Read the file to see all.
563
564 ### mod test › L947-951
565 ```
566 }
567
568 #[test]
569 fn test_core_text_to_css_font_weight() {
570 // Exact matches
571 ```
572
573 ### mod test › L959-963
574 ```
575 }
576
577 #[test]
578 fn test_core_text_to_css_font_stretch() {
579 // Exact matches
580 ```
581
582 ## Matches in font-kit/src/loaders/freetype.rs
583
584 ### mod test › L1238-1248
585 ```
586 mod test {
587 use crate::loaders::freetype::Font;
588
589 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
590 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
591
592 #[test]
593 fn get_pcf_postscript_name() {
594 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
595 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
596 }
597 ```
598
599 1 lines remaining in ancestor node. Read the file to see all.
600
601 ## Matches in font-kit/src/sources/core_text.rs
602
603 ### mod test › L265-275
604 ```
605 mod test {
606 use crate::properties::{Stretch, Weight};
607
608 #[test]
609 fn test_css_to_core_text_font_weight() {
610 // Exact matches
611 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
612 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
613 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
614 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
615
616 ```
617
618 27 lines remaining in ancestor node. Read the file to see all.
619
620 ### mod test › L278-282
621 ```
622 }
623
624 #[test]
625 fn test_css_to_core_text_font_stretch() {
626 // Exact matches
627 ```
628 "},
629 )],
630 ),
631 message(
632 Assistant,
633 [tool_use(
634 "tool_5",
635 "edit_file",
636 EditFileToolInput {
637 display_description: edit_description.into(),
638 path: input_file_path.into(),
639 mode: EditFileMode::Edit,
640 },
641 )],
642 ),
643 ],
644 Some(input_file_content.into()),
645 EvalAssertion::judge_diff(indoc! {"
646 - The diff contains a new `from_pixels` constructor
647 - The diff contains new tests for the `from_pixels` constructor
648 "}),
649 ),
650 );
651}
652
653#[test]
654#[cfg_attr(not(feature = "eval"), ignore)]
655fn eval_zode() {
656 // Model | Pass rate
657 // ============================================
658 //
659 // claude-3.7-sonnet | 1.0 (2025-06-14)
660 // claude-sonnet-4 | 1.0 (2025-06-14)
661 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
662 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
663 // gpt-4.1 | 1.0 (2025-05-22)
664 let input_file_path = "root/zode.py";
665 let input_content = None;
666 let edit_description = "Create the main Zode CLI script";
667 eval(
668 50,
669 1.,
670 0.05,
671 EvalInput::from_conversation(
672 vec![
673 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
674 message(
675 Assistant,
676 [
677 tool_use(
678 "tool_1",
679 "read_file",
680 ReadFileToolInput {
681 path: "root/eval/react.py".into(),
682 start_line: None,
683 end_line: None,
684 },
685 ),
686 tool_use(
687 "tool_2",
688 "read_file",
689 ReadFileToolInput {
690 path: "root/eval/react_test.py".into(),
691 start_line: None,
692 end_line: None,
693 },
694 ),
695 ],
696 ),
697 message(
698 User,
699 [
700 tool_result(
701 "tool_1",
702 "read_file",
703 include_str!("evals/fixtures/zode/react.py"),
704 ),
705 tool_result(
706 "tool_2",
707 "read_file",
708 include_str!("evals/fixtures/zode/react_test.py"),
709 ),
710 ],
711 ),
712 message(
713 Assistant,
714 [
715 text(
716 "Now that I understand what we need to build, I'll create the main Python script:",
717 ),
718 tool_use(
719 "tool_3",
720 "edit_file",
721 EditFileToolInput {
722 display_description: edit_description.into(),
723 path: input_file_path.into(),
724 mode: EditFileMode::Create,
725 },
726 ),
727 ],
728 ),
729 ],
730 input_content,
731 EvalAssertion::new(async move |sample, _, _cx| {
732 let invalid_starts = [' ', '`', '\n'];
733 let mut message = String::new();
734 for start in invalid_starts {
735 if sample.text_after.starts_with(start) {
736 message.push_str(&format!("The sample starts with a {:?}\n", start));
737 break;
738 }
739 }
740 // Remove trailing newline.
741 message.pop();
742
743 if message.is_empty() {
744 Ok(EvalAssertionOutcome {
745 score: 100,
746 message: None,
747 })
748 } else {
749 Ok(EvalAssertionOutcome {
750 score: 0,
751 message: Some(message),
752 })
753 }
754 }),
755 ),
756 );
757}
758
759#[test]
760#[cfg_attr(not(feature = "eval"), ignore)]
761fn eval_add_overwrite_test() {
762 // Model | Pass rate
763 // ============================================
764 //
765 // claude-3.7-sonnet | 0.65 (2025-06-14)
766 // claude-sonnet-4 | 0.07 (2025-06-14)
767 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
768 // gemini-2.5-flash-preview-04-17 |
769 // gpt-4.1 |
770 let input_file_path = "root/action_log.rs";
771 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
772 let edit_description = "Add a new test for overwriting a file in action_log.rs";
773 eval(
774 200,
775 0.5, // TODO: make this eval better
776 0.05,
777 EvalInput::from_conversation(
778 vec![
779 message(
780 User,
781 [text(indoc! {"
782 Introduce a new test in `action_log.rs` to test overwriting a file.
783 That is, a file already exists, but we call `buffer_created` as if the file were new.
784 Take inspiration from all the other tests in the file.
785 "})],
786 ),
787 message(
788 Assistant,
789 [tool_use(
790 "tool_1",
791 "read_file",
792 ReadFileToolInput {
793 path: input_file_path.into(),
794 start_line: None,
795 end_line: None,
796 },
797 )],
798 ),
799 message(
800 User,
801 [tool_result(
802 "tool_1",
803 "read_file",
804 indoc! {"
805 pub struct ActionLog [L13-20]
806 tracked_buffers [L15]
807 edited_since_project_diagnostics_check [L17]
808 project [L19]
809 impl ActionLog [L22-498]
810 pub fn new [L24-30]
811 pub fn project [L32-34]
812 pub fn checked_project_diagnostics [L37-39]
813 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
814 fn track_buffer_internal [L46-101]
815 fn handle_buffer_event [L103-116]
816 fn handle_buffer_edited [L118-123]
817 fn handle_buffer_file_changed [L125-158]
818 async fn maintain_diff [L160-264]
819 pub fn buffer_read [L267-269]
820 pub fn buffer_created [L272-276]
821 pub fn buffer_edited [L279-287]
822 pub fn will_delete_buffer [L289-304]
823 pub fn keep_edits_in_range [L306-364]
824 pub fn reject_edits_in_ranges [L366-459]
825 pub fn keep_all_edits [L461-473]
826 pub fn changed_buffers [L476-482]
827 pub fn stale_buffers [L485-497]
828 fn apply_non_conflicting_edits [L500-561]
829 fn diff_snapshots [L563-585]
830 fn point_to_row_edit [L587-614]
831 enum ChangeAuthor [L617-620]
832 User [L618]
833 Agent [L619]
834 enum TrackedBufferStatus [L623-627]
835 Created [L624]
836 Modified [L625]
837 Deleted [L626]
838 struct TrackedBuffer [L629-641]
839 buffer [L630]
840 base_text [L631]
841 unreviewed_changes [L632]
842 status [L633]
843 version [L634]
844 diff [L635]
845 snapshot [L636]
846 diff_update [L637]
847 _open_lsp_handle [L638]
848 _maintain_diff [L639]
849 _subscription [L640]
850 impl TrackedBuffer [L643-657]
851 fn has_changes [L644-650]
852 fn schedule_diff_update [L652-656]
853 pub struct ChangedBuffer [L659-661]
854 pub diff [L660]
855 mod tests [L664-1574]
856 fn init_logger [L678-682]
857 fn init_test [L684-691]
858 async fn test_keep_edits [L694-769]
859 async fn test_deletions [L772-854]
860 async fn test_overlapping_user_edits [L857-951]
861 async fn test_creating_files [L954-1010]
862 async fn test_deleting_files [L1013-1120]
863 async fn test_reject_edits [L1123-1255]
864 async fn test_reject_multiple_edits [L1258-1331]
865 async fn test_reject_deleted_file [L1334-1388]
866 async fn test_reject_created_file [L1391-1443]
867 async fn test_random_diffs [L1446-1535]
868 fn quiesce [L1510-1534]
869 struct HunkStatus [L1538-1542]
870 range [L1539]
871 diff_status [L1540]
872 old_text [L1541]
873 fn unreviewed_hunks [L1544-1573]
874
875 Showing symbols 1-69 (total symbols: 69)
876
877 Using the line numbers in this outline, you can call this tool again while specifying
878 the start_line and end_line fields to see the implementations of symbols in the outline.
879 "},
880 )],
881 ),
882 message(
883 Assistant,
884 [
885 text(
886 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
887 ),
888 tool_use(
889 "tool_2",
890 "read_file",
891 ReadFileToolInput {
892 path: input_file_path.into(),
893 start_line: Some(953),
894 end_line: Some(1010),
895 },
896 ),
897 ],
898 ),
899 message(
900 User,
901 [tool_result(
902 "tool_2",
903 "read_file",
904 lines(input_file_content, 953..1010),
905 )],
906 ),
907 message(
908 Assistant,
909 [
910 text(
911 "Now let's look at another relevant test to understand how they're structured:",
912 ),
913 tool_use(
914 "tool_3",
915 "read_file",
916 ReadFileToolInput {
917 path: input_file_path.into(),
918 start_line: Some(1012),
919 end_line: Some(1120),
920 },
921 ),
922 ],
923 ),
924 message(
925 User,
926 [tool_result(
927 "tool_3",
928 "read_file",
929 lines(input_file_content, 1012..1120),
930 )],
931 ),
932 message(
933 Assistant,
934 [
935 text("Now let's look at how `buffer_created` is implemented:"),
936 tool_use(
937 "tool_4",
938 "read_file",
939 ReadFileToolInput {
940 path: input_file_path.into(),
941 start_line: Some(271),
942 end_line: Some(276),
943 },
944 ),
945 ],
946 ),
947 message(
948 User,
949 [tool_result(
950 "tool_4",
951 "read_file",
952 lines(input_file_content, 271..276),
953 )],
954 ),
955 message(
956 Assistant,
957 [
958 text(indoc! {"
959 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
960
961 1. Create a file
962 2. Open a buffer for it
963 3. Track it in the action log
964 4. Modify the buffer
965 5. Call `buffer_created` again as if the file were new
966 6. Check that the changes are properly tracked
967
968 Let's write the test:
969 "}),
970 tool_use(
971 "tool_5",
972 "edit_file",
973 EditFileToolInput {
974 display_description: edit_description.into(),
975 path: input_file_path.into(),
976 mode: EditFileMode::Edit,
977 },
978 ),
979 ],
980 ),
981 ],
982 Some(input_file_content.into()),
983 EvalAssertion::judge_diff(
984 "A new test for overwritten files was created, without changing any previous test",
985 ),
986 ),
987 );
988}
989
990#[test]
991#[cfg_attr(not(feature = "eval"), ignore)]
992fn eval_create_empty_file() {
993 // Check that Edit Agent can create a file without writing its
994 // thoughts into it. This issue is not specific to empty files, but
995 // it's easier to reproduce with them.
996 //
997 // Model | Pass rate
998 // ============================================
999 //
1000 // claude-3.7-sonnet | 1.00 (2025-06-14)
1001 // claude-sonnet-4 | 1.00 (2025-06-14)
1002 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1003 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1004 // gpt-4.1 | 1.00 (2025-05-21)
1005 //
1006 //
1007 // TODO: gpt-4.1-mini errored 38 times:
1008 // "data did not match any variant of untagged enum ResponseStreamResult"
1009 //
1010 let input_file_content = None;
1011 let expected_output_content = String::new();
1012 eval(
1013 100,
1014 0.99,
1015 0.05,
1016 EvalInput::from_conversation(
1017 vec![
1018 message(User, [text("Create a second empty todo file ")]),
1019 message(
1020 Assistant,
1021 [
1022 text(formatdoc! {"
1023 I'll help you create a second empty todo file.
1024 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1025 "}),
1026 tool_use(
1027 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1028 "list_directory",
1029 ListDirectoryToolInput {
1030 path: "root".to_string(),
1031 },
1032 ),
1033 ],
1034 ),
1035 message(
1036 User,
1037 [tool_result(
1038 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1039 "list_directory",
1040 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1041 )],
1042 ),
1043 message(
1044 Assistant,
1045 [
1046 text(formatdoc! {"
1047 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1048 "}),
1049 tool_use(
1050 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1051 "edit_file",
1052 EditFileToolInput {
1053 display_description: "Create empty TODO3 file".to_string(),
1054 mode: EditFileMode::Create,
1055 path: "root/TODO3".into(),
1056 },
1057 ),
1058 ],
1059 ),
1060 ],
1061 input_file_content,
1062 // Bad behavior is to write something like
1063 // "I'll create an empty TODO3 file as requested."
1064 EvalAssertion::assert_eq(expected_output_content),
1065 ),
1066 );
1067}
1068
1069fn message(
1070 role: Role,
1071 contents: impl IntoIterator<Item = MessageContent>,
1072) -> LanguageModelRequestMessage {
1073 LanguageModelRequestMessage {
1074 role,
1075 content: contents.into_iter().collect(),
1076 cache: false,
1077 }
1078}
1079
1080fn text(text: impl Into<String>) -> MessageContent {
1081 MessageContent::Text(text.into())
1082}
1083
1084fn lines(input: &str, range: Range<usize>) -> String {
1085 input
1086 .lines()
1087 .skip(range.start)
1088 .take(range.len())
1089 .collect::<Vec<_>>()
1090 .join("\n")
1091}
1092
1093fn tool_use(
1094 id: impl Into<Arc<str>>,
1095 name: impl Into<Arc<str>>,
1096 input: impl Serialize,
1097) -> MessageContent {
1098 MessageContent::ToolUse(LanguageModelToolUse {
1099 id: LanguageModelToolUseId::from(id.into()),
1100 name: name.into(),
1101 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1102 input: serde_json::to_value(input).unwrap(),
1103 is_input_complete: true,
1104 })
1105}
1106
1107fn tool_result(
1108 id: impl Into<Arc<str>>,
1109 name: impl Into<Arc<str>>,
1110 result: impl Into<Arc<str>>,
1111) -> MessageContent {
1112 MessageContent::ToolResult(LanguageModelToolResult {
1113 tool_use_id: LanguageModelToolUseId::from(id.into()),
1114 tool_name: name.into(),
1115 is_error: false,
1116 content: LanguageModelToolResultContent::Text(result.into()),
1117 output: None,
1118 })
1119}
1120
1121#[derive(Clone)]
1122struct EvalInput {
1123 conversation: Vec<LanguageModelRequestMessage>,
1124 edit_file_input: EditFileToolInput,
1125 input_content: Option<String>,
1126 assertion: EvalAssertion,
1127}
1128
1129impl EvalInput {
1130 fn from_conversation(
1131 conversation: Vec<LanguageModelRequestMessage>,
1132 input_content: Option<String>,
1133 assertion: EvalAssertion,
1134 ) -> Self {
1135 let msg = conversation.last().expect("Conversation must not be empty");
1136 if msg.role != Role::Assistant {
1137 panic!("Conversation must end with an assistant message");
1138 }
1139 let tool_use = msg
1140 .content
1141 .iter()
1142 .flat_map(|content| match content {
1143 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1144 Some(tool_use)
1145 }
1146 _ => None,
1147 })
1148 .next()
1149 .expect("Conversation must end with an edit_file tool use")
1150 .clone();
1151
1152 let edit_file_input: EditFileToolInput =
1153 serde_json::from_value(tool_use.input.clone()).unwrap();
1154
1155 EvalInput {
1156 conversation,
1157 edit_file_input,
1158 input_content,
1159 assertion,
1160 }
1161 }
1162}
1163
1164#[derive(Clone)]
1165struct EvalSample {
1166 text_before: String,
1167 text_after: String,
1168 edit_output: EditAgentOutput,
1169 diff: String,
1170}
1171
1172trait AssertionFn: 'static + Send + Sync {
1173 fn assert<'a>(
1174 &'a self,
1175 sample: &'a EvalSample,
1176 judge_model: Arc<dyn LanguageModel>,
1177 cx: &'a mut TestAppContext,
1178 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1179}
1180
1181impl<F> AssertionFn for F
1182where
1183 F: 'static
1184 + Send
1185 + Sync
1186 + AsyncFn(
1187 &EvalSample,
1188 Arc<dyn LanguageModel>,
1189 &mut TestAppContext,
1190 ) -> Result<EvalAssertionOutcome>,
1191{
1192 fn assert<'a>(
1193 &'a self,
1194 sample: &'a EvalSample,
1195 judge_model: Arc<dyn LanguageModel>,
1196 cx: &'a mut TestAppContext,
1197 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1198 (self)(sample, judge_model, cx).boxed_local()
1199 }
1200}
1201
1202#[derive(Clone)]
1203struct EvalAssertion(Arc<dyn AssertionFn>);
1204
1205impl EvalAssertion {
1206 fn new<F>(f: F) -> Self
1207 where
1208 F: 'static
1209 + Send
1210 + Sync
1211 + AsyncFn(
1212 &EvalSample,
1213 Arc<dyn LanguageModel>,
1214 &mut TestAppContext,
1215 ) -> Result<EvalAssertionOutcome>,
1216 {
1217 EvalAssertion(Arc::new(f))
1218 }
1219
1220 fn assert_eq(expected: impl Into<String>) -> Self {
1221 let expected = expected.into();
1222 Self::new(async move |sample, _judge, _cx| {
1223 Ok(EvalAssertionOutcome {
1224 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1225 100
1226 } else {
1227 0
1228 },
1229 message: None,
1230 })
1231 })
1232 }
1233
1234 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1235 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1236 Self::new(async move |sample, _judge, _cx| {
1237 let matches = expected_diffs.iter().any(|possible_diff| {
1238 let expected =
1239 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1240 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1241 });
1242
1243 Ok(EvalAssertionOutcome {
1244 score: if matches { 100 } else { 0 },
1245 message: None,
1246 })
1247 })
1248 }
1249
1250 fn judge_diff(assertions: &'static str) -> Self {
1251 Self::new(async move |sample, judge, cx| {
1252 let prompt = DiffJudgeTemplate {
1253 diff: sample.diff.clone(),
1254 assertions,
1255 }
1256 .render(&Templates::new())
1257 .unwrap();
1258
1259 let request = LanguageModelRequest {
1260 messages: vec![LanguageModelRequestMessage {
1261 role: Role::User,
1262 content: vec![prompt.into()],
1263 cache: false,
1264 }],
1265 ..Default::default()
1266 };
1267 let mut response = retry_on_rate_limit(async || {
1268 Ok(judge
1269 .stream_completion_text(request.clone(), &cx.to_async())
1270 .await?)
1271 })
1272 .await?;
1273 let mut output = String::new();
1274 while let Some(chunk) = response.stream.next().await {
1275 let chunk = chunk?;
1276 output.push_str(&chunk);
1277 }
1278
1279 // Parse the score from the response
1280 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1281 if let Some(captures) = re.captures(&output) {
1282 if let Some(score_match) = captures.get(1) {
1283 let score = score_match.as_str().parse().unwrap_or(0);
1284 return Ok(EvalAssertionOutcome {
1285 score,
1286 message: Some(output),
1287 });
1288 }
1289 }
1290
1291 anyhow::bail!("No score found in response. Raw output: {output}");
1292 })
1293 }
1294
1295 async fn run(
1296 &self,
1297 input: &EvalSample,
1298 judge_model: Arc<dyn LanguageModel>,
1299 cx: &mut TestAppContext,
1300 ) -> Result<EvalAssertionOutcome> {
1301 self.0.assert(input, judge_model, cx).await
1302 }
1303}
1304
1305fn eval(
1306 iterations: usize,
1307 expected_pass_ratio: f32,
1308 mismatched_tag_threshold: f32,
1309 mut eval: EvalInput,
1310) {
1311 let mut evaluated_count = 0;
1312 let mut failed_count = 0;
1313 report_progress(evaluated_count, failed_count, iterations);
1314
1315 let (tx, rx) = mpsc::channel();
1316
1317 // Cache the last message in the conversation, and run one instance of the eval so that
1318 // all the next ones are cached.
1319 eval.conversation.last_mut().unwrap().cache = true;
1320 run_eval(eval.clone(), tx.clone());
1321
1322 let executor = gpui::background_executor();
1323 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1324 for _ in 1..iterations {
1325 let eval = eval.clone();
1326 let tx = tx.clone();
1327 let semaphore = semaphore.clone();
1328 executor
1329 .spawn(async move {
1330 let _guard = semaphore.acquire().await;
1331 run_eval(eval, tx)
1332 })
1333 .detach();
1334 }
1335 drop(tx);
1336
1337 let mut failed_evals = HashMap::default();
1338 let mut errored_evals = HashMap::default();
1339 let mut eval_outputs = Vec::new();
1340 let mut cumulative_parser_metrics = EditParserMetrics::default();
1341 while let Ok(output) = rx.recv() {
1342 match output {
1343 Ok(output) => {
1344 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1345 eval_outputs.push(output.clone());
1346 if output.assertion.score < 80 {
1347 failed_count += 1;
1348 failed_evals
1349 .entry(output.sample.text_after.clone())
1350 .or_insert(Vec::new())
1351 .push(output);
1352 }
1353 }
1354 Err(error) => {
1355 failed_count += 1;
1356 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1357 }
1358 }
1359
1360 evaluated_count += 1;
1361 report_progress(evaluated_count, failed_count, iterations);
1362 }
1363
1364 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1365 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1366 if actual_pass_ratio < expected_pass_ratio {
1367 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1368 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1369 for (error, count) in errored_evals {
1370 println!("Eval errored {} times. Error: {}", count, error);
1371 }
1372
1373 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1374 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1375 for (_buffer_output, failed_evals) in failed_evals {
1376 let eval_output = failed_evals.first().unwrap();
1377 println!("Eval failed {} times", failed_evals.len());
1378 println!("{}", eval_output);
1379 }
1380
1381 panic!(
1382 "Actual pass ratio: {}\nExpected pass ratio: {}",
1383 actual_pass_ratio, expected_pass_ratio
1384 );
1385 }
1386
1387 let mismatched_tag_ratio =
1388 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1389 if mismatched_tag_ratio > mismatched_tag_threshold {
1390 for eval_output in eval_outputs {
1391 println!("{}", eval_output);
1392 }
1393 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1394 }
1395}
1396
1397fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1398 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1399 let mut cx = TestAppContext::build(dispatcher, None);
1400 let output = cx.executor().block_test(async {
1401 let test = EditAgentTest::new(&mut cx).await;
1402 test.eval(eval, &mut cx).await
1403 });
1404 tx.send(output).unwrap();
1405}
1406
1407#[derive(Clone)]
1408struct EvalOutput {
1409 sample: EvalSample,
1410 assertion: EvalAssertionOutcome,
1411}
1412
1413impl Display for EvalOutput {
1414 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1415 writeln!(f, "Score: {:?}", self.assertion.score)?;
1416 if let Some(message) = self.assertion.message.as_ref() {
1417 writeln!(f, "Message: {}", message)?;
1418 }
1419
1420 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1421
1422 writeln!(
1423 f,
1424 "Parser Metrics:\n{:#?}",
1425 self.sample.edit_output.parser_metrics
1426 )?;
1427 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1428 Ok(())
1429 }
1430}
1431
1432fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1433 let passed_count = evaluated_count - failed_count;
1434 let passed_ratio = if evaluated_count == 0 {
1435 0.0
1436 } else {
1437 passed_count as f64 / evaluated_count as f64
1438 };
1439 print!(
1440 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1441 evaluated_count,
1442 iterations,
1443 passed_ratio * 100.0
1444 );
1445 std::io::stdout().flush().unwrap();
1446}
1447
1448struct EditAgentTest {
1449 agent: EditAgent,
1450 project: Entity<Project>,
1451 judge_model: Arc<dyn LanguageModel>,
1452}
1453
1454impl EditAgentTest {
1455 async fn new(cx: &mut TestAppContext) -> Self {
1456 cx.executor().allow_parking();
1457
1458 let fs = FakeFs::new(cx.executor().clone());
1459 cx.update(|cx| {
1460 settings::init(cx);
1461 gpui_tokio::init(cx);
1462 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1463 cx.set_http_client(http_client);
1464
1465 client::init_settings(cx);
1466 let client = Client::production(cx);
1467 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1468
1469 settings::init(cx);
1470 Project::init_settings(cx);
1471 language::init(cx);
1472 language_model::init(client.clone(), cx);
1473 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1474 crate::init(client.http_client(), cx);
1475 });
1476
1477 fs.insert_tree("/root", json!({})).await;
1478 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1479 let agent_model = SelectedModel::from_str(
1480 &std::env::var("ZED_AGENT_MODEL")
1481 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1482 )
1483 .unwrap();
1484 let judge_model = SelectedModel::from_str(
1485 &std::env::var("ZED_JUDGE_MODEL")
1486 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1487 )
1488 .unwrap();
1489 let (agent_model, judge_model) = cx
1490 .update(|cx| {
1491 cx.spawn(async move |cx| {
1492 let agent_model = Self::load_model(&agent_model, cx).await;
1493 let judge_model = Self::load_model(&judge_model, cx).await;
1494 (agent_model.unwrap(), judge_model.unwrap())
1495 })
1496 })
1497 .await;
1498 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1499
1500 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1501
1502 Self {
1503 agent: EditAgent::new(
1504 agent_model,
1505 project.clone(),
1506 action_log,
1507 Templates::new(),
1508 edit_format,
1509 ),
1510 project,
1511 judge_model,
1512 }
1513 }
1514
1515 async fn load_model(
1516 selected_model: &SelectedModel,
1517 cx: &mut AsyncApp,
1518 ) -> Result<Arc<dyn LanguageModel>> {
1519 let (provider, model) = cx.update(|cx| {
1520 let models = LanguageModelRegistry::read_global(cx);
1521 let model = models
1522 .available_models(cx)
1523 .find(|model| {
1524 model.provider_id() == selected_model.provider
1525 && model.id() == selected_model.model
1526 })
1527 .expect("Model not found");
1528 let provider = models.provider(&model.provider_id()).unwrap();
1529 (provider, model)
1530 })?;
1531 cx.update(|cx| provider.authenticate(cx))?.await?;
1532 Ok(model)
1533 }
1534
1535 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1536 let path = self
1537 .project
1538 .read_with(cx, |project, cx| {
1539 project.find_project_path(eval.edit_file_input.path, cx)
1540 })
1541 .unwrap();
1542 let buffer = self
1543 .project
1544 .update(cx, |project, cx| project.open_buffer(path, cx))
1545 .await
1546 .unwrap();
1547 let tools = cx.update(|cx| {
1548 ToolRegistry::default_global(cx)
1549 .tools()
1550 .into_iter()
1551 .filter_map(|tool| {
1552 let input_schema = tool
1553 .input_schema(self.agent.model.tool_input_format())
1554 .ok()?;
1555 Some(LanguageModelRequestTool {
1556 name: tool.name(),
1557 description: tool.description(),
1558 input_schema,
1559 })
1560 })
1561 .collect::<Vec<_>>()
1562 });
1563 let tool_names = tools
1564 .iter()
1565 .map(|tool| tool.name.clone())
1566 .collect::<Vec<_>>();
1567 let worktrees = vec![WorktreeContext {
1568 root_name: "root".to_string(),
1569 abs_path: Path::new("/path/to/root").into(),
1570 rules_file: None,
1571 }];
1572 let prompt_builder = PromptBuilder::new(None)?;
1573 let project_context = ProjectContext::new(worktrees, Vec::default());
1574 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1575 &project_context,
1576 &ModelContext {
1577 available_tools: tool_names,
1578 },
1579 )?;
1580
1581 let has_system_prompt = eval
1582 .conversation
1583 .first()
1584 .map_or(false, |msg| msg.role == Role::System);
1585 let messages = if has_system_prompt {
1586 eval.conversation
1587 } else {
1588 [LanguageModelRequestMessage {
1589 role: Role::System,
1590 content: vec![MessageContent::Text(system_prompt)],
1591 cache: true,
1592 }]
1593 .into_iter()
1594 .chain(eval.conversation)
1595 .collect::<Vec<_>>()
1596 };
1597
1598 let conversation = LanguageModelRequest {
1599 messages,
1600 tools,
1601 ..Default::default()
1602 };
1603
1604 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1605 if let Some(input_content) = eval.input_content.as_deref() {
1606 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1607 }
1608 retry_on_rate_limit(async || {
1609 self.agent
1610 .edit(
1611 buffer.clone(),
1612 eval.edit_file_input.display_description.clone(),
1613 &conversation,
1614 &mut cx.to_async(),
1615 )
1616 .0
1617 .await
1618 })
1619 .await?
1620 } else {
1621 retry_on_rate_limit(async || {
1622 self.agent
1623 .overwrite(
1624 buffer.clone(),
1625 eval.edit_file_input.display_description.clone(),
1626 &conversation,
1627 &mut cx.to_async(),
1628 )
1629 .0
1630 .await
1631 })
1632 .await?
1633 };
1634
1635 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1636 let sample = EvalSample {
1637 edit_output,
1638 diff: language::unified_diff(
1639 eval.input_content.as_deref().unwrap_or_default(),
1640 &buffer_text,
1641 ),
1642 text_before: eval.input_content.unwrap_or_default(),
1643 text_after: buffer_text,
1644 };
1645 let assertion = eval
1646 .assertion
1647 .run(&sample, self.judge_model.clone(), cx)
1648 .await?;
1649
1650 Ok(EvalOutput { assertion, sample })
1651 }
1652}
1653
1654async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1655 let mut attempt = 0;
1656 loop {
1657 attempt += 1;
1658 match request().await {
1659 Ok(result) => return Ok(result),
1660 Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1661 Ok(err) => match err {
1662 LanguageModelCompletionError::RateLimitExceeded { retry_after } => {
1663 // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1664 let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1665 eprintln!(
1666 "Attempt #{attempt}: Rate limit exceeded. Retry after {retry_after:?} + jitter of {jitter:?}"
1667 );
1668 Timer::after(retry_after + jitter).await;
1669 continue;
1670 }
1671 _ => return Err(err.into()),
1672 },
1673 Err(err) => return Err(err),
1674 },
1675 }
1676 }
1677}
1678
1679#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1680struct EvalAssertionOutcome {
1681 score: usize,
1682 message: Option<String>,
1683}
1684
1685#[derive(Serialize)]
1686pub struct DiffJudgeTemplate {
1687 diff: String,
1688 assertions: &'static str,
1689}
1690
1691impl Template for DiffJudgeTemplate {
1692 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1693}
1694
1695fn strip_empty_lines(text: &str) -> String {
1696 text.lines()
1697 .filter(|line| !line.trim().is_empty())
1698 .collect::<Vec<_>>()
1699 .join("\n")
1700}