1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 path::Path,
30 str::FromStr,
31 sync::mpsc,
32 time::Duration,
33};
34use util::path;
35
36#[test]
37#[cfg_attr(not(feature = "eval"), ignore)]
38fn eval_extract_handle_command_output() {
39 // Test how well agent generates multiple edit hunks.
40 //
41 // Model | Pass rate
42 // ----------------------------|----------
43 // claude-3.7-sonnet | 0.99 (2025-06-14)
44 // claude-sonnet-4 | 0.97 (2025-06-14)
45 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
46 // gemini-2.5-flash | 0.11 (2025-05-22)
47 // gpt-4.1 | 1.00 (2025-05-22)
48
49 let input_file_path = "root/blame.rs";
50 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
51 let possible_diffs = vec![
52 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
57 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
58 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
59 ];
60 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
61 eval(
62 100,
63 0.95,
64 0.05,
65 EvalInput::from_conversation(
66 vec![
67 message(
68 User,
69 [text(formatdoc! {"
70 Read the `{input_file_path}` file and extract a method in
71 the final stanza of `run_git_blame` to deal with command failures,
72 call it `handle_command_output` and take the std::process::Output as the only parameter.
73 Do not document the method and do not add any comments.
74
75 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
76 "})],
77 ),
78 message(
79 Assistant,
80 [tool_use(
81 "tool_1",
82 "read_file",
83 ReadFileToolInput {
84 path: input_file_path.into(),
85 start_line: None,
86 end_line: None,
87 },
88 )],
89 ),
90 message(
91 User,
92 [tool_result("tool_1", "read_file", input_file_content)],
93 ),
94 message(
95 Assistant,
96 [tool_use(
97 "tool_2",
98 "edit_file",
99 EditFileToolInput {
100 display_description: edit_description.into(),
101 path: input_file_path.into(),
102 mode: EditFileMode::Edit,
103 },
104 )],
105 ),
106 ],
107 Some(input_file_content.into()),
108 EvalAssertion::assert_diff_any(possible_diffs),
109 ),
110 );
111}
112
113#[test]
114#[cfg_attr(not(feature = "eval"), ignore)]
115fn eval_delete_run_git_blame() {
116 // Model | Pass rate
117 // ----------------------------|----------
118 // claude-3.7-sonnet | 1.0 (2025-06-14)
119 // claude-sonnet-4 | 0.96 (2025-06-14)
120 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
121 // gemini-2.5-flash |
122 // gpt-4.1 |
123 let input_file_path = "root/blame.rs";
124 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
125 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
126 let edit_description = "Delete the `run_git_blame` function.";
127 eval(
128 100,
129 0.95,
130 0.05,
131 EvalInput::from_conversation(
132 vec![
133 message(
134 User,
135 [text(formatdoc! {"
136 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
137 one function, not its usages.
138 "})],
139 ),
140 message(
141 Assistant,
142 [tool_use(
143 "tool_1",
144 "read_file",
145 ReadFileToolInput {
146 path: input_file_path.into(),
147 start_line: None,
148 end_line: None,
149 },
150 )],
151 ),
152 message(
153 User,
154 [tool_result("tool_1", "read_file", input_file_content)],
155 ),
156 message(
157 Assistant,
158 [tool_use(
159 "tool_2",
160 "edit_file",
161 EditFileToolInput {
162 display_description: edit_description.into(),
163 path: input_file_path.into(),
164 mode: EditFileMode::Edit,
165 },
166 )],
167 ),
168 ],
169 Some(input_file_content.into()),
170 EvalAssertion::assert_eq(output_file_content),
171 ),
172 );
173}
174
175#[test]
176#[cfg_attr(not(feature = "eval"), ignore)]
177fn eval_translate_doc_comments() {
178 // Model | Pass rate
179 // ============================================
180 //
181 // claude-3.7-sonnet | 1.0 (2025-06-14)
182 // claude-sonnet-4 | 1.0 (2025-06-14)
183 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
184 // gemini-2.5-flash-preview-04-17 |
185 // gpt-4.1 |
186 let input_file_path = "root/canvas.rs";
187 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
188 let edit_description = "Translate all doc comments to Italian";
189 eval(
190 200,
191 1.,
192 0.05,
193 EvalInput::from_conversation(
194 vec![
195 message(
196 User,
197 [text(formatdoc! {"
198 Read the {input_file_path} file and edit it (without overwriting it),
199 translating all the doc comments to italian.
200 "})],
201 ),
202 message(
203 Assistant,
204 [tool_use(
205 "tool_1",
206 "read_file",
207 ReadFileToolInput {
208 path: input_file_path.into(),
209 start_line: None,
210 end_line: None,
211 },
212 )],
213 ),
214 message(
215 User,
216 [tool_result("tool_1", "read_file", input_file_content)],
217 ),
218 message(
219 Assistant,
220 [tool_use(
221 "tool_2",
222 "edit_file",
223 EditFileToolInput {
224 display_description: edit_description.into(),
225 path: input_file_path.into(),
226 mode: EditFileMode::Edit,
227 },
228 )],
229 ),
230 ],
231 Some(input_file_content.into()),
232 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
233 ),
234 );
235}
236
237#[test]
238#[cfg_attr(not(feature = "eval"), ignore)]
239fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
240 // Model | Pass rate
241 // ============================================
242 //
243 // claude-3.7-sonnet | 0.96 (2025-06-14)
244 // claude-sonnet-4 | 0.11 (2025-06-14)
245 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
246 // gemini-2.5-flash-preview-04-17 |
247 // gpt-4.1 |
248 let input_file_path = "root/lib.rs";
249 let input_file_content =
250 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
251 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
252 eval(
253 100,
254 0.95,
255 0.05,
256 EvalInput::from_conversation(
257 vec![
258 message(
259 User,
260 [text(formatdoc! {"
261 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
262 Use `ureq` to download the SDK for the current platform and architecture.
263 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
264 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
265 that's inside of the archive.
266 Don't re-download the SDK if that executable already exists.
267
268 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
269
270 Here are the available wasi-sdk assets:
271 - wasi-sdk-25.0-x86_64-macos.tar.gz
272 - wasi-sdk-25.0-arm64-macos.tar.gz
273 - wasi-sdk-25.0-x86_64-linux.tar.gz
274 - wasi-sdk-25.0-arm64-linux.tar.gz
275 - wasi-sdk-25.0-x86_64-linux.tar.gz
276 - wasi-sdk-25.0-arm64-linux.tar.gz
277 - wasi-sdk-25.0-x86_64-windows.tar.gz
278 "})],
279 ),
280 message(
281 Assistant,
282 [tool_use(
283 "tool_1",
284 "read_file",
285 ReadFileToolInput {
286 path: input_file_path.into(),
287 start_line: Some(971),
288 end_line: Some(1050),
289 },
290 )],
291 ),
292 message(
293 User,
294 [tool_result(
295 "tool_1",
296 "read_file",
297 lines(input_file_content, 971..1050),
298 )],
299 ),
300 message(
301 Assistant,
302 [tool_use(
303 "tool_2",
304 "read_file",
305 ReadFileToolInput {
306 path: input_file_path.into(),
307 start_line: Some(1050),
308 end_line: Some(1100),
309 },
310 )],
311 ),
312 message(
313 User,
314 [tool_result(
315 "tool_2",
316 "read_file",
317 lines(input_file_content, 1050..1100),
318 )],
319 ),
320 message(
321 Assistant,
322 [tool_use(
323 "tool_3",
324 "read_file",
325 ReadFileToolInput {
326 path: input_file_path.into(),
327 start_line: Some(1100),
328 end_line: Some(1150),
329 },
330 )],
331 ),
332 message(
333 User,
334 [tool_result(
335 "tool_3",
336 "read_file",
337 lines(input_file_content, 1100..1150),
338 )],
339 ),
340 message(
341 Assistant,
342 [tool_use(
343 "tool_4",
344 "edit_file",
345 EditFileToolInput {
346 display_description: edit_description.into(),
347 path: input_file_path.into(),
348 mode: EditFileMode::Edit,
349 },
350 )],
351 ),
352 ],
353 Some(input_file_content.into()),
354 EvalAssertion::judge_diff(indoc! {"
355 - The compile_parser_to_wasm method has been changed to use wasi-sdk
356 - ureq is used to download the SDK for current platform and architecture
357 "}),
358 ),
359 );
360}
361
362#[test]
363#[cfg_attr(not(feature = "eval"), ignore)]
364fn eval_disable_cursor_blinking() {
365 // Model | Pass rate
366 // ============================================
367 //
368 // claude-3.7-sonnet | 0.59 (2025-07-14)
369 // claude-sonnet-4 | 0.81 (2025-07-14)
370 // gemini-2.5-pro | 0.95 (2025-07-14)
371 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
372 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
373 let input_file_path = "root/editor.rs";
374 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
375 let edit_description = "Comment out the call to `BlinkManager::enable`";
376 let possible_diffs = vec![
377 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
378 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
379 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
380 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
381 ];
382 eval(
383 100,
384 0.51,
385 0.05,
386 EvalInput::from_conversation(
387 vec![
388 message(User, [text("Let's research how to cursor blinking works.")]),
389 message(
390 Assistant,
391 [tool_use(
392 "tool_1",
393 "grep",
394 GrepToolInput {
395 regex: "blink".into(),
396 include_pattern: None,
397 offset: 0,
398 case_sensitive: false,
399 },
400 )],
401 ),
402 message(
403 User,
404 [tool_result(
405 "tool_1",
406 "grep",
407 [
408 lines(input_file_content, 100..400),
409 lines(input_file_content, 800..1300),
410 lines(input_file_content, 1600..2000),
411 lines(input_file_content, 5000..5500),
412 lines(input_file_content, 8000..9000),
413 lines(input_file_content, 18455..18470),
414 lines(input_file_content, 20000..20500),
415 lines(input_file_content, 21000..21300),
416 ]
417 .join("Match found:\n\n"),
418 )],
419 ),
420 message(
421 User,
422 [text(indoc! {"
423 Comment out the lines that interact with the BlinkManager.
424 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
425 Don't add additional comments.
426 "})],
427 ),
428 message(
429 Assistant,
430 [tool_use(
431 "tool_4",
432 "edit_file",
433 EditFileToolInput {
434 display_description: edit_description.into(),
435 path: input_file_path.into(),
436 mode: EditFileMode::Edit,
437 },
438 )],
439 ),
440 ],
441 Some(input_file_content.into()),
442 EvalAssertion::assert_diff_any(possible_diffs),
443 ),
444 );
445}
446
447#[test]
448#[cfg_attr(not(feature = "eval"), ignore)]
449fn eval_from_pixels_constructor() {
450 // Results for 2025-06-13
451 //
452 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
453 // value. Higher values improve the pass rate but may sometimes cause
454 // edits to be misapplied. In the context of this eval, this means
455 // the agent might add from_pixels tests in incorrect locations
456 // (e.g., at the beginning of the file), yet the evaluation may still
457 // rate it highly.
458 //
459 // Model | Date | Pass rate
460 // =========================================================
461 // claude-4.0-sonnet | 2025-06-14 | 0.99
462 // claude-3.7-sonnet | 2025-06-14 | 0.88
463 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
464 // gpt-4.1 |
465 let input_file_path = "root/canvas.rs";
466 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
467 let edit_description = "Implement from_pixels constructor and add tests.";
468 eval(
469 100,
470 0.95,
471 // For whatever reason, this eval produces more mismatched tags.
472 // Increasing for now, let's see if we can bring this down.
473 0.25,
474 EvalInput::from_conversation(
475 vec![
476 message(
477 User,
478 [text(indoc! {"
479 Introduce a new `from_pixels` constructor in Canvas and
480 also add tests for it in the same file.
481 "})],
482 ),
483 message(
484 Assistant,
485 [tool_use(
486 "tool_1",
487 "read_file",
488 ReadFileToolInput {
489 path: input_file_path.into(),
490 start_line: None,
491 end_line: None,
492 },
493 )],
494 ),
495 message(
496 User,
497 [tool_result("tool_1", "read_file", input_file_content)],
498 ),
499 message(
500 Assistant,
501 [tool_use(
502 "tool_2",
503 "grep",
504 GrepToolInput {
505 regex: "mod\\s+tests".into(),
506 include_pattern: Some("font-kit/src/canvas.rs".into()),
507 offset: 0,
508 case_sensitive: false,
509 },
510 )],
511 ),
512 message(User, [tool_result("tool_2", "grep", "No matches found")]),
513 message(
514 Assistant,
515 [tool_use(
516 "tool_3",
517 "grep",
518 GrepToolInput {
519 regex: "mod\\s+tests".into(),
520 include_pattern: Some("font-kit/src/**/*.rs".into()),
521 offset: 0,
522 case_sensitive: false,
523 },
524 )],
525 ),
526 message(User, [tool_result("tool_3", "grep", "No matches found")]),
527 message(
528 Assistant,
529 [tool_use(
530 "tool_4",
531 "grep",
532 GrepToolInput {
533 regex: "#\\[test\\]".into(),
534 include_pattern: Some("font-kit/src/**/*.rs".into()),
535 offset: 0,
536 case_sensitive: false,
537 },
538 )],
539 ),
540 message(
541 User,
542 [tool_result(
543 "tool_4",
544 "grep",
545 indoc! {"
546 Found 6 matches:
547
548 ## Matches in font-kit/src/loaders/core_text.rs
549
550 ### mod test › L926-936
551 ```
552 mod test {
553 use super::Font;
554 use crate::properties::{Stretch, Weight};
555
556 #[cfg(feature = \"source\")]
557 use crate::source::SystemSource;
558
559 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
560
561 #[cfg(feature = \"source\")]
562 #[test]
563 ```
564
565 55 lines remaining in ancestor node. Read the file to see all.
566
567 ### mod test › L947-951
568 ```
569 }
570
571 #[test]
572 fn test_core_text_to_css_font_weight() {
573 // Exact matches
574 ```
575
576 ### mod test › L959-963
577 ```
578 }
579
580 #[test]
581 fn test_core_text_to_css_font_stretch() {
582 // Exact matches
583 ```
584
585 ## Matches in font-kit/src/loaders/freetype.rs
586
587 ### mod test › L1238-1248
588 ```
589 mod test {
590 use crate::loaders::freetype::Font;
591
592 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
593 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
594
595 #[test]
596 fn get_pcf_postscript_name() {
597 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
598 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
599 }
600 ```
601
602 1 lines remaining in ancestor node. Read the file to see all.
603
604 ## Matches in font-kit/src/sources/core_text.rs
605
606 ### mod test › L265-275
607 ```
608 mod test {
609 use crate::properties::{Stretch, Weight};
610
611 #[test]
612 fn test_css_to_core_text_font_weight() {
613 // Exact matches
614 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
615 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
616 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
617 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
618
619 ```
620
621 27 lines remaining in ancestor node. Read the file to see all.
622
623 ### mod test › L278-282
624 ```
625 }
626
627 #[test]
628 fn test_css_to_core_text_font_stretch() {
629 // Exact matches
630 ```
631 "},
632 )],
633 ),
634 message(
635 Assistant,
636 [tool_use(
637 "tool_5",
638 "edit_file",
639 EditFileToolInput {
640 display_description: edit_description.into(),
641 path: input_file_path.into(),
642 mode: EditFileMode::Edit,
643 },
644 )],
645 ),
646 ],
647 Some(input_file_content.into()),
648 EvalAssertion::judge_diff(indoc! {"
649 - The diff contains a new `from_pixels` constructor
650 - The diff contains new tests for the `from_pixels` constructor
651 "}),
652 ),
653 );
654}
655
656#[test]
657#[cfg_attr(not(feature = "eval"), ignore)]
658fn eval_zode() {
659 // Model | Pass rate
660 // ============================================
661 //
662 // claude-3.7-sonnet | 1.0 (2025-06-14)
663 // claude-sonnet-4 | 1.0 (2025-06-14)
664 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
665 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
666 // gpt-4.1 | 1.0 (2025-05-22)
667 let input_file_path = "root/zode.py";
668 let input_content = None;
669 let edit_description = "Create the main Zode CLI script";
670 eval(
671 50,
672 1.,
673 0.05,
674 EvalInput::from_conversation(
675 vec![
676 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
677 message(
678 Assistant,
679 [
680 tool_use(
681 "tool_1",
682 "read_file",
683 ReadFileToolInput {
684 path: "root/eval/react.py".into(),
685 start_line: None,
686 end_line: None,
687 },
688 ),
689 tool_use(
690 "tool_2",
691 "read_file",
692 ReadFileToolInput {
693 path: "root/eval/react_test.py".into(),
694 start_line: None,
695 end_line: None,
696 },
697 ),
698 ],
699 ),
700 message(
701 User,
702 [
703 tool_result(
704 "tool_1",
705 "read_file",
706 include_str!("evals/fixtures/zode/react.py"),
707 ),
708 tool_result(
709 "tool_2",
710 "read_file",
711 include_str!("evals/fixtures/zode/react_test.py"),
712 ),
713 ],
714 ),
715 message(
716 Assistant,
717 [
718 text(
719 "Now that I understand what we need to build, I'll create the main Python script:",
720 ),
721 tool_use(
722 "tool_3",
723 "edit_file",
724 EditFileToolInput {
725 display_description: edit_description.into(),
726 path: input_file_path.into(),
727 mode: EditFileMode::Create,
728 },
729 ),
730 ],
731 ),
732 ],
733 input_content,
734 EvalAssertion::new(async move |sample, _, _cx| {
735 let invalid_starts = [' ', '`', '\n'];
736 let mut message = String::new();
737 for start in invalid_starts {
738 if sample.text_after.starts_with(start) {
739 message.push_str(&format!("The sample starts with a {:?}\n", start));
740 break;
741 }
742 }
743 // Remove trailing newline.
744 message.pop();
745
746 if message.is_empty() {
747 Ok(EvalAssertionOutcome {
748 score: 100,
749 message: None,
750 })
751 } else {
752 Ok(EvalAssertionOutcome {
753 score: 0,
754 message: Some(message),
755 })
756 }
757 }),
758 ),
759 );
760}
761
762#[test]
763#[cfg_attr(not(feature = "eval"), ignore)]
764fn eval_add_overwrite_test() {
765 // Model | Pass rate
766 // ============================================
767 //
768 // claude-3.7-sonnet | 0.65 (2025-06-14)
769 // claude-sonnet-4 | 0.07 (2025-06-14)
770 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
771 // gemini-2.5-flash-preview-04-17 |
772 // gpt-4.1 |
773 let input_file_path = "root/action_log.rs";
774 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
775 let edit_description = "Add a new test for overwriting a file in action_log.rs";
776 eval(
777 200,
778 0.5, // TODO: make this eval better
779 0.05,
780 EvalInput::from_conversation(
781 vec![
782 message(
783 User,
784 [text(indoc! {"
785 Introduce a new test in `action_log.rs` to test overwriting a file.
786 That is, a file already exists, but we call `buffer_created` as if the file were new.
787 Take inspiration from all the other tests in the file.
788 "})],
789 ),
790 message(
791 Assistant,
792 [tool_use(
793 "tool_1",
794 "read_file",
795 ReadFileToolInput {
796 path: input_file_path.into(),
797 start_line: None,
798 end_line: None,
799 },
800 )],
801 ),
802 message(
803 User,
804 [tool_result(
805 "tool_1",
806 "read_file",
807 indoc! {"
808 pub struct ActionLog [L13-20]
809 tracked_buffers [L15]
810 edited_since_project_diagnostics_check [L17]
811 project [L19]
812 impl ActionLog [L22-498]
813 pub fn new [L24-30]
814 pub fn project [L32-34]
815 pub fn checked_project_diagnostics [L37-39]
816 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
817 fn track_buffer_internal [L46-101]
818 fn handle_buffer_event [L103-116]
819 fn handle_buffer_edited [L118-123]
820 fn handle_buffer_file_changed [L125-158]
821 async fn maintain_diff [L160-264]
822 pub fn buffer_read [L267-269]
823 pub fn buffer_created [L272-276]
824 pub fn buffer_edited [L279-287]
825 pub fn will_delete_buffer [L289-304]
826 pub fn keep_edits_in_range [L306-364]
827 pub fn reject_edits_in_ranges [L366-459]
828 pub fn keep_all_edits [L461-473]
829 pub fn changed_buffers [L476-482]
830 pub fn stale_buffers [L485-497]
831 fn apply_non_conflicting_edits [L500-561]
832 fn diff_snapshots [L563-585]
833 fn point_to_row_edit [L587-614]
834 enum ChangeAuthor [L617-620]
835 User [L618]
836 Agent [L619]
837 enum TrackedBufferStatus [L623-627]
838 Created [L624]
839 Modified [L625]
840 Deleted [L626]
841 struct TrackedBuffer [L629-641]
842 buffer [L630]
843 base_text [L631]
844 unreviewed_changes [L632]
845 status [L633]
846 version [L634]
847 diff [L635]
848 snapshot [L636]
849 diff_update [L637]
850 _open_lsp_handle [L638]
851 _maintain_diff [L639]
852 _subscription [L640]
853 impl TrackedBuffer [L643-657]
854 fn has_changes [L644-650]
855 fn schedule_diff_update [L652-656]
856 pub struct ChangedBuffer [L659-661]
857 pub diff [L660]
858 mod tests [L664-1574]
859 fn init_logger [L678-682]
860 fn init_test [L684-691]
861 async fn test_keep_edits [L694-769]
862 async fn test_deletions [L772-854]
863 async fn test_overlapping_user_edits [L857-951]
864 async fn test_creating_files [L954-1010]
865 async fn test_deleting_files [L1013-1120]
866 async fn test_reject_edits [L1123-1255]
867 async fn test_reject_multiple_edits [L1258-1331]
868 async fn test_reject_deleted_file [L1334-1388]
869 async fn test_reject_created_file [L1391-1443]
870 async fn test_random_diffs [L1446-1535]
871 fn quiesce [L1510-1534]
872 struct HunkStatus [L1538-1542]
873 range [L1539]
874 diff_status [L1540]
875 old_text [L1541]
876 fn unreviewed_hunks [L1544-1573]
877
878 Showing symbols 1-69 (total symbols: 69)
879
880 Using the line numbers in this outline, you can call this tool again while specifying
881 the start_line and end_line fields to see the implementations of symbols in the outline.
882 "},
883 )],
884 ),
885 message(
886 Assistant,
887 [
888 text(
889 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
890 ),
891 tool_use(
892 "tool_2",
893 "read_file",
894 ReadFileToolInput {
895 path: input_file_path.into(),
896 start_line: Some(953),
897 end_line: Some(1010),
898 },
899 ),
900 ],
901 ),
902 message(
903 User,
904 [tool_result(
905 "tool_2",
906 "read_file",
907 lines(input_file_content, 953..1010),
908 )],
909 ),
910 message(
911 Assistant,
912 [
913 text(
914 "Now let's look at another relevant test to understand how they're structured:",
915 ),
916 tool_use(
917 "tool_3",
918 "read_file",
919 ReadFileToolInput {
920 path: input_file_path.into(),
921 start_line: Some(1012),
922 end_line: Some(1120),
923 },
924 ),
925 ],
926 ),
927 message(
928 User,
929 [tool_result(
930 "tool_3",
931 "read_file",
932 lines(input_file_content, 1012..1120),
933 )],
934 ),
935 message(
936 Assistant,
937 [
938 text("Now let's look at how `buffer_created` is implemented:"),
939 tool_use(
940 "tool_4",
941 "read_file",
942 ReadFileToolInput {
943 path: input_file_path.into(),
944 start_line: Some(271),
945 end_line: Some(276),
946 },
947 ),
948 ],
949 ),
950 message(
951 User,
952 [tool_result(
953 "tool_4",
954 "read_file",
955 lines(input_file_content, 271..276),
956 )],
957 ),
958 message(
959 Assistant,
960 [
961 text(indoc! {"
962 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
963
964 1. Create a file
965 2. Open a buffer for it
966 3. Track it in the action log
967 4. Modify the buffer
968 5. Call `buffer_created` again as if the file were new
969 6. Check that the changes are properly tracked
970
971 Let's write the test:
972 "}),
973 tool_use(
974 "tool_5",
975 "edit_file",
976 EditFileToolInput {
977 display_description: edit_description.into(),
978 path: input_file_path.into(),
979 mode: EditFileMode::Edit,
980 },
981 ),
982 ],
983 ),
984 ],
985 Some(input_file_content.into()),
986 EvalAssertion::judge_diff(
987 "A new test for overwritten files was created, without changing any previous test",
988 ),
989 ),
990 );
991}
992
993#[test]
994#[cfg_attr(not(feature = "eval"), ignore)]
995fn eval_create_empty_file() {
996 // Check that Edit Agent can create a file without writing its
997 // thoughts into it. This issue is not specific to empty files, but
998 // it's easier to reproduce with them.
999 //
1000 // Model | Pass rate
1001 // ============================================
1002 //
1003 // claude-3.7-sonnet | 1.00 (2025-06-14)
1004 // claude-sonnet-4 | 1.00 (2025-06-14)
1005 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1006 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1007 // gpt-4.1 | 1.00 (2025-05-21)
1008 //
1009 //
1010 // TODO: gpt-4.1-mini errored 38 times:
1011 // "data did not match any variant of untagged enum ResponseStreamResult"
1012 //
1013 let input_file_content = None;
1014 let expected_output_content = String::new();
1015 eval(
1016 100,
1017 0.99,
1018 0.05,
1019 EvalInput::from_conversation(
1020 vec![
1021 message(User, [text("Create a second empty todo file ")]),
1022 message(
1023 Assistant,
1024 [
1025 text(formatdoc! {"
1026 I'll help you create a second empty todo file.
1027 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1028 "}),
1029 tool_use(
1030 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1031 "list_directory",
1032 ListDirectoryToolInput {
1033 path: "root".to_string(),
1034 },
1035 ),
1036 ],
1037 ),
1038 message(
1039 User,
1040 [tool_result(
1041 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1042 "list_directory",
1043 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1044 )],
1045 ),
1046 message(
1047 Assistant,
1048 [
1049 text(formatdoc! {"
1050 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1051 "}),
1052 tool_use(
1053 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1054 "edit_file",
1055 EditFileToolInput {
1056 display_description: "Create empty TODO3 file".to_string(),
1057 mode: EditFileMode::Create,
1058 path: "root/TODO3".into(),
1059 },
1060 ),
1061 ],
1062 ),
1063 ],
1064 input_file_content,
1065 // Bad behavior is to write something like
1066 // "I'll create an empty TODO3 file as requested."
1067 EvalAssertion::assert_eq(expected_output_content),
1068 ),
1069 );
1070}
1071
1072fn message(
1073 role: Role,
1074 contents: impl IntoIterator<Item = MessageContent>,
1075) -> LanguageModelRequestMessage {
1076 LanguageModelRequestMessage {
1077 role,
1078 content: contents.into_iter().collect(),
1079 cache: false,
1080 }
1081}
1082
1083fn text(text: impl Into<String>) -> MessageContent {
1084 MessageContent::Text(text.into())
1085}
1086
1087fn lines(input: &str, range: Range<usize>) -> String {
1088 input
1089 .lines()
1090 .skip(range.start)
1091 .take(range.len())
1092 .collect::<Vec<_>>()
1093 .join("\n")
1094}
1095
1096fn tool_use(
1097 id: impl Into<Arc<str>>,
1098 name: impl Into<Arc<str>>,
1099 input: impl Serialize,
1100) -> MessageContent {
1101 MessageContent::ToolUse(LanguageModelToolUse {
1102 id: LanguageModelToolUseId::from(id.into()),
1103 name: name.into(),
1104 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1105 input: serde_json::to_value(input).unwrap(),
1106 is_input_complete: true,
1107 })
1108}
1109
1110fn tool_result(
1111 id: impl Into<Arc<str>>,
1112 name: impl Into<Arc<str>>,
1113 result: impl Into<Arc<str>>,
1114) -> MessageContent {
1115 MessageContent::ToolResult(LanguageModelToolResult {
1116 tool_use_id: LanguageModelToolUseId::from(id.into()),
1117 tool_name: name.into(),
1118 is_error: false,
1119 content: LanguageModelToolResultContent::Text(result.into()),
1120 output: None,
1121 })
1122}
1123
1124#[derive(Clone)]
1125struct EvalInput {
1126 conversation: Vec<LanguageModelRequestMessage>,
1127 edit_file_input: EditFileToolInput,
1128 input_content: Option<String>,
1129 assertion: EvalAssertion,
1130}
1131
1132impl EvalInput {
1133 fn from_conversation(
1134 conversation: Vec<LanguageModelRequestMessage>,
1135 input_content: Option<String>,
1136 assertion: EvalAssertion,
1137 ) -> Self {
1138 let msg = conversation.last().expect("Conversation must not be empty");
1139 if msg.role != Role::Assistant {
1140 panic!("Conversation must end with an assistant message");
1141 }
1142 let tool_use = msg
1143 .content
1144 .iter()
1145 .flat_map(|content| match content {
1146 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1147 Some(tool_use)
1148 }
1149 _ => None,
1150 })
1151 .next()
1152 .expect("Conversation must end with an edit_file tool use")
1153 .clone();
1154
1155 let edit_file_input: EditFileToolInput =
1156 serde_json::from_value(tool_use.input.clone()).unwrap();
1157
1158 EvalInput {
1159 conversation,
1160 edit_file_input,
1161 input_content,
1162 assertion,
1163 }
1164 }
1165}
1166
1167#[derive(Clone)]
1168struct EvalSample {
1169 text_before: String,
1170 text_after: String,
1171 edit_output: EditAgentOutput,
1172 diff: String,
1173}
1174
1175trait AssertionFn: 'static + Send + Sync {
1176 fn assert<'a>(
1177 &'a self,
1178 sample: &'a EvalSample,
1179 judge_model: Arc<dyn LanguageModel>,
1180 cx: &'a mut TestAppContext,
1181 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1182}
1183
1184impl<F> AssertionFn for F
1185where
1186 F: 'static
1187 + Send
1188 + Sync
1189 + AsyncFn(
1190 &EvalSample,
1191 Arc<dyn LanguageModel>,
1192 &mut TestAppContext,
1193 ) -> Result<EvalAssertionOutcome>,
1194{
1195 fn assert<'a>(
1196 &'a self,
1197 sample: &'a EvalSample,
1198 judge_model: Arc<dyn LanguageModel>,
1199 cx: &'a mut TestAppContext,
1200 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1201 (self)(sample, judge_model, cx).boxed_local()
1202 }
1203}
1204
1205#[derive(Clone)]
1206struct EvalAssertion(Arc<dyn AssertionFn>);
1207
1208impl EvalAssertion {
1209 fn new<F>(f: F) -> Self
1210 where
1211 F: 'static
1212 + Send
1213 + Sync
1214 + AsyncFn(
1215 &EvalSample,
1216 Arc<dyn LanguageModel>,
1217 &mut TestAppContext,
1218 ) -> Result<EvalAssertionOutcome>,
1219 {
1220 EvalAssertion(Arc::new(f))
1221 }
1222
1223 fn assert_eq(expected: impl Into<String>) -> Self {
1224 let expected = expected.into();
1225 Self::new(async move |sample, _judge, _cx| {
1226 Ok(EvalAssertionOutcome {
1227 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1228 100
1229 } else {
1230 0
1231 },
1232 message: None,
1233 })
1234 })
1235 }
1236
1237 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1238 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1239 Self::new(async move |sample, _judge, _cx| {
1240 let matches = expected_diffs.iter().any(|possible_diff| {
1241 let expected =
1242 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1243 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1244 });
1245
1246 Ok(EvalAssertionOutcome {
1247 score: if matches { 100 } else { 0 },
1248 message: None,
1249 })
1250 })
1251 }
1252
1253 fn judge_diff(assertions: &'static str) -> Self {
1254 Self::new(async move |sample, judge, cx| {
1255 let prompt = DiffJudgeTemplate {
1256 diff: sample.diff.clone(),
1257 assertions,
1258 }
1259 .render(&Templates::new())
1260 .unwrap();
1261
1262 let request = LanguageModelRequest {
1263 messages: vec![LanguageModelRequestMessage {
1264 role: Role::User,
1265 content: vec![prompt.into()],
1266 cache: false,
1267 }],
1268 thinking_allowed: true,
1269 ..Default::default()
1270 };
1271 let mut response = retry_on_rate_limit(async || {
1272 Ok(judge
1273 .stream_completion_text(request.clone(), &cx.to_async())
1274 .await?)
1275 })
1276 .await?;
1277 let mut output = String::new();
1278 while let Some(chunk) = response.stream.next().await {
1279 let chunk = chunk?;
1280 output.push_str(&chunk);
1281 }
1282
1283 // Parse the score from the response
1284 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1285 if let Some(captures) = re.captures(&output) {
1286 if let Some(score_match) = captures.get(1) {
1287 let score = score_match.as_str().parse().unwrap_or(0);
1288 return Ok(EvalAssertionOutcome {
1289 score,
1290 message: Some(output),
1291 });
1292 }
1293 }
1294
1295 anyhow::bail!("No score found in response. Raw output: {output}");
1296 })
1297 }
1298
1299 async fn run(
1300 &self,
1301 input: &EvalSample,
1302 judge_model: Arc<dyn LanguageModel>,
1303 cx: &mut TestAppContext,
1304 ) -> Result<EvalAssertionOutcome> {
1305 self.0.assert(input, judge_model, cx).await
1306 }
1307}
1308
1309fn eval(
1310 iterations: usize,
1311 expected_pass_ratio: f32,
1312 mismatched_tag_threshold: f32,
1313 mut eval: EvalInput,
1314) {
1315 let mut evaluated_count = 0;
1316 let mut failed_count = 0;
1317 report_progress(evaluated_count, failed_count, iterations);
1318
1319 let (tx, rx) = mpsc::channel();
1320
1321 // Cache the last message in the conversation, and run one instance of the eval so that
1322 // all the next ones are cached.
1323 eval.conversation.last_mut().unwrap().cache = true;
1324 run_eval(eval.clone(), tx.clone());
1325
1326 let executor = gpui::background_executor();
1327 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1328 for _ in 1..iterations {
1329 let eval = eval.clone();
1330 let tx = tx.clone();
1331 let semaphore = semaphore.clone();
1332 executor
1333 .spawn(async move {
1334 let _guard = semaphore.acquire().await;
1335 run_eval(eval, tx)
1336 })
1337 .detach();
1338 }
1339 drop(tx);
1340
1341 let mut failed_evals = HashMap::default();
1342 let mut errored_evals = HashMap::default();
1343 let mut eval_outputs = Vec::new();
1344 let mut cumulative_parser_metrics = EditParserMetrics::default();
1345 while let Ok(output) = rx.recv() {
1346 match output {
1347 Ok(output) => {
1348 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1349 eval_outputs.push(output.clone());
1350 if output.assertion.score < 80 {
1351 failed_count += 1;
1352 failed_evals
1353 .entry(output.sample.text_after.clone())
1354 .or_insert(Vec::new())
1355 .push(output);
1356 }
1357 }
1358 Err(error) => {
1359 failed_count += 1;
1360 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1361 }
1362 }
1363
1364 evaluated_count += 1;
1365 report_progress(evaluated_count, failed_count, iterations);
1366 }
1367
1368 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1369 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1370 if actual_pass_ratio < expected_pass_ratio {
1371 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1372 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1373 for (error, count) in errored_evals {
1374 println!("Eval errored {} times. Error: {}", count, error);
1375 }
1376
1377 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1378 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1379 for (_buffer_output, failed_evals) in failed_evals {
1380 let eval_output = failed_evals.first().unwrap();
1381 println!("Eval failed {} times", failed_evals.len());
1382 println!("{}", eval_output);
1383 }
1384
1385 panic!(
1386 "Actual pass ratio: {}\nExpected pass ratio: {}",
1387 actual_pass_ratio, expected_pass_ratio
1388 );
1389 }
1390
1391 let mismatched_tag_ratio =
1392 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1393 if mismatched_tag_ratio > mismatched_tag_threshold {
1394 for eval_output in eval_outputs {
1395 println!("{}", eval_output);
1396 }
1397 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1398 }
1399}
1400
1401fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1402 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1403 let mut cx = TestAppContext::build(dispatcher, None);
1404 let output = cx.executor().block_test(async {
1405 let test = EditAgentTest::new(&mut cx).await;
1406 test.eval(eval, &mut cx).await
1407 });
1408 tx.send(output).unwrap();
1409}
1410
1411#[derive(Clone)]
1412struct EvalOutput {
1413 sample: EvalSample,
1414 assertion: EvalAssertionOutcome,
1415}
1416
1417impl Display for EvalOutput {
1418 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1419 writeln!(f, "Score: {:?}", self.assertion.score)?;
1420 if let Some(message) = self.assertion.message.as_ref() {
1421 writeln!(f, "Message: {}", message)?;
1422 }
1423
1424 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1425
1426 writeln!(
1427 f,
1428 "Parser Metrics:\n{:#?}",
1429 self.sample.edit_output.parser_metrics
1430 )?;
1431 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1432 Ok(())
1433 }
1434}
1435
1436fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1437 let passed_count = evaluated_count - failed_count;
1438 let passed_ratio = if evaluated_count == 0 {
1439 0.0
1440 } else {
1441 passed_count as f64 / evaluated_count as f64
1442 };
1443 print!(
1444 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1445 evaluated_count,
1446 iterations,
1447 passed_ratio * 100.0
1448 );
1449 std::io::stdout().flush().unwrap();
1450}
1451
1452struct EditAgentTest {
1453 agent: EditAgent,
1454 project: Entity<Project>,
1455 judge_model: Arc<dyn LanguageModel>,
1456}
1457
1458impl EditAgentTest {
1459 async fn new(cx: &mut TestAppContext) -> Self {
1460 cx.executor().allow_parking();
1461
1462 let fs = FakeFs::new(cx.executor().clone());
1463 cx.update(|cx| {
1464 settings::init(cx);
1465 gpui_tokio::init(cx);
1466 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1467 cx.set_http_client(http_client);
1468
1469 client::init_settings(cx);
1470 let client = Client::production(cx);
1471 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1472
1473 settings::init(cx);
1474 Project::init_settings(cx);
1475 language::init(cx);
1476 language_model::init(client.clone(), cx);
1477 language_models::init(user_store.clone(), client.clone(), cx);
1478 crate::init(client.http_client(), cx);
1479 });
1480
1481 fs.insert_tree("/root", json!({})).await;
1482 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1483 let agent_model = SelectedModel::from_str(
1484 &std::env::var("ZED_AGENT_MODEL")
1485 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1486 )
1487 .unwrap();
1488 let judge_model = SelectedModel::from_str(
1489 &std::env::var("ZED_JUDGE_MODEL")
1490 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1491 )
1492 .unwrap();
1493 let (agent_model, judge_model) = cx
1494 .update(|cx| {
1495 cx.spawn(async move |cx| {
1496 let agent_model = Self::load_model(&agent_model, cx).await;
1497 let judge_model = Self::load_model(&judge_model, cx).await;
1498 (agent_model.unwrap(), judge_model.unwrap())
1499 })
1500 })
1501 .await;
1502 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1503
1504 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1505
1506 Self {
1507 agent: EditAgent::new(
1508 agent_model,
1509 project.clone(),
1510 action_log,
1511 Templates::new(),
1512 edit_format,
1513 ),
1514 project,
1515 judge_model,
1516 }
1517 }
1518
1519 async fn load_model(
1520 selected_model: &SelectedModel,
1521 cx: &mut AsyncApp,
1522 ) -> Result<Arc<dyn LanguageModel>> {
1523 let (provider, model) = cx.update(|cx| {
1524 let models = LanguageModelRegistry::read_global(cx);
1525 let model = models
1526 .available_models(cx)
1527 .find(|model| {
1528 model.provider_id() == selected_model.provider
1529 && model.id() == selected_model.model
1530 })
1531 .expect("Model not found");
1532 let provider = models.provider(&model.provider_id()).unwrap();
1533 (provider, model)
1534 })?;
1535 cx.update(|cx| provider.authenticate(cx))?.await?;
1536 Ok(model)
1537 }
1538
1539 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1540 let path = self
1541 .project
1542 .read_with(cx, |project, cx| {
1543 project.find_project_path(eval.edit_file_input.path, cx)
1544 })
1545 .unwrap();
1546 let buffer = self
1547 .project
1548 .update(cx, |project, cx| project.open_buffer(path, cx))
1549 .await
1550 .unwrap();
1551 let tools = cx.update(|cx| {
1552 ToolRegistry::default_global(cx)
1553 .tools()
1554 .into_iter()
1555 .filter_map(|tool| {
1556 let input_schema = tool
1557 .input_schema(self.agent.model.tool_input_format())
1558 .ok()?;
1559 Some(LanguageModelRequestTool {
1560 name: tool.name(),
1561 description: tool.description(),
1562 input_schema,
1563 })
1564 })
1565 .collect::<Vec<_>>()
1566 });
1567 let tool_names = tools
1568 .iter()
1569 .map(|tool| tool.name.clone())
1570 .collect::<Vec<_>>();
1571 let worktrees = vec![WorktreeContext {
1572 root_name: "root".to_string(),
1573 abs_path: Path::new("/path/to/root").into(),
1574 rules_file: None,
1575 }];
1576 let prompt_builder = PromptBuilder::new(None)?;
1577 let project_context = ProjectContext::new(worktrees, Vec::default());
1578 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1579 &project_context,
1580 &ModelContext {
1581 available_tools: tool_names,
1582 },
1583 )?;
1584
1585 let has_system_prompt = eval
1586 .conversation
1587 .first()
1588 .map_or(false, |msg| msg.role == Role::System);
1589 let messages = if has_system_prompt {
1590 eval.conversation
1591 } else {
1592 [LanguageModelRequestMessage {
1593 role: Role::System,
1594 content: vec![MessageContent::Text(system_prompt)],
1595 cache: true,
1596 }]
1597 .into_iter()
1598 .chain(eval.conversation)
1599 .collect::<Vec<_>>()
1600 };
1601
1602 let conversation = LanguageModelRequest {
1603 messages,
1604 tools,
1605 thinking_allowed: true,
1606 ..Default::default()
1607 };
1608
1609 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1610 if let Some(input_content) = eval.input_content.as_deref() {
1611 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1612 }
1613 retry_on_rate_limit(async || {
1614 self.agent
1615 .edit(
1616 buffer.clone(),
1617 eval.edit_file_input.display_description.clone(),
1618 &conversation,
1619 &mut cx.to_async(),
1620 )
1621 .0
1622 .await
1623 })
1624 .await?
1625 } else {
1626 retry_on_rate_limit(async || {
1627 self.agent
1628 .overwrite(
1629 buffer.clone(),
1630 eval.edit_file_input.display_description.clone(),
1631 &conversation,
1632 &mut cx.to_async(),
1633 )
1634 .0
1635 .await
1636 })
1637 .await?
1638 };
1639
1640 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1641 let sample = EvalSample {
1642 edit_output,
1643 diff: language::unified_diff(
1644 eval.input_content.as_deref().unwrap_or_default(),
1645 &buffer_text,
1646 ),
1647 text_before: eval.input_content.unwrap_or_default(),
1648 text_after: buffer_text,
1649 };
1650 let assertion = eval
1651 .assertion
1652 .run(&sample, self.judge_model.clone(), cx)
1653 .await?;
1654
1655 Ok(EvalOutput { assertion, sample })
1656 }
1657}
1658
1659async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1660 let mut attempt = 0;
1661 loop {
1662 attempt += 1;
1663 match request().await {
1664 Ok(result) => return Ok(result),
1665 Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1666 Ok(err) => match &err {
1667 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1668 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1669 let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
1670 // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1671 let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1672 eprintln!(
1673 "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
1674 );
1675 Timer::after(retry_after + jitter).await;
1676 continue;
1677 }
1678 _ => return Err(err.into()),
1679 },
1680 Err(err) => return Err(err),
1681 },
1682 }
1683 }
1684}
1685
1686#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1687struct EvalAssertionOutcome {
1688 score: usize,
1689 message: Option<String>,
1690}
1691
1692#[derive(Serialize)]
1693pub struct DiffJudgeTemplate {
1694 diff: String,
1695 assertions: &'static str,
1696}
1697
1698impl Template for DiffJudgeTemplate {
1699 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1700}
1701
1702fn strip_empty_lines(text: &str) -> String {
1703 text.lines()
1704 .filter(|line| !line.trim().is_empty())
1705 .collect::<Vec<_>>()
1706 .join("\n")
1707}