1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use http_client::StatusCode;
16use indoc::{formatdoc, indoc};
17use language_model::{
18 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
19 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
20};
21use project::Project;
22use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
23use rand::prelude::*;
24use reqwest_client::ReqwestClient;
25use serde_json::json;
26use std::{
27 cmp::Reverse,
28 fmt::{self, Display},
29 io::Write as _,
30 path::Path,
31 str::FromStr,
32 sync::mpsc,
33 time::Duration,
34};
35use util::path;
36
37#[test]
38#[cfg_attr(not(feature = "eval"), ignore)]
39fn eval_extract_handle_command_output() {
40 // Test how well agent generates multiple edit hunks.
41 //
42 // Model | Pass rate
43 // ----------------------------|----------
44 // claude-3.7-sonnet | 0.99 (2025-06-14)
45 // claude-sonnet-4 | 0.97 (2025-06-14)
46 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
47 // gemini-2.5-flash | 0.11 (2025-05-22)
48 // gpt-4.1 | 1.00 (2025-05-22)
49
50 let input_file_path = "root/blame.rs";
51 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
52 let possible_diffs = vec![
53 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
57 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
58 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
59 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
60 ];
61 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
62 eval(
63 100,
64 0.95,
65 0.05,
66 EvalInput::from_conversation(
67 vec![
68 message(
69 User,
70 [text(formatdoc! {"
71 Read the `{input_file_path}` file and extract a method in
72 the final stanza of `run_git_blame` to deal with command failures,
73 call it `handle_command_output` and take the std::process::Output as the only parameter.
74 Do not document the method and do not add any comments.
75
76 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
77 "})],
78 ),
79 message(
80 Assistant,
81 [tool_use(
82 "tool_1",
83 "read_file",
84 ReadFileToolInput {
85 path: input_file_path.into(),
86 start_line: None,
87 end_line: None,
88 },
89 )],
90 ),
91 message(
92 User,
93 [tool_result("tool_1", "read_file", input_file_content)],
94 ),
95 message(
96 Assistant,
97 [tool_use(
98 "tool_2",
99 "edit_file",
100 EditFileToolInput {
101 display_description: edit_description.into(),
102 path: input_file_path.into(),
103 mode: EditFileMode::Edit,
104 },
105 )],
106 ),
107 ],
108 Some(input_file_content.into()),
109 EvalAssertion::assert_diff_any(possible_diffs),
110 ),
111 );
112}
113
114#[test]
115#[cfg_attr(not(feature = "eval"), ignore)]
116fn eval_delete_run_git_blame() {
117 // Model | Pass rate
118 // ----------------------------|----------
119 // claude-3.7-sonnet | 1.0 (2025-06-14)
120 // claude-sonnet-4 | 0.96 (2025-06-14)
121 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
122 // gemini-2.5-flash |
123 // gpt-4.1 |
124 let input_file_path = "root/blame.rs";
125 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
126 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
127 let edit_description = "Delete the `run_git_blame` function.";
128 eval(
129 100,
130 0.95,
131 0.05,
132 EvalInput::from_conversation(
133 vec![
134 message(
135 User,
136 [text(formatdoc! {"
137 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
138 one function, not its usages.
139 "})],
140 ),
141 message(
142 Assistant,
143 [tool_use(
144 "tool_1",
145 "read_file",
146 ReadFileToolInput {
147 path: input_file_path.into(),
148 start_line: None,
149 end_line: None,
150 },
151 )],
152 ),
153 message(
154 User,
155 [tool_result("tool_1", "read_file", input_file_content)],
156 ),
157 message(
158 Assistant,
159 [tool_use(
160 "tool_2",
161 "edit_file",
162 EditFileToolInput {
163 display_description: edit_description.into(),
164 path: input_file_path.into(),
165 mode: EditFileMode::Edit,
166 },
167 )],
168 ),
169 ],
170 Some(input_file_content.into()),
171 EvalAssertion::assert_eq(output_file_content),
172 ),
173 );
174}
175
176#[test]
177#[cfg_attr(not(feature = "eval"), ignore)]
178fn eval_translate_doc_comments() {
179 // Model | Pass rate
180 // ============================================
181 //
182 // claude-3.7-sonnet | 1.0 (2025-06-14)
183 // claude-sonnet-4 | 1.0 (2025-06-14)
184 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
185 // gemini-2.5-flash-preview-04-17 |
186 // gpt-4.1 |
187 let input_file_path = "root/canvas.rs";
188 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
189 let edit_description = "Translate all doc comments to Italian";
190 eval(
191 200,
192 1.,
193 0.05,
194 EvalInput::from_conversation(
195 vec![
196 message(
197 User,
198 [text(formatdoc! {"
199 Read the {input_file_path} file and edit it (without overwriting it),
200 translating all the doc comments to italian.
201 "})],
202 ),
203 message(
204 Assistant,
205 [tool_use(
206 "tool_1",
207 "read_file",
208 ReadFileToolInput {
209 path: input_file_path.into(),
210 start_line: None,
211 end_line: None,
212 },
213 )],
214 ),
215 message(
216 User,
217 [tool_result("tool_1", "read_file", input_file_content)],
218 ),
219 message(
220 Assistant,
221 [tool_use(
222 "tool_2",
223 "edit_file",
224 EditFileToolInput {
225 display_description: edit_description.into(),
226 path: input_file_path.into(),
227 mode: EditFileMode::Edit,
228 },
229 )],
230 ),
231 ],
232 Some(input_file_content.into()),
233 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
234 ),
235 );
236}
237
238#[test]
239#[cfg_attr(not(feature = "eval"), ignore)]
240fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
241 // Model | Pass rate
242 // ============================================
243 //
244 // claude-3.7-sonnet | 0.96 (2025-06-14)
245 // claude-sonnet-4 | 0.11 (2025-06-14)
246 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
247 // gemini-2.5-flash-preview-04-17 |
248 // gpt-4.1 |
249 let input_file_path = "root/lib.rs";
250 let input_file_content =
251 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
252 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
253 eval(
254 100,
255 0.95,
256 0.05,
257 EvalInput::from_conversation(
258 vec![
259 message(
260 User,
261 [text(formatdoc! {"
262 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
263 Use `ureq` to download the SDK for the current platform and architecture.
264 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
265 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
266 that's inside of the archive.
267 Don't re-download the SDK if that executable already exists.
268
269 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
270
271 Here are the available wasi-sdk assets:
272 - wasi-sdk-25.0-x86_64-macos.tar.gz
273 - wasi-sdk-25.0-arm64-macos.tar.gz
274 - wasi-sdk-25.0-x86_64-linux.tar.gz
275 - wasi-sdk-25.0-arm64-linux.tar.gz
276 - wasi-sdk-25.0-x86_64-linux.tar.gz
277 - wasi-sdk-25.0-arm64-linux.tar.gz
278 - wasi-sdk-25.0-x86_64-windows.tar.gz
279 "})],
280 ),
281 message(
282 Assistant,
283 [tool_use(
284 "tool_1",
285 "read_file",
286 ReadFileToolInput {
287 path: input_file_path.into(),
288 start_line: Some(971),
289 end_line: Some(1050),
290 },
291 )],
292 ),
293 message(
294 User,
295 [tool_result(
296 "tool_1",
297 "read_file",
298 lines(input_file_content, 971..1050),
299 )],
300 ),
301 message(
302 Assistant,
303 [tool_use(
304 "tool_2",
305 "read_file",
306 ReadFileToolInput {
307 path: input_file_path.into(),
308 start_line: Some(1050),
309 end_line: Some(1100),
310 },
311 )],
312 ),
313 message(
314 User,
315 [tool_result(
316 "tool_2",
317 "read_file",
318 lines(input_file_content, 1050..1100),
319 )],
320 ),
321 message(
322 Assistant,
323 [tool_use(
324 "tool_3",
325 "read_file",
326 ReadFileToolInput {
327 path: input_file_path.into(),
328 start_line: Some(1100),
329 end_line: Some(1150),
330 },
331 )],
332 ),
333 message(
334 User,
335 [tool_result(
336 "tool_3",
337 "read_file",
338 lines(input_file_content, 1100..1150),
339 )],
340 ),
341 message(
342 Assistant,
343 [tool_use(
344 "tool_4",
345 "edit_file",
346 EditFileToolInput {
347 display_description: edit_description.into(),
348 path: input_file_path.into(),
349 mode: EditFileMode::Edit,
350 },
351 )],
352 ),
353 ],
354 Some(input_file_content.into()),
355 EvalAssertion::judge_diff(indoc! {"
356 - The compile_parser_to_wasm method has been changed to use wasi-sdk
357 - ureq is used to download the SDK for current platform and architecture
358 "}),
359 ),
360 );
361}
362
363#[test]
364#[cfg_attr(not(feature = "eval"), ignore)]
365fn eval_disable_cursor_blinking() {
366 // Model | Pass rate
367 // ============================================
368 //
369 // claude-3.7-sonnet | 0.99 (2025-06-14)
370 // claude-sonnet-4 | 0.85 (2025-06-14)
371 // gemini-2.5-pro-preview-latest | 0.97 (2025-06-16)
372 // gemini-2.5-flash-preview-04-17 |
373 // gpt-4.1 |
374 let input_file_path = "root/editor.rs";
375 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
376 let edit_description = "Comment out the call to `BlinkManager::enable`";
377 eval(
378 100,
379 0.95,
380 0.05,
381 EvalInput::from_conversation(
382 vec![
383 message(User, [text("Let's research how to cursor blinking works.")]),
384 message(
385 Assistant,
386 [tool_use(
387 "tool_1",
388 "grep",
389 GrepToolInput {
390 regex: "blink".into(),
391 include_pattern: None,
392 offset: 0,
393 case_sensitive: false,
394 },
395 )],
396 ),
397 message(
398 User,
399 [tool_result(
400 "tool_1",
401 "grep",
402 [
403 lines(input_file_content, 100..400),
404 lines(input_file_content, 800..1300),
405 lines(input_file_content, 1600..2000),
406 lines(input_file_content, 5000..5500),
407 lines(input_file_content, 8000..9000),
408 lines(input_file_content, 18455..18470),
409 lines(input_file_content, 20000..20500),
410 lines(input_file_content, 21000..21300),
411 ]
412 .join("Match found:\n\n"),
413 )],
414 ),
415 message(
416 User,
417 [text(indoc! {"
418 Comment out the lines that interact with the BlinkManager.
419 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
420 Don't add additional comments.
421 "})],
422 ),
423 message(
424 Assistant,
425 [tool_use(
426 "tool_4",
427 "edit_file",
428 EditFileToolInput {
429 display_description: edit_description.into(),
430 path: input_file_path.into(),
431 mode: EditFileMode::Edit,
432 },
433 )],
434 ),
435 ],
436 Some(input_file_content.into()),
437 EvalAssertion::judge_diff(indoc! {"
438 - Calls to BlinkManager in `observe_window_activation` were commented out
439 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
440 - All the edits have valid indentation
441 "}),
442 ),
443 );
444}
445
446#[test]
447#[cfg_attr(not(feature = "eval"), ignore)]
448fn eval_from_pixels_constructor() {
449 // Results for 2025-06-13
450 //
451 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
452 // value. Higher values improve the pass rate but may sometimes cause
453 // edits to be misapplied. In the context of this eval, this means
454 // the agent might add from_pixels tests in incorrect locations
455 // (e.g., at the beginning of the file), yet the evaluation may still
456 // rate it highly.
457 //
458 // Model | Date | Pass rate
459 // =========================================================
460 // claude-4.0-sonnet | 2025-06-14 | 0.99
461 // claude-3.7-sonnet | 2025-06-14 | 0.88
462 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
463 // gpt-4.1 |
464 let input_file_path = "root/canvas.rs";
465 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
466 let edit_description = "Implement from_pixels constructor and add tests.";
467 eval(
468 100,
469 0.95,
470 // For whatever reason, this eval produces more mismatched tags.
471 // Increasing for now, let's see if we can bring this down.
472 0.25,
473 EvalInput::from_conversation(
474 vec![
475 message(
476 User,
477 [text(indoc! {"
478 Introduce a new `from_pixels` constructor in Canvas and
479 also add tests for it in the same file.
480 "})],
481 ),
482 message(
483 Assistant,
484 [tool_use(
485 "tool_1",
486 "read_file",
487 ReadFileToolInput {
488 path: input_file_path.into(),
489 start_line: None,
490 end_line: None,
491 },
492 )],
493 ),
494 message(
495 User,
496 [tool_result("tool_1", "read_file", input_file_content)],
497 ),
498 message(
499 Assistant,
500 [tool_use(
501 "tool_2",
502 "grep",
503 GrepToolInput {
504 regex: "mod\\s+tests".into(),
505 include_pattern: Some("font-kit/src/canvas.rs".into()),
506 offset: 0,
507 case_sensitive: false,
508 },
509 )],
510 ),
511 message(User, [tool_result("tool_2", "grep", "No matches found")]),
512 message(
513 Assistant,
514 [tool_use(
515 "tool_3",
516 "grep",
517 GrepToolInput {
518 regex: "mod\\s+tests".into(),
519 include_pattern: Some("font-kit/src/**/*.rs".into()),
520 offset: 0,
521 case_sensitive: false,
522 },
523 )],
524 ),
525 message(User, [tool_result("tool_3", "grep", "No matches found")]),
526 message(
527 Assistant,
528 [tool_use(
529 "tool_4",
530 "grep",
531 GrepToolInput {
532 regex: "#\\[test\\]".into(),
533 include_pattern: Some("font-kit/src/**/*.rs".into()),
534 offset: 0,
535 case_sensitive: false,
536 },
537 )],
538 ),
539 message(
540 User,
541 [tool_result(
542 "tool_4",
543 "grep",
544 indoc! {"
545 Found 6 matches:
546
547 ## Matches in font-kit/src/loaders/core_text.rs
548
549 ### mod test › L926-936
550 ```
551 mod test {
552 use super::Font;
553 use crate::properties::{Stretch, Weight};
554
555 #[cfg(feature = \"source\")]
556 use crate::source::SystemSource;
557
558 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
559
560 #[cfg(feature = \"source\")]
561 #[test]
562 ```
563
564 55 lines remaining in ancestor node. Read the file to see all.
565
566 ### mod test › L947-951
567 ```
568 }
569
570 #[test]
571 fn test_core_text_to_css_font_weight() {
572 // Exact matches
573 ```
574
575 ### mod test › L959-963
576 ```
577 }
578
579 #[test]
580 fn test_core_text_to_css_font_stretch() {
581 // Exact matches
582 ```
583
584 ## Matches in font-kit/src/loaders/freetype.rs
585
586 ### mod test › L1238-1248
587 ```
588 mod test {
589 use crate::loaders::freetype::Font;
590
591 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
592 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
593
594 #[test]
595 fn get_pcf_postscript_name() {
596 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
597 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
598 }
599 ```
600
601 1 lines remaining in ancestor node. Read the file to see all.
602
603 ## Matches in font-kit/src/sources/core_text.rs
604
605 ### mod test › L265-275
606 ```
607 mod test {
608 use crate::properties::{Stretch, Weight};
609
610 #[test]
611 fn test_css_to_core_text_font_weight() {
612 // Exact matches
613 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
614 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
615 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
616 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
617
618 ```
619
620 27 lines remaining in ancestor node. Read the file to see all.
621
622 ### mod test › L278-282
623 ```
624 }
625
626 #[test]
627 fn test_css_to_core_text_font_stretch() {
628 // Exact matches
629 ```
630 "},
631 )],
632 ),
633 message(
634 Assistant,
635 [tool_use(
636 "tool_5",
637 "edit_file",
638 EditFileToolInput {
639 display_description: edit_description.into(),
640 path: input_file_path.into(),
641 mode: EditFileMode::Edit,
642 },
643 )],
644 ),
645 ],
646 Some(input_file_content.into()),
647 EvalAssertion::judge_diff(indoc! {"
648 - The diff contains a new `from_pixels` constructor
649 - The diff contains new tests for the `from_pixels` constructor
650 "}),
651 ),
652 );
653}
654
655#[test]
656#[cfg_attr(not(feature = "eval"), ignore)]
657fn eval_zode() {
658 // Model | Pass rate
659 // ============================================
660 //
661 // claude-3.7-sonnet | 1.0 (2025-06-14)
662 // claude-sonnet-4 | 1.0 (2025-06-14)
663 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
664 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
665 // gpt-4.1 | 1.0 (2025-05-22)
666 let input_file_path = "root/zode.py";
667 let input_content = None;
668 let edit_description = "Create the main Zode CLI script";
669 eval(
670 50,
671 1.,
672 0.05,
673 EvalInput::from_conversation(
674 vec![
675 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
676 message(
677 Assistant,
678 [
679 tool_use(
680 "tool_1",
681 "read_file",
682 ReadFileToolInput {
683 path: "root/eval/react.py".into(),
684 start_line: None,
685 end_line: None,
686 },
687 ),
688 tool_use(
689 "tool_2",
690 "read_file",
691 ReadFileToolInput {
692 path: "root/eval/react_test.py".into(),
693 start_line: None,
694 end_line: None,
695 },
696 ),
697 ],
698 ),
699 message(
700 User,
701 [
702 tool_result(
703 "tool_1",
704 "read_file",
705 include_str!("evals/fixtures/zode/react.py"),
706 ),
707 tool_result(
708 "tool_2",
709 "read_file",
710 include_str!("evals/fixtures/zode/react_test.py"),
711 ),
712 ],
713 ),
714 message(
715 Assistant,
716 [
717 text(
718 "Now that I understand what we need to build, I'll create the main Python script:",
719 ),
720 tool_use(
721 "tool_3",
722 "edit_file",
723 EditFileToolInput {
724 display_description: edit_description.into(),
725 path: input_file_path.into(),
726 mode: EditFileMode::Create,
727 },
728 ),
729 ],
730 ),
731 ],
732 input_content,
733 EvalAssertion::new(async move |sample, _, _cx| {
734 let invalid_starts = [' ', '`', '\n'];
735 let mut message = String::new();
736 for start in invalid_starts {
737 if sample.text_after.starts_with(start) {
738 message.push_str(&format!("The sample starts with a {:?}\n", start));
739 break;
740 }
741 }
742 // Remove trailing newline.
743 message.pop();
744
745 if message.is_empty() {
746 Ok(EvalAssertionOutcome {
747 score: 100,
748 message: None,
749 })
750 } else {
751 Ok(EvalAssertionOutcome {
752 score: 0,
753 message: Some(message),
754 })
755 }
756 }),
757 ),
758 );
759}
760
761#[test]
762#[cfg_attr(not(feature = "eval"), ignore)]
763fn eval_add_overwrite_test() {
764 // Model | Pass rate
765 // ============================================
766 //
767 // claude-3.7-sonnet | 0.65 (2025-06-14)
768 // claude-sonnet-4 | 0.07 (2025-06-14)
769 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
770 // gemini-2.5-flash-preview-04-17 |
771 // gpt-4.1 |
772 let input_file_path = "root/action_log.rs";
773 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
774 let edit_description = "Add a new test for overwriting a file in action_log.rs";
775 eval(
776 200,
777 0.5, // TODO: make this eval better
778 0.05,
779 EvalInput::from_conversation(
780 vec![
781 message(
782 User,
783 [text(indoc! {"
784 Introduce a new test in `action_log.rs` to test overwriting a file.
785 That is, a file already exists, but we call `buffer_created` as if the file were new.
786 Take inspiration from all the other tests in the file.
787 "})],
788 ),
789 message(
790 Assistant,
791 [tool_use(
792 "tool_1",
793 "read_file",
794 ReadFileToolInput {
795 path: input_file_path.into(),
796 start_line: None,
797 end_line: None,
798 },
799 )],
800 ),
801 message(
802 User,
803 [tool_result(
804 "tool_1",
805 "read_file",
806 indoc! {"
807 pub struct ActionLog [L13-20]
808 tracked_buffers [L15]
809 edited_since_project_diagnostics_check [L17]
810 project [L19]
811 impl ActionLog [L22-498]
812 pub fn new [L24-30]
813 pub fn project [L32-34]
814 pub fn checked_project_diagnostics [L37-39]
815 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
816 fn track_buffer_internal [L46-101]
817 fn handle_buffer_event [L103-116]
818 fn handle_buffer_edited [L118-123]
819 fn handle_buffer_file_changed [L125-158]
820 async fn maintain_diff [L160-264]
821 pub fn buffer_read [L267-269]
822 pub fn buffer_created [L272-276]
823 pub fn buffer_edited [L279-287]
824 pub fn will_delete_buffer [L289-304]
825 pub fn keep_edits_in_range [L306-364]
826 pub fn reject_edits_in_ranges [L366-459]
827 pub fn keep_all_edits [L461-473]
828 pub fn changed_buffers [L476-482]
829 pub fn stale_buffers [L485-497]
830 fn apply_non_conflicting_edits [L500-561]
831 fn diff_snapshots [L563-585]
832 fn point_to_row_edit [L587-614]
833 enum ChangeAuthor [L617-620]
834 User [L618]
835 Agent [L619]
836 enum TrackedBufferStatus [L623-627]
837 Created [L624]
838 Modified [L625]
839 Deleted [L626]
840 struct TrackedBuffer [L629-641]
841 buffer [L630]
842 base_text [L631]
843 unreviewed_changes [L632]
844 status [L633]
845 version [L634]
846 diff [L635]
847 snapshot [L636]
848 diff_update [L637]
849 _open_lsp_handle [L638]
850 _maintain_diff [L639]
851 _subscription [L640]
852 impl TrackedBuffer [L643-657]
853 fn has_changes [L644-650]
854 fn schedule_diff_update [L652-656]
855 pub struct ChangedBuffer [L659-661]
856 pub diff [L660]
857 mod tests [L664-1574]
858 fn init_logger [L678-682]
859 fn init_test [L684-691]
860 async fn test_keep_edits [L694-769]
861 async fn test_deletions [L772-854]
862 async fn test_overlapping_user_edits [L857-951]
863 async fn test_creating_files [L954-1010]
864 async fn test_deleting_files [L1013-1120]
865 async fn test_reject_edits [L1123-1255]
866 async fn test_reject_multiple_edits [L1258-1331]
867 async fn test_reject_deleted_file [L1334-1388]
868 async fn test_reject_created_file [L1391-1443]
869 async fn test_random_diffs [L1446-1535]
870 fn quiesce [L1510-1534]
871 struct HunkStatus [L1538-1542]
872 range [L1539]
873 diff_status [L1540]
874 old_text [L1541]
875 fn unreviewed_hunks [L1544-1573]
876
877 Showing symbols 1-69 (total symbols: 69)
878
879 Using the line numbers in this outline, you can call this tool again while specifying
880 the start_line and end_line fields to see the implementations of symbols in the outline.
881 "},
882 )],
883 ),
884 message(
885 Assistant,
886 [
887 text(
888 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
889 ),
890 tool_use(
891 "tool_2",
892 "read_file",
893 ReadFileToolInput {
894 path: input_file_path.into(),
895 start_line: Some(953),
896 end_line: Some(1010),
897 },
898 ),
899 ],
900 ),
901 message(
902 User,
903 [tool_result(
904 "tool_2",
905 "read_file",
906 lines(input_file_content, 953..1010),
907 )],
908 ),
909 message(
910 Assistant,
911 [
912 text(
913 "Now let's look at another relevant test to understand how they're structured:",
914 ),
915 tool_use(
916 "tool_3",
917 "read_file",
918 ReadFileToolInput {
919 path: input_file_path.into(),
920 start_line: Some(1012),
921 end_line: Some(1120),
922 },
923 ),
924 ],
925 ),
926 message(
927 User,
928 [tool_result(
929 "tool_3",
930 "read_file",
931 lines(input_file_content, 1012..1120),
932 )],
933 ),
934 message(
935 Assistant,
936 [
937 text("Now let's look at how `buffer_created` is implemented:"),
938 tool_use(
939 "tool_4",
940 "read_file",
941 ReadFileToolInput {
942 path: input_file_path.into(),
943 start_line: Some(271),
944 end_line: Some(276),
945 },
946 ),
947 ],
948 ),
949 message(
950 User,
951 [tool_result(
952 "tool_4",
953 "read_file",
954 lines(input_file_content, 271..276),
955 )],
956 ),
957 message(
958 Assistant,
959 [
960 text(indoc! {"
961 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
962
963 1. Create a file
964 2. Open a buffer for it
965 3. Track it in the action log
966 4. Modify the buffer
967 5. Call `buffer_created` again as if the file were new
968 6. Check that the changes are properly tracked
969
970 Let's write the test:
971 "}),
972 tool_use(
973 "tool_5",
974 "edit_file",
975 EditFileToolInput {
976 display_description: edit_description.into(),
977 path: input_file_path.into(),
978 mode: EditFileMode::Edit,
979 },
980 ),
981 ],
982 ),
983 ],
984 Some(input_file_content.into()),
985 EvalAssertion::judge_diff(
986 "A new test for overwritten files was created, without changing any previous test",
987 ),
988 ),
989 );
990}
991
992#[test]
993#[cfg_attr(not(feature = "eval"), ignore)]
994fn eval_create_empty_file() {
995 // Check that Edit Agent can create a file without writing its
996 // thoughts into it. This issue is not specific to empty files, but
997 // it's easier to reproduce with them.
998 //
999 // Model | Pass rate
1000 // ============================================
1001 //
1002 // claude-3.7-sonnet | 1.00 (2025-06-14)
1003 // claude-sonnet-4 | 1.00 (2025-06-14)
1004 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1005 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1006 // gpt-4.1 | 1.00 (2025-05-21)
1007 //
1008 //
1009 // TODO: gpt-4.1-mini errored 38 times:
1010 // "data did not match any variant of untagged enum ResponseStreamResult"
1011 //
1012 let input_file_content = None;
1013 let expected_output_content = String::new();
1014 eval(
1015 100,
1016 0.99,
1017 0.05,
1018 EvalInput::from_conversation(
1019 vec![
1020 message(User, [text("Create a second empty todo file ")]),
1021 message(
1022 Assistant,
1023 [
1024 text(formatdoc! {"
1025 I'll help you create a second empty todo file.
1026 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1027 "}),
1028 tool_use(
1029 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1030 "list_directory",
1031 ListDirectoryToolInput {
1032 path: "root".to_string(),
1033 },
1034 ),
1035 ],
1036 ),
1037 message(
1038 User,
1039 [tool_result(
1040 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1041 "list_directory",
1042 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1043 )],
1044 ),
1045 message(
1046 Assistant,
1047 [
1048 text(formatdoc! {"
1049 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1050 "}),
1051 tool_use(
1052 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1053 "edit_file",
1054 EditFileToolInput {
1055 display_description: "Create empty TODO3 file".to_string(),
1056 mode: EditFileMode::Create,
1057 path: "root/TODO3".into(),
1058 },
1059 ),
1060 ],
1061 ),
1062 ],
1063 input_file_content,
1064 // Bad behavior is to write something like
1065 // "I'll create an empty TODO3 file as requested."
1066 EvalAssertion::assert_eq(expected_output_content),
1067 ),
1068 );
1069}
1070
1071fn message(
1072 role: Role,
1073 contents: impl IntoIterator<Item = MessageContent>,
1074) -> LanguageModelRequestMessage {
1075 LanguageModelRequestMessage {
1076 role,
1077 content: contents.into_iter().collect(),
1078 cache: false,
1079 }
1080}
1081
1082fn text(text: impl Into<String>) -> MessageContent {
1083 MessageContent::Text(text.into())
1084}
1085
1086fn lines(input: &str, range: Range<usize>) -> String {
1087 input
1088 .lines()
1089 .skip(range.start)
1090 .take(range.len())
1091 .collect::<Vec<_>>()
1092 .join("\n")
1093}
1094
1095fn tool_use(
1096 id: impl Into<Arc<str>>,
1097 name: impl Into<Arc<str>>,
1098 input: impl Serialize,
1099) -> MessageContent {
1100 MessageContent::ToolUse(LanguageModelToolUse {
1101 id: LanguageModelToolUseId::from(id.into()),
1102 name: name.into(),
1103 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1104 input: serde_json::to_value(input).unwrap(),
1105 is_input_complete: true,
1106 })
1107}
1108
1109fn tool_result(
1110 id: impl Into<Arc<str>>,
1111 name: impl Into<Arc<str>>,
1112 result: impl Into<Arc<str>>,
1113) -> MessageContent {
1114 MessageContent::ToolResult(LanguageModelToolResult {
1115 tool_use_id: LanguageModelToolUseId::from(id.into()),
1116 tool_name: name.into(),
1117 is_error: false,
1118 content: LanguageModelToolResultContent::Text(result.into()),
1119 output: None,
1120 })
1121}
1122
1123#[derive(Clone)]
1124struct EvalInput {
1125 conversation: Vec<LanguageModelRequestMessage>,
1126 edit_file_input: EditFileToolInput,
1127 input_content: Option<String>,
1128 assertion: EvalAssertion,
1129}
1130
1131impl EvalInput {
1132 fn from_conversation(
1133 conversation: Vec<LanguageModelRequestMessage>,
1134 input_content: Option<String>,
1135 assertion: EvalAssertion,
1136 ) -> Self {
1137 let msg = conversation.last().expect("Conversation must not be empty");
1138 if msg.role != Role::Assistant {
1139 panic!("Conversation must end with an assistant message");
1140 }
1141 let tool_use = msg
1142 .content
1143 .iter()
1144 .flat_map(|content| match content {
1145 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1146 Some(tool_use)
1147 }
1148 _ => None,
1149 })
1150 .next()
1151 .expect("Conversation must end with an edit_file tool use")
1152 .clone();
1153
1154 let edit_file_input: EditFileToolInput =
1155 serde_json::from_value(tool_use.input.clone()).unwrap();
1156
1157 EvalInput {
1158 conversation,
1159 edit_file_input,
1160 input_content,
1161 assertion,
1162 }
1163 }
1164}
1165
1166#[derive(Clone)]
1167struct EvalSample {
1168 text_before: String,
1169 text_after: String,
1170 edit_output: EditAgentOutput,
1171 diff: String,
1172}
1173
1174trait AssertionFn: 'static + Send + Sync {
1175 fn assert<'a>(
1176 &'a self,
1177 sample: &'a EvalSample,
1178 judge_model: Arc<dyn LanguageModel>,
1179 cx: &'a mut TestAppContext,
1180 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1181}
1182
1183impl<F> AssertionFn for F
1184where
1185 F: 'static
1186 + Send
1187 + Sync
1188 + AsyncFn(
1189 &EvalSample,
1190 Arc<dyn LanguageModel>,
1191 &mut TestAppContext,
1192 ) -> Result<EvalAssertionOutcome>,
1193{
1194 fn assert<'a>(
1195 &'a self,
1196 sample: &'a EvalSample,
1197 judge_model: Arc<dyn LanguageModel>,
1198 cx: &'a mut TestAppContext,
1199 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1200 (self)(sample, judge_model, cx).boxed_local()
1201 }
1202}
1203
1204#[derive(Clone)]
1205struct EvalAssertion(Arc<dyn AssertionFn>);
1206
1207impl EvalAssertion {
1208 fn new<F>(f: F) -> Self
1209 where
1210 F: 'static
1211 + Send
1212 + Sync
1213 + AsyncFn(
1214 &EvalSample,
1215 Arc<dyn LanguageModel>,
1216 &mut TestAppContext,
1217 ) -> Result<EvalAssertionOutcome>,
1218 {
1219 EvalAssertion(Arc::new(f))
1220 }
1221
1222 fn assert_eq(expected: impl Into<String>) -> Self {
1223 let expected = expected.into();
1224 Self::new(async move |sample, _judge, _cx| {
1225 Ok(EvalAssertionOutcome {
1226 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1227 100
1228 } else {
1229 0
1230 },
1231 message: None,
1232 })
1233 })
1234 }
1235
1236 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1237 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1238 Self::new(async move |sample, _judge, _cx| {
1239 let matches = expected_diffs.iter().any(|possible_diff| {
1240 let expected =
1241 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1242 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1243 });
1244
1245 Ok(EvalAssertionOutcome {
1246 score: if matches { 100 } else { 0 },
1247 message: None,
1248 })
1249 })
1250 }
1251
1252 fn judge_diff(assertions: &'static str) -> Self {
1253 Self::new(async move |sample, judge, cx| {
1254 let prompt = DiffJudgeTemplate {
1255 diff: sample.diff.clone(),
1256 assertions,
1257 }
1258 .render(&Templates::new())
1259 .unwrap();
1260
1261 let request = LanguageModelRequest {
1262 messages: vec![LanguageModelRequestMessage {
1263 role: Role::User,
1264 content: vec![prompt.into()],
1265 cache: false,
1266 }],
1267 ..Default::default()
1268 };
1269 let mut response = retry_on_rate_limit(async || {
1270 Ok(judge
1271 .stream_completion_text(request.clone(), &cx.to_async())
1272 .await?)
1273 })
1274 .await?;
1275 let mut output = String::new();
1276 while let Some(chunk) = response.stream.next().await {
1277 let chunk = chunk?;
1278 output.push_str(&chunk);
1279 }
1280
1281 // Parse the score from the response
1282 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1283 if let Some(captures) = re.captures(&output) {
1284 if let Some(score_match) = captures.get(1) {
1285 let score = score_match.as_str().parse().unwrap_or(0);
1286 return Ok(EvalAssertionOutcome {
1287 score,
1288 message: Some(output),
1289 });
1290 }
1291 }
1292
1293 anyhow::bail!("No score found in response. Raw output: {output}");
1294 })
1295 }
1296
1297 async fn run(
1298 &self,
1299 input: &EvalSample,
1300 judge_model: Arc<dyn LanguageModel>,
1301 cx: &mut TestAppContext,
1302 ) -> Result<EvalAssertionOutcome> {
1303 self.0.assert(input, judge_model, cx).await
1304 }
1305}
1306
1307fn eval(
1308 iterations: usize,
1309 expected_pass_ratio: f32,
1310 mismatched_tag_threshold: f32,
1311 mut eval: EvalInput,
1312) {
1313 let mut evaluated_count = 0;
1314 let mut failed_count = 0;
1315 report_progress(evaluated_count, failed_count, iterations);
1316
1317 let (tx, rx) = mpsc::channel();
1318
1319 // Cache the last message in the conversation, and run one instance of the eval so that
1320 // all the next ones are cached.
1321 eval.conversation.last_mut().unwrap().cache = true;
1322 run_eval(eval.clone(), tx.clone());
1323
1324 let executor = gpui::background_executor();
1325 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1326 for _ in 1..iterations {
1327 let eval = eval.clone();
1328 let tx = tx.clone();
1329 let semaphore = semaphore.clone();
1330 executor
1331 .spawn(async move {
1332 let _guard = semaphore.acquire().await;
1333 run_eval(eval, tx)
1334 })
1335 .detach();
1336 }
1337 drop(tx);
1338
1339 let mut failed_evals = HashMap::default();
1340 let mut errored_evals = HashMap::default();
1341 let mut eval_outputs = Vec::new();
1342 let mut cumulative_parser_metrics = EditParserMetrics::default();
1343 while let Ok(output) = rx.recv() {
1344 match output {
1345 Ok(output) => {
1346 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1347 eval_outputs.push(output.clone());
1348 if output.assertion.score < 80 {
1349 failed_count += 1;
1350 failed_evals
1351 .entry(output.sample.text_after.clone())
1352 .or_insert(Vec::new())
1353 .push(output);
1354 }
1355 }
1356 Err(error) => {
1357 failed_count += 1;
1358 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1359 }
1360 }
1361
1362 evaluated_count += 1;
1363 report_progress(evaluated_count, failed_count, iterations);
1364 }
1365
1366 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1367 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1368 if actual_pass_ratio < expected_pass_ratio {
1369 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1370 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1371 for (error, count) in errored_evals {
1372 println!("Eval errored {} times. Error: {}", count, error);
1373 }
1374
1375 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1376 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1377 for (_buffer_output, failed_evals) in failed_evals {
1378 let eval_output = failed_evals.first().unwrap();
1379 println!("Eval failed {} times", failed_evals.len());
1380 println!("{}", eval_output);
1381 }
1382
1383 panic!(
1384 "Actual pass ratio: {}\nExpected pass ratio: {}",
1385 actual_pass_ratio, expected_pass_ratio
1386 );
1387 }
1388
1389 let mismatched_tag_ratio =
1390 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1391 if mismatched_tag_ratio > mismatched_tag_threshold {
1392 for eval_output in eval_outputs {
1393 println!("{}", eval_output);
1394 }
1395 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1396 }
1397}
1398
1399fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1400 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1401 let mut cx = TestAppContext::build(dispatcher, None);
1402 let output = cx.executor().block_test(async {
1403 let test = EditAgentTest::new(&mut cx).await;
1404 test.eval(eval, &mut cx).await
1405 });
1406 tx.send(output).unwrap();
1407}
1408
1409#[derive(Clone)]
1410struct EvalOutput {
1411 sample: EvalSample,
1412 assertion: EvalAssertionOutcome,
1413}
1414
1415impl Display for EvalOutput {
1416 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1417 writeln!(f, "Score: {:?}", self.assertion.score)?;
1418 if let Some(message) = self.assertion.message.as_ref() {
1419 writeln!(f, "Message: {}", message)?;
1420 }
1421
1422 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1423
1424 writeln!(
1425 f,
1426 "Parser Metrics:\n{:#?}",
1427 self.sample.edit_output.parser_metrics
1428 )?;
1429 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1430 Ok(())
1431 }
1432}
1433
1434fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1435 let passed_count = evaluated_count - failed_count;
1436 let passed_ratio = if evaluated_count == 0 {
1437 0.0
1438 } else {
1439 passed_count as f64 / evaluated_count as f64
1440 };
1441 print!(
1442 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1443 evaluated_count,
1444 iterations,
1445 passed_ratio * 100.0
1446 );
1447 std::io::stdout().flush().unwrap();
1448}
1449
1450struct EditAgentTest {
1451 agent: EditAgent,
1452 project: Entity<Project>,
1453 judge_model: Arc<dyn LanguageModel>,
1454}
1455
1456impl EditAgentTest {
1457 async fn new(cx: &mut TestAppContext) -> Self {
1458 cx.executor().allow_parking();
1459
1460 let fs = FakeFs::new(cx.executor().clone());
1461 cx.update(|cx| {
1462 settings::init(cx);
1463 gpui_tokio::init(cx);
1464 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1465 cx.set_http_client(http_client);
1466
1467 client::init_settings(cx);
1468 let client = Client::production(cx);
1469 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1470
1471 settings::init(cx);
1472 Project::init_settings(cx);
1473 language::init(cx);
1474 language_model::init(client.clone(), cx);
1475 language_models::init(user_store.clone(), client.clone(), cx);
1476 crate::init(client.http_client(), cx);
1477 });
1478
1479 fs.insert_tree("/root", json!({})).await;
1480 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1481 let agent_model = SelectedModel::from_str(
1482 &std::env::var("ZED_AGENT_MODEL")
1483 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1484 )
1485 .unwrap();
1486 let judge_model = SelectedModel::from_str(
1487 &std::env::var("ZED_JUDGE_MODEL")
1488 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1489 )
1490 .unwrap();
1491 let (agent_model, judge_model) = cx
1492 .update(|cx| {
1493 cx.spawn(async move |cx| {
1494 let agent_model = Self::load_model(&agent_model, cx).await;
1495 let judge_model = Self::load_model(&judge_model, cx).await;
1496 (agent_model.unwrap(), judge_model.unwrap())
1497 })
1498 })
1499 .await;
1500 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1501
1502 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1503
1504 Self {
1505 agent: EditAgent::new(
1506 agent_model,
1507 project.clone(),
1508 action_log,
1509 Templates::new(),
1510 edit_format,
1511 ),
1512 project,
1513 judge_model,
1514 }
1515 }
1516
1517 async fn load_model(
1518 selected_model: &SelectedModel,
1519 cx: &mut AsyncApp,
1520 ) -> Result<Arc<dyn LanguageModel>> {
1521 let (provider, model) = cx.update(|cx| {
1522 let models = LanguageModelRegistry::read_global(cx);
1523 let model = models
1524 .available_models(cx)
1525 .find(|model| {
1526 model.provider_id() == selected_model.provider
1527 && model.id() == selected_model.model
1528 })
1529 .expect("Model not found");
1530 let provider = models.provider(&model.provider_id()).unwrap();
1531 (provider, model)
1532 })?;
1533 cx.update(|cx| provider.authenticate(cx))?.await?;
1534 Ok(model)
1535 }
1536
1537 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1538 let path = self
1539 .project
1540 .read_with(cx, |project, cx| {
1541 project.find_project_path(eval.edit_file_input.path, cx)
1542 })
1543 .unwrap();
1544 let buffer = self
1545 .project
1546 .update(cx, |project, cx| project.open_buffer(path, cx))
1547 .await
1548 .unwrap();
1549 let tools = cx.update(|cx| {
1550 ToolRegistry::default_global(cx)
1551 .tools()
1552 .into_iter()
1553 .filter_map(|tool| {
1554 let input_schema = tool
1555 .input_schema(self.agent.model.tool_input_format())
1556 .ok()?;
1557 Some(LanguageModelRequestTool {
1558 name: tool.name(),
1559 description: tool.description(),
1560 input_schema,
1561 })
1562 })
1563 .collect::<Vec<_>>()
1564 });
1565 let tool_names = tools
1566 .iter()
1567 .map(|tool| tool.name.clone())
1568 .collect::<Vec<_>>();
1569 let worktrees = vec![WorktreeContext {
1570 root_name: "root".to_string(),
1571 abs_path: Path::new("/path/to/root").into(),
1572 rules_file: None,
1573 }];
1574 let prompt_builder = PromptBuilder::new(None)?;
1575 let project_context = ProjectContext::new(worktrees, Vec::default());
1576 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1577 &project_context,
1578 &ModelContext {
1579 available_tools: tool_names,
1580 },
1581 )?;
1582
1583 let has_system_prompt = eval
1584 .conversation
1585 .first()
1586 .map_or(false, |msg| msg.role == Role::System);
1587 let messages = if has_system_prompt {
1588 eval.conversation
1589 } else {
1590 [LanguageModelRequestMessage {
1591 role: Role::System,
1592 content: vec![MessageContent::Text(system_prompt)],
1593 cache: true,
1594 }]
1595 .into_iter()
1596 .chain(eval.conversation)
1597 .collect::<Vec<_>>()
1598 };
1599
1600 let conversation = LanguageModelRequest {
1601 messages,
1602 tools,
1603 ..Default::default()
1604 };
1605
1606 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1607 if let Some(input_content) = eval.input_content.as_deref() {
1608 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1609 }
1610 retry_on_rate_limit(async || {
1611 self.agent
1612 .edit(
1613 buffer.clone(),
1614 eval.edit_file_input.display_description.clone(),
1615 &conversation,
1616 &mut cx.to_async(),
1617 )
1618 .0
1619 .await
1620 })
1621 .await?
1622 } else {
1623 retry_on_rate_limit(async || {
1624 self.agent
1625 .overwrite(
1626 buffer.clone(),
1627 eval.edit_file_input.display_description.clone(),
1628 &conversation,
1629 &mut cx.to_async(),
1630 )
1631 .0
1632 .await
1633 })
1634 .await?
1635 };
1636
1637 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1638 let sample = EvalSample {
1639 edit_output,
1640 diff: language::unified_diff(
1641 eval.input_content.as_deref().unwrap_or_default(),
1642 &buffer_text,
1643 ),
1644 text_before: eval.input_content.unwrap_or_default(),
1645 text_after: buffer_text,
1646 };
1647 let assertion = eval
1648 .assertion
1649 .run(&sample, self.judge_model.clone(), cx)
1650 .await?;
1651
1652 Ok(EvalOutput { assertion, sample })
1653 }
1654}
1655
1656async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1657 let mut attempt = 0;
1658 loop {
1659 attempt += 1;
1660 match request().await {
1661 Ok(result) => return Ok(result),
1662 Err(err) => match err.downcast::<LanguageModelCompletionError>() {
1663 Ok(err) => match &err {
1664 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1665 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1666 let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
1667 // Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
1668 let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1669 eprintln!(
1670 "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
1671 );
1672 Timer::after(retry_after + jitter).await;
1673 continue;
1674 }
1675 LanguageModelCompletionError::UpstreamProviderError {
1676 status,
1677 retry_after,
1678 ..
1679 } => {
1680 // Only retry for specific status codes
1681 let should_retry = matches!(
1682 *status,
1683 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1684 ) || status.as_u16() == 529;
1685
1686 if !should_retry {
1687 return Err(err.into());
1688 }
1689
1690 // Use server-provided retry_after if available, otherwise use default
1691 let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
1692 let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1693 eprintln!(
1694 "Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
1695 );
1696 Timer::after(retry_after + jitter).await;
1697 continue;
1698 }
1699 _ => return Err(err.into()),
1700 },
1701 Err(err) => return Err(err),
1702 },
1703 }
1704 }
1705}
1706
1707#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1708struct EvalAssertionOutcome {
1709 score: usize,
1710 message: Option<String>,
1711}
1712
1713#[derive(Serialize)]
1714pub struct DiffJudgeTemplate {
1715 diff: String,
1716 assertions: &'static str,
1717}
1718
1719impl Template for DiffJudgeTemplate {
1720 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1721}
1722
1723fn strip_empty_lines(text: &str) -> String {
1724 text.lines()
1725 .filter(|line| !line.trim().is_empty())
1726 .collect::<Vec<_>>()
1727 .join("\n")
1728}