1use super::*;
2use crate::{
3 EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
4};
5use Role::*;
6use client::{Client, UserStore};
7use collections::HashMap;
8use fs::FakeFs;
9use futures::{FutureExt, future::LocalBoxFuture};
10use gpui::{AppContext, TestAppContext, Timer};
11use http_client::StatusCode;
12use indoc::{formatdoc, indoc};
13use language_model::{
14 LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
15 LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
16};
17use project::Project;
18use prompt_store::{ProjectContext, WorktreeContext};
19use rand::prelude::*;
20use reqwest_client::ReqwestClient;
21use serde_json::json;
22use std::{
23 cmp::Reverse,
24 fmt::{self, Display},
25 io::Write as _,
26 path::Path,
27 str::FromStr,
28 sync::mpsc,
29 time::Duration,
30};
31use util::path;
32
33#[test]
34#[cfg_attr(not(feature = "unit-eval"), ignore)]
35fn eval_extract_handle_command_output() {
36 // Test how well agent generates multiple edit hunks.
37 //
38 // Model | Pass rate
39 // ----------------------------|----------
40 // claude-3.7-sonnet | 0.99 (2025-06-14)
41 // claude-sonnet-4 | 0.97 (2025-06-14)
42 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
43 // gemini-2.5-flash | 0.11 (2025-05-22)
44 // gpt-4.1 | 1.00 (2025-05-22)
45 // claude-sonnet-4.5 | 0.79 (2025-11-10)
46
47 let input_file_path = "root/blame.rs";
48 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
49 let possible_diffs = vec![
50 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
51 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
57 ];
58 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
59 eval(
60 100,
61 0.95,
62 0.05,
63 EvalInput::from_conversation(
64 vec![
65 message(
66 User,
67 [text(formatdoc! {"
68 Read the `{input_file_path}` file and extract a method in
69 the final stanza of `run_git_blame` to deal with command failures,
70 call it `handle_command_output` and take the std::process::Output as the only parameter.
71 Do not document the method and do not add any comments.
72
73 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
74 "})],
75 ),
76 message(
77 Assistant,
78 [tool_use(
79 "tool_1",
80 "read_file",
81 ReadFileToolInput {
82 path: input_file_path.into(),
83 start_line: None,
84 end_line: None,
85 },
86 )],
87 ),
88 message(
89 User,
90 [tool_result("tool_1", "read_file", input_file_content)],
91 ),
92 message(
93 Assistant,
94 [tool_use(
95 "tool_2",
96 "edit_file",
97 EditFileToolInput {
98 display_description: edit_description.into(),
99 path: input_file_path.into(),
100 mode: EditFileMode::Edit,
101 },
102 )],
103 ),
104 ],
105 Some(input_file_content.into()),
106 EvalAssertion::assert_diff_any(possible_diffs),
107 ),
108 );
109}
110
111#[test]
112#[cfg_attr(not(feature = "unit-eval"), ignore)]
113fn eval_delete_run_git_blame() {
114 // Model | Pass rate
115 // ----------------------------|----------
116 // claude-3.7-sonnet | 1.0 (2025-06-14)
117 // claude-sonnet-4 | 0.96 (2025-06-14)
118 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
119 // gemini-2.5-flash |
120 // gpt-4.1 |
121
122 let input_file_path = "root/blame.rs";
123 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
124 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
125 let edit_description = "Delete the `run_git_blame` function.";
126 eval(
127 100,
128 0.95,
129 0.05,
130 EvalInput::from_conversation(
131 vec![
132 message(
133 User,
134 [text(formatdoc! {"
135 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
136 one function, not its usages.
137 "})],
138 ),
139 message(
140 Assistant,
141 [tool_use(
142 "tool_1",
143 "read_file",
144 ReadFileToolInput {
145 path: input_file_path.into(),
146 start_line: None,
147 end_line: None,
148 },
149 )],
150 ),
151 message(
152 User,
153 [tool_result("tool_1", "read_file", input_file_content)],
154 ),
155 message(
156 Assistant,
157 [tool_use(
158 "tool_2",
159 "edit_file",
160 EditFileToolInput {
161 display_description: edit_description.into(),
162 path: input_file_path.into(),
163 mode: EditFileMode::Edit,
164 },
165 )],
166 ),
167 ],
168 Some(input_file_content.into()),
169 EvalAssertion::assert_eq(output_file_content),
170 ),
171 );
172}
173
174#[test]
175#[cfg_attr(not(feature = "unit-eval"), ignore)]
176fn eval_translate_doc_comments() {
177 // Model | Pass rate
178 // ============================================
179 //
180 // claude-3.7-sonnet | 1.0 (2025-06-14)
181 // claude-sonnet-4 | 1.0 (2025-06-14)
182 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
183 // gemini-2.5-flash-preview-04-17 |
184 // gpt-4.1 |
185
186 let input_file_path = "root/canvas.rs";
187 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
188 let edit_description = "Translate all doc comments to Italian";
189 eval(
190 200,
191 1.,
192 0.05,
193 EvalInput::from_conversation(
194 vec![
195 message(
196 User,
197 [text(formatdoc! {"
198 Read the {input_file_path} file and edit it (without overwriting it),
199 translating all the doc comments to italian.
200 "})],
201 ),
202 message(
203 Assistant,
204 [tool_use(
205 "tool_1",
206 "read_file",
207 ReadFileToolInput {
208 path: input_file_path.into(),
209 start_line: None,
210 end_line: None,
211 },
212 )],
213 ),
214 message(
215 User,
216 [tool_result("tool_1", "read_file", input_file_content)],
217 ),
218 message(
219 Assistant,
220 [tool_use(
221 "tool_2",
222 "edit_file",
223 EditFileToolInput {
224 display_description: edit_description.into(),
225 path: input_file_path.into(),
226 mode: EditFileMode::Edit,
227 },
228 )],
229 ),
230 ],
231 Some(input_file_content.into()),
232 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
233 ),
234 );
235}
236
237#[test]
238#[cfg_attr(not(feature = "unit-eval"), ignore)]
239fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
240 // Model | Pass rate
241 // ============================================
242 //
243 // claude-3.7-sonnet | 0.96 (2025-06-14)
244 // claude-sonnet-4 | 0.11 (2025-06-14)
245 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
246 // gemini-2.5-flash-preview-04-17 |
247 // gpt-4.1 |
248 // claude-sonnet-4.5 | 0.25 (2025-11-10)
249
250 let input_file_path = "root/lib.rs";
251 let input_file_content =
252 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
253 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
254 eval(
255 100,
256 0.95,
257 0.05,
258 EvalInput::from_conversation(
259 vec![
260 message(
261 User,
262 [text(formatdoc! {"
263 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
264 Use `ureq` to download the SDK for the current platform and architecture.
265 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
266 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
267 that's inside of the archive.
268 Don't re-download the SDK if that executable already exists.
269
270 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
271
272 Here are the available wasi-sdk assets:
273 - wasi-sdk-25.0-x86_64-macos.tar.gz
274 - wasi-sdk-25.0-arm64-macos.tar.gz
275 - wasi-sdk-25.0-x86_64-linux.tar.gz
276 - wasi-sdk-25.0-arm64-linux.tar.gz
277 - wasi-sdk-25.0-x86_64-linux.tar.gz
278 - wasi-sdk-25.0-arm64-linux.tar.gz
279 - wasi-sdk-25.0-x86_64-windows.tar.gz
280 "})],
281 ),
282 message(
283 Assistant,
284 [tool_use(
285 "tool_1",
286 "read_file",
287 ReadFileToolInput {
288 path: input_file_path.into(),
289 start_line: Some(971),
290 end_line: Some(1050),
291 },
292 )],
293 ),
294 message(
295 User,
296 [tool_result(
297 "tool_1",
298 "read_file",
299 lines(input_file_content, 971..1050),
300 )],
301 ),
302 message(
303 Assistant,
304 [tool_use(
305 "tool_2",
306 "read_file",
307 ReadFileToolInput {
308 path: input_file_path.into(),
309 start_line: Some(1050),
310 end_line: Some(1100),
311 },
312 )],
313 ),
314 message(
315 User,
316 [tool_result(
317 "tool_2",
318 "read_file",
319 lines(input_file_content, 1050..1100),
320 )],
321 ),
322 message(
323 Assistant,
324 [tool_use(
325 "tool_3",
326 "read_file",
327 ReadFileToolInput {
328 path: input_file_path.into(),
329 start_line: Some(1100),
330 end_line: Some(1150),
331 },
332 )],
333 ),
334 message(
335 User,
336 [tool_result(
337 "tool_3",
338 "read_file",
339 lines(input_file_content, 1100..1150),
340 )],
341 ),
342 message(
343 Assistant,
344 [tool_use(
345 "tool_4",
346 "edit_file",
347 EditFileToolInput {
348 display_description: edit_description.into(),
349 path: input_file_path.into(),
350 mode: EditFileMode::Edit,
351 },
352 )],
353 ),
354 ],
355 Some(input_file_content.into()),
356 EvalAssertion::judge_diff(indoc! {"
357 - The compile_parser_to_wasm method has been changed to use wasi-sdk
358 - ureq is used to download the SDK for current platform and architecture
359 "}),
360 ),
361 );
362}
363
364#[test]
365#[cfg_attr(not(feature = "unit-eval"), ignore)]
366fn eval_disable_cursor_blinking() {
367 // Model | Pass rate
368 // ============================================
369 //
370 // claude-3.7-sonnet | 0.59 (2025-07-14)
371 // claude-sonnet-4 | 0.81 (2025-07-14)
372 // gemini-2.5-pro | 0.95 (2025-07-14)
373 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
374 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
375 // claude-sonnet-4.5 | 0.20 (2025-11-10)
376
377 let input_file_path = "root/editor.rs";
378 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
379 let edit_description = "Comment out the call to `BlinkManager::enable`";
380 let possible_diffs = vec![
381 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
382 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
383 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
384 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
385 ];
386 eval(
387 100,
388 0.51,
389 0.05,
390 EvalInput::from_conversation(
391 vec![
392 message(User, [text("Let's research how to cursor blinking works.")]),
393 message(
394 Assistant,
395 [tool_use(
396 "tool_1",
397 "grep",
398 GrepToolInput {
399 regex: "blink".into(),
400 include_pattern: None,
401 offset: 0,
402 case_sensitive: false,
403 },
404 )],
405 ),
406 message(
407 User,
408 [tool_result(
409 "tool_1",
410 "grep",
411 [
412 lines(input_file_content, 100..400),
413 lines(input_file_content, 800..1300),
414 lines(input_file_content, 1600..2000),
415 lines(input_file_content, 5000..5500),
416 lines(input_file_content, 8000..9000),
417 lines(input_file_content, 18455..18470),
418 lines(input_file_content, 20000..20500),
419 lines(input_file_content, 21000..21300),
420 ]
421 .join("Match found:\n\n"),
422 )],
423 ),
424 message(
425 User,
426 [text(indoc! {"
427 Comment out the lines that interact with the BlinkManager.
428 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
429 Don't add additional comments.
430 "})],
431 ),
432 message(
433 Assistant,
434 [tool_use(
435 "tool_4",
436 "edit_file",
437 EditFileToolInput {
438 display_description: edit_description.into(),
439 path: input_file_path.into(),
440 mode: EditFileMode::Edit,
441 },
442 )],
443 ),
444 ],
445 Some(input_file_content.into()),
446 EvalAssertion::assert_diff_any(possible_diffs),
447 ),
448 );
449}
450
451#[test]
452#[cfg_attr(not(feature = "unit-eval"), ignore)]
453fn eval_from_pixels_constructor() {
454 // Results for 2025-06-13
455 //
456 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
457 // value. Higher values improve the pass rate but may sometimes cause
458 // edits to be misapplied. In the context of this eval, this means
459 // the agent might add from_pixels tests in incorrect locations
460 // (e.g., at the beginning of the file), yet the evaluation may still
461 // rate it highly.
462 //
463 // Model | Date | Pass rate
464 // =========================================================
465 // claude-4.0-sonnet | 2025-06-14 | 0.99
466 // claude-3.7-sonnet | 2025-06-14 | 0.88
467 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
468 // gpt-4.1 |
469
470 let input_file_path = "root/canvas.rs";
471 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
472 let edit_description = "Implement from_pixels constructor and add tests.";
473 eval(
474 100,
475 0.95,
476 // For whatever reason, this eval produces more mismatched tags.
477 // Increasing for now, let's see if we can bring this down.
478 0.25,
479 EvalInput::from_conversation(
480 vec![
481 message(
482 User,
483 [text(indoc! {"
484 Introduce a new `from_pixels` constructor in Canvas and
485 also add tests for it in the same file.
486 "})],
487 ),
488 message(
489 Assistant,
490 [tool_use(
491 "tool_1",
492 "read_file",
493 ReadFileToolInput {
494 path: input_file_path.into(),
495 start_line: None,
496 end_line: None,
497 },
498 )],
499 ),
500 message(
501 User,
502 [tool_result("tool_1", "read_file", input_file_content)],
503 ),
504 message(
505 Assistant,
506 [tool_use(
507 "tool_2",
508 "grep",
509 GrepToolInput {
510 regex: "mod\\s+tests".into(),
511 include_pattern: Some("font-kit/src/canvas.rs".into()),
512 offset: 0,
513 case_sensitive: false,
514 },
515 )],
516 ),
517 message(User, [tool_result("tool_2", "grep", "No matches found")]),
518 message(
519 Assistant,
520 [tool_use(
521 "tool_3",
522 "grep",
523 GrepToolInput {
524 regex: "mod\\s+tests".into(),
525 include_pattern: Some("font-kit/src/**/*.rs".into()),
526 offset: 0,
527 case_sensitive: false,
528 },
529 )],
530 ),
531 message(User, [tool_result("tool_3", "grep", "No matches found")]),
532 message(
533 Assistant,
534 [tool_use(
535 "tool_4",
536 "grep",
537 GrepToolInput {
538 regex: "#\\[test\\]".into(),
539 include_pattern: Some("font-kit/src/**/*.rs".into()),
540 offset: 0,
541 case_sensitive: false,
542 },
543 )],
544 ),
545 message(
546 User,
547 [tool_result(
548 "tool_4",
549 "grep",
550 indoc! {"
551 Found 6 matches:
552
553 ## Matches in font-kit/src/loaders/core_text.rs
554
555 ### mod test › L926-936
556 ```
557 mod test {
558 use super::Font;
559 use crate::properties::{Stretch, Weight};
560
561 #[cfg(feature = \"source\")]
562 use crate::source::SystemSource;
563
564 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
565
566 #[cfg(feature = \"source\")]
567 #[test]
568 ```
569
570 55 lines remaining in ancestor node. Read the file to see all.
571
572 ### mod test › L947-951
573 ```
574 }
575
576 #[test]
577 fn test_core_text_to_css_font_weight() {
578 // Exact matches
579 ```
580
581 ### mod test › L959-963
582 ```
583 }
584
585 #[test]
586 fn test_core_text_to_css_font_stretch() {
587 // Exact matches
588 ```
589
590 ## Matches in font-kit/src/loaders/freetype.rs
591
592 ### mod test › L1238-1248
593 ```
594 mod test {
595 use crate::loaders::freetype::Font;
596
597 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
598 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
599
600 #[test]
601 fn get_pcf_postscript_name() {
602 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
603 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
604 }
605 ```
606
607 1 lines remaining in ancestor node. Read the file to see all.
608
609 ## Matches in font-kit/src/sources/core_text.rs
610
611 ### mod test › L265-275
612 ```
613 mod test {
614 use crate::properties::{Stretch, Weight};
615
616 #[test]
617 fn test_css_to_core_text_font_weight() {
618 // Exact matches
619 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
620 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
621 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
622 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
623
624 ```
625
626 27 lines remaining in ancestor node. Read the file to see all.
627
628 ### mod test › L278-282
629 ```
630 }
631
632 #[test]
633 fn test_css_to_core_text_font_stretch() {
634 // Exact matches
635 ```
636 "},
637 )],
638 ),
639 message(
640 Assistant,
641 [tool_use(
642 "tool_5",
643 "edit_file",
644 EditFileToolInput {
645 display_description: edit_description.into(),
646 path: input_file_path.into(),
647 mode: EditFileMode::Edit,
648 },
649 )],
650 ),
651 ],
652 Some(input_file_content.into()),
653 EvalAssertion::judge_diff(indoc! {"
654 - The diff contains a new `from_pixels` constructor
655 - The diff contains new tests for the `from_pixels` constructor
656 "}),
657 ),
658 );
659}
660
661#[test]
662#[cfg_attr(not(feature = "unit-eval"), ignore)]
663fn eval_zode() {
664 // Model | Pass rate
665 // ============================================
666 //
667 // claude-3.7-sonnet | 1.0 (2025-06-14)
668 // claude-sonnet-4 | 1.0 (2025-06-14)
669 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
670 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
671 // gpt-4.1 | 1.0 (2025-05-22)
672
673 let input_file_path = "root/zode.py";
674 let input_content = None;
675 let edit_description = "Create the main Zode CLI script";
676 eval(
677 50,
678 1.,
679 0.05,
680 EvalInput::from_conversation(
681 vec![
682 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
683 message(
684 Assistant,
685 [
686 tool_use(
687 "tool_1",
688 "read_file",
689 ReadFileToolInput {
690 path: "root/eval/react.py".into(),
691 start_line: None,
692 end_line: None,
693 },
694 ),
695 tool_use(
696 "tool_2",
697 "read_file",
698 ReadFileToolInput {
699 path: "root/eval/react_test.py".into(),
700 start_line: None,
701 end_line: None,
702 },
703 ),
704 ],
705 ),
706 message(
707 User,
708 [
709 tool_result(
710 "tool_1",
711 "read_file",
712 include_str!("evals/fixtures/zode/react.py"),
713 ),
714 tool_result(
715 "tool_2",
716 "read_file",
717 include_str!("evals/fixtures/zode/react_test.py"),
718 ),
719 ],
720 ),
721 message(
722 Assistant,
723 [
724 text(
725 "Now that I understand what we need to build, I'll create the main Python script:",
726 ),
727 tool_use(
728 "tool_3",
729 "edit_file",
730 EditFileToolInput {
731 display_description: edit_description.into(),
732 path: input_file_path.into(),
733 mode: EditFileMode::Create,
734 },
735 ),
736 ],
737 ),
738 ],
739 input_content,
740 EvalAssertion::new(async move |sample, _, _cx| {
741 let invalid_starts = [' ', '`', '\n'];
742 let mut message = String::new();
743 for start in invalid_starts {
744 if sample.text_after.starts_with(start) {
745 message.push_str(&format!("The sample starts with a {:?}\n", start));
746 break;
747 }
748 }
749 // Remove trailing newline.
750 message.pop();
751
752 if message.is_empty() {
753 Ok(EvalAssertionOutcome {
754 score: 100,
755 message: None,
756 })
757 } else {
758 Ok(EvalAssertionOutcome {
759 score: 0,
760 message: Some(message),
761 })
762 }
763 }),
764 ),
765 );
766}
767
768#[test]
769#[cfg_attr(not(feature = "unit-eval"), ignore)]
770fn eval_add_overwrite_test() {
771 // Model | Pass rate
772 // ============================================
773 //
774 // claude-3.7-sonnet | 0.65 (2025-06-14)
775 // claude-sonnet-4 | 0.07 (2025-06-14)
776 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
777 // gemini-2.5-flash-preview-04-17 |
778 // gpt-4.1 |
779 // claude-sonnet-4.5 | 0.34 (2025-11-10)
780
781 let input_file_path = "root/action_log.rs";
782 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
783 let edit_description = "Add a new test for overwriting a file in action_log.rs";
784 eval(
785 200,
786 0.5, // TODO: make this eval better
787 0.05,
788 EvalInput::from_conversation(
789 vec![
790 message(
791 User,
792 [text(indoc! {"
793 Introduce a new test in `action_log.rs` to test overwriting a file.
794 That is, a file already exists, but we call `buffer_created` as if the file were new.
795 Take inspiration from all the other tests in the file.
796 "})],
797 ),
798 message(
799 Assistant,
800 [tool_use(
801 "tool_1",
802 "read_file",
803 ReadFileToolInput {
804 path: input_file_path.into(),
805 start_line: None,
806 end_line: None,
807 },
808 )],
809 ),
810 message(
811 User,
812 [tool_result(
813 "tool_1",
814 "read_file",
815 indoc! {"
816 pub struct ActionLog [L13-20]
817 tracked_buffers [L15]
818 edited_since_project_diagnostics_check [L17]
819 project [L19]
820 impl ActionLog [L22-498]
821 pub fn new [L24-30]
822 pub fn project [L32-34]
823 pub fn checked_project_diagnostics [L37-39]
824 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
825 fn track_buffer_internal [L46-101]
826 fn handle_buffer_event [L103-116]
827 fn handle_buffer_edited [L118-123]
828 fn handle_buffer_file_changed [L125-158]
829 async fn maintain_diff [L160-264]
830 pub fn buffer_read [L267-269]
831 pub fn buffer_created [L272-276]
832 pub fn buffer_edited [L279-287]
833 pub fn will_delete_buffer [L289-304]
834 pub fn keep_edits_in_range [L306-364]
835 pub fn reject_edits_in_ranges [L366-459]
836 pub fn keep_all_edits [L461-473]
837 pub fn changed_buffers [L476-482]
838 pub fn stale_buffers [L485-497]
839 fn apply_non_conflicting_edits [L500-561]
840 fn diff_snapshots [L563-585]
841 fn point_to_row_edit [L587-614]
842 enum ChangeAuthor [L617-620]
843 User [L618]
844 Agent [L619]
845 enum TrackedBufferStatus [L623-627]
846 Created [L624]
847 Modified [L625]
848 Deleted [L626]
849 struct TrackedBuffer [L629-641]
850 buffer [L630]
851 base_text [L631]
852 unreviewed_changes [L632]
853 status [L633]
854 version [L634]
855 diff [L635]
856 snapshot [L636]
857 diff_update [L637]
858 _open_lsp_handle [L638]
859 _maintain_diff [L639]
860 _subscription [L640]
861 impl TrackedBuffer [L643-657]
862 fn has_changes [L644-650]
863 fn schedule_diff_update [L652-656]
864 pub struct ChangedBuffer [L659-661]
865 pub diff [L660]
866 mod tests [L664-1574]
867 fn init_logger [L678-682]
868 fn init_test [L684-691]
869 async fn test_keep_edits [L694-769]
870 async fn test_deletions [L772-854]
871 async fn test_overlapping_user_edits [L857-951]
872 async fn test_creating_files [L954-1010]
873 async fn test_deleting_files [L1013-1120]
874 async fn test_reject_edits [L1123-1255]
875 async fn test_reject_multiple_edits [L1258-1331]
876 async fn test_reject_deleted_file [L1334-1388]
877 async fn test_reject_created_file [L1391-1443]
878 async fn test_random_diffs [L1446-1535]
879 fn quiesce [L1510-1534]
880 struct HunkStatus [L1538-1542]
881 range [L1539]
882 diff_status [L1540]
883 old_text [L1541]
884 fn unreviewed_hunks [L1544-1573]
885
886 Showing symbols 1-69 (total symbols: 69)
887
888 Using the line numbers in this outline, you can call this tool again while specifying
889 the start_line and end_line fields to see the implementations of symbols in the outline.
890 "},
891 )],
892 ),
893 message(
894 Assistant,
895 [
896 text(
897 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
898 ),
899 tool_use(
900 "tool_2",
901 "read_file",
902 ReadFileToolInput {
903 path: input_file_path.into(),
904 start_line: Some(953),
905 end_line: Some(1010),
906 },
907 ),
908 ],
909 ),
910 message(
911 User,
912 [tool_result(
913 "tool_2",
914 "read_file",
915 lines(input_file_content, 953..1010),
916 )],
917 ),
918 message(
919 Assistant,
920 [
921 text(
922 "Now let's look at another relevant test to understand how they're structured:",
923 ),
924 tool_use(
925 "tool_3",
926 "read_file",
927 ReadFileToolInput {
928 path: input_file_path.into(),
929 start_line: Some(1012),
930 end_line: Some(1120),
931 },
932 ),
933 ],
934 ),
935 message(
936 User,
937 [tool_result(
938 "tool_3",
939 "read_file",
940 lines(input_file_content, 1012..1120),
941 )],
942 ),
943 message(
944 Assistant,
945 [
946 text("Now let's look at how `buffer_created` is implemented:"),
947 tool_use(
948 "tool_4",
949 "read_file",
950 ReadFileToolInput {
951 path: input_file_path.into(),
952 start_line: Some(271),
953 end_line: Some(276),
954 },
955 ),
956 ],
957 ),
958 message(
959 User,
960 [tool_result(
961 "tool_4",
962 "read_file",
963 lines(input_file_content, 271..276),
964 )],
965 ),
966 message(
967 Assistant,
968 [
969 text(indoc! {"
970 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
971
972 1. Create a file
973 2. Open a buffer for it
974 3. Track it in the action log
975 4. Modify the buffer
976 5. Call `buffer_created` again as if the file were new
977 6. Check that the changes are properly tracked
978
979 Let's write the test:
980 "}),
981 tool_use(
982 "tool_5",
983 "edit_file",
984 EditFileToolInput {
985 display_description: edit_description.into(),
986 path: input_file_path.into(),
987 mode: EditFileMode::Edit,
988 },
989 ),
990 ],
991 ),
992 ],
993 Some(input_file_content.into()),
994 EvalAssertion::judge_diff(
995 "A new test for overwritten files was created, without changing any previous test",
996 ),
997 ),
998 );
999}
1000
1001#[test]
1002#[cfg_attr(not(feature = "unit-eval"), ignore)]
1003fn eval_create_empty_file() {
1004 // Check that Edit Agent can create a file without writing its
1005 // thoughts into it. This issue is not specific to empty files, but
1006 // it's easier to reproduce with them.
1007 //
1008 // Model | Pass rate
1009 // ============================================
1010 //
1011 // claude-3.7-sonnet | 1.00 (2025-06-14)
1012 // claude-sonnet-4 | 1.00 (2025-06-14)
1013 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1014 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1015 // gpt-4.1 | 1.00 (2025-05-21)
1016 //
1017 //
1018 // TODO: gpt-4.1-mini errored 38 times:
1019 // "data did not match any variant of untagged enum ResponseStreamResult"
1020
1021 let input_file_content = None;
1022 let expected_output_content = String::new();
1023 eval(
1024 100,
1025 0.99,
1026 0.05,
1027 EvalInput::from_conversation(
1028 vec![
1029 message(User, [text("Create a second empty todo file ")]),
1030 message(
1031 Assistant,
1032 [
1033 text(formatdoc! {"
1034 I'll help you create a second empty todo file.
1035 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1036 "}),
1037 tool_use(
1038 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1039 "list_directory",
1040 ListDirectoryToolInput {
1041 path: "root".to_string(),
1042 },
1043 ),
1044 ],
1045 ),
1046 message(
1047 User,
1048 [tool_result(
1049 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1050 "list_directory",
1051 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1052 )],
1053 ),
1054 message(
1055 Assistant,
1056 [
1057 text(formatdoc! {"
1058 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1059 "}),
1060 tool_use(
1061 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1062 "edit_file",
1063 EditFileToolInput {
1064 display_description: "Create empty TODO3 file".to_string(),
1065 mode: EditFileMode::Create,
1066 path: "root/TODO3".into(),
1067 },
1068 ),
1069 ],
1070 ),
1071 ],
1072 input_file_content,
1073 // Bad behavior is to write something like
1074 // "I'll create an empty TODO3 file as requested."
1075 EvalAssertion::assert_eq(expected_output_content),
1076 ),
1077 );
1078}
1079
1080fn message(
1081 role: Role,
1082 contents: impl IntoIterator<Item = MessageContent>,
1083) -> LanguageModelRequestMessage {
1084 LanguageModelRequestMessage {
1085 role,
1086 content: contents.into_iter().collect(),
1087 cache: false,
1088 }
1089}
1090
1091fn text(text: impl Into<String>) -> MessageContent {
1092 MessageContent::Text(text.into())
1093}
1094
1095fn lines(input: &str, range: Range<usize>) -> String {
1096 input
1097 .lines()
1098 .skip(range.start)
1099 .take(range.len())
1100 .collect::<Vec<_>>()
1101 .join("\n")
1102}
1103
1104fn tool_use(
1105 id: impl Into<Arc<str>>,
1106 name: impl Into<Arc<str>>,
1107 input: impl Serialize,
1108) -> MessageContent {
1109 MessageContent::ToolUse(LanguageModelToolUse {
1110 id: LanguageModelToolUseId::from(id.into()),
1111 name: name.into(),
1112 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1113 input: serde_json::to_value(input).unwrap(),
1114 is_input_complete: true,
1115 })
1116}
1117
1118fn tool_result(
1119 id: impl Into<Arc<str>>,
1120 name: impl Into<Arc<str>>,
1121 result: impl Into<Arc<str>>,
1122) -> MessageContent {
1123 MessageContent::ToolResult(LanguageModelToolResult {
1124 tool_use_id: LanguageModelToolUseId::from(id.into()),
1125 tool_name: name.into(),
1126 is_error: false,
1127 content: LanguageModelToolResultContent::Text(result.into()),
1128 output: None,
1129 })
1130}
1131
1132#[derive(Clone)]
1133struct EvalInput {
1134 conversation: Vec<LanguageModelRequestMessage>,
1135 edit_file_input: EditFileToolInput,
1136 input_content: Option<String>,
1137 assertion: EvalAssertion,
1138}
1139
1140impl EvalInput {
1141 fn from_conversation(
1142 conversation: Vec<LanguageModelRequestMessage>,
1143 input_content: Option<String>,
1144 assertion: EvalAssertion,
1145 ) -> Self {
1146 let msg = conversation.last().expect("Conversation must not be empty");
1147 if msg.role != Role::Assistant {
1148 panic!("Conversation must end with an assistant message");
1149 }
1150 let tool_use = msg
1151 .content
1152 .iter()
1153 .flat_map(|content| match content {
1154 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1155 Some(tool_use)
1156 }
1157 _ => None,
1158 })
1159 .next()
1160 .expect("Conversation must end with an edit_file tool use")
1161 .clone();
1162
1163 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1164
1165 EvalInput {
1166 conversation,
1167 edit_file_input,
1168 input_content,
1169 assertion,
1170 }
1171 }
1172}
1173
1174#[derive(Clone)]
1175struct EvalSample {
1176 text_before: String,
1177 text_after: String,
1178 edit_output: EditAgentOutput,
1179 diff: String,
1180}
1181
1182trait AssertionFn: 'static + Send + Sync {
1183 fn assert<'a>(
1184 &'a self,
1185 sample: &'a EvalSample,
1186 judge_model: Arc<dyn LanguageModel>,
1187 cx: &'a mut TestAppContext,
1188 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1189}
1190
1191impl<F> AssertionFn for F
1192where
1193 F: 'static
1194 + Send
1195 + Sync
1196 + AsyncFn(
1197 &EvalSample,
1198 Arc<dyn LanguageModel>,
1199 &mut TestAppContext,
1200 ) -> Result<EvalAssertionOutcome>,
1201{
1202 fn assert<'a>(
1203 &'a self,
1204 sample: &'a EvalSample,
1205 judge_model: Arc<dyn LanguageModel>,
1206 cx: &'a mut TestAppContext,
1207 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1208 (self)(sample, judge_model, cx).boxed_local()
1209 }
1210}
1211
1212#[derive(Clone)]
1213struct EvalAssertion(Arc<dyn AssertionFn>);
1214
1215impl EvalAssertion {
1216 fn new<F>(f: F) -> Self
1217 where
1218 F: 'static
1219 + Send
1220 + Sync
1221 + AsyncFn(
1222 &EvalSample,
1223 Arc<dyn LanguageModel>,
1224 &mut TestAppContext,
1225 ) -> Result<EvalAssertionOutcome>,
1226 {
1227 EvalAssertion(Arc::new(f))
1228 }
1229
1230 fn assert_eq(expected: impl Into<String>) -> Self {
1231 let expected = expected.into();
1232 Self::new(async move |sample, _judge, _cx| {
1233 Ok(EvalAssertionOutcome {
1234 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1235 100
1236 } else {
1237 0
1238 },
1239 message: None,
1240 })
1241 })
1242 }
1243
1244 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1245 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1246 Self::new(async move |sample, _judge, _cx| {
1247 let matches = expected_diffs.iter().any(|possible_diff| {
1248 let expected =
1249 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1250 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1251 });
1252
1253 Ok(EvalAssertionOutcome {
1254 score: if matches { 100 } else { 0 },
1255 message: None,
1256 })
1257 })
1258 }
1259
1260 fn judge_diff(assertions: &'static str) -> Self {
1261 Self::new(async move |sample, judge, cx| {
1262 let prompt = DiffJudgeTemplate {
1263 diff: sample.diff.clone(),
1264 assertions,
1265 }
1266 .render(&Templates::new())
1267 .unwrap();
1268
1269 let request = LanguageModelRequest {
1270 messages: vec![LanguageModelRequestMessage {
1271 role: Role::User,
1272 content: vec![prompt.into()],
1273 cache: false,
1274 }],
1275 thinking_allowed: true,
1276 ..Default::default()
1277 };
1278 let mut response = retry_on_rate_limit(async || {
1279 Ok(judge
1280 .stream_completion_text(request.clone(), &cx.to_async())
1281 .await?)
1282 })
1283 .await?;
1284 let mut output = String::new();
1285 while let Some(chunk) = response.stream.next().await {
1286 let chunk = chunk?;
1287 output.push_str(&chunk);
1288 }
1289
1290 // Parse the score from the response
1291 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1292 if let Some(captures) = re.captures(&output)
1293 && let Some(score_match) = captures.get(1)
1294 {
1295 let score = score_match.as_str().parse().unwrap_or(0);
1296 return Ok(EvalAssertionOutcome {
1297 score,
1298 message: Some(output),
1299 });
1300 }
1301
1302 anyhow::bail!("No score found in response. Raw output: {output}");
1303 })
1304 }
1305
1306 async fn run(
1307 &self,
1308 input: &EvalSample,
1309 judge_model: Arc<dyn LanguageModel>,
1310 cx: &mut TestAppContext,
1311 ) -> Result<EvalAssertionOutcome> {
1312 self.0.assert(input, judge_model, cx).await
1313 }
1314}
1315
1316fn eval(
1317 iterations: usize,
1318 expected_pass_ratio: f32,
1319 mismatched_tag_threshold: f32,
1320 mut eval: EvalInput,
1321) {
1322 let mut evaluated_count = 0;
1323 let mut failed_count = 0;
1324 report_progress(evaluated_count, failed_count, iterations);
1325
1326 let (tx, rx) = mpsc::channel();
1327
1328 // Cache the last message in the conversation, and run one instance of the eval so that
1329 // all the next ones are cached.
1330 eval.conversation.last_mut().unwrap().cache = true;
1331 run_eval(eval.clone(), tx.clone());
1332
1333 let executor = gpui::background_executor();
1334 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1335 for _ in 1..iterations {
1336 let eval = eval.clone();
1337 let tx = tx.clone();
1338 let semaphore = semaphore.clone();
1339 executor
1340 .spawn(async move {
1341 let _guard = semaphore.acquire().await;
1342 run_eval(eval, tx)
1343 })
1344 .detach();
1345 }
1346 drop(tx);
1347
1348 let mut failed_evals = HashMap::default();
1349 let mut errored_evals = HashMap::default();
1350 let mut eval_outputs = Vec::new();
1351 let mut cumulative_parser_metrics = EditParserMetrics::default();
1352 while let Ok(output) = rx.recv() {
1353 match output {
1354 Ok(output) => {
1355 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1356 eval_outputs.push(output.clone());
1357 if output.assertion.score < 80 {
1358 failed_count += 1;
1359 failed_evals
1360 .entry(output.sample.text_after.clone())
1361 .or_insert(Vec::new())
1362 .push(output);
1363 }
1364 }
1365 Err(error) => {
1366 failed_count += 1;
1367 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1368 }
1369 }
1370
1371 evaluated_count += 1;
1372 report_progress(evaluated_count, failed_count, iterations);
1373 }
1374
1375 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1376 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1377 if actual_pass_ratio < expected_pass_ratio {
1378 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1379 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1380 for (error, count) in errored_evals {
1381 println!("Eval errored {} times. Error: {}", count, error);
1382 }
1383
1384 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1385 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1386 for (_buffer_output, failed_evals) in failed_evals {
1387 let eval_output = failed_evals.first().unwrap();
1388 println!("Eval failed {} times", failed_evals.len());
1389 println!("{}", eval_output);
1390 }
1391
1392 panic!(
1393 "Actual pass ratio: {}\nExpected pass ratio: {}",
1394 actual_pass_ratio, expected_pass_ratio
1395 );
1396 }
1397
1398 let mismatched_tag_ratio =
1399 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1400 if mismatched_tag_ratio > mismatched_tag_threshold {
1401 for eval_output in eval_outputs {
1402 println!("{}", eval_output);
1403 }
1404 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1405 }
1406}
1407
1408fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1409 let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1410 let mut cx = TestAppContext::build(dispatcher, None);
1411 let output = cx.executor().block_test(async {
1412 let test = EditAgentTest::new(&mut cx).await;
1413 test.eval(eval, &mut cx).await
1414 });
1415 tx.send(output).unwrap();
1416}
1417
1418#[derive(Clone)]
1419struct EvalOutput {
1420 sample: EvalSample,
1421 assertion: EvalAssertionOutcome,
1422}
1423
1424impl Display for EvalOutput {
1425 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1426 writeln!(f, "Score: {:?}", self.assertion.score)?;
1427 if let Some(message) = self.assertion.message.as_ref() {
1428 writeln!(f, "Message: {}", message)?;
1429 }
1430
1431 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1432
1433 writeln!(
1434 f,
1435 "Parser Metrics:\n{:#?}",
1436 self.sample.edit_output.parser_metrics
1437 )?;
1438 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1439 Ok(())
1440 }
1441}
1442
1443fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1444 let passed_count = evaluated_count - failed_count;
1445 let passed_ratio = if evaluated_count == 0 {
1446 0.0
1447 } else {
1448 passed_count as f64 / evaluated_count as f64
1449 };
1450 print!(
1451 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1452 evaluated_count,
1453 iterations,
1454 passed_ratio * 100.0
1455 );
1456 std::io::stdout().flush().unwrap();
1457}
1458
1459struct EditAgentTest {
1460 agent: EditAgent,
1461 project: Entity<Project>,
1462 judge_model: Arc<dyn LanguageModel>,
1463}
1464
1465impl EditAgentTest {
1466 async fn new(cx: &mut TestAppContext) -> Self {
1467 cx.executor().allow_parking();
1468
1469 let fs = FakeFs::new(cx.executor());
1470 cx.update(|cx| {
1471 settings::init(cx);
1472 gpui_tokio::init(cx);
1473 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1474 cx.set_http_client(http_client);
1475 let client = Client::production(cx);
1476 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1477 settings::init(cx);
1478 language_model::init(client.clone(), cx);
1479 language_models::init(user_store, client.clone(), cx);
1480 });
1481
1482 fs.insert_tree("/root", json!({})).await;
1483 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1484 let agent_model = SelectedModel::from_str(
1485 &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1486 )
1487 .unwrap();
1488 let judge_model = SelectedModel::from_str(
1489 &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1490 )
1491 .unwrap();
1492
1493 let authenticate_provider_tasks = cx.update(|cx| {
1494 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1495 registry
1496 .providers()
1497 .iter()
1498 .map(|p| p.authenticate(cx))
1499 .collect::<Vec<_>>()
1500 })
1501 });
1502 let (agent_model, judge_model) = cx
1503 .update(|cx| {
1504 cx.spawn(async move |cx| {
1505 futures::future::join_all(authenticate_provider_tasks).await;
1506 let agent_model = Self::load_model(&agent_model, cx).await;
1507 let judge_model = Self::load_model(&judge_model, cx).await;
1508 (agent_model.unwrap(), judge_model.unwrap())
1509 })
1510 })
1511 .await;
1512 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1513
1514 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1515
1516 Self {
1517 agent: EditAgent::new(
1518 agent_model,
1519 project.clone(),
1520 action_log,
1521 Templates::new(),
1522 edit_format,
1523 ),
1524 project,
1525 judge_model,
1526 }
1527 }
1528
1529 async fn load_model(
1530 selected_model: &SelectedModel,
1531 cx: &mut AsyncApp,
1532 ) -> Result<Arc<dyn LanguageModel>> {
1533 cx.update(|cx| {
1534 let registry = LanguageModelRegistry::read_global(cx);
1535 let provider = registry
1536 .provider(&selected_model.provider)
1537 .expect("Provider not found");
1538 provider.authenticate(cx)
1539 })?
1540 .await?;
1541 cx.update(|cx| {
1542 let models = LanguageModelRegistry::read_global(cx);
1543 let model = models
1544 .available_models(cx)
1545 .find(|model| {
1546 model.provider_id() == selected_model.provider
1547 && model.id() == selected_model.model
1548 })
1549 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1550 model
1551 })
1552 }
1553
1554 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1555 let path = self
1556 .project
1557 .read_with(cx, |project, cx| {
1558 project.find_project_path(eval.edit_file_input.path, cx)
1559 })
1560 .unwrap();
1561 let buffer = self
1562 .project
1563 .update(cx, |project, cx| project.open_buffer(path, cx))
1564 .await
1565 .unwrap();
1566
1567 let tools = crate::built_in_tools().collect::<Vec<_>>();
1568
1569 let system_prompt = {
1570 let worktrees = vec![WorktreeContext {
1571 root_name: "root".to_string(),
1572 abs_path: Path::new("/path/to/root").into(),
1573 rules_file: None,
1574 }];
1575 let project_context = ProjectContext::new(worktrees, Vec::default());
1576 let tool_names = tools
1577 .iter()
1578 .map(|tool| tool.name.clone().into())
1579 .collect::<Vec<_>>();
1580 let template = crate::SystemPromptTemplate {
1581 project: &project_context,
1582 available_tools: tool_names,
1583 model_name: None,
1584 };
1585 let templates = Templates::new();
1586 template.render(&templates).unwrap()
1587 };
1588
1589 let has_system_prompt = eval
1590 .conversation
1591 .first()
1592 .is_some_and(|msg| msg.role == Role::System);
1593 let messages = if has_system_prompt {
1594 eval.conversation
1595 } else {
1596 [LanguageModelRequestMessage {
1597 role: Role::System,
1598 content: vec![MessageContent::Text(system_prompt)],
1599 cache: true,
1600 }]
1601 .into_iter()
1602 .chain(eval.conversation)
1603 .collect::<Vec<_>>()
1604 };
1605
1606 let conversation = LanguageModelRequest {
1607 messages,
1608 tools,
1609 thinking_allowed: true,
1610 ..Default::default()
1611 };
1612
1613 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1614 if let Some(input_content) = eval.input_content.as_deref() {
1615 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1616 }
1617 retry_on_rate_limit(async || {
1618 self.agent
1619 .edit(
1620 buffer.clone(),
1621 eval.edit_file_input.display_description.clone(),
1622 &conversation,
1623 &mut cx.to_async(),
1624 )
1625 .0
1626 .await
1627 })
1628 .await?
1629 } else {
1630 retry_on_rate_limit(async || {
1631 self.agent
1632 .overwrite(
1633 buffer.clone(),
1634 eval.edit_file_input.display_description.clone(),
1635 &conversation,
1636 &mut cx.to_async(),
1637 )
1638 .0
1639 .await
1640 })
1641 .await?
1642 };
1643
1644 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1645 let sample = EvalSample {
1646 edit_output,
1647 diff: language::unified_diff(
1648 eval.input_content.as_deref().unwrap_or_default(),
1649 &buffer_text,
1650 ),
1651 text_before: eval.input_content.unwrap_or_default(),
1652 text_after: buffer_text,
1653 };
1654 let assertion = eval
1655 .assertion
1656 .run(&sample, self.judge_model.clone(), cx)
1657 .await?;
1658
1659 Ok(EvalOutput { assertion, sample })
1660 }
1661}
1662
1663async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1664 const MAX_RETRIES: usize = 20;
1665 let mut attempt = 0;
1666
1667 loop {
1668 attempt += 1;
1669 let response = request().await;
1670
1671 if attempt >= MAX_RETRIES {
1672 return response;
1673 }
1674
1675 let retry_delay = match &response {
1676 Ok(_) => None,
1677 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1678 Some(err) => match &err {
1679 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1680 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1681 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1682 }
1683 LanguageModelCompletionError::UpstreamProviderError {
1684 status,
1685 retry_after,
1686 ..
1687 } => {
1688 // Only retry for specific status codes
1689 let should_retry = matches!(
1690 *status,
1691 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1692 ) || status.as_u16() == 529;
1693
1694 if should_retry {
1695 // Use server-provided retry_after if available, otherwise use default
1696 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1697 } else {
1698 None
1699 }
1700 }
1701 LanguageModelCompletionError::ApiReadResponseError { .. }
1702 | LanguageModelCompletionError::ApiInternalServerError { .. }
1703 | LanguageModelCompletionError::HttpSend { .. } => {
1704 // Exponential backoff for transient I/O and internal server errors
1705 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1706 }
1707 _ => None,
1708 },
1709 _ => None,
1710 },
1711 };
1712
1713 if let Some(retry_after) = retry_delay {
1714 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1715 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1716 Timer::after(retry_after + jitter).await;
1717 } else {
1718 return response;
1719 }
1720 }
1721}
1722
1723#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1724struct EvalAssertionOutcome {
1725 score: usize,
1726 message: Option<String>,
1727}
1728
1729#[derive(Serialize)]
1730pub struct DiffJudgeTemplate {
1731 diff: String,
1732 assertions: &'static str,
1733}
1734
1735impl Template for DiffJudgeTemplate {
1736 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1737}
1738
1739fn strip_empty_lines(text: &str) -> String {
1740 text.lines()
1741 .filter(|line| !line.trim().is_empty())
1742 .collect::<Vec<_>>()
1743 .join("\n")
1744}