1use super::*;
2use crate::{
3 AgentTool, EditFileMode, EditFileTool, EditFileToolInput, GrepTool, GrepToolInput,
4 ListDirectoryTool, ListDirectoryToolInput, ReadFileTool, ReadFileToolInput,
5};
6use Role::*;
7use client::{Client, UserStore};
8use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
9use fs::FakeFs;
10use futures::{FutureExt, future::LocalBoxFuture};
11use gpui::{AppContext, TestAppContext};
12use http_client::StatusCode;
13use indoc::{formatdoc, indoc};
14use language_model::{
15 LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
16 LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
17};
18use project::Project;
19use prompt_store::{ProjectContext, WorktreeContext};
20use rand::prelude::*;
21use reqwest_client::ReqwestClient;
22use serde_json::json;
23use std::{
24 fmt::{self, Display},
25 path::Path,
26 str::FromStr,
27 time::Duration,
28};
29use util::path;
30
31#[derive(Default, Clone, Debug)]
32struct EditAgentOutputProcessor {
33 mismatched_tag_threshold: f32,
34 cumulative_tags: usize,
35 cumulative_mismatched_tags: usize,
36 eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
37}
38
39fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
40 EditAgentOutputProcessor {
41 mismatched_tag_threshold,
42 cumulative_tags: 0,
43 cumulative_mismatched_tags: 0,
44 eval_outputs: Vec::new(),
45 }
46}
47
48#[derive(Clone, Debug)]
49struct EditEvalMetadata {
50 tags: usize,
51 mismatched_tags: usize,
52}
53
54impl EvalOutputProcessor for EditAgentOutputProcessor {
55 type Metadata = EditEvalMetadata;
56
57 fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
58 if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
59 self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
60 self.cumulative_tags += output.metadata.tags;
61 self.eval_outputs.push(output.clone());
62 }
63 }
64
65 fn assert(&mut self) {
66 let mismatched_tag_ratio =
67 self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
68 if mismatched_tag_ratio > self.mismatched_tag_threshold {
69 for eval_output in &self.eval_outputs {
70 println!("{}", eval_output.data);
71 }
72 panic!(
73 "Too many mismatched tags: {:?}",
74 self.cumulative_mismatched_tags
75 );
76 }
77 }
78}
79
80#[test]
81#[cfg_attr(not(feature = "unit-eval"), ignore)]
82fn eval_extract_handle_command_output() {
83 // Test how well agent generates multiple edit hunks.
84 //
85 // Model | Pass rate
86 // ----------------------------|----------
87 // claude-3.7-sonnet | 0.99 (2025-06-14)
88 // claude-sonnet-4 | 0.97 (2025-06-14)
89 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
90 // gemini-2.5-flash | 0.11 (2025-05-22)
91
92 let input_file_path = "root/blame.rs";
93 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
94 let possible_diffs = vec![
95 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
96 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
97 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
98 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
99 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
100 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
101 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
102 ];
103 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
104 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
105 run_eval(EvalInput::from_conversation(
106 vec![
107 message(
108 User,
109 [text(formatdoc! {"
110 Read the `{input_file_path}` file and extract a method in
111 the final stanza of `run_git_blame` to deal with command failures,
112 call it `handle_command_output` and take the std::process::Output as the only parameter.
113 Do not document the method and do not add any comments.
114
115 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
116 "})],
117 ),
118 message(
119 Assistant,
120 [tool_use(
121 "tool_1",
122 ReadFileTool::NAME,
123 ReadFileToolInput {
124 path: input_file_path.into(),
125 start_line: None,
126 end_line: None,
127 },
128 )],
129 ),
130 message(
131 User,
132 [tool_result(
133 "tool_1",
134 ReadFileTool::NAME,
135 input_file_content,
136 )],
137 ),
138 message(
139 Assistant,
140 [tool_use(
141 "tool_2",
142 EditFileTool::NAME,
143 EditFileToolInput {
144 display_description: edit_description.into(),
145 path: input_file_path.into(),
146 mode: EditFileMode::Edit,
147 },
148 )],
149 ),
150 ],
151 Some(input_file_content.into()),
152 EvalAssertion::assert_diff_any(possible_diffs.clone()),
153 ))
154 });
155}
156
157#[test]
158#[cfg_attr(not(feature = "unit-eval"), ignore)]
159fn eval_delete_run_git_blame() {
160 // Model | Pass rate
161 // ----------------------------|----------
162 // claude-3.7-sonnet | 1.0 (2025-06-14)
163 // claude-sonnet-4 | 0.96 (2025-06-14)
164 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
165 // gemini-2.5-flash |
166
167 let input_file_path = "root/blame.rs";
168 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
169 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
170 let edit_description = "Delete the `run_git_blame` function.";
171
172 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
173 run_eval(EvalInput::from_conversation(
174 vec![
175 message(
176 User,
177 [text(formatdoc! {"
178 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
179 one function, not its usages.
180 "})],
181 ),
182 message(
183 Assistant,
184 [tool_use(
185 "tool_1",
186 ReadFileTool::NAME,
187 ReadFileToolInput {
188 path: input_file_path.into(),
189 start_line: None,
190 end_line: None,
191 },
192 )],
193 ),
194 message(
195 User,
196 [tool_result(
197 "tool_1",
198 ReadFileTool::NAME,
199 input_file_content,
200 )],
201 ),
202 message(
203 Assistant,
204 [tool_use(
205 "tool_2",
206 EditFileTool::NAME,
207 EditFileToolInput {
208 display_description: edit_description.into(),
209 path: input_file_path.into(),
210 mode: EditFileMode::Edit,
211 },
212 )],
213 ),
214 ],
215 Some(input_file_content.into()),
216 EvalAssertion::assert_eq(output_file_content),
217 ))
218 });
219}
220
221#[test]
222#[cfg_attr(not(feature = "unit-eval"), ignore)]
223fn eval_translate_doc_comments() {
224 // Model | Pass rate
225 // ============================================
226 //
227 // claude-3.7-sonnet | 1.0 (2025-06-14)
228 // claude-sonnet-4 | 1.0 (2025-06-14)
229 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
230 // gemini-2.5-flash-preview-04-17 |
231
232 let input_file_path = "root/canvas.rs";
233 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
234 let edit_description = "Translate all doc comments to Italian";
235
236 eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
237 run_eval(EvalInput::from_conversation(
238 vec![
239 message(
240 User,
241 [text(formatdoc! {"
242 Read the {input_file_path} file and edit it (without overwriting it),
243 translating all the doc comments to italian.
244 "})],
245 ),
246 message(
247 Assistant,
248 [tool_use(
249 "tool_1",
250 ReadFileTool::NAME,
251 ReadFileToolInput {
252 path: input_file_path.into(),
253 start_line: None,
254 end_line: None,
255 },
256 )],
257 ),
258 message(
259 User,
260 [tool_result(
261 "tool_1",
262 ReadFileTool::NAME,
263 input_file_content,
264 )],
265 ),
266 message(
267 Assistant,
268 [tool_use(
269 "tool_2",
270 EditFileTool::NAME,
271 EditFileToolInput {
272 display_description: edit_description.into(),
273 path: input_file_path.into(),
274 mode: EditFileMode::Edit,
275 },
276 )],
277 ),
278 ],
279 Some(input_file_content.into()),
280 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
281 ))
282 });
283}
284
285#[test]
286#[cfg_attr(not(feature = "unit-eval"), ignore)]
287fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
288 // Model | Pass rate
289 // ============================================
290 //
291 // claude-3.7-sonnet | 0.96 (2025-06-14)
292 // claude-sonnet-4 | 0.11 (2025-06-14)
293 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
294 // gemini-2.5-flash-preview-04-17 |
295
296 let input_file_path = "root/lib.rs";
297 let input_file_content =
298 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
299 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
300
301 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
302 run_eval(EvalInput::from_conversation(
303 vec![
304 message(
305 User,
306 [text(formatdoc! {"
307 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
308 Use `ureq` to download the SDK for the current platform and architecture.
309 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
310 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
311 that's inside of the archive.
312 Don't re-download the SDK if that executable already exists.
313
314 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
315
316 Here are the available wasi-sdk assets:
317 - wasi-sdk-25.0-x86_64-macos.tar.gz
318 - wasi-sdk-25.0-arm64-macos.tar.gz
319 - wasi-sdk-25.0-x86_64-linux.tar.gz
320 - wasi-sdk-25.0-arm64-linux.tar.gz
321 - wasi-sdk-25.0-x86_64-linux.tar.gz
322 - wasi-sdk-25.0-arm64-linux.tar.gz
323 - wasi-sdk-25.0-x86_64-windows.tar.gz
324 "})],
325 ),
326 message(
327 Assistant,
328 [tool_use(
329 "tool_1",
330 ReadFileTool::NAME,
331 ReadFileToolInput {
332 path: input_file_path.into(),
333 start_line: Some(971),
334 end_line: Some(1050),
335 },
336 )],
337 ),
338 message(
339 User,
340 [tool_result(
341 "tool_1",
342 ReadFileTool::NAME,
343 lines(input_file_content, 971..1050),
344 )],
345 ),
346 message(
347 Assistant,
348 [tool_use(
349 "tool_2",
350 ReadFileTool::NAME,
351 ReadFileToolInput {
352 path: input_file_path.into(),
353 start_line: Some(1050),
354 end_line: Some(1100),
355 },
356 )],
357 ),
358 message(
359 User,
360 [tool_result(
361 "tool_2",
362 ReadFileTool::NAME,
363 lines(input_file_content, 1050..1100),
364 )],
365 ),
366 message(
367 Assistant,
368 [tool_use(
369 "tool_3",
370 ReadFileTool::NAME,
371 ReadFileToolInput {
372 path: input_file_path.into(),
373 start_line: Some(1100),
374 end_line: Some(1150),
375 },
376 )],
377 ),
378 message(
379 User,
380 [tool_result(
381 "tool_3",
382 ReadFileTool::NAME,
383 lines(input_file_content, 1100..1150),
384 )],
385 ),
386 message(
387 Assistant,
388 [tool_use(
389 "tool_4",
390 EditFileTool::NAME,
391 EditFileToolInput {
392 display_description: edit_description.into(),
393 path: input_file_path.into(),
394 mode: EditFileMode::Edit,
395 },
396 )],
397 ),
398 ],
399 Some(input_file_content.into()),
400 EvalAssertion::judge_diff(indoc! {"
401 - The compile_parser_to_wasm method has been changed to use wasi-sdk
402 - ureq is used to download the SDK for current platform and architecture
403 "}),
404 ))
405 });
406}
407
408#[test]
409#[cfg_attr(not(feature = "unit-eval"), ignore)]
410fn eval_disable_cursor_blinking() {
411 // Model | Pass rate
412 // ============================================
413 //
414 // claude-3.7-sonnet | 0.59 (2025-07-14)
415 // claude-sonnet-4 | 0.81 (2025-07-14)
416 // gemini-2.5-pro | 0.95 (2025-07-14)
417 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
418
419 let input_file_path = "root/editor.rs";
420 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
421 let edit_description = "Comment out the call to `BlinkManager::enable`";
422 let possible_diffs = vec![
423 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
424 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
425 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
426 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
427 ];
428 eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
429 run_eval(EvalInput::from_conversation(
430 vec![
431 message(User, [text("Let's research how to cursor blinking works.")]),
432 message(
433 Assistant,
434 [tool_use(
435 "tool_1",
436 GrepTool::NAME,
437 GrepToolInput {
438 regex: "blink".into(),
439 include_pattern: None,
440 offset: 0,
441 case_sensitive: false,
442 },
443 )],
444 ),
445 message(
446 User,
447 [tool_result(
448 "tool_1",
449 GrepTool::NAME,
450 [
451 lines(input_file_content, 100..400),
452 lines(input_file_content, 800..1300),
453 lines(input_file_content, 1600..2000),
454 lines(input_file_content, 5000..5500),
455 lines(input_file_content, 8000..9000),
456 lines(input_file_content, 18455..18470),
457 lines(input_file_content, 20000..20500),
458 lines(input_file_content, 21000..21300),
459 ]
460 .join("Match found:\n\n"),
461 )],
462 ),
463 message(
464 User,
465 [text(indoc! {"
466 Comment out the lines that interact with the BlinkManager.
467 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
468 Don't add additional comments.
469 "})],
470 ),
471 message(
472 Assistant,
473 [tool_use(
474 "tool_4",
475 EditFileTool::NAME,
476 EditFileToolInput {
477 display_description: edit_description.into(),
478 path: input_file_path.into(),
479 mode: EditFileMode::Edit,
480 },
481 )],
482 ),
483 ],
484 Some(input_file_content.into()),
485 EvalAssertion::assert_diff_any(possible_diffs.clone()),
486 ))
487 });
488}
489
490#[test]
491#[cfg_attr(not(feature = "unit-eval"), ignore)]
492fn eval_from_pixels_constructor() {
493 // Results for 2025-06-13
494 //
495 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
496 // value. Higher values improve the pass rate but may sometimes cause
497 // edits to be misapplied. In the context of this eval, this means
498 // the agent might add from_pixels tests in incorrect locations
499 // (e.g., at the beginning of the file), yet the evaluation may still
500 // rate it highly.
501 //
502 // Model | Date | Pass rate
503 // =========================================================
504 // claude-4.0-sonnet | 2025-06-14 | 0.99
505 // claude-3.7-sonnet | 2025-06-14 | 0.88
506 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
507
508 let input_file_path = "root/canvas.rs";
509 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
510 let edit_description = "Implement from_pixels constructor and add tests.";
511
512 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
513 run_eval(EvalInput::from_conversation(
514 vec![
515 message(
516 User,
517 [text(indoc! {"
518 Introduce a new `from_pixels` constructor in Canvas and
519 also add tests for it in the same file.
520 "})],
521 ),
522 message(
523 Assistant,
524 [tool_use(
525 "tool_1",
526 ReadFileTool::NAME,
527 ReadFileToolInput {
528 path: input_file_path.into(),
529 start_line: None,
530 end_line: None,
531 },
532 )],
533 ),
534 message(
535 User,
536 [tool_result(
537 "tool_1",
538 ReadFileTool::NAME,
539 input_file_content,
540 )],
541 ),
542 message(
543 Assistant,
544 [tool_use(
545 "tool_2",
546 GrepTool::NAME,
547 GrepToolInput {
548 regex: "mod\\s+tests".into(),
549 include_pattern: Some("font-kit/src/canvas.rs".into()),
550 offset: 0,
551 case_sensitive: false,
552 },
553 )],
554 ),
555 message(
556 User,
557 [tool_result("tool_2", GrepTool::NAME, "No matches found")],
558 ),
559 message(
560 Assistant,
561 [tool_use(
562 "tool_3",
563 GrepTool::NAME,
564 GrepToolInput {
565 regex: "mod\\s+tests".into(),
566 include_pattern: Some("font-kit/src/**/*.rs".into()),
567 offset: 0,
568 case_sensitive: false,
569 },
570 )],
571 ),
572 message(
573 User,
574 [tool_result("tool_3", GrepTool::NAME, "No matches found")],
575 ),
576 message(
577 Assistant,
578 [tool_use(
579 "tool_4",
580 GrepTool::NAME,
581 GrepToolInput {
582 regex: "#\\[test\\]".into(),
583 include_pattern: Some("font-kit/src/**/*.rs".into()),
584 offset: 0,
585 case_sensitive: false,
586 },
587 )],
588 ),
589 message(
590 User,
591 [tool_result(
592 "tool_4",
593 GrepTool::NAME,
594 indoc! {"
595 Found 6 matches:
596
597 ## Matches in font-kit/src/loaders/core_text.rs
598
599 ### mod test › L926-936
600 ```
601 mod test {
602 use super::Font;
603 use crate::properties::{Stretch, Weight};
604
605 #[cfg(feature = \"source\")]
606 use crate::source::SystemSource;
607
608 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
609
610 #[cfg(feature = \"source\")]
611 #[test]
612 ```
613
614 55 lines remaining in ancestor node. Read the file to see all.
615
616 ### mod test › L947-951
617 ```
618 }
619
620 #[test]
621 fn test_core_text_to_css_font_weight() {
622 // Exact matches
623 ```
624
625 ### mod test › L959-963
626 ```
627 }
628
629 #[test]
630 fn test_core_text_to_css_font_stretch() {
631 // Exact matches
632 ```
633
634 ## Matches in font-kit/src/loaders/freetype.rs
635
636 ### mod test › L1238-1248
637 ```
638 mod test {
639 use crate::loaders::freetype::Font;
640
641 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
642 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
643
644 #[test]
645 fn get_pcf_postscript_name() {
646 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
647 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
648 }
649 ```
650
651 1 lines remaining in ancestor node. Read the file to see all.
652
653 ## Matches in font-kit/src/sources/core_text.rs
654
655 ### mod test › L265-275
656 ```
657 mod test {
658 use crate::properties::{Stretch, Weight};
659
660 #[test]
661 fn test_css_to_core_text_font_weight() {
662 // Exact matches
663 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
664 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
665 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
666 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
667
668 ```
669
670 27 lines remaining in ancestor node. Read the file to see all.
671
672 ### mod test › L278-282
673 ```
674 }
675
676 #[test]
677 fn test_css_to_core_text_font_stretch() {
678 // Exact matches
679 ```
680 "},
681 )],
682 ),
683 message(
684 Assistant,
685 [tool_use(
686 "tool_5",
687 EditFileTool::NAME,
688 EditFileToolInput {
689 display_description: edit_description.into(),
690 path: input_file_path.into(),
691 mode: EditFileMode::Edit,
692 },
693 )],
694 ),
695 ],
696 Some(input_file_content.into()),
697 EvalAssertion::judge_diff(indoc! {"
698 - The diff contains a new `from_pixels` constructor
699 - The diff contains new tests for the `from_pixels` constructor
700 "}),
701 ))
702 });
703}
704
705#[test]
706#[cfg_attr(not(feature = "unit-eval"), ignore)]
707fn eval_zode() {
708 // Model | Pass rate
709 // ============================================
710 //
711 // claude-3.7-sonnet | 1.0 (2025-06-14)
712 // claude-sonnet-4 | 1.0 (2025-06-14)
713 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
714 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
715
716 let input_file_path = "root/zode.py";
717 let input_content = None;
718 let edit_description = "Create the main Zode CLI script";
719
720 eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
721 run_eval(EvalInput::from_conversation(
722 vec![
723 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
724 message(
725 Assistant,
726 [
727 tool_use(
728 "tool_1",
729 ReadFileTool::NAME,
730 ReadFileToolInput {
731 path: "root/eval/react.py".into(),
732 start_line: None,
733 end_line: None,
734 },
735 ),
736 tool_use(
737 "tool_2",
738 ReadFileTool::NAME,
739 ReadFileToolInput {
740 path: "root/eval/react_test.py".into(),
741 start_line: None,
742 end_line: None,
743 },
744 ),
745 ],
746 ),
747 message(
748 User,
749 [
750 tool_result(
751 "tool_1",
752 ReadFileTool::NAME,
753 include_str!("evals/fixtures/zode/react.py"),
754 ),
755 tool_result(
756 "tool_2",
757 ReadFileTool::NAME,
758 include_str!("evals/fixtures/zode/react_test.py"),
759 ),
760 ],
761 ),
762 message(
763 Assistant,
764 [
765 text(
766 "Now that I understand what we need to build, I'll create the main Python script:",
767 ),
768 tool_use(
769 "tool_3",
770 EditFileTool::NAME,
771 EditFileToolInput {
772 display_description: edit_description.into(),
773 path: input_file_path.into(),
774 mode: EditFileMode::Create,
775 },
776 ),
777 ],
778 ),
779 ],
780 input_content.clone(),
781 EvalAssertion::new(async move |sample, _, _cx| {
782 let invalid_starts = [' ', '`', '\n'];
783 let mut message = String::new();
784 for start in invalid_starts {
785 if sample.text_after.starts_with(start) {
786 message.push_str(&format!("The sample starts with a {:?}\n", start));
787 break;
788 }
789 }
790 // Remove trailing newline.
791 message.pop();
792
793 if message.is_empty() {
794 Ok(EvalAssertionOutcome {
795 score: 100,
796 message: None,
797 })
798 } else {
799 Ok(EvalAssertionOutcome {
800 score: 0,
801 message: Some(message),
802 })
803 }
804 }),
805 ))
806 });
807}
808
809#[test]
810#[cfg_attr(not(feature = "unit-eval"), ignore)]
811fn eval_add_overwrite_test() {
812 // Model | Pass rate
813 // ============================================
814 //
815 // claude-3.7-sonnet | 0.65 (2025-06-14)
816 // claude-sonnet-4 | 0.07 (2025-06-14)
817 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
818 // gemini-2.5-flash-preview-04-17 |
819
820 let input_file_path = "root/action_log.rs";
821 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
822 let edit_description = "Add a new test for overwriting a file in action_log.rs";
823
824 eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
825 run_eval(EvalInput::from_conversation(
826 vec![
827 message(
828 User,
829 [text(indoc! {"
830 Introduce a new test in `action_log.rs` to test overwriting a file.
831 That is, a file already exists, but we call `buffer_created` as if the file were new.
832 Take inspiration from all the other tests in the file.
833 "})],
834 ),
835 message(
836 Assistant,
837 [tool_use(
838 "tool_1",
839 ReadFileTool::NAME,
840 ReadFileToolInput {
841 path: input_file_path.into(),
842 start_line: None,
843 end_line: None,
844 },
845 )],
846 ),
847 message(
848 User,
849 [tool_result(
850 "tool_1",
851 ReadFileTool::NAME,
852 indoc! {"
853 pub struct ActionLog [L13-20]
854 tracked_buffers [L15]
855 edited_since_project_diagnostics_check [L17]
856 project [L19]
857 impl ActionLog [L22-498]
858 pub fn new [L24-30]
859 pub fn project [L32-34]
860 pub fn checked_project_diagnostics [L37-39]
861 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
862 fn track_buffer_internal [L46-101]
863 fn handle_buffer_event [L103-116]
864 fn handle_buffer_edited [L118-123]
865 fn handle_buffer_file_changed [L125-158]
866 async fn maintain_diff [L160-264]
867 pub fn buffer_read [L267-269]
868 pub fn buffer_created [L272-276]
869 pub fn buffer_edited [L279-287]
870 pub fn will_delete_buffer [L289-304]
871 pub fn keep_edits_in_range [L306-364]
872 pub fn reject_edits_in_ranges [L366-459]
873 pub fn keep_all_edits [L461-473]
874 pub fn changed_buffers [L476-482]
875 pub fn stale_buffers [L485-497]
876 fn apply_non_conflicting_edits [L500-561]
877 fn diff_snapshots [L563-585]
878 fn point_to_row_edit [L587-614]
879 enum ChangeAuthor [L617-620]
880 User [L618]
881 Agent [L619]
882 enum TrackedBufferStatus [L623-627]
883 Created [L624]
884 Modified [L625]
885 Deleted [L626]
886 struct TrackedBuffer [L629-641]
887 buffer [L630]
888 base_text [L631]
889 unreviewed_changes [L632]
890 status [L633]
891 version [L634]
892 diff [L635]
893 snapshot [L636]
894 diff_update [L637]
895 _open_lsp_handle [L638]
896 _maintain_diff [L639]
897 _subscription [L640]
898 impl TrackedBuffer [L643-657]
899 fn has_changes [L644-650]
900 fn schedule_diff_update [L652-656]
901 pub struct ChangedBuffer [L659-661]
902 pub diff [L660]
903 mod tests [L664-1574]
904 fn init_logger [L678-682]
905 fn init_test [L684-691]
906 async fn test_keep_edits [L694-769]
907 async fn test_deletions [L772-854]
908 async fn test_overlapping_user_edits [L857-951]
909 async fn test_creating_files [L954-1010]
910 async fn test_deleting_files [L1013-1120]
911 async fn test_reject_edits [L1123-1255]
912 async fn test_reject_multiple_edits [L1258-1331]
913 async fn test_reject_deleted_file [L1334-1388]
914 async fn test_reject_created_file [L1391-1443]
915 async fn test_random_diffs [L1446-1535]
916 fn quiesce [L1510-1534]
917 struct HunkStatus [L1538-1542]
918 range [L1539]
919 diff_status [L1540]
920 old_text [L1541]
921 fn unreviewed_hunks [L1544-1573]
922
923 Showing symbols 1-69 (total symbols: 69)
924
925 Using the line numbers in this outline, you can call this tool again while specifying
926 the start_line and end_line fields to see the implementations of symbols in the outline.
927 "},
928 )],
929 ),
930 message(
931 Assistant,
932 [
933 text(
934 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
935 ),
936 tool_use(
937 "tool_2",
938 ReadFileTool::NAME,
939 ReadFileToolInput {
940 path: input_file_path.into(),
941 start_line: Some(953),
942 end_line: Some(1010),
943 },
944 ),
945 ],
946 ),
947 message(
948 User,
949 [tool_result(
950 "tool_2",
951 ReadFileTool::NAME,
952 lines(input_file_content, 953..1010),
953 )],
954 ),
955 message(
956 Assistant,
957 [
958 text(
959 "Now let's look at another relevant test to understand how they're structured:",
960 ),
961 tool_use(
962 "tool_3",
963 ReadFileTool::NAME,
964 ReadFileToolInput {
965 path: input_file_path.into(),
966 start_line: Some(1012),
967 end_line: Some(1120),
968 },
969 ),
970 ],
971 ),
972 message(
973 User,
974 [tool_result(
975 "tool_3",
976 ReadFileTool::NAME,
977 lines(input_file_content, 1012..1120),
978 )],
979 ),
980 message(
981 Assistant,
982 [
983 text("Now let's look at how `buffer_created` is implemented:"),
984 tool_use(
985 "tool_4",
986 ReadFileTool::NAME,
987 ReadFileToolInput {
988 path: input_file_path.into(),
989 start_line: Some(271),
990 end_line: Some(276),
991 },
992 ),
993 ],
994 ),
995 message(
996 User,
997 [tool_result(
998 "tool_4",
999 ReadFileTool::NAME,
1000 lines(input_file_content, 271..276),
1001 )],
1002 ),
1003 message(
1004 Assistant,
1005 [
1006 text(indoc! {"
1007 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
1008
1009 1. Create a file
1010 2. Open a buffer for it
1011 3. Track it in the action log
1012 4. Modify the buffer
1013 5. Call `buffer_created` again as if the file were new
1014 6. Check that the changes are properly tracked
1015
1016 Let's write the test:
1017 "}),
1018 tool_use(
1019 "tool_5",
1020 EditFileTool::NAME,
1021 EditFileToolInput {
1022 display_description: edit_description.into(),
1023 path: input_file_path.into(),
1024 mode: EditFileMode::Edit,
1025 },
1026 ),
1027 ],
1028 ),
1029 ],
1030 Some(input_file_content.into()),
1031 EvalAssertion::judge_diff(
1032 "A new test for overwritten files was created, without changing any previous test",
1033 ),
1034 ))
1035 });
1036}
1037
1038#[test]
1039#[cfg_attr(not(feature = "unit-eval"), ignore)]
1040fn eval_create_empty_file() {
1041 // Check that Edit Agent can create a file without writing its
1042 // thoughts into it. This issue is not specific to empty files, but
1043 // it's easier to reproduce with them.
1044 //
1045 // Model | Pass rate
1046 // ============================================
1047 //
1048 // claude-3.7-sonnet | 1.00 (2025-06-14)
1049 // claude-sonnet-4 | 1.00 (2025-06-14)
1050 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1051 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1052
1053 let input_file_content = None;
1054 let expected_output_content = String::new();
1055
1056 eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1057 run_eval(EvalInput::from_conversation(
1058 vec![
1059 message(User, [text("Create a second empty todo file ")]),
1060 message(
1061 Assistant,
1062 [
1063 text(formatdoc! {"
1064 I'll help you create a second empty todo file.
1065 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1066 "}),
1067 tool_use(
1068 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1069 ListDirectoryTool::NAME,
1070 ListDirectoryToolInput {
1071 path: "root".to_string(),
1072 },
1073 ),
1074 ],
1075 ),
1076 message(
1077 User,
1078 [tool_result(
1079 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1080 ListDirectoryTool::NAME,
1081 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1082 )],
1083 ),
1084 message(
1085 Assistant,
1086 [
1087 text(formatdoc! {"
1088 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1089 "}),
1090 tool_use(
1091 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1092 EditFileTool::NAME,
1093 EditFileToolInput {
1094 display_description: "Create empty TODO3 file".to_string(),
1095 mode: EditFileMode::Create,
1096 path: "root/TODO3".into(),
1097 },
1098 ),
1099 ],
1100 ),
1101 ],
1102 input_file_content.clone(),
1103 // Bad behavior is to write something like
1104 // "I'll create an empty TODO3 file as requested."
1105 EvalAssertion::assert_eq(expected_output_content.clone()),
1106 ))
1107 });
1108}
1109
1110fn message(
1111 role: Role,
1112 contents: impl IntoIterator<Item = MessageContent>,
1113) -> LanguageModelRequestMessage {
1114 LanguageModelRequestMessage {
1115 role,
1116 content: contents.into_iter().collect(),
1117 cache: false,
1118 reasoning_details: None,
1119 }
1120}
1121
1122fn text(text: impl Into<String>) -> MessageContent {
1123 MessageContent::Text(text.into())
1124}
1125
1126fn lines(input: &str, range: Range<usize>) -> String {
1127 input
1128 .lines()
1129 .skip(range.start)
1130 .take(range.len())
1131 .collect::<Vec<_>>()
1132 .join("\n")
1133}
1134
1135fn tool_use(
1136 id: impl Into<Arc<str>>,
1137 name: impl Into<Arc<str>>,
1138 input: impl Serialize,
1139) -> MessageContent {
1140 MessageContent::ToolUse(LanguageModelToolUse {
1141 id: LanguageModelToolUseId::from(id.into()),
1142 name: name.into(),
1143 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1144 input: serde_json::to_value(input).unwrap(),
1145 is_input_complete: true,
1146 thought_signature: None,
1147 })
1148}
1149
1150fn tool_result(
1151 id: impl Into<Arc<str>>,
1152 name: impl Into<Arc<str>>,
1153 result: impl Into<Arc<str>>,
1154) -> MessageContent {
1155 MessageContent::ToolResult(LanguageModelToolResult {
1156 tool_use_id: LanguageModelToolUseId::from(id.into()),
1157 tool_name: name.into(),
1158 is_error: false,
1159 content: LanguageModelToolResultContent::Text(result.into()),
1160 output: None,
1161 })
1162}
1163
1164#[derive(Clone)]
1165struct EvalInput {
1166 conversation: Vec<LanguageModelRequestMessage>,
1167 edit_file_input: EditFileToolInput,
1168 input_content: Option<String>,
1169 assertion: EvalAssertion,
1170}
1171
1172impl EvalInput {
1173 fn from_conversation(
1174 conversation: Vec<LanguageModelRequestMessage>,
1175 input_content: Option<String>,
1176 assertion: EvalAssertion,
1177 ) -> Self {
1178 let msg = conversation.last().expect("Conversation must not be empty");
1179 if msg.role != Role::Assistant {
1180 panic!("Conversation must end with an assistant message");
1181 }
1182 let tool_use = msg
1183 .content
1184 .iter()
1185 .flat_map(|content| match content {
1186 MessageContent::ToolUse(tool_use) if tool_use.name == EditFileTool::NAME.into() => {
1187 Some(tool_use)
1188 }
1189 _ => None,
1190 })
1191 .next()
1192 .expect("Conversation must end with an edit_file tool use")
1193 .clone();
1194
1195 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1196
1197 EvalInput {
1198 conversation,
1199 edit_file_input,
1200 input_content,
1201 assertion,
1202 }
1203 }
1204}
1205
1206#[derive(Clone)]
1207struct EvalSample {
1208 text_before: String,
1209 text_after: String,
1210 edit_output: EditAgentOutput,
1211 diff: String,
1212}
1213
1214trait AssertionFn: 'static + Send + Sync {
1215 fn assert<'a>(
1216 &'a self,
1217 sample: &'a EvalSample,
1218 judge_model: Arc<dyn LanguageModel>,
1219 cx: &'a mut TestAppContext,
1220 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1221}
1222
1223impl<F> AssertionFn for F
1224where
1225 F: 'static
1226 + Send
1227 + Sync
1228 + AsyncFn(
1229 &EvalSample,
1230 Arc<dyn LanguageModel>,
1231 &mut TestAppContext,
1232 ) -> Result<EvalAssertionOutcome>,
1233{
1234 fn assert<'a>(
1235 &'a self,
1236 sample: &'a EvalSample,
1237 judge_model: Arc<dyn LanguageModel>,
1238 cx: &'a mut TestAppContext,
1239 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1240 (self)(sample, judge_model, cx).boxed_local()
1241 }
1242}
1243
1244#[derive(Clone)]
1245struct EvalAssertion(Arc<dyn AssertionFn>);
1246
1247impl EvalAssertion {
1248 fn new<F>(f: F) -> Self
1249 where
1250 F: 'static
1251 + Send
1252 + Sync
1253 + AsyncFn(
1254 &EvalSample,
1255 Arc<dyn LanguageModel>,
1256 &mut TestAppContext,
1257 ) -> Result<EvalAssertionOutcome>,
1258 {
1259 EvalAssertion(Arc::new(f))
1260 }
1261
1262 fn assert_eq(expected: impl Into<String>) -> Self {
1263 let expected = expected.into();
1264 Self::new(async move |sample, _judge, _cx| {
1265 Ok(EvalAssertionOutcome {
1266 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1267 100
1268 } else {
1269 0
1270 },
1271 message: None,
1272 })
1273 })
1274 }
1275
1276 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1277 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1278 Self::new(async move |sample, _judge, _cx| {
1279 let matches = expected_diffs.iter().any(|possible_diff| {
1280 let expected =
1281 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1282 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1283 });
1284
1285 Ok(EvalAssertionOutcome {
1286 score: if matches { 100 } else { 0 },
1287 message: None,
1288 })
1289 })
1290 }
1291
1292 fn judge_diff(assertions: &'static str) -> Self {
1293 Self::new(async move |sample, judge, cx| {
1294 let prompt = DiffJudgeTemplate {
1295 diff: sample.diff.clone(),
1296 assertions,
1297 }
1298 .render(&Templates::new())
1299 .unwrap();
1300
1301 let request = LanguageModelRequest {
1302 messages: vec![LanguageModelRequestMessage {
1303 role: Role::User,
1304 content: vec![prompt.into()],
1305 cache: false,
1306 reasoning_details: None,
1307 }],
1308 thinking_allowed: true,
1309 ..Default::default()
1310 };
1311 let mut response = retry_on_rate_limit(async || {
1312 Ok(judge
1313 .stream_completion_text(request.clone(), &cx.to_async())
1314 .await?)
1315 })
1316 .await?;
1317 let mut output = String::new();
1318 while let Some(chunk) = response.stream.next().await {
1319 let chunk = chunk?;
1320 output.push_str(&chunk);
1321 }
1322
1323 // Parse the score from the response
1324 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1325 if let Some(captures) = re.captures(&output)
1326 && let Some(score_match) = captures.get(1)
1327 {
1328 let score = score_match.as_str().parse().unwrap_or(0);
1329 return Ok(EvalAssertionOutcome {
1330 score,
1331 message: Some(output),
1332 });
1333 }
1334
1335 anyhow::bail!("No score found in response. Raw output: {output}");
1336 })
1337 }
1338
1339 async fn run(
1340 &self,
1341 input: &EvalSample,
1342 judge_model: Arc<dyn LanguageModel>,
1343 cx: &mut TestAppContext,
1344 ) -> Result<EvalAssertionOutcome> {
1345 self.0.assert(input, judge_model, cx).await
1346 }
1347}
1348
1349fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1350 let dispatcher = gpui::TestDispatcher::new(rand::random());
1351 let mut cx = TestAppContext::build(dispatcher, None);
1352 let foreground_executor = cx.foreground_executor().clone();
1353 let result = foreground_executor.block_test(async {
1354 let test = EditAgentTest::new(&mut cx).await;
1355 test.eval(eval, &mut cx).await
1356 });
1357 cx.quit();
1358 match result {
1359 Ok(output) => eval_utils::EvalOutput {
1360 data: output.to_string(),
1361 outcome: if output.assertion.score < 80 {
1362 eval_utils::OutcomeKind::Failed
1363 } else {
1364 eval_utils::OutcomeKind::Passed
1365 },
1366 metadata: EditEvalMetadata {
1367 tags: output.sample.edit_output.parser_metrics.tags,
1368 mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1369 },
1370 },
1371 Err(e) => eval_utils::EvalOutput {
1372 data: format!("{e:?}"),
1373 outcome: eval_utils::OutcomeKind::Error,
1374 metadata: EditEvalMetadata {
1375 tags: 0,
1376 mismatched_tags: 0,
1377 },
1378 },
1379 }
1380}
1381
1382#[derive(Clone)]
1383struct EditEvalOutput {
1384 sample: EvalSample,
1385 assertion: EvalAssertionOutcome,
1386}
1387
1388impl Display for EditEvalOutput {
1389 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1390 writeln!(f, "Score: {:?}", self.assertion.score)?;
1391 if let Some(message) = self.assertion.message.as_ref() {
1392 writeln!(f, "Message: {}", message)?;
1393 }
1394
1395 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1396
1397 writeln!(
1398 f,
1399 "Parser Metrics:\n{:#?}",
1400 self.sample.edit_output.parser_metrics
1401 )?;
1402 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1403 Ok(())
1404 }
1405}
1406
1407struct EditAgentTest {
1408 agent: EditAgent,
1409 project: Entity<Project>,
1410 judge_model: Arc<dyn LanguageModel>,
1411}
1412
1413impl EditAgentTest {
1414 async fn new(cx: &mut TestAppContext) -> Self {
1415 cx.executor().allow_parking();
1416
1417 let fs = FakeFs::new(cx.executor());
1418 cx.update(|cx| {
1419 settings::init(cx);
1420 gpui_tokio::init(cx);
1421 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1422 cx.set_http_client(http_client);
1423 let client = Client::production(cx);
1424 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1425 settings::init(cx);
1426 language_model::init(user_store.clone(), client.clone(), cx);
1427 language_models::init(user_store, client.clone(), cx);
1428 });
1429
1430 fs.insert_tree("/root", json!({})).await;
1431 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1432 let agent_model = SelectedModel::from_str(
1433 &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1434 )
1435 .unwrap();
1436 let judge_model = SelectedModel::from_str(
1437 &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1438 )
1439 .unwrap();
1440
1441 let authenticate_provider_tasks = cx.update(|cx| {
1442 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1443 registry
1444 .providers()
1445 .iter()
1446 .map(|p| p.authenticate(cx))
1447 .collect::<Vec<_>>()
1448 })
1449 });
1450 let (agent_model, judge_model) = cx
1451 .update(|cx| {
1452 cx.spawn(async move |cx| {
1453 futures::future::join_all(authenticate_provider_tasks).await;
1454 let agent_model = Self::load_model(&agent_model, cx).await;
1455 let judge_model = Self::load_model(&judge_model, cx).await;
1456 (agent_model.unwrap(), judge_model.unwrap())
1457 })
1458 })
1459 .await;
1460 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1461
1462 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1463
1464 Self {
1465 agent: EditAgent::new(
1466 agent_model,
1467 project.clone(),
1468 action_log,
1469 Templates::new(),
1470 edit_format,
1471 true,
1472 true,
1473 ),
1474 project,
1475 judge_model,
1476 }
1477 }
1478
1479 async fn load_model(
1480 selected_model: &SelectedModel,
1481 cx: &mut AsyncApp,
1482 ) -> Result<Arc<dyn LanguageModel>> {
1483 cx.update(|cx| {
1484 let registry = LanguageModelRegistry::read_global(cx);
1485 let provider = registry
1486 .provider(&selected_model.provider)
1487 .expect("Provider not found");
1488 provider.authenticate(cx)
1489 })
1490 .await?;
1491 Ok(cx.update(|cx| {
1492 let models = LanguageModelRegistry::read_global(cx);
1493 let model = models
1494 .available_models(cx)
1495 .find(|model| {
1496 model.provider_id() == selected_model.provider
1497 && model.id() == selected_model.model
1498 })
1499 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1500 model
1501 }))
1502 }
1503
1504 async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1505 // Make sure the last message in the conversation is cached.
1506 eval.conversation.last_mut().unwrap().cache = true;
1507
1508 let path = self
1509 .project
1510 .read_with(cx, |project, cx| {
1511 project.find_project_path(eval.edit_file_input.path, cx)
1512 })
1513 .unwrap();
1514 let buffer = self
1515 .project
1516 .update(cx, |project, cx| project.open_buffer(path, cx))
1517 .await
1518 .unwrap();
1519
1520 let tools = crate::built_in_tools().collect::<Vec<_>>();
1521
1522 let system_prompt = {
1523 let worktrees = vec![WorktreeContext {
1524 root_name: "root".to_string(),
1525 abs_path: Path::new("/path/to/root").into(),
1526 rules_file: None,
1527 }];
1528 let project_context = ProjectContext::new(worktrees, Vec::default());
1529 let tool_names = tools
1530 .iter()
1531 .map(|tool| tool.name.clone().into())
1532 .collect::<Vec<_>>();
1533 let template = crate::SystemPromptTemplate {
1534 project: &project_context,
1535 available_tools: tool_names,
1536 model_name: None,
1537 };
1538 let templates = Templates::new();
1539 template.render(&templates).unwrap()
1540 };
1541
1542 let has_system_prompt = eval
1543 .conversation
1544 .first()
1545 .is_some_and(|msg| msg.role == Role::System);
1546 let messages = if has_system_prompt {
1547 eval.conversation
1548 } else {
1549 [LanguageModelRequestMessage {
1550 role: Role::System,
1551 content: vec![MessageContent::Text(system_prompt)],
1552 cache: true,
1553 reasoning_details: None,
1554 }]
1555 .into_iter()
1556 .chain(eval.conversation)
1557 .collect::<Vec<_>>()
1558 };
1559
1560 let conversation = LanguageModelRequest {
1561 messages,
1562 tools,
1563 thinking_allowed: true,
1564 ..Default::default()
1565 };
1566
1567 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1568 if let Some(input_content) = eval.input_content.as_deref() {
1569 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1570 }
1571 retry_on_rate_limit(async || {
1572 self.agent
1573 .edit(
1574 buffer.clone(),
1575 eval.edit_file_input.display_description.clone(),
1576 &conversation,
1577 &mut cx.to_async(),
1578 )
1579 .0
1580 .await
1581 })
1582 .await?
1583 } else {
1584 retry_on_rate_limit(async || {
1585 self.agent
1586 .overwrite(
1587 buffer.clone(),
1588 eval.edit_file_input.display_description.clone(),
1589 &conversation,
1590 &mut cx.to_async(),
1591 )
1592 .0
1593 .await
1594 })
1595 .await?
1596 };
1597
1598 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1599 let sample = EvalSample {
1600 edit_output,
1601 diff: language::unified_diff(
1602 eval.input_content.as_deref().unwrap_or_default(),
1603 &buffer_text,
1604 ),
1605 text_before: eval.input_content.unwrap_or_default(),
1606 text_after: buffer_text,
1607 };
1608 let assertion = eval
1609 .assertion
1610 .run(&sample, self.judge_model.clone(), cx)
1611 .await?;
1612
1613 Ok(EditEvalOutput { assertion, sample })
1614 }
1615}
1616
1617async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1618 const MAX_RETRIES: usize = 20;
1619 let mut attempt = 0;
1620
1621 loop {
1622 attempt += 1;
1623 let response = request().await;
1624
1625 if attempt >= MAX_RETRIES {
1626 return response;
1627 }
1628
1629 let retry_delay = match &response {
1630 Ok(_) => None,
1631 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1632 Some(err) => match &err {
1633 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1634 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1635 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1636 }
1637 LanguageModelCompletionError::UpstreamProviderError {
1638 status,
1639 retry_after,
1640 ..
1641 } => {
1642 // Only retry for specific status codes
1643 let should_retry = matches!(
1644 *status,
1645 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1646 ) || status.as_u16() == 529;
1647
1648 if should_retry {
1649 // Use server-provided retry_after if available, otherwise use default
1650 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1651 } else {
1652 None
1653 }
1654 }
1655 LanguageModelCompletionError::ApiReadResponseError { .. }
1656 | LanguageModelCompletionError::ApiInternalServerError { .. }
1657 | LanguageModelCompletionError::HttpSend { .. } => {
1658 // Exponential backoff for transient I/O and internal server errors
1659 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1660 }
1661 _ => None,
1662 },
1663 _ => None,
1664 },
1665 };
1666
1667 if let Some(retry_after) = retry_delay {
1668 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1669 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1670 // This code does not use the gpui::executor
1671 #[allow(clippy::disallowed_methods)]
1672 smol::Timer::after(retry_after + jitter).await;
1673 } else {
1674 return response;
1675 }
1676 }
1677}
1678
1679#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1680struct EvalAssertionOutcome {
1681 score: usize,
1682 message: Option<String>,
1683}
1684
1685#[derive(Serialize)]
1686pub struct DiffJudgeTemplate {
1687 diff: String,
1688 assertions: &'static str,
1689}
1690
1691impl Template for DiffJudgeTemplate {
1692 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1693}
1694
1695fn strip_empty_lines(text: &str) -> String {
1696 text.lines()
1697 .filter(|line| !line.trim().is_empty())
1698 .collect::<Vec<_>>()
1699 .join("\n")
1700}