1use super::*;
2use crate::{
3 EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
4};
5use Role::*;
6use client::{Client, UserStore};
7use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
8use fs::FakeFs;
9use futures::{FutureExt, future::LocalBoxFuture};
10use gpui::{AppContext, TestAppContext, Timer};
11use http_client::StatusCode;
12use indoc::{formatdoc, indoc};
13use language_model::{
14 LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
15 LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
16};
17use project::Project;
18use prompt_store::{ProjectContext, WorktreeContext};
19use rand::prelude::*;
20use reqwest_client::ReqwestClient;
21use serde_json::json;
22use std::{
23 fmt::{self, Display},
24 path::Path,
25 str::FromStr,
26 time::Duration,
27};
28use util::path;
29
30#[derive(Default, Clone, Debug)]
31struct EditAgentOutputProcessor {
32 mismatched_tag_threshold: f32,
33 cumulative_tags: usize,
34 cumulative_mismatched_tags: usize,
35 eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
36}
37
38fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
39 EditAgentOutputProcessor {
40 mismatched_tag_threshold,
41 cumulative_tags: 0,
42 cumulative_mismatched_tags: 0,
43 eval_outputs: Vec::new(),
44 }
45}
46
47#[derive(Clone, Debug)]
48struct EditEvalMetadata {
49 tags: usize,
50 mismatched_tags: usize,
51}
52
53impl EvalOutputProcessor for EditAgentOutputProcessor {
54 type Metadata = EditEvalMetadata;
55
56 fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
57 if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
58 self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
59 self.cumulative_tags += output.metadata.tags;
60 self.eval_outputs.push(output.clone());
61 }
62 }
63
64 fn assert(&mut self) {
65 let mismatched_tag_ratio =
66 self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
67 if mismatched_tag_ratio > self.mismatched_tag_threshold {
68 for eval_output in &self.eval_outputs {
69 println!("{}", eval_output.data);
70 }
71 panic!(
72 "Too many mismatched tags: {:?}",
73 self.cumulative_mismatched_tags
74 );
75 }
76 }
77}
78
79#[test]
80#[cfg_attr(not(feature = "unit-eval"), ignore)]
81fn eval_extract_handle_command_output() {
82 // Test how well agent generates multiple edit hunks.
83 //
84 // Model | Pass rate
85 // ----------------------------|----------
86 // claude-3.7-sonnet | 0.99 (2025-06-14)
87 // claude-sonnet-4 | 0.97 (2025-06-14)
88 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
89 // gemini-2.5-flash | 0.11 (2025-05-22)
90 // gpt-4.1 | 1.00 (2025-05-22)
91
92 let input_file_path = "root/blame.rs";
93 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
94 let possible_diffs = vec![
95 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
96 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
97 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
98 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
99 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
100 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
101 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
102 ];
103 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
104 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
105 run_eval(EvalInput::from_conversation(
106 vec![
107 message(
108 User,
109 [text(formatdoc! {"
110 Read the `{input_file_path}` file and extract a method in
111 the final stanza of `run_git_blame` to deal with command failures,
112 call it `handle_command_output` and take the std::process::Output as the only parameter.
113 Do not document the method and do not add any comments.
114
115 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
116 "})],
117 ),
118 message(
119 Assistant,
120 [tool_use(
121 "tool_1",
122 "read_file",
123 ReadFileToolInput {
124 path: input_file_path.into(),
125 start_line: None,
126 end_line: None,
127 },
128 )],
129 ),
130 message(
131 User,
132 [tool_result("tool_1", "read_file", input_file_content)],
133 ),
134 message(
135 Assistant,
136 [tool_use(
137 "tool_2",
138 "edit_file",
139 EditFileToolInput {
140 display_description: edit_description.into(),
141 path: input_file_path.into(),
142 mode: EditFileMode::Edit,
143 },
144 )],
145 ),
146 ],
147 Some(input_file_content.into()),
148 EvalAssertion::assert_diff_any(possible_diffs.clone()),
149 ))
150 });
151}
152
153#[test]
154#[cfg_attr(not(feature = "unit-eval"), ignore)]
155fn eval_delete_run_git_blame() {
156 // Model | Pass rate
157 // ----------------------------|----------
158 // claude-3.7-sonnet | 1.0 (2025-06-14)
159 // claude-sonnet-4 | 0.96 (2025-06-14)
160 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
161 // gemini-2.5-flash |
162 // gpt-4.1 |
163
164 let input_file_path = "root/blame.rs";
165 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
166 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
167 let edit_description = "Delete the `run_git_blame` function.";
168
169 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
170 run_eval(EvalInput::from_conversation(
171 vec![
172 message(
173 User,
174 [text(formatdoc! {"
175 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
176 one function, not its usages.
177 "})],
178 ),
179 message(
180 Assistant,
181 [tool_use(
182 "tool_1",
183 "read_file",
184 ReadFileToolInput {
185 path: input_file_path.into(),
186 start_line: None,
187 end_line: None,
188 },
189 )],
190 ),
191 message(
192 User,
193 [tool_result("tool_1", "read_file", input_file_content)],
194 ),
195 message(
196 Assistant,
197 [tool_use(
198 "tool_2",
199 "edit_file",
200 EditFileToolInput {
201 display_description: edit_description.into(),
202 path: input_file_path.into(),
203 mode: EditFileMode::Edit,
204 },
205 )],
206 ),
207 ],
208 Some(input_file_content.into()),
209 EvalAssertion::assert_eq(output_file_content),
210 ))
211 });
212}
213
214#[test]
215#[cfg_attr(not(feature = "unit-eval"), ignore)]
216fn eval_translate_doc_comments() {
217 // Model | Pass rate
218 // ============================================
219 //
220 // claude-3.7-sonnet | 1.0 (2025-06-14)
221 // claude-sonnet-4 | 1.0 (2025-06-14)
222 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
223 // gemini-2.5-flash-preview-04-17 |
224 // gpt-4.1 |
225
226 let input_file_path = "root/canvas.rs";
227 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
228 let edit_description = "Translate all doc comments to Italian";
229
230 eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
231 run_eval(EvalInput::from_conversation(
232 vec![
233 message(
234 User,
235 [text(formatdoc! {"
236 Read the {input_file_path} file and edit it (without overwriting it),
237 translating all the doc comments to italian.
238 "})],
239 ),
240 message(
241 Assistant,
242 [tool_use(
243 "tool_1",
244 "read_file",
245 ReadFileToolInput {
246 path: input_file_path.into(),
247 start_line: None,
248 end_line: None,
249 },
250 )],
251 ),
252 message(
253 User,
254 [tool_result("tool_1", "read_file", input_file_content)],
255 ),
256 message(
257 Assistant,
258 [tool_use(
259 "tool_2",
260 "edit_file",
261 EditFileToolInput {
262 display_description: edit_description.into(),
263 path: input_file_path.into(),
264 mode: EditFileMode::Edit,
265 },
266 )],
267 ),
268 ],
269 Some(input_file_content.into()),
270 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
271 ))
272 });
273}
274
275#[test]
276#[cfg_attr(not(feature = "unit-eval"), ignore)]
277fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
278 // Model | Pass rate
279 // ============================================
280 //
281 // claude-3.7-sonnet | 0.96 (2025-06-14)
282 // claude-sonnet-4 | 0.11 (2025-06-14)
283 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
284 // gemini-2.5-flash-preview-04-17 |
285 // gpt-4.1 |
286
287 let input_file_path = "root/lib.rs";
288 let input_file_content =
289 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
290 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
291
292 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
293 run_eval(EvalInput::from_conversation(
294 vec![
295 message(
296 User,
297 [text(formatdoc! {"
298 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
299 Use `ureq` to download the SDK for the current platform and architecture.
300 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
301 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
302 that's inside of the archive.
303 Don't re-download the SDK if that executable already exists.
304
305 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
306
307 Here are the available wasi-sdk assets:
308 - wasi-sdk-25.0-x86_64-macos.tar.gz
309 - wasi-sdk-25.0-arm64-macos.tar.gz
310 - wasi-sdk-25.0-x86_64-linux.tar.gz
311 - wasi-sdk-25.0-arm64-linux.tar.gz
312 - wasi-sdk-25.0-x86_64-linux.tar.gz
313 - wasi-sdk-25.0-arm64-linux.tar.gz
314 - wasi-sdk-25.0-x86_64-windows.tar.gz
315 "})],
316 ),
317 message(
318 Assistant,
319 [tool_use(
320 "tool_1",
321 "read_file",
322 ReadFileToolInput {
323 path: input_file_path.into(),
324 start_line: Some(971),
325 end_line: Some(1050),
326 },
327 )],
328 ),
329 message(
330 User,
331 [tool_result(
332 "tool_1",
333 "read_file",
334 lines(input_file_content, 971..1050),
335 )],
336 ),
337 message(
338 Assistant,
339 [tool_use(
340 "tool_2",
341 "read_file",
342 ReadFileToolInput {
343 path: input_file_path.into(),
344 start_line: Some(1050),
345 end_line: Some(1100),
346 },
347 )],
348 ),
349 message(
350 User,
351 [tool_result(
352 "tool_2",
353 "read_file",
354 lines(input_file_content, 1050..1100),
355 )],
356 ),
357 message(
358 Assistant,
359 [tool_use(
360 "tool_3",
361 "read_file",
362 ReadFileToolInput {
363 path: input_file_path.into(),
364 start_line: Some(1100),
365 end_line: Some(1150),
366 },
367 )],
368 ),
369 message(
370 User,
371 [tool_result(
372 "tool_3",
373 "read_file",
374 lines(input_file_content, 1100..1150),
375 )],
376 ),
377 message(
378 Assistant,
379 [tool_use(
380 "tool_4",
381 "edit_file",
382 EditFileToolInput {
383 display_description: edit_description.into(),
384 path: input_file_path.into(),
385 mode: EditFileMode::Edit,
386 },
387 )],
388 ),
389 ],
390 Some(input_file_content.into()),
391 EvalAssertion::judge_diff(indoc! {"
392 - The compile_parser_to_wasm method has been changed to use wasi-sdk
393 - ureq is used to download the SDK for current platform and architecture
394 "}),
395 ))
396 });
397}
398
399#[test]
400#[cfg_attr(not(feature = "unit-eval"), ignore)]
401fn eval_disable_cursor_blinking() {
402 // Model | Pass rate
403 // ============================================
404 //
405 // claude-3.7-sonnet | 0.59 (2025-07-14)
406 // claude-sonnet-4 | 0.81 (2025-07-14)
407 // gemini-2.5-pro | 0.95 (2025-07-14)
408 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
409 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
410
411 let input_file_path = "root/editor.rs";
412 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
413 let edit_description = "Comment out the call to `BlinkManager::enable`";
414 let possible_diffs = vec![
415 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
416 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
417 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
418 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
419 ];
420 eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
421 run_eval(EvalInput::from_conversation(
422 vec![
423 message(User, [text("Let's research how to cursor blinking works.")]),
424 message(
425 Assistant,
426 [tool_use(
427 "tool_1",
428 "grep",
429 GrepToolInput {
430 regex: "blink".into(),
431 include_pattern: None,
432 offset: 0,
433 case_sensitive: false,
434 },
435 )],
436 ),
437 message(
438 User,
439 [tool_result(
440 "tool_1",
441 "grep",
442 [
443 lines(input_file_content, 100..400),
444 lines(input_file_content, 800..1300),
445 lines(input_file_content, 1600..2000),
446 lines(input_file_content, 5000..5500),
447 lines(input_file_content, 8000..9000),
448 lines(input_file_content, 18455..18470),
449 lines(input_file_content, 20000..20500),
450 lines(input_file_content, 21000..21300),
451 ]
452 .join("Match found:\n\n"),
453 )],
454 ),
455 message(
456 User,
457 [text(indoc! {"
458 Comment out the lines that interact with the BlinkManager.
459 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
460 Don't add additional comments.
461 "})],
462 ),
463 message(
464 Assistant,
465 [tool_use(
466 "tool_4",
467 "edit_file",
468 EditFileToolInput {
469 display_description: edit_description.into(),
470 path: input_file_path.into(),
471 mode: EditFileMode::Edit,
472 },
473 )],
474 ),
475 ],
476 Some(input_file_content.into()),
477 EvalAssertion::assert_diff_any(possible_diffs.clone()),
478 ))
479 });
480}
481
482#[test]
483#[cfg_attr(not(feature = "unit-eval"), ignore)]
484fn eval_from_pixels_constructor() {
485 // Results for 2025-06-13
486 //
487 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
488 // value. Higher values improve the pass rate but may sometimes cause
489 // edits to be misapplied. In the context of this eval, this means
490 // the agent might add from_pixels tests in incorrect locations
491 // (e.g., at the beginning of the file), yet the evaluation may still
492 // rate it highly.
493 //
494 // Model | Date | Pass rate
495 // =========================================================
496 // claude-4.0-sonnet | 2025-06-14 | 0.99
497 // claude-3.7-sonnet | 2025-06-14 | 0.88
498 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
499 // gpt-4.1 |
500
501 let input_file_path = "root/canvas.rs";
502 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
503 let edit_description = "Implement from_pixels constructor and add tests.";
504
505 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
506 run_eval(EvalInput::from_conversation(
507 vec![
508 message(
509 User,
510 [text(indoc! {"
511 Introduce a new `from_pixels` constructor in Canvas and
512 also add tests for it in the same file.
513 "})],
514 ),
515 message(
516 Assistant,
517 [tool_use(
518 "tool_1",
519 "read_file",
520 ReadFileToolInput {
521 path: input_file_path.into(),
522 start_line: None,
523 end_line: None,
524 },
525 )],
526 ),
527 message(
528 User,
529 [tool_result("tool_1", "read_file", input_file_content)],
530 ),
531 message(
532 Assistant,
533 [tool_use(
534 "tool_2",
535 "grep",
536 GrepToolInput {
537 regex: "mod\\s+tests".into(),
538 include_pattern: Some("font-kit/src/canvas.rs".into()),
539 offset: 0,
540 case_sensitive: false,
541 },
542 )],
543 ),
544 message(User, [tool_result("tool_2", "grep", "No matches found")]),
545 message(
546 Assistant,
547 [tool_use(
548 "tool_3",
549 "grep",
550 GrepToolInput {
551 regex: "mod\\s+tests".into(),
552 include_pattern: Some("font-kit/src/**/*.rs".into()),
553 offset: 0,
554 case_sensitive: false,
555 },
556 )],
557 ),
558 message(User, [tool_result("tool_3", "grep", "No matches found")]),
559 message(
560 Assistant,
561 [tool_use(
562 "tool_4",
563 "grep",
564 GrepToolInput {
565 regex: "#\\[test\\]".into(),
566 include_pattern: Some("font-kit/src/**/*.rs".into()),
567 offset: 0,
568 case_sensitive: false,
569 },
570 )],
571 ),
572 message(
573 User,
574 [tool_result(
575 "tool_4",
576 "grep",
577 indoc! {"
578 Found 6 matches:
579
580 ## Matches in font-kit/src/loaders/core_text.rs
581
582 ### mod test › L926-936
583 ```
584 mod test {
585 use super::Font;
586 use crate::properties::{Stretch, Weight};
587
588 #[cfg(feature = \"source\")]
589 use crate::source::SystemSource;
590
591 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
592
593 #[cfg(feature = \"source\")]
594 #[test]
595 ```
596
597 55 lines remaining in ancestor node. Read the file to see all.
598
599 ### mod test › L947-951
600 ```
601 }
602
603 #[test]
604 fn test_core_text_to_css_font_weight() {
605 // Exact matches
606 ```
607
608 ### mod test › L959-963
609 ```
610 }
611
612 #[test]
613 fn test_core_text_to_css_font_stretch() {
614 // Exact matches
615 ```
616
617 ## Matches in font-kit/src/loaders/freetype.rs
618
619 ### mod test › L1238-1248
620 ```
621 mod test {
622 use crate::loaders::freetype::Font;
623
624 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
625 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
626
627 #[test]
628 fn get_pcf_postscript_name() {
629 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
630 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
631 }
632 ```
633
634 1 lines remaining in ancestor node. Read the file to see all.
635
636 ## Matches in font-kit/src/sources/core_text.rs
637
638 ### mod test › L265-275
639 ```
640 mod test {
641 use crate::properties::{Stretch, Weight};
642
643 #[test]
644 fn test_css_to_core_text_font_weight() {
645 // Exact matches
646 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
647 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
648 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
649 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
650
651 ```
652
653 27 lines remaining in ancestor node. Read the file to see all.
654
655 ### mod test › L278-282
656 ```
657 }
658
659 #[test]
660 fn test_css_to_core_text_font_stretch() {
661 // Exact matches
662 ```
663 "},
664 )],
665 ),
666 message(
667 Assistant,
668 [tool_use(
669 "tool_5",
670 "edit_file",
671 EditFileToolInput {
672 display_description: edit_description.into(),
673 path: input_file_path.into(),
674 mode: EditFileMode::Edit,
675 },
676 )],
677 ),
678 ],
679 Some(input_file_content.into()),
680 EvalAssertion::judge_diff(indoc! {"
681 - The diff contains a new `from_pixels` constructor
682 - The diff contains new tests for the `from_pixels` constructor
683 "}),
684 ))
685 });
686}
687
688#[test]
689#[cfg_attr(not(feature = "unit-eval"), ignore)]
690fn eval_zode() {
691 // Model | Pass rate
692 // ============================================
693 //
694 // claude-3.7-sonnet | 1.0 (2025-06-14)
695 // claude-sonnet-4 | 1.0 (2025-06-14)
696 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
697 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
698 // gpt-4.1 | 1.0 (2025-05-22)
699
700 let input_file_path = "root/zode.py";
701 let input_content = None;
702 let edit_description = "Create the main Zode CLI script";
703
704 eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
705 run_eval(EvalInput::from_conversation(
706 vec![
707 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
708 message(
709 Assistant,
710 [
711 tool_use(
712 "tool_1",
713 "read_file",
714 ReadFileToolInput {
715 path: "root/eval/react.py".into(),
716 start_line: None,
717 end_line: None,
718 },
719 ),
720 tool_use(
721 "tool_2",
722 "read_file",
723 ReadFileToolInput {
724 path: "root/eval/react_test.py".into(),
725 start_line: None,
726 end_line: None,
727 },
728 ),
729 ],
730 ),
731 message(
732 User,
733 [
734 tool_result(
735 "tool_1",
736 "read_file",
737 include_str!("evals/fixtures/zode/react.py"),
738 ),
739 tool_result(
740 "tool_2",
741 "read_file",
742 include_str!("evals/fixtures/zode/react_test.py"),
743 ),
744 ],
745 ),
746 message(
747 Assistant,
748 [
749 text(
750 "Now that I understand what we need to build, I'll create the main Python script:",
751 ),
752 tool_use(
753 "tool_3",
754 "edit_file",
755 EditFileToolInput {
756 display_description: edit_description.into(),
757 path: input_file_path.into(),
758 mode: EditFileMode::Create,
759 },
760 ),
761 ],
762 ),
763 ],
764 input_content.clone(),
765 EvalAssertion::new(async move |sample, _, _cx| {
766 let invalid_starts = [' ', '`', '\n'];
767 let mut message = String::new();
768 for start in invalid_starts {
769 if sample.text_after.starts_with(start) {
770 message.push_str(&format!("The sample starts with a {:?}\n", start));
771 break;
772 }
773 }
774 // Remove trailing newline.
775 message.pop();
776
777 if message.is_empty() {
778 Ok(EvalAssertionOutcome {
779 score: 100,
780 message: None,
781 })
782 } else {
783 Ok(EvalAssertionOutcome {
784 score: 0,
785 message: Some(message),
786 })
787 }
788 }),
789 ))
790 });
791}
792
793#[test]
794#[cfg_attr(not(feature = "unit-eval"), ignore)]
795fn eval_add_overwrite_test() {
796 // Model | Pass rate
797 // ============================================
798 //
799 // claude-3.7-sonnet | 0.65 (2025-06-14)
800 // claude-sonnet-4 | 0.07 (2025-06-14)
801 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
802 // gemini-2.5-flash-preview-04-17 |
803 // gpt-4.1 |
804
805 let input_file_path = "root/action_log.rs";
806 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
807 let edit_description = "Add a new test for overwriting a file in action_log.rs";
808
809 eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
810 run_eval(EvalInput::from_conversation(
811 vec![
812 message(
813 User,
814 [text(indoc! {"
815 Introduce a new test in `action_log.rs` to test overwriting a file.
816 That is, a file already exists, but we call `buffer_created` as if the file were new.
817 Take inspiration from all the other tests in the file.
818 "})],
819 ),
820 message(
821 Assistant,
822 [tool_use(
823 "tool_1",
824 "read_file",
825 ReadFileToolInput {
826 path: input_file_path.into(),
827 start_line: None,
828 end_line: None,
829 },
830 )],
831 ),
832 message(
833 User,
834 [tool_result(
835 "tool_1",
836 "read_file",
837 indoc! {"
838 pub struct ActionLog [L13-20]
839 tracked_buffers [L15]
840 edited_since_project_diagnostics_check [L17]
841 project [L19]
842 impl ActionLog [L22-498]
843 pub fn new [L24-30]
844 pub fn project [L32-34]
845 pub fn checked_project_diagnostics [L37-39]
846 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
847 fn track_buffer_internal [L46-101]
848 fn handle_buffer_event [L103-116]
849 fn handle_buffer_edited [L118-123]
850 fn handle_buffer_file_changed [L125-158]
851 async fn maintain_diff [L160-264]
852 pub fn buffer_read [L267-269]
853 pub fn buffer_created [L272-276]
854 pub fn buffer_edited [L279-287]
855 pub fn will_delete_buffer [L289-304]
856 pub fn keep_edits_in_range [L306-364]
857 pub fn reject_edits_in_ranges [L366-459]
858 pub fn keep_all_edits [L461-473]
859 pub fn changed_buffers [L476-482]
860 pub fn stale_buffers [L485-497]
861 fn apply_non_conflicting_edits [L500-561]
862 fn diff_snapshots [L563-585]
863 fn point_to_row_edit [L587-614]
864 enum ChangeAuthor [L617-620]
865 User [L618]
866 Agent [L619]
867 enum TrackedBufferStatus [L623-627]
868 Created [L624]
869 Modified [L625]
870 Deleted [L626]
871 struct TrackedBuffer [L629-641]
872 buffer [L630]
873 base_text [L631]
874 unreviewed_changes [L632]
875 status [L633]
876 version [L634]
877 diff [L635]
878 snapshot [L636]
879 diff_update [L637]
880 _open_lsp_handle [L638]
881 _maintain_diff [L639]
882 _subscription [L640]
883 impl TrackedBuffer [L643-657]
884 fn has_changes [L644-650]
885 fn schedule_diff_update [L652-656]
886 pub struct ChangedBuffer [L659-661]
887 pub diff [L660]
888 mod tests [L664-1574]
889 fn init_logger [L678-682]
890 fn init_test [L684-691]
891 async fn test_keep_edits [L694-769]
892 async fn test_deletions [L772-854]
893 async fn test_overlapping_user_edits [L857-951]
894 async fn test_creating_files [L954-1010]
895 async fn test_deleting_files [L1013-1120]
896 async fn test_reject_edits [L1123-1255]
897 async fn test_reject_multiple_edits [L1258-1331]
898 async fn test_reject_deleted_file [L1334-1388]
899 async fn test_reject_created_file [L1391-1443]
900 async fn test_random_diffs [L1446-1535]
901 fn quiesce [L1510-1534]
902 struct HunkStatus [L1538-1542]
903 range [L1539]
904 diff_status [L1540]
905 old_text [L1541]
906 fn unreviewed_hunks [L1544-1573]
907
908 Showing symbols 1-69 (total symbols: 69)
909
910 Using the line numbers in this outline, you can call this tool again while specifying
911 the start_line and end_line fields to see the implementations of symbols in the outline.
912 "},
913 )],
914 ),
915 message(
916 Assistant,
917 [
918 text(
919 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
920 ),
921 tool_use(
922 "tool_2",
923 "read_file",
924 ReadFileToolInput {
925 path: input_file_path.into(),
926 start_line: Some(953),
927 end_line: Some(1010),
928 },
929 ),
930 ],
931 ),
932 message(
933 User,
934 [tool_result(
935 "tool_2",
936 "read_file",
937 lines(input_file_content, 953..1010),
938 )],
939 ),
940 message(
941 Assistant,
942 [
943 text(
944 "Now let's look at another relevant test to understand how they're structured:",
945 ),
946 tool_use(
947 "tool_3",
948 "read_file",
949 ReadFileToolInput {
950 path: input_file_path.into(),
951 start_line: Some(1012),
952 end_line: Some(1120),
953 },
954 ),
955 ],
956 ),
957 message(
958 User,
959 [tool_result(
960 "tool_3",
961 "read_file",
962 lines(input_file_content, 1012..1120),
963 )],
964 ),
965 message(
966 Assistant,
967 [
968 text("Now let's look at how `buffer_created` is implemented:"),
969 tool_use(
970 "tool_4",
971 "read_file",
972 ReadFileToolInput {
973 path: input_file_path.into(),
974 start_line: Some(271),
975 end_line: Some(276),
976 },
977 ),
978 ],
979 ),
980 message(
981 User,
982 [tool_result(
983 "tool_4",
984 "read_file",
985 lines(input_file_content, 271..276),
986 )],
987 ),
988 message(
989 Assistant,
990 [
991 text(indoc! {"
992 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
993
994 1. Create a file
995 2. Open a buffer for it
996 3. Track it in the action log
997 4. Modify the buffer
998 5. Call `buffer_created` again as if the file were new
999 6. Check that the changes are properly tracked
1000
1001 Let's write the test:
1002 "}),
1003 tool_use(
1004 "tool_5",
1005 "edit_file",
1006 EditFileToolInput {
1007 display_description: edit_description.into(),
1008 path: input_file_path.into(),
1009 mode: EditFileMode::Edit,
1010 },
1011 ),
1012 ],
1013 ),
1014 ],
1015 Some(input_file_content.into()),
1016 EvalAssertion::judge_diff(
1017 "A new test for overwritten files was created, without changing any previous test",
1018 ),
1019 ))
1020 });
1021}
1022
1023#[test]
1024#[cfg_attr(not(feature = "unit-eval"), ignore)]
1025fn eval_create_empty_file() {
1026 // Check that Edit Agent can create a file without writing its
1027 // thoughts into it. This issue is not specific to empty files, but
1028 // it's easier to reproduce with them.
1029 //
1030 // Model | Pass rate
1031 // ============================================
1032 //
1033 // claude-3.7-sonnet | 1.00 (2025-06-14)
1034 // claude-sonnet-4 | 1.00 (2025-06-14)
1035 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1036 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1037 // gpt-4.1 | 1.00 (2025-05-21)
1038 //
1039 //
1040 // TODO: gpt-4.1-mini errored 38 times:
1041 // "data did not match any variant of untagged enum ResponseStreamResult"
1042
1043 let input_file_content = None;
1044 let expected_output_content = String::new();
1045
1046 eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1047 run_eval(EvalInput::from_conversation(
1048 vec![
1049 message(User, [text("Create a second empty todo file ")]),
1050 message(
1051 Assistant,
1052 [
1053 text(formatdoc! {"
1054 I'll help you create a second empty todo file.
1055 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1056 "}),
1057 tool_use(
1058 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1059 "list_directory",
1060 ListDirectoryToolInput {
1061 path: "root".to_string(),
1062 },
1063 ),
1064 ],
1065 ),
1066 message(
1067 User,
1068 [tool_result(
1069 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1070 "list_directory",
1071 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1072 )],
1073 ),
1074 message(
1075 Assistant,
1076 [
1077 text(formatdoc! {"
1078 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1079 "}),
1080 tool_use(
1081 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1082 "edit_file",
1083 EditFileToolInput {
1084 display_description: "Create empty TODO3 file".to_string(),
1085 mode: EditFileMode::Create,
1086 path: "root/TODO3".into(),
1087 },
1088 ),
1089 ],
1090 ),
1091 ],
1092 input_file_content.clone(),
1093 // Bad behavior is to write something like
1094 // "I'll create an empty TODO3 file as requested."
1095 EvalAssertion::assert_eq(expected_output_content.clone()),
1096 ))
1097 });
1098}
1099
1100fn message(
1101 role: Role,
1102 contents: impl IntoIterator<Item = MessageContent>,
1103) -> LanguageModelRequestMessage {
1104 LanguageModelRequestMessage {
1105 role,
1106 content: contents.into_iter().collect(),
1107 cache: false,
1108 reasoning_details: None,
1109 }
1110}
1111
1112fn text(text: impl Into<String>) -> MessageContent {
1113 MessageContent::Text(text.into())
1114}
1115
1116fn lines(input: &str, range: Range<usize>) -> String {
1117 input
1118 .lines()
1119 .skip(range.start)
1120 .take(range.len())
1121 .collect::<Vec<_>>()
1122 .join("\n")
1123}
1124
1125fn tool_use(
1126 id: impl Into<Arc<str>>,
1127 name: impl Into<Arc<str>>,
1128 input: impl Serialize,
1129) -> MessageContent {
1130 MessageContent::ToolUse(LanguageModelToolUse {
1131 id: LanguageModelToolUseId::from(id.into()),
1132 name: name.into(),
1133 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1134 input: serde_json::to_value(input).unwrap(),
1135 is_input_complete: true,
1136 thought_signature: None,
1137 })
1138}
1139
1140fn tool_result(
1141 id: impl Into<Arc<str>>,
1142 name: impl Into<Arc<str>>,
1143 result: impl Into<Arc<str>>,
1144) -> MessageContent {
1145 MessageContent::ToolResult(LanguageModelToolResult {
1146 tool_use_id: LanguageModelToolUseId::from(id.into()),
1147 tool_name: name.into(),
1148 is_error: false,
1149 content: LanguageModelToolResultContent::Text(result.into()),
1150 output: None,
1151 })
1152}
1153
1154#[derive(Clone)]
1155struct EvalInput {
1156 conversation: Vec<LanguageModelRequestMessage>,
1157 edit_file_input: EditFileToolInput,
1158 input_content: Option<String>,
1159 assertion: EvalAssertion,
1160}
1161
1162impl EvalInput {
1163 fn from_conversation(
1164 conversation: Vec<LanguageModelRequestMessage>,
1165 input_content: Option<String>,
1166 assertion: EvalAssertion,
1167 ) -> Self {
1168 let msg = conversation.last().expect("Conversation must not be empty");
1169 if msg.role != Role::Assistant {
1170 panic!("Conversation must end with an assistant message");
1171 }
1172 let tool_use = msg
1173 .content
1174 .iter()
1175 .flat_map(|content| match content {
1176 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1177 Some(tool_use)
1178 }
1179 _ => None,
1180 })
1181 .next()
1182 .expect("Conversation must end with an edit_file tool use")
1183 .clone();
1184
1185 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1186
1187 EvalInput {
1188 conversation,
1189 edit_file_input,
1190 input_content,
1191 assertion,
1192 }
1193 }
1194}
1195
1196#[derive(Clone)]
1197struct EvalSample {
1198 text_before: String,
1199 text_after: String,
1200 edit_output: EditAgentOutput,
1201 diff: String,
1202}
1203
1204trait AssertionFn: 'static + Send + Sync {
1205 fn assert<'a>(
1206 &'a self,
1207 sample: &'a EvalSample,
1208 judge_model: Arc<dyn LanguageModel>,
1209 cx: &'a mut TestAppContext,
1210 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1211}
1212
1213impl<F> AssertionFn for F
1214where
1215 F: 'static
1216 + Send
1217 + Sync
1218 + AsyncFn(
1219 &EvalSample,
1220 Arc<dyn LanguageModel>,
1221 &mut TestAppContext,
1222 ) -> Result<EvalAssertionOutcome>,
1223{
1224 fn assert<'a>(
1225 &'a self,
1226 sample: &'a EvalSample,
1227 judge_model: Arc<dyn LanguageModel>,
1228 cx: &'a mut TestAppContext,
1229 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1230 (self)(sample, judge_model, cx).boxed_local()
1231 }
1232}
1233
1234#[derive(Clone)]
1235struct EvalAssertion(Arc<dyn AssertionFn>);
1236
1237impl EvalAssertion {
1238 fn new<F>(f: F) -> Self
1239 where
1240 F: 'static
1241 + Send
1242 + Sync
1243 + AsyncFn(
1244 &EvalSample,
1245 Arc<dyn LanguageModel>,
1246 &mut TestAppContext,
1247 ) -> Result<EvalAssertionOutcome>,
1248 {
1249 EvalAssertion(Arc::new(f))
1250 }
1251
1252 fn assert_eq(expected: impl Into<String>) -> Self {
1253 let expected = expected.into();
1254 Self::new(async move |sample, _judge, _cx| {
1255 Ok(EvalAssertionOutcome {
1256 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1257 100
1258 } else {
1259 0
1260 },
1261 message: None,
1262 })
1263 })
1264 }
1265
1266 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1267 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1268 Self::new(async move |sample, _judge, _cx| {
1269 let matches = expected_diffs.iter().any(|possible_diff| {
1270 let expected =
1271 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1272 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1273 });
1274
1275 Ok(EvalAssertionOutcome {
1276 score: if matches { 100 } else { 0 },
1277 message: None,
1278 })
1279 })
1280 }
1281
1282 fn judge_diff(assertions: &'static str) -> Self {
1283 Self::new(async move |sample, judge, cx| {
1284 let prompt = DiffJudgeTemplate {
1285 diff: sample.diff.clone(),
1286 assertions,
1287 }
1288 .render(&Templates::new())
1289 .unwrap();
1290
1291 let request = LanguageModelRequest {
1292 messages: vec![LanguageModelRequestMessage {
1293 role: Role::User,
1294 content: vec![prompt.into()],
1295 cache: false,
1296 reasoning_details: None,
1297 }],
1298 thinking_allowed: true,
1299 ..Default::default()
1300 };
1301 let mut response = retry_on_rate_limit(async || {
1302 Ok(judge
1303 .stream_completion_text(request.clone(), &cx.to_async())
1304 .await?)
1305 })
1306 .await?;
1307 let mut output = String::new();
1308 while let Some(chunk) = response.stream.next().await {
1309 let chunk = chunk?;
1310 output.push_str(&chunk);
1311 }
1312
1313 // Parse the score from the response
1314 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1315 if let Some(captures) = re.captures(&output)
1316 && let Some(score_match) = captures.get(1)
1317 {
1318 let score = score_match.as_str().parse().unwrap_or(0);
1319 return Ok(EvalAssertionOutcome {
1320 score,
1321 message: Some(output),
1322 });
1323 }
1324
1325 anyhow::bail!("No score found in response. Raw output: {output}");
1326 })
1327 }
1328
1329 async fn run(
1330 &self,
1331 input: &EvalSample,
1332 judge_model: Arc<dyn LanguageModel>,
1333 cx: &mut TestAppContext,
1334 ) -> Result<EvalAssertionOutcome> {
1335 self.0.assert(input, judge_model, cx).await
1336 }
1337}
1338
1339fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1340 let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1341 let mut cx = TestAppContext::build(dispatcher, None);
1342 let result = cx.executor().block_test(async {
1343 let test = EditAgentTest::new(&mut cx).await;
1344 test.eval(eval, &mut cx).await
1345 });
1346 match result {
1347 Ok(output) => eval_utils::EvalOutput {
1348 data: output.to_string(),
1349 outcome: if output.assertion.score < 80 {
1350 eval_utils::OutcomeKind::Failed
1351 } else {
1352 eval_utils::OutcomeKind::Passed
1353 },
1354 metadata: EditEvalMetadata {
1355 tags: output.sample.edit_output.parser_metrics.tags,
1356 mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1357 },
1358 },
1359 Err(e) => eval_utils::EvalOutput {
1360 data: format!("{e:?}"),
1361 outcome: eval_utils::OutcomeKind::Error,
1362 metadata: EditEvalMetadata {
1363 tags: 0,
1364 mismatched_tags: 0,
1365 },
1366 },
1367 }
1368}
1369
1370#[derive(Clone)]
1371struct EditEvalOutput {
1372 sample: EvalSample,
1373 assertion: EvalAssertionOutcome,
1374}
1375
1376impl Display for EditEvalOutput {
1377 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1378 writeln!(f, "Score: {:?}", self.assertion.score)?;
1379 if let Some(message) = self.assertion.message.as_ref() {
1380 writeln!(f, "Message: {}", message)?;
1381 }
1382
1383 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1384
1385 writeln!(
1386 f,
1387 "Parser Metrics:\n{:#?}",
1388 self.sample.edit_output.parser_metrics
1389 )?;
1390 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1391 Ok(())
1392 }
1393}
1394
1395struct EditAgentTest {
1396 agent: EditAgent,
1397 project: Entity<Project>,
1398 judge_model: Arc<dyn LanguageModel>,
1399}
1400
1401impl EditAgentTest {
1402 async fn new(cx: &mut TestAppContext) -> Self {
1403 cx.executor().allow_parking();
1404
1405 let fs = FakeFs::new(cx.executor());
1406 cx.update(|cx| {
1407 settings::init(cx);
1408 gpui_tokio::init(cx);
1409 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1410 cx.set_http_client(http_client);
1411 let client = Client::production(cx);
1412 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1413 settings::init(cx);
1414 language_model::init(client.clone(), cx);
1415 language_models::init(user_store, client.clone(), cx);
1416 });
1417
1418 fs.insert_tree("/root", json!({})).await;
1419 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1420 let agent_model = SelectedModel::from_str(
1421 &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1422 )
1423 .unwrap();
1424 let judge_model = SelectedModel::from_str(
1425 &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1426 )
1427 .unwrap();
1428
1429 let authenticate_provider_tasks = cx.update(|cx| {
1430 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1431 registry
1432 .providers()
1433 .iter()
1434 .map(|p| p.authenticate(cx))
1435 .collect::<Vec<_>>()
1436 })
1437 });
1438 let (agent_model, judge_model) = cx
1439 .update(|cx| {
1440 cx.spawn(async move |cx| {
1441 futures::future::join_all(authenticate_provider_tasks).await;
1442 let agent_model = Self::load_model(&agent_model, cx).await;
1443 let judge_model = Self::load_model(&judge_model, cx).await;
1444 (agent_model.unwrap(), judge_model.unwrap())
1445 })
1446 })
1447 .await;
1448 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1449
1450 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1451
1452 Self {
1453 agent: EditAgent::new(
1454 agent_model,
1455 project.clone(),
1456 action_log,
1457 Templates::new(),
1458 edit_format,
1459 ),
1460 project,
1461 judge_model,
1462 }
1463 }
1464
1465 async fn load_model(
1466 selected_model: &SelectedModel,
1467 cx: &mut AsyncApp,
1468 ) -> Result<Arc<dyn LanguageModel>> {
1469 cx.update(|cx| {
1470 let registry = LanguageModelRegistry::read_global(cx);
1471 let provider = registry
1472 .provider(&selected_model.provider)
1473 .expect("Provider not found");
1474 provider.authenticate(cx)
1475 })?
1476 .await?;
1477 cx.update(|cx| {
1478 let models = LanguageModelRegistry::read_global(cx);
1479 let model = models
1480 .available_models(cx)
1481 .find(|model| {
1482 model.provider_id() == selected_model.provider
1483 && model.id() == selected_model.model
1484 })
1485 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1486 model
1487 })
1488 }
1489
1490 async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1491 // Make sure the last message in the conversation is cached.
1492 eval.conversation.last_mut().unwrap().cache = true;
1493
1494 let path = self
1495 .project
1496 .read_with(cx, |project, cx| {
1497 project.find_project_path(eval.edit_file_input.path, cx)
1498 })
1499 .unwrap();
1500 let buffer = self
1501 .project
1502 .update(cx, |project, cx| project.open_buffer(path, cx))
1503 .await
1504 .unwrap();
1505
1506 let tools = crate::built_in_tools().collect::<Vec<_>>();
1507
1508 let system_prompt = {
1509 let worktrees = vec![WorktreeContext {
1510 root_name: "root".to_string(),
1511 abs_path: Path::new("/path/to/root").into(),
1512 rules_file: None,
1513 }];
1514 let project_context = ProjectContext::new(worktrees, Vec::default());
1515 let tool_names = tools
1516 .iter()
1517 .map(|tool| tool.name.clone().into())
1518 .collect::<Vec<_>>();
1519 let template = crate::SystemPromptTemplate {
1520 project: &project_context,
1521 available_tools: tool_names,
1522 model_name: None,
1523 };
1524 let templates = Templates::new();
1525 template.render(&templates).unwrap()
1526 };
1527
1528 let has_system_prompt = eval
1529 .conversation
1530 .first()
1531 .is_some_and(|msg| msg.role == Role::System);
1532 let messages = if has_system_prompt {
1533 eval.conversation
1534 } else {
1535 [LanguageModelRequestMessage {
1536 role: Role::System,
1537 content: vec![MessageContent::Text(system_prompt)],
1538 cache: true,
1539 reasoning_details: None,
1540 }]
1541 .into_iter()
1542 .chain(eval.conversation)
1543 .collect::<Vec<_>>()
1544 };
1545
1546 let conversation = LanguageModelRequest {
1547 messages,
1548 tools,
1549 thinking_allowed: true,
1550 ..Default::default()
1551 };
1552
1553 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1554 if let Some(input_content) = eval.input_content.as_deref() {
1555 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1556 }
1557 retry_on_rate_limit(async || {
1558 self.agent
1559 .edit(
1560 buffer.clone(),
1561 eval.edit_file_input.display_description.clone(),
1562 &conversation,
1563 &mut cx.to_async(),
1564 )
1565 .0
1566 .await
1567 })
1568 .await?
1569 } else {
1570 retry_on_rate_limit(async || {
1571 self.agent
1572 .overwrite(
1573 buffer.clone(),
1574 eval.edit_file_input.display_description.clone(),
1575 &conversation,
1576 &mut cx.to_async(),
1577 )
1578 .0
1579 .await
1580 })
1581 .await?
1582 };
1583
1584 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1585 let sample = EvalSample {
1586 edit_output,
1587 diff: language::unified_diff(
1588 eval.input_content.as_deref().unwrap_or_default(),
1589 &buffer_text,
1590 ),
1591 text_before: eval.input_content.unwrap_or_default(),
1592 text_after: buffer_text,
1593 };
1594 let assertion = eval
1595 .assertion
1596 .run(&sample, self.judge_model.clone(), cx)
1597 .await?;
1598
1599 Ok(EditEvalOutput { assertion, sample })
1600 }
1601}
1602
1603async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1604 const MAX_RETRIES: usize = 20;
1605 let mut attempt = 0;
1606
1607 loop {
1608 attempt += 1;
1609 let response = request().await;
1610
1611 if attempt >= MAX_RETRIES {
1612 return response;
1613 }
1614
1615 let retry_delay = match &response {
1616 Ok(_) => None,
1617 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1618 Some(err) => match &err {
1619 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1620 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1621 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1622 }
1623 LanguageModelCompletionError::UpstreamProviderError {
1624 status,
1625 retry_after,
1626 ..
1627 } => {
1628 // Only retry for specific status codes
1629 let should_retry = matches!(
1630 *status,
1631 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1632 ) || status.as_u16() == 529;
1633
1634 if should_retry {
1635 // Use server-provided retry_after if available, otherwise use default
1636 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1637 } else {
1638 None
1639 }
1640 }
1641 LanguageModelCompletionError::ApiReadResponseError { .. }
1642 | LanguageModelCompletionError::ApiInternalServerError { .. }
1643 | LanguageModelCompletionError::HttpSend { .. } => {
1644 // Exponential backoff for transient I/O and internal server errors
1645 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1646 }
1647 _ => None,
1648 },
1649 _ => None,
1650 },
1651 };
1652
1653 if let Some(retry_after) = retry_delay {
1654 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1655 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1656 Timer::after(retry_after + jitter).await;
1657 } else {
1658 return response;
1659 }
1660 }
1661}
1662
1663#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1664struct EvalAssertionOutcome {
1665 score: usize,
1666 message: Option<String>,
1667}
1668
1669#[derive(Serialize)]
1670pub struct DiffJudgeTemplate {
1671 diff: String,
1672 assertions: &'static str,
1673}
1674
1675impl Template for DiffJudgeTemplate {
1676 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1677}
1678
1679fn strip_empty_lines(text: &str) -> String {
1680 text.lines()
1681 .filter(|line| !line.trim().is_empty())
1682 .collect::<Vec<_>>()
1683 .join("\n")
1684}