1use super::*;
2use crate::{
3 AgentTool, EditFileMode, EditFileTool, EditFileToolInput, GrepTool, GrepToolInput,
4 ListDirectoryTool, ListDirectoryToolInput, ReadFileTool, ReadFileToolInput,
5};
6use Role::*;
7use client::{Client, UserStore};
8use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
9use fs::FakeFs;
10use futures::{FutureExt, future::LocalBoxFuture};
11use gpui::{AppContext, TestAppContext};
12use http_client::StatusCode;
13use indoc::{formatdoc, indoc};
14use language_model::{
15 LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
16 LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
17};
18use project::Project;
19use prompt_store::{ProjectContext, WorktreeContext};
20use rand::prelude::*;
21use reqwest_client::ReqwestClient;
22use serde_json::json;
23use std::{
24 fmt::{self, Display},
25 path::Path,
26 str::FromStr,
27 time::Duration,
28};
29use util::path;
30
31#[derive(Default, Clone, Debug)]
32struct EditAgentOutputProcessor {
33 mismatched_tag_threshold: f32,
34 cumulative_tags: usize,
35 cumulative_mismatched_tags: usize,
36 eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
37}
38
39fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
40 EditAgentOutputProcessor {
41 mismatched_tag_threshold,
42 cumulative_tags: 0,
43 cumulative_mismatched_tags: 0,
44 eval_outputs: Vec::new(),
45 }
46}
47
48#[derive(Clone, Debug)]
49struct EditEvalMetadata {
50 tags: usize,
51 mismatched_tags: usize,
52}
53
54impl EvalOutputProcessor for EditAgentOutputProcessor {
55 type Metadata = EditEvalMetadata;
56
57 fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
58 if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
59 self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
60 self.cumulative_tags += output.metadata.tags;
61 self.eval_outputs.push(output.clone());
62 }
63 }
64
65 fn assert(&mut self) {
66 let mismatched_tag_ratio =
67 self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
68 if mismatched_tag_ratio > self.mismatched_tag_threshold {
69 for eval_output in &self.eval_outputs {
70 println!("{}", eval_output.data);
71 }
72 panic!(
73 "Too many mismatched tags: {:?}",
74 self.cumulative_mismatched_tags
75 );
76 }
77 }
78}
79
80#[test]
81#[cfg_attr(not(feature = "unit-eval"), ignore)]
82fn eval_extract_handle_command_output() {
83 // Test how well agent generates multiple edit hunks.
84 //
85 // Model | Pass rate
86 // ----------------------------|----------
87 // claude-3.7-sonnet | 0.99 (2025-06-14)
88 // claude-sonnet-4 | 0.97 (2025-06-14)
89 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
90 // gemini-2.5-flash | 0.11 (2025-05-22)
91 // gpt-4.1 | 1.00 (2025-05-22)
92
93 let input_file_path = "root/blame.rs";
94 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
95 let possible_diffs = vec![
96 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
97 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
98 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
99 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
100 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
101 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
102 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
103 ];
104 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
105 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
106 run_eval(EvalInput::from_conversation(
107 vec![
108 message(
109 User,
110 [text(formatdoc! {"
111 Read the `{input_file_path}` file and extract a method in
112 the final stanza of `run_git_blame` to deal with command failures,
113 call it `handle_command_output` and take the std::process::Output as the only parameter.
114 Do not document the method and do not add any comments.
115
116 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
117 "})],
118 ),
119 message(
120 Assistant,
121 [tool_use(
122 "tool_1",
123 ReadFileTool::NAME,
124 ReadFileToolInput {
125 path: input_file_path.into(),
126 start_line: None,
127 end_line: None,
128 },
129 )],
130 ),
131 message(
132 User,
133 [tool_result(
134 "tool_1",
135 ReadFileTool::NAME,
136 input_file_content,
137 )],
138 ),
139 message(
140 Assistant,
141 [tool_use(
142 "tool_2",
143 EditFileTool::NAME,
144 EditFileToolInput {
145 display_description: edit_description.into(),
146 path: input_file_path.into(),
147 mode: EditFileMode::Edit,
148 },
149 )],
150 ),
151 ],
152 Some(input_file_content.into()),
153 EvalAssertion::assert_diff_any(possible_diffs.clone()),
154 ))
155 });
156}
157
158#[test]
159#[cfg_attr(not(feature = "unit-eval"), ignore)]
160fn eval_delete_run_git_blame() {
161 // Model | Pass rate
162 // ----------------------------|----------
163 // claude-3.7-sonnet | 1.0 (2025-06-14)
164 // claude-sonnet-4 | 0.96 (2025-06-14)
165 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
166 // gemini-2.5-flash |
167 // gpt-4.1 |
168
169 let input_file_path = "root/blame.rs";
170 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
171 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
172 let edit_description = "Delete the `run_git_blame` function.";
173
174 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
175 run_eval(EvalInput::from_conversation(
176 vec![
177 message(
178 User,
179 [text(formatdoc! {"
180 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
181 one function, not its usages.
182 "})],
183 ),
184 message(
185 Assistant,
186 [tool_use(
187 "tool_1",
188 ReadFileTool::NAME,
189 ReadFileToolInput {
190 path: input_file_path.into(),
191 start_line: None,
192 end_line: None,
193 },
194 )],
195 ),
196 message(
197 User,
198 [tool_result(
199 "tool_1",
200 ReadFileTool::NAME,
201 input_file_content,
202 )],
203 ),
204 message(
205 Assistant,
206 [tool_use(
207 "tool_2",
208 EditFileTool::NAME,
209 EditFileToolInput {
210 display_description: edit_description.into(),
211 path: input_file_path.into(),
212 mode: EditFileMode::Edit,
213 },
214 )],
215 ),
216 ],
217 Some(input_file_content.into()),
218 EvalAssertion::assert_eq(output_file_content),
219 ))
220 });
221}
222
223#[test]
224#[cfg_attr(not(feature = "unit-eval"), ignore)]
225fn eval_translate_doc_comments() {
226 // Model | Pass rate
227 // ============================================
228 //
229 // claude-3.7-sonnet | 1.0 (2025-06-14)
230 // claude-sonnet-4 | 1.0 (2025-06-14)
231 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
232 // gemini-2.5-flash-preview-04-17 |
233 // gpt-4.1 |
234
235 let input_file_path = "root/canvas.rs";
236 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
237 let edit_description = "Translate all doc comments to Italian";
238
239 eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
240 run_eval(EvalInput::from_conversation(
241 vec![
242 message(
243 User,
244 [text(formatdoc! {"
245 Read the {input_file_path} file and edit it (without overwriting it),
246 translating all the doc comments to italian.
247 "})],
248 ),
249 message(
250 Assistant,
251 [tool_use(
252 "tool_1",
253 ReadFileTool::NAME,
254 ReadFileToolInput {
255 path: input_file_path.into(),
256 start_line: None,
257 end_line: None,
258 },
259 )],
260 ),
261 message(
262 User,
263 [tool_result(
264 "tool_1",
265 ReadFileTool::NAME,
266 input_file_content,
267 )],
268 ),
269 message(
270 Assistant,
271 [tool_use(
272 "tool_2",
273 EditFileTool::NAME,
274 EditFileToolInput {
275 display_description: edit_description.into(),
276 path: input_file_path.into(),
277 mode: EditFileMode::Edit,
278 },
279 )],
280 ),
281 ],
282 Some(input_file_content.into()),
283 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
284 ))
285 });
286}
287
288#[test]
289#[cfg_attr(not(feature = "unit-eval"), ignore)]
290fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
291 // Model | Pass rate
292 // ============================================
293 //
294 // claude-3.7-sonnet | 0.96 (2025-06-14)
295 // claude-sonnet-4 | 0.11 (2025-06-14)
296 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
297 // gemini-2.5-flash-preview-04-17 |
298 // gpt-4.1 |
299
300 let input_file_path = "root/lib.rs";
301 let input_file_content =
302 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
303 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
304
305 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
306 run_eval(EvalInput::from_conversation(
307 vec![
308 message(
309 User,
310 [text(formatdoc! {"
311 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
312 Use `ureq` to download the SDK for the current platform and architecture.
313 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
314 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
315 that's inside of the archive.
316 Don't re-download the SDK if that executable already exists.
317
318 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
319
320 Here are the available wasi-sdk assets:
321 - wasi-sdk-25.0-x86_64-macos.tar.gz
322 - wasi-sdk-25.0-arm64-macos.tar.gz
323 - wasi-sdk-25.0-x86_64-linux.tar.gz
324 - wasi-sdk-25.0-arm64-linux.tar.gz
325 - wasi-sdk-25.0-x86_64-linux.tar.gz
326 - wasi-sdk-25.0-arm64-linux.tar.gz
327 - wasi-sdk-25.0-x86_64-windows.tar.gz
328 "})],
329 ),
330 message(
331 Assistant,
332 [tool_use(
333 "tool_1",
334 ReadFileTool::NAME,
335 ReadFileToolInput {
336 path: input_file_path.into(),
337 start_line: Some(971),
338 end_line: Some(1050),
339 },
340 )],
341 ),
342 message(
343 User,
344 [tool_result(
345 "tool_1",
346 ReadFileTool::NAME,
347 lines(input_file_content, 971..1050),
348 )],
349 ),
350 message(
351 Assistant,
352 [tool_use(
353 "tool_2",
354 ReadFileTool::NAME,
355 ReadFileToolInput {
356 path: input_file_path.into(),
357 start_line: Some(1050),
358 end_line: Some(1100),
359 },
360 )],
361 ),
362 message(
363 User,
364 [tool_result(
365 "tool_2",
366 ReadFileTool::NAME,
367 lines(input_file_content, 1050..1100),
368 )],
369 ),
370 message(
371 Assistant,
372 [tool_use(
373 "tool_3",
374 ReadFileTool::NAME,
375 ReadFileToolInput {
376 path: input_file_path.into(),
377 start_line: Some(1100),
378 end_line: Some(1150),
379 },
380 )],
381 ),
382 message(
383 User,
384 [tool_result(
385 "tool_3",
386 ReadFileTool::NAME,
387 lines(input_file_content, 1100..1150),
388 )],
389 ),
390 message(
391 Assistant,
392 [tool_use(
393 "tool_4",
394 EditFileTool::NAME,
395 EditFileToolInput {
396 display_description: edit_description.into(),
397 path: input_file_path.into(),
398 mode: EditFileMode::Edit,
399 },
400 )],
401 ),
402 ],
403 Some(input_file_content.into()),
404 EvalAssertion::judge_diff(indoc! {"
405 - The compile_parser_to_wasm method has been changed to use wasi-sdk
406 - ureq is used to download the SDK for current platform and architecture
407 "}),
408 ))
409 });
410}
411
412#[test]
413#[cfg_attr(not(feature = "unit-eval"), ignore)]
414fn eval_disable_cursor_blinking() {
415 // Model | Pass rate
416 // ============================================
417 //
418 // claude-3.7-sonnet | 0.59 (2025-07-14)
419 // claude-sonnet-4 | 0.81 (2025-07-14)
420 // gemini-2.5-pro | 0.95 (2025-07-14)
421 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
422 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
423
424 let input_file_path = "root/editor.rs";
425 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
426 let edit_description = "Comment out the call to `BlinkManager::enable`";
427 let possible_diffs = vec![
428 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
429 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
430 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
431 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
432 ];
433 eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
434 run_eval(EvalInput::from_conversation(
435 vec![
436 message(User, [text("Let's research how to cursor blinking works.")]),
437 message(
438 Assistant,
439 [tool_use(
440 "tool_1",
441 GrepTool::NAME,
442 GrepToolInput {
443 regex: "blink".into(),
444 include_pattern: None,
445 offset: 0,
446 case_sensitive: false,
447 },
448 )],
449 ),
450 message(
451 User,
452 [tool_result(
453 "tool_1",
454 GrepTool::NAME,
455 [
456 lines(input_file_content, 100..400),
457 lines(input_file_content, 800..1300),
458 lines(input_file_content, 1600..2000),
459 lines(input_file_content, 5000..5500),
460 lines(input_file_content, 8000..9000),
461 lines(input_file_content, 18455..18470),
462 lines(input_file_content, 20000..20500),
463 lines(input_file_content, 21000..21300),
464 ]
465 .join("Match found:\n\n"),
466 )],
467 ),
468 message(
469 User,
470 [text(indoc! {"
471 Comment out the lines that interact with the BlinkManager.
472 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
473 Don't add additional comments.
474 "})],
475 ),
476 message(
477 Assistant,
478 [tool_use(
479 "tool_4",
480 EditFileTool::NAME,
481 EditFileToolInput {
482 display_description: edit_description.into(),
483 path: input_file_path.into(),
484 mode: EditFileMode::Edit,
485 },
486 )],
487 ),
488 ],
489 Some(input_file_content.into()),
490 EvalAssertion::assert_diff_any(possible_diffs.clone()),
491 ))
492 });
493}
494
495#[test]
496#[cfg_attr(not(feature = "unit-eval"), ignore)]
497fn eval_from_pixels_constructor() {
498 // Results for 2025-06-13
499 //
500 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
501 // value. Higher values improve the pass rate but may sometimes cause
502 // edits to be misapplied. In the context of this eval, this means
503 // the agent might add from_pixels tests in incorrect locations
504 // (e.g., at the beginning of the file), yet the evaluation may still
505 // rate it highly.
506 //
507 // Model | Date | Pass rate
508 // =========================================================
509 // claude-4.0-sonnet | 2025-06-14 | 0.99
510 // claude-3.7-sonnet | 2025-06-14 | 0.88
511 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
512 // gpt-4.1 |
513
514 let input_file_path = "root/canvas.rs";
515 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
516 let edit_description = "Implement from_pixels constructor and add tests.";
517
518 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
519 run_eval(EvalInput::from_conversation(
520 vec![
521 message(
522 User,
523 [text(indoc! {"
524 Introduce a new `from_pixels` constructor in Canvas and
525 also add tests for it in the same file.
526 "})],
527 ),
528 message(
529 Assistant,
530 [tool_use(
531 "tool_1",
532 ReadFileTool::NAME,
533 ReadFileToolInput {
534 path: input_file_path.into(),
535 start_line: None,
536 end_line: None,
537 },
538 )],
539 ),
540 message(
541 User,
542 [tool_result(
543 "tool_1",
544 ReadFileTool::NAME,
545 input_file_content,
546 )],
547 ),
548 message(
549 Assistant,
550 [tool_use(
551 "tool_2",
552 GrepTool::NAME,
553 GrepToolInput {
554 regex: "mod\\s+tests".into(),
555 include_pattern: Some("font-kit/src/canvas.rs".into()),
556 offset: 0,
557 case_sensitive: false,
558 },
559 )],
560 ),
561 message(
562 User,
563 [tool_result("tool_2", GrepTool::NAME, "No matches found")],
564 ),
565 message(
566 Assistant,
567 [tool_use(
568 "tool_3",
569 GrepTool::NAME,
570 GrepToolInput {
571 regex: "mod\\s+tests".into(),
572 include_pattern: Some("font-kit/src/**/*.rs".into()),
573 offset: 0,
574 case_sensitive: false,
575 },
576 )],
577 ),
578 message(
579 User,
580 [tool_result("tool_3", GrepTool::NAME, "No matches found")],
581 ),
582 message(
583 Assistant,
584 [tool_use(
585 "tool_4",
586 GrepTool::NAME,
587 GrepToolInput {
588 regex: "#\\[test\\]".into(),
589 include_pattern: Some("font-kit/src/**/*.rs".into()),
590 offset: 0,
591 case_sensitive: false,
592 },
593 )],
594 ),
595 message(
596 User,
597 [tool_result(
598 "tool_4",
599 GrepTool::NAME,
600 indoc! {"
601 Found 6 matches:
602
603 ## Matches in font-kit/src/loaders/core_text.rs
604
605 ### mod test › L926-936
606 ```
607 mod test {
608 use super::Font;
609 use crate::properties::{Stretch, Weight};
610
611 #[cfg(feature = \"source\")]
612 use crate::source::SystemSource;
613
614 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
615
616 #[cfg(feature = \"source\")]
617 #[test]
618 ```
619
620 55 lines remaining in ancestor node. Read the file to see all.
621
622 ### mod test › L947-951
623 ```
624 }
625
626 #[test]
627 fn test_core_text_to_css_font_weight() {
628 // Exact matches
629 ```
630
631 ### mod test › L959-963
632 ```
633 }
634
635 #[test]
636 fn test_core_text_to_css_font_stretch() {
637 // Exact matches
638 ```
639
640 ## Matches in font-kit/src/loaders/freetype.rs
641
642 ### mod test › L1238-1248
643 ```
644 mod test {
645 use crate::loaders::freetype::Font;
646
647 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
648 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
649
650 #[test]
651 fn get_pcf_postscript_name() {
652 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
653 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
654 }
655 ```
656
657 1 lines remaining in ancestor node. Read the file to see all.
658
659 ## Matches in font-kit/src/sources/core_text.rs
660
661 ### mod test › L265-275
662 ```
663 mod test {
664 use crate::properties::{Stretch, Weight};
665
666 #[test]
667 fn test_css_to_core_text_font_weight() {
668 // Exact matches
669 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
670 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
671 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
672 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
673
674 ```
675
676 27 lines remaining in ancestor node. Read the file to see all.
677
678 ### mod test › L278-282
679 ```
680 }
681
682 #[test]
683 fn test_css_to_core_text_font_stretch() {
684 // Exact matches
685 ```
686 "},
687 )],
688 ),
689 message(
690 Assistant,
691 [tool_use(
692 "tool_5",
693 EditFileTool::NAME,
694 EditFileToolInput {
695 display_description: edit_description.into(),
696 path: input_file_path.into(),
697 mode: EditFileMode::Edit,
698 },
699 )],
700 ),
701 ],
702 Some(input_file_content.into()),
703 EvalAssertion::judge_diff(indoc! {"
704 - The diff contains a new `from_pixels` constructor
705 - The diff contains new tests for the `from_pixels` constructor
706 "}),
707 ))
708 });
709}
710
711#[test]
712#[cfg_attr(not(feature = "unit-eval"), ignore)]
713fn eval_zode() {
714 // Model | Pass rate
715 // ============================================
716 //
717 // claude-3.7-sonnet | 1.0 (2025-06-14)
718 // claude-sonnet-4 | 1.0 (2025-06-14)
719 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
720 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
721 // gpt-4.1 | 1.0 (2025-05-22)
722
723 let input_file_path = "root/zode.py";
724 let input_content = None;
725 let edit_description = "Create the main Zode CLI script";
726
727 eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
728 run_eval(EvalInput::from_conversation(
729 vec![
730 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
731 message(
732 Assistant,
733 [
734 tool_use(
735 "tool_1",
736 ReadFileTool::NAME,
737 ReadFileToolInput {
738 path: "root/eval/react.py".into(),
739 start_line: None,
740 end_line: None,
741 },
742 ),
743 tool_use(
744 "tool_2",
745 ReadFileTool::NAME,
746 ReadFileToolInput {
747 path: "root/eval/react_test.py".into(),
748 start_line: None,
749 end_line: None,
750 },
751 ),
752 ],
753 ),
754 message(
755 User,
756 [
757 tool_result(
758 "tool_1",
759 ReadFileTool::NAME,
760 include_str!("evals/fixtures/zode/react.py"),
761 ),
762 tool_result(
763 "tool_2",
764 ReadFileTool::NAME,
765 include_str!("evals/fixtures/zode/react_test.py"),
766 ),
767 ],
768 ),
769 message(
770 Assistant,
771 [
772 text(
773 "Now that I understand what we need to build, I'll create the main Python script:",
774 ),
775 tool_use(
776 "tool_3",
777 EditFileTool::NAME,
778 EditFileToolInput {
779 display_description: edit_description.into(),
780 path: input_file_path.into(),
781 mode: EditFileMode::Create,
782 },
783 ),
784 ],
785 ),
786 ],
787 input_content.clone(),
788 EvalAssertion::new(async move |sample, _, _cx| {
789 let invalid_starts = [' ', '`', '\n'];
790 let mut message = String::new();
791 for start in invalid_starts {
792 if sample.text_after.starts_with(start) {
793 message.push_str(&format!("The sample starts with a {:?}\n", start));
794 break;
795 }
796 }
797 // Remove trailing newline.
798 message.pop();
799
800 if message.is_empty() {
801 Ok(EvalAssertionOutcome {
802 score: 100,
803 message: None,
804 })
805 } else {
806 Ok(EvalAssertionOutcome {
807 score: 0,
808 message: Some(message),
809 })
810 }
811 }),
812 ))
813 });
814}
815
816#[test]
817#[cfg_attr(not(feature = "unit-eval"), ignore)]
818fn eval_add_overwrite_test() {
819 // Model | Pass rate
820 // ============================================
821 //
822 // claude-3.7-sonnet | 0.65 (2025-06-14)
823 // claude-sonnet-4 | 0.07 (2025-06-14)
824 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
825 // gemini-2.5-flash-preview-04-17 |
826 // gpt-4.1 |
827
828 let input_file_path = "root/action_log.rs";
829 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
830 let edit_description = "Add a new test for overwriting a file in action_log.rs";
831
832 eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
833 run_eval(EvalInput::from_conversation(
834 vec![
835 message(
836 User,
837 [text(indoc! {"
838 Introduce a new test in `action_log.rs` to test overwriting a file.
839 That is, a file already exists, but we call `buffer_created` as if the file were new.
840 Take inspiration from all the other tests in the file.
841 "})],
842 ),
843 message(
844 Assistant,
845 [tool_use(
846 "tool_1",
847 ReadFileTool::NAME,
848 ReadFileToolInput {
849 path: input_file_path.into(),
850 start_line: None,
851 end_line: None,
852 },
853 )],
854 ),
855 message(
856 User,
857 [tool_result(
858 "tool_1",
859 ReadFileTool::NAME,
860 indoc! {"
861 pub struct ActionLog [L13-20]
862 tracked_buffers [L15]
863 edited_since_project_diagnostics_check [L17]
864 project [L19]
865 impl ActionLog [L22-498]
866 pub fn new [L24-30]
867 pub fn project [L32-34]
868 pub fn checked_project_diagnostics [L37-39]
869 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
870 fn track_buffer_internal [L46-101]
871 fn handle_buffer_event [L103-116]
872 fn handle_buffer_edited [L118-123]
873 fn handle_buffer_file_changed [L125-158]
874 async fn maintain_diff [L160-264]
875 pub fn buffer_read [L267-269]
876 pub fn buffer_created [L272-276]
877 pub fn buffer_edited [L279-287]
878 pub fn will_delete_buffer [L289-304]
879 pub fn keep_edits_in_range [L306-364]
880 pub fn reject_edits_in_ranges [L366-459]
881 pub fn keep_all_edits [L461-473]
882 pub fn changed_buffers [L476-482]
883 pub fn stale_buffers [L485-497]
884 fn apply_non_conflicting_edits [L500-561]
885 fn diff_snapshots [L563-585]
886 fn point_to_row_edit [L587-614]
887 enum ChangeAuthor [L617-620]
888 User [L618]
889 Agent [L619]
890 enum TrackedBufferStatus [L623-627]
891 Created [L624]
892 Modified [L625]
893 Deleted [L626]
894 struct TrackedBuffer [L629-641]
895 buffer [L630]
896 base_text [L631]
897 unreviewed_changes [L632]
898 status [L633]
899 version [L634]
900 diff [L635]
901 snapshot [L636]
902 diff_update [L637]
903 _open_lsp_handle [L638]
904 _maintain_diff [L639]
905 _subscription [L640]
906 impl TrackedBuffer [L643-657]
907 fn has_changes [L644-650]
908 fn schedule_diff_update [L652-656]
909 pub struct ChangedBuffer [L659-661]
910 pub diff [L660]
911 mod tests [L664-1574]
912 fn init_logger [L678-682]
913 fn init_test [L684-691]
914 async fn test_keep_edits [L694-769]
915 async fn test_deletions [L772-854]
916 async fn test_overlapping_user_edits [L857-951]
917 async fn test_creating_files [L954-1010]
918 async fn test_deleting_files [L1013-1120]
919 async fn test_reject_edits [L1123-1255]
920 async fn test_reject_multiple_edits [L1258-1331]
921 async fn test_reject_deleted_file [L1334-1388]
922 async fn test_reject_created_file [L1391-1443]
923 async fn test_random_diffs [L1446-1535]
924 fn quiesce [L1510-1534]
925 struct HunkStatus [L1538-1542]
926 range [L1539]
927 diff_status [L1540]
928 old_text [L1541]
929 fn unreviewed_hunks [L1544-1573]
930
931 Showing symbols 1-69 (total symbols: 69)
932
933 Using the line numbers in this outline, you can call this tool again while specifying
934 the start_line and end_line fields to see the implementations of symbols in the outline.
935 "},
936 )],
937 ),
938 message(
939 Assistant,
940 [
941 text(
942 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
943 ),
944 tool_use(
945 "tool_2",
946 ReadFileTool::NAME,
947 ReadFileToolInput {
948 path: input_file_path.into(),
949 start_line: Some(953),
950 end_line: Some(1010),
951 },
952 ),
953 ],
954 ),
955 message(
956 User,
957 [tool_result(
958 "tool_2",
959 ReadFileTool::NAME,
960 lines(input_file_content, 953..1010),
961 )],
962 ),
963 message(
964 Assistant,
965 [
966 text(
967 "Now let's look at another relevant test to understand how they're structured:",
968 ),
969 tool_use(
970 "tool_3",
971 ReadFileTool::NAME,
972 ReadFileToolInput {
973 path: input_file_path.into(),
974 start_line: Some(1012),
975 end_line: Some(1120),
976 },
977 ),
978 ],
979 ),
980 message(
981 User,
982 [tool_result(
983 "tool_3",
984 ReadFileTool::NAME,
985 lines(input_file_content, 1012..1120),
986 )],
987 ),
988 message(
989 Assistant,
990 [
991 text("Now let's look at how `buffer_created` is implemented:"),
992 tool_use(
993 "tool_4",
994 ReadFileTool::NAME,
995 ReadFileToolInput {
996 path: input_file_path.into(),
997 start_line: Some(271),
998 end_line: Some(276),
999 },
1000 ),
1001 ],
1002 ),
1003 message(
1004 User,
1005 [tool_result(
1006 "tool_4",
1007 ReadFileTool::NAME,
1008 lines(input_file_content, 271..276),
1009 )],
1010 ),
1011 message(
1012 Assistant,
1013 [
1014 text(indoc! {"
1015 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
1016
1017 1. Create a file
1018 2. Open a buffer for it
1019 3. Track it in the action log
1020 4. Modify the buffer
1021 5. Call `buffer_created` again as if the file were new
1022 6. Check that the changes are properly tracked
1023
1024 Let's write the test:
1025 "}),
1026 tool_use(
1027 "tool_5",
1028 EditFileTool::NAME,
1029 EditFileToolInput {
1030 display_description: edit_description.into(),
1031 path: input_file_path.into(),
1032 mode: EditFileMode::Edit,
1033 },
1034 ),
1035 ],
1036 ),
1037 ],
1038 Some(input_file_content.into()),
1039 EvalAssertion::judge_diff(
1040 "A new test for overwritten files was created, without changing any previous test",
1041 ),
1042 ))
1043 });
1044}
1045
1046#[test]
1047#[cfg_attr(not(feature = "unit-eval"), ignore)]
1048fn eval_create_empty_file() {
1049 // Check that Edit Agent can create a file without writing its
1050 // thoughts into it. This issue is not specific to empty files, but
1051 // it's easier to reproduce with them.
1052 //
1053 // Model | Pass rate
1054 // ============================================
1055 //
1056 // claude-3.7-sonnet | 1.00 (2025-06-14)
1057 // claude-sonnet-4 | 1.00 (2025-06-14)
1058 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1059 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1060 // gpt-4.1 | 1.00 (2025-05-21)
1061 //
1062 //
1063 // TODO: gpt-4.1-mini errored 38 times:
1064 // "data did not match any variant of untagged enum ResponseStreamResult"
1065
1066 let input_file_content = None;
1067 let expected_output_content = String::new();
1068
1069 eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1070 run_eval(EvalInput::from_conversation(
1071 vec![
1072 message(User, [text("Create a second empty todo file ")]),
1073 message(
1074 Assistant,
1075 [
1076 text(formatdoc! {"
1077 I'll help you create a second empty todo file.
1078 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1079 "}),
1080 tool_use(
1081 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1082 ListDirectoryTool::NAME,
1083 ListDirectoryToolInput {
1084 path: "root".to_string(),
1085 },
1086 ),
1087 ],
1088 ),
1089 message(
1090 User,
1091 [tool_result(
1092 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1093 ListDirectoryTool::NAME,
1094 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1095 )],
1096 ),
1097 message(
1098 Assistant,
1099 [
1100 text(formatdoc! {"
1101 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1102 "}),
1103 tool_use(
1104 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1105 EditFileTool::NAME,
1106 EditFileToolInput {
1107 display_description: "Create empty TODO3 file".to_string(),
1108 mode: EditFileMode::Create,
1109 path: "root/TODO3".into(),
1110 },
1111 ),
1112 ],
1113 ),
1114 ],
1115 input_file_content.clone(),
1116 // Bad behavior is to write something like
1117 // "I'll create an empty TODO3 file as requested."
1118 EvalAssertion::assert_eq(expected_output_content.clone()),
1119 ))
1120 });
1121}
1122
1123fn message(
1124 role: Role,
1125 contents: impl IntoIterator<Item = MessageContent>,
1126) -> LanguageModelRequestMessage {
1127 LanguageModelRequestMessage {
1128 role,
1129 content: contents.into_iter().collect(),
1130 cache: false,
1131 reasoning_details: None,
1132 }
1133}
1134
1135fn text(text: impl Into<String>) -> MessageContent {
1136 MessageContent::Text(text.into())
1137}
1138
1139fn lines(input: &str, range: Range<usize>) -> String {
1140 input
1141 .lines()
1142 .skip(range.start)
1143 .take(range.len())
1144 .collect::<Vec<_>>()
1145 .join("\n")
1146}
1147
1148fn tool_use(
1149 id: impl Into<Arc<str>>,
1150 name: impl Into<Arc<str>>,
1151 input: impl Serialize,
1152) -> MessageContent {
1153 MessageContent::ToolUse(LanguageModelToolUse {
1154 id: LanguageModelToolUseId::from(id.into()),
1155 name: name.into(),
1156 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1157 input: serde_json::to_value(input).unwrap(),
1158 is_input_complete: true,
1159 thought_signature: None,
1160 })
1161}
1162
1163fn tool_result(
1164 id: impl Into<Arc<str>>,
1165 name: impl Into<Arc<str>>,
1166 result: impl Into<Arc<str>>,
1167) -> MessageContent {
1168 MessageContent::ToolResult(LanguageModelToolResult {
1169 tool_use_id: LanguageModelToolUseId::from(id.into()),
1170 tool_name: name.into(),
1171 is_error: false,
1172 content: LanguageModelToolResultContent::Text(result.into()),
1173 output: None,
1174 })
1175}
1176
1177#[derive(Clone)]
1178struct EvalInput {
1179 conversation: Vec<LanguageModelRequestMessage>,
1180 edit_file_input: EditFileToolInput,
1181 input_content: Option<String>,
1182 assertion: EvalAssertion,
1183}
1184
1185impl EvalInput {
1186 fn from_conversation(
1187 conversation: Vec<LanguageModelRequestMessage>,
1188 input_content: Option<String>,
1189 assertion: EvalAssertion,
1190 ) -> Self {
1191 let msg = conversation.last().expect("Conversation must not be empty");
1192 if msg.role != Role::Assistant {
1193 panic!("Conversation must end with an assistant message");
1194 }
1195 let tool_use = msg
1196 .content
1197 .iter()
1198 .flat_map(|content| match content {
1199 MessageContent::ToolUse(tool_use) if tool_use.name == EditFileTool::NAME.into() => {
1200 Some(tool_use)
1201 }
1202 _ => None,
1203 })
1204 .next()
1205 .expect("Conversation must end with an edit_file tool use")
1206 .clone();
1207
1208 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1209
1210 EvalInput {
1211 conversation,
1212 edit_file_input,
1213 input_content,
1214 assertion,
1215 }
1216 }
1217}
1218
1219#[derive(Clone)]
1220struct EvalSample {
1221 text_before: String,
1222 text_after: String,
1223 edit_output: EditAgentOutput,
1224 diff: String,
1225}
1226
1227trait AssertionFn: 'static + Send + Sync {
1228 fn assert<'a>(
1229 &'a self,
1230 sample: &'a EvalSample,
1231 judge_model: Arc<dyn LanguageModel>,
1232 cx: &'a mut TestAppContext,
1233 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1234}
1235
1236impl<F> AssertionFn for F
1237where
1238 F: 'static
1239 + Send
1240 + Sync
1241 + AsyncFn(
1242 &EvalSample,
1243 Arc<dyn LanguageModel>,
1244 &mut TestAppContext,
1245 ) -> Result<EvalAssertionOutcome>,
1246{
1247 fn assert<'a>(
1248 &'a self,
1249 sample: &'a EvalSample,
1250 judge_model: Arc<dyn LanguageModel>,
1251 cx: &'a mut TestAppContext,
1252 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1253 (self)(sample, judge_model, cx).boxed_local()
1254 }
1255}
1256
1257#[derive(Clone)]
1258struct EvalAssertion(Arc<dyn AssertionFn>);
1259
1260impl EvalAssertion {
1261 fn new<F>(f: F) -> Self
1262 where
1263 F: 'static
1264 + Send
1265 + Sync
1266 + AsyncFn(
1267 &EvalSample,
1268 Arc<dyn LanguageModel>,
1269 &mut TestAppContext,
1270 ) -> Result<EvalAssertionOutcome>,
1271 {
1272 EvalAssertion(Arc::new(f))
1273 }
1274
1275 fn assert_eq(expected: impl Into<String>) -> Self {
1276 let expected = expected.into();
1277 Self::new(async move |sample, _judge, _cx| {
1278 Ok(EvalAssertionOutcome {
1279 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1280 100
1281 } else {
1282 0
1283 },
1284 message: None,
1285 })
1286 })
1287 }
1288
1289 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1290 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1291 Self::new(async move |sample, _judge, _cx| {
1292 let matches = expected_diffs.iter().any(|possible_diff| {
1293 let expected =
1294 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1295 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1296 });
1297
1298 Ok(EvalAssertionOutcome {
1299 score: if matches { 100 } else { 0 },
1300 message: None,
1301 })
1302 })
1303 }
1304
1305 fn judge_diff(assertions: &'static str) -> Self {
1306 Self::new(async move |sample, judge, cx| {
1307 let prompt = DiffJudgeTemplate {
1308 diff: sample.diff.clone(),
1309 assertions,
1310 }
1311 .render(&Templates::new())
1312 .unwrap();
1313
1314 let request = LanguageModelRequest {
1315 messages: vec![LanguageModelRequestMessage {
1316 role: Role::User,
1317 content: vec![prompt.into()],
1318 cache: false,
1319 reasoning_details: None,
1320 }],
1321 thinking_allowed: true,
1322 ..Default::default()
1323 };
1324 let mut response = retry_on_rate_limit(async || {
1325 Ok(judge
1326 .stream_completion_text(request.clone(), &cx.to_async())
1327 .await?)
1328 })
1329 .await?;
1330 let mut output = String::new();
1331 while let Some(chunk) = response.stream.next().await {
1332 let chunk = chunk?;
1333 output.push_str(&chunk);
1334 }
1335
1336 // Parse the score from the response
1337 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1338 if let Some(captures) = re.captures(&output)
1339 && let Some(score_match) = captures.get(1)
1340 {
1341 let score = score_match.as_str().parse().unwrap_or(0);
1342 return Ok(EvalAssertionOutcome {
1343 score,
1344 message: Some(output),
1345 });
1346 }
1347
1348 anyhow::bail!("No score found in response. Raw output: {output}");
1349 })
1350 }
1351
1352 async fn run(
1353 &self,
1354 input: &EvalSample,
1355 judge_model: Arc<dyn LanguageModel>,
1356 cx: &mut TestAppContext,
1357 ) -> Result<EvalAssertionOutcome> {
1358 self.0.assert(input, judge_model, cx).await
1359 }
1360}
1361
1362fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1363 let dispatcher = gpui::TestDispatcher::new(rand::random());
1364 let mut cx = TestAppContext::build(dispatcher, None);
1365 let foreground_executor = cx.foreground_executor().clone();
1366 let result = foreground_executor.block_test(async {
1367 let test = EditAgentTest::new(&mut cx).await;
1368 test.eval(eval, &mut cx).await
1369 });
1370 cx.quit();
1371 match result {
1372 Ok(output) => eval_utils::EvalOutput {
1373 data: output.to_string(),
1374 outcome: if output.assertion.score < 80 {
1375 eval_utils::OutcomeKind::Failed
1376 } else {
1377 eval_utils::OutcomeKind::Passed
1378 },
1379 metadata: EditEvalMetadata {
1380 tags: output.sample.edit_output.parser_metrics.tags,
1381 mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1382 },
1383 },
1384 Err(e) => eval_utils::EvalOutput {
1385 data: format!("{e:?}"),
1386 outcome: eval_utils::OutcomeKind::Error,
1387 metadata: EditEvalMetadata {
1388 tags: 0,
1389 mismatched_tags: 0,
1390 },
1391 },
1392 }
1393}
1394
1395#[derive(Clone)]
1396struct EditEvalOutput {
1397 sample: EvalSample,
1398 assertion: EvalAssertionOutcome,
1399}
1400
1401impl Display for EditEvalOutput {
1402 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1403 writeln!(f, "Score: {:?}", self.assertion.score)?;
1404 if let Some(message) = self.assertion.message.as_ref() {
1405 writeln!(f, "Message: {}", message)?;
1406 }
1407
1408 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1409
1410 writeln!(
1411 f,
1412 "Parser Metrics:\n{:#?}",
1413 self.sample.edit_output.parser_metrics
1414 )?;
1415 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1416 Ok(())
1417 }
1418}
1419
1420struct EditAgentTest {
1421 agent: EditAgent,
1422 project: Entity<Project>,
1423 judge_model: Arc<dyn LanguageModel>,
1424}
1425
1426impl EditAgentTest {
1427 async fn new(cx: &mut TestAppContext) -> Self {
1428 cx.executor().allow_parking();
1429
1430 let fs = FakeFs::new(cx.executor());
1431 cx.update(|cx| {
1432 settings::init(cx);
1433 gpui_tokio::init(cx);
1434 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1435 cx.set_http_client(http_client);
1436 let client = Client::production(cx);
1437 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1438 settings::init(cx);
1439 language_model::init(client.clone(), cx);
1440 language_models::init(user_store, client.clone(), cx);
1441 });
1442
1443 fs.insert_tree("/root", json!({})).await;
1444 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1445 let agent_model = SelectedModel::from_str(
1446 &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1447 )
1448 .unwrap();
1449 let judge_model = SelectedModel::from_str(
1450 &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1451 )
1452 .unwrap();
1453
1454 let authenticate_provider_tasks = cx.update(|cx| {
1455 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1456 registry
1457 .providers()
1458 .iter()
1459 .map(|p| p.authenticate(cx))
1460 .collect::<Vec<_>>()
1461 })
1462 });
1463 let (agent_model, judge_model) = cx
1464 .update(|cx| {
1465 cx.spawn(async move |cx| {
1466 futures::future::join_all(authenticate_provider_tasks).await;
1467 let agent_model = Self::load_model(&agent_model, cx).await;
1468 let judge_model = Self::load_model(&judge_model, cx).await;
1469 (agent_model.unwrap(), judge_model.unwrap())
1470 })
1471 })
1472 .await;
1473 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1474
1475 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1476
1477 Self {
1478 agent: EditAgent::new(
1479 agent_model,
1480 project.clone(),
1481 action_log,
1482 Templates::new(),
1483 edit_format,
1484 ),
1485 project,
1486 judge_model,
1487 }
1488 }
1489
1490 async fn load_model(
1491 selected_model: &SelectedModel,
1492 cx: &mut AsyncApp,
1493 ) -> Result<Arc<dyn LanguageModel>> {
1494 cx.update(|cx| {
1495 let registry = LanguageModelRegistry::read_global(cx);
1496 let provider = registry
1497 .provider(&selected_model.provider)
1498 .expect("Provider not found");
1499 provider.authenticate(cx)
1500 })
1501 .await?;
1502 Ok(cx.update(|cx| {
1503 let models = LanguageModelRegistry::read_global(cx);
1504 let model = models
1505 .available_models(cx)
1506 .find(|model| {
1507 model.provider_id() == selected_model.provider
1508 && model.id() == selected_model.model
1509 })
1510 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1511 model
1512 }))
1513 }
1514
1515 async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1516 // Make sure the last message in the conversation is cached.
1517 eval.conversation.last_mut().unwrap().cache = true;
1518
1519 let path = self
1520 .project
1521 .read_with(cx, |project, cx| {
1522 project.find_project_path(eval.edit_file_input.path, cx)
1523 })
1524 .unwrap();
1525 let buffer = self
1526 .project
1527 .update(cx, |project, cx| project.open_buffer(path, cx))
1528 .await
1529 .unwrap();
1530
1531 let tools = crate::built_in_tools().collect::<Vec<_>>();
1532
1533 let system_prompt = {
1534 let worktrees = vec![WorktreeContext {
1535 root_name: "root".to_string(),
1536 abs_path: Path::new("/path/to/root").into(),
1537 rules_file: None,
1538 }];
1539 let project_context = ProjectContext::new(worktrees, Vec::default());
1540 let tool_names = tools
1541 .iter()
1542 .map(|tool| tool.name.clone().into())
1543 .collect::<Vec<_>>();
1544 let template = crate::SystemPromptTemplate {
1545 project: &project_context,
1546 available_tools: tool_names,
1547 model_name: None,
1548 };
1549 let templates = Templates::new();
1550 template.render(&templates).unwrap()
1551 };
1552
1553 let has_system_prompt = eval
1554 .conversation
1555 .first()
1556 .is_some_and(|msg| msg.role == Role::System);
1557 let messages = if has_system_prompt {
1558 eval.conversation
1559 } else {
1560 [LanguageModelRequestMessage {
1561 role: Role::System,
1562 content: vec![MessageContent::Text(system_prompt)],
1563 cache: true,
1564 reasoning_details: None,
1565 }]
1566 .into_iter()
1567 .chain(eval.conversation)
1568 .collect::<Vec<_>>()
1569 };
1570
1571 let conversation = LanguageModelRequest {
1572 messages,
1573 tools,
1574 thinking_allowed: true,
1575 ..Default::default()
1576 };
1577
1578 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1579 if let Some(input_content) = eval.input_content.as_deref() {
1580 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1581 }
1582 retry_on_rate_limit(async || {
1583 self.agent
1584 .edit(
1585 buffer.clone(),
1586 eval.edit_file_input.display_description.clone(),
1587 &conversation,
1588 &mut cx.to_async(),
1589 )
1590 .0
1591 .await
1592 })
1593 .await?
1594 } else {
1595 retry_on_rate_limit(async || {
1596 self.agent
1597 .overwrite(
1598 buffer.clone(),
1599 eval.edit_file_input.display_description.clone(),
1600 &conversation,
1601 &mut cx.to_async(),
1602 )
1603 .0
1604 .await
1605 })
1606 .await?
1607 };
1608
1609 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1610 let sample = EvalSample {
1611 edit_output,
1612 diff: language::unified_diff(
1613 eval.input_content.as_deref().unwrap_or_default(),
1614 &buffer_text,
1615 ),
1616 text_before: eval.input_content.unwrap_or_default(),
1617 text_after: buffer_text,
1618 };
1619 let assertion = eval
1620 .assertion
1621 .run(&sample, self.judge_model.clone(), cx)
1622 .await?;
1623
1624 Ok(EditEvalOutput { assertion, sample })
1625 }
1626}
1627
1628async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1629 const MAX_RETRIES: usize = 20;
1630 let mut attempt = 0;
1631
1632 loop {
1633 attempt += 1;
1634 let response = request().await;
1635
1636 if attempt >= MAX_RETRIES {
1637 return response;
1638 }
1639
1640 let retry_delay = match &response {
1641 Ok(_) => None,
1642 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1643 Some(err) => match &err {
1644 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1645 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1646 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1647 }
1648 LanguageModelCompletionError::UpstreamProviderError {
1649 status,
1650 retry_after,
1651 ..
1652 } => {
1653 // Only retry for specific status codes
1654 let should_retry = matches!(
1655 *status,
1656 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1657 ) || status.as_u16() == 529;
1658
1659 if should_retry {
1660 // Use server-provided retry_after if available, otherwise use default
1661 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1662 } else {
1663 None
1664 }
1665 }
1666 LanguageModelCompletionError::ApiReadResponseError { .. }
1667 | LanguageModelCompletionError::ApiInternalServerError { .. }
1668 | LanguageModelCompletionError::HttpSend { .. } => {
1669 // Exponential backoff for transient I/O and internal server errors
1670 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1671 }
1672 _ => None,
1673 },
1674 _ => None,
1675 },
1676 };
1677
1678 if let Some(retry_after) = retry_delay {
1679 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1680 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1681 // This code does not use the gpui::executor
1682 #[allow(clippy::disallowed_methods)]
1683 smol::Timer::after(retry_after + jitter).await;
1684 } else {
1685 return response;
1686 }
1687 }
1688}
1689
1690#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1691struct EvalAssertionOutcome {
1692 score: usize,
1693 message: Option<String>,
1694}
1695
1696#[derive(Serialize)]
1697pub struct DiffJudgeTemplate {
1698 diff: String,
1699 assertions: &'static str,
1700}
1701
1702impl Template for DiffJudgeTemplate {
1703 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1704}
1705
1706fn strip_empty_lines(text: &str) -> String {
1707 text.lines()
1708 .filter(|line| !line.trim().is_empty())
1709 .collect::<Vec<_>>()
1710 .join("\n")
1711}