1use super::*;
2use crate::{
3 EditFileMode, EditFileToolInput, GrepToolInput, ListDirectoryToolInput, ReadFileToolInput,
4};
5use Role::*;
6use client::{Client, UserStore};
7use eval_utils::{EvalOutput, EvalOutputProcessor, OutcomeKind};
8use fs::FakeFs;
9use futures::{FutureExt, future::LocalBoxFuture};
10use gpui::{AppContext, TestAppContext, Timer};
11use http_client::StatusCode;
12use indoc::{formatdoc, indoc};
13use language_model::{
14 LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolResultContent,
15 LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
16};
17use project::Project;
18use prompt_store::{ProjectContext, WorktreeContext};
19use rand::prelude::*;
20use reqwest_client::ReqwestClient;
21use serde_json::json;
22use std::{
23 fmt::{self, Display},
24 path::Path,
25 str::FromStr,
26 time::Duration,
27};
28use util::path;
29
30#[derive(Default, Clone, Debug)]
31struct EditAgentOutputProcessor {
32 mismatched_tag_threshold: f32,
33 cumulative_tags: usize,
34 cumulative_mismatched_tags: usize,
35 eval_outputs: Vec<EvalOutput<EditEvalMetadata>>,
36}
37
38fn mismatched_tag_threshold(mismatched_tag_threshold: f32) -> EditAgentOutputProcessor {
39 EditAgentOutputProcessor {
40 mismatched_tag_threshold,
41 cumulative_tags: 0,
42 cumulative_mismatched_tags: 0,
43 eval_outputs: Vec::new(),
44 }
45}
46
47#[derive(Clone, Debug)]
48struct EditEvalMetadata {
49 tags: usize,
50 mismatched_tags: usize,
51}
52
53impl EvalOutputProcessor for EditAgentOutputProcessor {
54 type Metadata = EditEvalMetadata;
55
56 fn process(&mut self, output: &EvalOutput<Self::Metadata>) {
57 if matches!(output.outcome, OutcomeKind::Passed | OutcomeKind::Failed) {
58 self.cumulative_mismatched_tags += output.metadata.mismatched_tags;
59 self.cumulative_tags += output.metadata.tags;
60 self.eval_outputs.push(output.clone());
61 }
62 }
63
64 fn assert(&mut self) {
65 let mismatched_tag_ratio =
66 self.cumulative_mismatched_tags as f32 / self.cumulative_tags as f32;
67 if mismatched_tag_ratio > self.mismatched_tag_threshold {
68 for eval_output in &self.eval_outputs {
69 println!("{}", eval_output.data);
70 }
71 panic!(
72 "Too many mismatched tags: {:?}",
73 self.cumulative_mismatched_tags
74 );
75 }
76 }
77}
78
79#[test]
80#[cfg_attr(not(feature = "unit-eval"), ignore)]
81fn eval_extract_handle_command_output() {
82 // Test how well agent generates multiple edit hunks.
83 //
84 // Model | Pass rate
85 // ----------------------------|----------
86 // claude-3.7-sonnet | 0.99 (2025-06-14)
87 // claude-sonnet-4 | 0.97 (2025-06-14)
88 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
89 // gemini-2.5-flash | 0.11 (2025-05-22)
90 // gpt-4.1 | 1.00 (2025-05-22)
91
92 let input_file_path = "root/blame.rs";
93 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
94 let possible_diffs = vec![
95 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
96 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
97 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
98 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
99 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
100 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
101 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
102 ];
103 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
104 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
105 run_eval(EvalInput::from_conversation(
106 vec![
107 message(
108 User,
109 [text(formatdoc! {"
110 Read the `{input_file_path}` file and extract a method in
111 the final stanza of `run_git_blame` to deal with command failures,
112 call it `handle_command_output` and take the std::process::Output as the only parameter.
113 Do not document the method and do not add any comments.
114
115 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
116 "})],
117 ),
118 message(
119 Assistant,
120 [tool_use(
121 "tool_1",
122 "read_file",
123 ReadFileToolInput {
124 path: input_file_path.into(),
125 start_line: None,
126 end_line: None,
127 start_byte: None,
128 max_bytes: None,
129 },
130 )],
131 ),
132 message(
133 User,
134 [tool_result("tool_1", "read_file", input_file_content)],
135 ),
136 message(
137 Assistant,
138 [tool_use(
139 "tool_2",
140 "edit_file",
141 EditFileToolInput {
142 display_description: edit_description.into(),
143 path: input_file_path.into(),
144 mode: EditFileMode::Edit,
145 },
146 )],
147 ),
148 ],
149 Some(input_file_content.into()),
150 EvalAssertion::assert_diff_any(possible_diffs.clone()),
151 ))
152 });
153}
154
155#[test]
156#[cfg_attr(not(feature = "unit-eval"), ignore)]
157fn eval_delete_run_git_blame() {
158 // Model | Pass rate
159 // ----------------------------|----------
160 // claude-3.7-sonnet | 1.0 (2025-06-14)
161 // claude-sonnet-4 | 0.96 (2025-06-14)
162 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
163 // gemini-2.5-flash |
164 // gpt-4.1 |
165
166 let input_file_path = "root/blame.rs";
167 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
168 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
169 let edit_description = "Delete the `run_git_blame` function.";
170
171 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
172 run_eval(EvalInput::from_conversation(
173 vec![
174 message(
175 User,
176 [text(formatdoc! {"
177 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
178 one function, not its usages.
179 "})],
180 ),
181 message(
182 Assistant,
183 [tool_use(
184 "tool_1",
185 "read_file",
186 ReadFileToolInput {
187 path: input_file_path.into(),
188 start_line: None,
189 end_line: None,
190 start_byte: None,
191 max_bytes: None,
192 },
193 )],
194 ),
195 message(
196 User,
197 [tool_result("tool_1", "read_file", input_file_content)],
198 ),
199 message(
200 Assistant,
201 [tool_use(
202 "tool_2",
203 "edit_file",
204 EditFileToolInput {
205 display_description: edit_description.into(),
206 path: input_file_path.into(),
207 mode: EditFileMode::Edit,
208 },
209 )],
210 ),
211 ],
212 Some(input_file_content.into()),
213 EvalAssertion::assert_eq(output_file_content),
214 ))
215 });
216}
217
218#[test]
219#[cfg_attr(not(feature = "unit-eval"), ignore)]
220fn eval_translate_doc_comments() {
221 // Model | Pass rate
222 // ============================================
223 //
224 // claude-3.7-sonnet | 1.0 (2025-06-14)
225 // claude-sonnet-4 | 1.0 (2025-06-14)
226 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
227 // gemini-2.5-flash-preview-04-17 |
228 // gpt-4.1 |
229
230 let input_file_path = "root/canvas.rs";
231 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
232 let edit_description = "Translate all doc comments to Italian";
233
234 eval_utils::eval(200, 1., mismatched_tag_threshold(0.05), move || {
235 run_eval(EvalInput::from_conversation(
236 vec![
237 message(
238 User,
239 [text(formatdoc! {"
240 Read the {input_file_path} file and edit it (without overwriting it),
241 translating all the doc comments to italian.
242 "})],
243 ),
244 message(
245 Assistant,
246 [tool_use(
247 "tool_1",
248 "read_file",
249 ReadFileToolInput {
250 path: input_file_path.into(),
251 start_line: None,
252 end_line: None,
253 start_byte: None,
254 max_bytes: None,
255 },
256 )],
257 ),
258 message(
259 User,
260 [tool_result("tool_1", "read_file", input_file_content)],
261 ),
262 message(
263 Assistant,
264 [tool_use(
265 "tool_2",
266 "edit_file",
267 EditFileToolInput {
268 display_description: edit_description.into(),
269 path: input_file_path.into(),
270 mode: EditFileMode::Edit,
271 },
272 )],
273 ),
274 ],
275 Some(input_file_content.into()),
276 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
277 ))
278 });
279}
280
281#[test]
282#[cfg_attr(not(feature = "unit-eval"), ignore)]
283fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
284 // Model | Pass rate
285 // ============================================
286 //
287 // claude-3.7-sonnet | 0.96 (2025-06-14)
288 // claude-sonnet-4 | 0.11 (2025-06-14)
289 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
290 // gemini-2.5-flash-preview-04-17 |
291 // gpt-4.1 |
292
293 let input_file_path = "root/lib.rs";
294 let input_file_content =
295 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
296 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
297
298 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.05), move || {
299 run_eval(EvalInput::from_conversation(
300 vec![
301 message(
302 User,
303 [text(formatdoc! {"
304 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
305 Use `ureq` to download the SDK for the current platform and architecture.
306 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
307 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
308 that's inside of the archive.
309 Don't re-download the SDK if that executable already exists.
310
311 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
312
313 Here are the available wasi-sdk assets:
314 - wasi-sdk-25.0-x86_64-macos.tar.gz
315 - wasi-sdk-25.0-arm64-macos.tar.gz
316 - wasi-sdk-25.0-x86_64-linux.tar.gz
317 - wasi-sdk-25.0-arm64-linux.tar.gz
318 - wasi-sdk-25.0-x86_64-linux.tar.gz
319 - wasi-sdk-25.0-arm64-linux.tar.gz
320 - wasi-sdk-25.0-x86_64-windows.tar.gz
321 "})],
322 ),
323 message(
324 Assistant,
325 [tool_use(
326 "tool_1",
327 "read_file",
328 ReadFileToolInput {
329 path: input_file_path.into(),
330 start_line: Some(971),
331 end_line: Some(1050),
332 start_byte: None,
333 max_bytes: None,
334 },
335 )],
336 ),
337 message(
338 User,
339 [tool_result(
340 "tool_1",
341 "read_file",
342 lines(input_file_content, 971..1050),
343 )],
344 ),
345 message(
346 Assistant,
347 [tool_use(
348 "tool_2",
349 "read_file",
350 ReadFileToolInput {
351 path: input_file_path.into(),
352 start_line: Some(1050),
353 end_line: Some(1100),
354 start_byte: None,
355 max_bytes: None,
356 },
357 )],
358 ),
359 message(
360 User,
361 [tool_result(
362 "tool_2",
363 "read_file",
364 lines(input_file_content, 1050..1100),
365 )],
366 ),
367 message(
368 Assistant,
369 [tool_use(
370 "tool_3",
371 "read_file",
372 ReadFileToolInput {
373 path: input_file_path.into(),
374 start_line: Some(1100),
375 end_line: Some(1150),
376 start_byte: None,
377 max_bytes: None,
378 },
379 )],
380 ),
381 message(
382 User,
383 [tool_result(
384 "tool_3",
385 "read_file",
386 lines(input_file_content, 1100..1150),
387 )],
388 ),
389 message(
390 Assistant,
391 [tool_use(
392 "tool_4",
393 "edit_file",
394 EditFileToolInput {
395 display_description: edit_description.into(),
396 path: input_file_path.into(),
397 mode: EditFileMode::Edit,
398 },
399 )],
400 ),
401 ],
402 Some(input_file_content.into()),
403 EvalAssertion::judge_diff(indoc! {"
404 - The compile_parser_to_wasm method has been changed to use wasi-sdk
405 - ureq is used to download the SDK for current platform and architecture
406 "}),
407 ))
408 });
409}
410
411#[test]
412#[cfg_attr(not(feature = "unit-eval"), ignore)]
413fn eval_disable_cursor_blinking() {
414 // Model | Pass rate
415 // ============================================
416 //
417 // claude-3.7-sonnet | 0.59 (2025-07-14)
418 // claude-sonnet-4 | 0.81 (2025-07-14)
419 // gemini-2.5-pro | 0.95 (2025-07-14)
420 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
421 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
422
423 let input_file_path = "root/editor.rs";
424 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
425 let edit_description = "Comment out the call to `BlinkManager::enable`";
426 let possible_diffs = vec![
427 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
428 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
429 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
430 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
431 ];
432 eval_utils::eval(100, 0.51, mismatched_tag_threshold(0.05), move || {
433 run_eval(EvalInput::from_conversation(
434 vec![
435 message(User, [text("Let's research how to cursor blinking works.")]),
436 message(
437 Assistant,
438 [tool_use(
439 "tool_1",
440 "grep",
441 GrepToolInput {
442 regex: "blink".into(),
443 include_pattern: None,
444 offset: 0,
445 case_sensitive: false,
446 },
447 )],
448 ),
449 message(
450 User,
451 [tool_result(
452 "tool_1",
453 "grep",
454 [
455 lines(input_file_content, 100..400),
456 lines(input_file_content, 800..1300),
457 lines(input_file_content, 1600..2000),
458 lines(input_file_content, 5000..5500),
459 lines(input_file_content, 8000..9000),
460 lines(input_file_content, 18455..18470),
461 lines(input_file_content, 20000..20500),
462 lines(input_file_content, 21000..21300),
463 ]
464 .join("Match found:\n\n"),
465 )],
466 ),
467 message(
468 User,
469 [text(indoc! {"
470 Comment out the lines that interact with the BlinkManager.
471 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
472 Don't add additional comments.
473 "})],
474 ),
475 message(
476 Assistant,
477 [tool_use(
478 "tool_4",
479 "edit_file",
480 EditFileToolInput {
481 display_description: edit_description.into(),
482 path: input_file_path.into(),
483 mode: EditFileMode::Edit,
484 },
485 )],
486 ),
487 ],
488 Some(input_file_content.into()),
489 EvalAssertion::assert_diff_any(possible_diffs.clone()),
490 ))
491 });
492}
493
494#[test]
495#[cfg_attr(not(feature = "unit-eval"), ignore)]
496fn eval_from_pixels_constructor() {
497 // Results for 2025-06-13
498 //
499 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
500 // value. Higher values improve the pass rate but may sometimes cause
501 // edits to be misapplied. In the context of this eval, this means
502 // the agent might add from_pixels tests in incorrect locations
503 // (e.g., at the beginning of the file), yet the evaluation may still
504 // rate it highly.
505 //
506 // Model | Date | Pass rate
507 // =========================================================
508 // claude-4.0-sonnet | 2025-06-14 | 0.99
509 // claude-3.7-sonnet | 2025-06-14 | 0.88
510 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
511 // gpt-4.1 |
512
513 let input_file_path = "root/canvas.rs";
514 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
515 let edit_description = "Implement from_pixels constructor and add tests.";
516
517 eval_utils::eval(100, 0.95, mismatched_tag_threshold(0.25), move || {
518 run_eval(EvalInput::from_conversation(
519 vec![
520 message(
521 User,
522 [text(indoc! {"
523 Introduce a new `from_pixels` constructor in Canvas and
524 also add tests for it in the same file.
525 "})],
526 ),
527 message(
528 Assistant,
529 [tool_use(
530 "tool_1",
531 "read_file",
532 ReadFileToolInput {
533 path: input_file_path.into(),
534 start_line: None,
535 end_line: None,
536 start_byte: None,
537 max_bytes: None,
538 },
539 )],
540 ),
541 message(
542 User,
543 [tool_result("tool_1", "read_file", input_file_content)],
544 ),
545 message(
546 Assistant,
547 [tool_use(
548 "tool_2",
549 "grep",
550 GrepToolInput {
551 regex: "mod\\s+tests".into(),
552 include_pattern: Some("font-kit/src/canvas.rs".into()),
553 offset: 0,
554 case_sensitive: false,
555 },
556 )],
557 ),
558 message(User, [tool_result("tool_2", "grep", "No matches found")]),
559 message(
560 Assistant,
561 [tool_use(
562 "tool_3",
563 "grep",
564 GrepToolInput {
565 regex: "mod\\s+tests".into(),
566 include_pattern: Some("font-kit/src/**/*.rs".into()),
567 offset: 0,
568 case_sensitive: false,
569 },
570 )],
571 ),
572 message(User, [tool_result("tool_3", "grep", "No matches found")]),
573 message(
574 Assistant,
575 [tool_use(
576 "tool_4",
577 "grep",
578 GrepToolInput {
579 regex: "#\\[test\\]".into(),
580 include_pattern: Some("font-kit/src/**/*.rs".into()),
581 offset: 0,
582 case_sensitive: false,
583 },
584 )],
585 ),
586 message(
587 User,
588 [tool_result(
589 "tool_4",
590 "grep",
591 indoc! {"
592 Found 6 matches:
593
594 ## Matches in font-kit/src/loaders/core_text.rs
595
596 ### mod test › L926-936
597 ```
598 mod test {
599 use super::Font;
600 use crate::properties::{Stretch, Weight};
601
602 #[cfg(feature = \"source\")]
603 use crate::source::SystemSource;
604
605 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
606
607 #[cfg(feature = \"source\")]
608 #[test]
609 ```
610
611 55 lines remaining in ancestor node. Read the file to see all.
612
613 ### mod test › L947-951
614 ```
615 }
616
617 #[test]
618 fn test_core_text_to_css_font_weight() {
619 // Exact matches
620 ```
621
622 ### mod test › L959-963
623 ```
624 }
625
626 #[test]
627 fn test_core_text_to_css_font_stretch() {
628 // Exact matches
629 ```
630
631 ## Matches in font-kit/src/loaders/freetype.rs
632
633 ### mod test › L1238-1248
634 ```
635 mod test {
636 use crate::loaders::freetype::Font;
637
638 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
639 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
640
641 #[test]
642 fn get_pcf_postscript_name() {
643 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
644 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
645 }
646 ```
647
648 1 lines remaining in ancestor node. Read the file to see all.
649
650 ## Matches in font-kit/src/sources/core_text.rs
651
652 ### mod test › L265-275
653 ```
654 mod test {
655 use crate::properties::{Stretch, Weight};
656
657 #[test]
658 fn test_css_to_core_text_font_weight() {
659 // Exact matches
660 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
661 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
662 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
663 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
664
665 ```
666
667 27 lines remaining in ancestor node. Read the file to see all.
668
669 ### mod test › L278-282
670 ```
671 }
672
673 #[test]
674 fn test_css_to_core_text_font_stretch() {
675 // Exact matches
676 ```
677 "},
678 )],
679 ),
680 message(
681 Assistant,
682 [tool_use(
683 "tool_5",
684 "edit_file",
685 EditFileToolInput {
686 display_description: edit_description.into(),
687 path: input_file_path.into(),
688 mode: EditFileMode::Edit,
689 },
690 )],
691 ),
692 ],
693 Some(input_file_content.into()),
694 EvalAssertion::judge_diff(indoc! {"
695 - The diff contains a new `from_pixels` constructor
696 - The diff contains new tests for the `from_pixels` constructor
697 "}),
698 ))
699 });
700}
701
702#[test]
703#[cfg_attr(not(feature = "unit-eval"), ignore)]
704fn eval_zode() {
705 // Model | Pass rate
706 // ============================================
707 //
708 // claude-3.7-sonnet | 1.0 (2025-06-14)
709 // claude-sonnet-4 | 1.0 (2025-06-14)
710 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
711 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
712 // gpt-4.1 | 1.0 (2025-05-22)
713
714 let input_file_path = "root/zode.py";
715 let input_content = None;
716 let edit_description = "Create the main Zode CLI script";
717
718 eval_utils::eval(50, 1., mismatched_tag_threshold(0.05), move || {
719 run_eval(EvalInput::from_conversation(
720 vec![
721 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
722 message(
723 Assistant,
724 [
725 tool_use(
726 "tool_1",
727 "read_file",
728 ReadFileToolInput {
729 path: "root/eval/react.py".into(),
730 start_line: None,
731 end_line: None,
732 start_byte: None,
733 max_bytes: None,
734 },
735 ),
736 tool_use(
737 "tool_2",
738 "read_file",
739 ReadFileToolInput {
740 path: "root/eval/react_test.py".into(),
741 start_line: None,
742 end_line: None,
743 start_byte: None,
744 max_bytes: None,
745 },
746 ),
747 ],
748 ),
749 message(
750 User,
751 [
752 tool_result(
753 "tool_1",
754 "read_file",
755 include_str!("evals/fixtures/zode/react.py"),
756 ),
757 tool_result(
758 "tool_2",
759 "read_file",
760 include_str!("evals/fixtures/zode/react_test.py"),
761 ),
762 ],
763 ),
764 message(
765 Assistant,
766 [
767 text(
768 "Now that I understand what we need to build, I'll create the main Python script:",
769 ),
770 tool_use(
771 "tool_3",
772 "edit_file",
773 EditFileToolInput {
774 display_description: edit_description.into(),
775 path: input_file_path.into(),
776 mode: EditFileMode::Create,
777 },
778 ),
779 ],
780 ),
781 ],
782 input_content.clone(),
783 EvalAssertion::new(async move |sample, _, _cx| {
784 let invalid_starts = [' ', '`', '\n'];
785 let mut message = String::new();
786 for start in invalid_starts {
787 if sample.text_after.starts_with(start) {
788 message.push_str(&format!("The sample starts with a {:?}\n", start));
789 break;
790 }
791 }
792 // Remove trailing newline.
793 message.pop();
794
795 if message.is_empty() {
796 Ok(EvalAssertionOutcome {
797 score: 100,
798 message: None,
799 })
800 } else {
801 Ok(EvalAssertionOutcome {
802 score: 0,
803 message: Some(message),
804 })
805 }
806 }),
807 ))
808 });
809}
810
811#[test]
812#[cfg_attr(not(feature = "unit-eval"), ignore)]
813fn eval_add_overwrite_test() {
814 // Model | Pass rate
815 // ============================================
816 //
817 // claude-3.7-sonnet | 0.65 (2025-06-14)
818 // claude-sonnet-4 | 0.07 (2025-06-14)
819 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
820 // gemini-2.5-flash-preview-04-17 |
821 // gpt-4.1 |
822
823 let input_file_path = "root/action_log.rs";
824 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
825 let edit_description = "Add a new test for overwriting a file in action_log.rs";
826
827 eval_utils::eval(200, 0.5, mismatched_tag_threshold(0.05), move || {
828 run_eval(EvalInput::from_conversation(
829 vec![
830 message(
831 User,
832 [text(indoc! {"
833 Introduce a new test in `action_log.rs` to test overwriting a file.
834 That is, a file already exists, but we call `buffer_created` as if the file were new.
835 Take inspiration from all the other tests in the file.
836 "})],
837 ),
838 message(
839 Assistant,
840 [tool_use(
841 "tool_1",
842 "read_file",
843 ReadFileToolInput {
844 path: input_file_path.into(),
845 start_line: None,
846 end_line: None,
847 start_byte: None,
848 max_bytes: None,
849 },
850 )],
851 ),
852 message(
853 User,
854 [tool_result(
855 "tool_1",
856 "read_file",
857 indoc! {"
858 pub struct ActionLog [L13-20]
859 tracked_buffers [L15]
860 edited_since_project_diagnostics_check [L17]
861 project [L19]
862 impl ActionLog [L22-498]
863 pub fn new [L24-30]
864 pub fn project [L32-34]
865 pub fn checked_project_diagnostics [L37-39]
866 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
867 fn track_buffer_internal [L46-101]
868 fn handle_buffer_event [L103-116]
869 fn handle_buffer_edited [L118-123]
870 fn handle_buffer_file_changed [L125-158]
871 async fn maintain_diff [L160-264]
872 pub fn buffer_read [L267-269]
873 pub fn buffer_created [L272-276]
874 pub fn buffer_edited [L279-287]
875 pub fn will_delete_buffer [L289-304]
876 pub fn keep_edits_in_range [L306-364]
877 pub fn reject_edits_in_ranges [L366-459]
878 pub fn keep_all_edits [L461-473]
879 pub fn changed_buffers [L476-482]
880 pub fn stale_buffers [L485-497]
881 fn apply_non_conflicting_edits [L500-561]
882 fn diff_snapshots [L563-585]
883 fn point_to_row_edit [L587-614]
884 enum ChangeAuthor [L617-620]
885 User [L618]
886 Agent [L619]
887 enum TrackedBufferStatus [L623-627]
888 Created [L624]
889 Modified [L625]
890 Deleted [L626]
891 struct TrackedBuffer [L629-641]
892 buffer [L630]
893 base_text [L631]
894 unreviewed_changes [L632]
895 status [L633]
896 version [L634]
897 diff [L635]
898 snapshot [L636]
899 diff_update [L637]
900 _open_lsp_handle [L638]
901 _maintain_diff [L639]
902 _subscription [L640]
903 impl TrackedBuffer [L643-657]
904 fn has_changes [L644-650]
905 fn schedule_diff_update [L652-656]
906 pub struct ChangedBuffer [L659-661]
907 pub diff [L660]
908 mod tests [L664-1574]
909 fn init_logger [L678-682]
910 fn init_test [L684-691]
911 async fn test_keep_edits [L694-769]
912 async fn test_deletions [L772-854]
913 async fn test_overlapping_user_edits [L857-951]
914 async fn test_creating_files [L954-1010]
915 async fn test_deleting_files [L1013-1120]
916 async fn test_reject_edits [L1123-1255]
917 async fn test_reject_multiple_edits [L1258-1331]
918 async fn test_reject_deleted_file [L1334-1388]
919 async fn test_reject_created_file [L1391-1443]
920 async fn test_random_diffs [L1446-1535]
921 fn quiesce [L1510-1534]
922 struct HunkStatus [L1538-1542]
923 range [L1539]
924 diff_status [L1540]
925 old_text [L1541]
926 fn unreviewed_hunks [L1544-1573]
927
928 Showing symbols 1-69 (total symbols: 69)
929
930 Using the line numbers in this outline, you can call this tool again while specifying
931 the start_line and end_line fields to see the implementations of symbols in the outline.
932 "},
933 )],
934 ),
935 message(
936 Assistant,
937 [
938 text(
939 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
940 ),
941 tool_use(
942 "tool_2",
943 "read_file",
944 ReadFileToolInput {
945 path: input_file_path.into(),
946 start_line: Some(953),
947 end_line: Some(1010),
948 start_byte: None,
949 max_bytes: None,
950 },
951 ),
952 ],
953 ),
954 message(
955 User,
956 [tool_result(
957 "tool_2",
958 "read_file",
959 lines(input_file_content, 953..1010),
960 )],
961 ),
962 message(
963 Assistant,
964 [
965 text(
966 "Now let's look at another relevant test to understand how they're structured:",
967 ),
968 tool_use(
969 "tool_3",
970 "read_file",
971 ReadFileToolInput {
972 path: input_file_path.into(),
973 start_line: Some(1012),
974 end_line: Some(1120),
975 start_byte: None,
976 max_bytes: None,
977 },
978 ),
979 ],
980 ),
981 message(
982 User,
983 [tool_result(
984 "tool_3",
985 "read_file",
986 lines(input_file_content, 1012..1120),
987 )],
988 ),
989 message(
990 Assistant,
991 [
992 text("Now let's look at how `buffer_created` is implemented:"),
993 tool_use(
994 "tool_4",
995 "read_file",
996 ReadFileToolInput {
997 path: input_file_path.into(),
998 start_line: Some(271),
999 end_line: Some(276),
1000 start_byte: None,
1001 max_bytes: None,
1002 },
1003 ),
1004 ],
1005 ),
1006 message(
1007 User,
1008 [tool_result(
1009 "tool_4",
1010 "read_file",
1011 lines(input_file_content, 271..276),
1012 )],
1013 ),
1014 message(
1015 Assistant,
1016 [
1017 text(indoc! {"
1018 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
1019
1020 1. Create a file
1021 2. Open a buffer for it
1022 3. Track it in the action log
1023 4. Modify the buffer
1024 5. Call `buffer_created` again as if the file were new
1025 6. Check that the changes are properly tracked
1026
1027 Let's write the test:
1028 "}),
1029 tool_use(
1030 "tool_5",
1031 "edit_file",
1032 EditFileToolInput {
1033 display_description: edit_description.into(),
1034 path: input_file_path.into(),
1035 mode: EditFileMode::Edit,
1036 },
1037 ),
1038 ],
1039 ),
1040 ],
1041 Some(input_file_content.into()),
1042 EvalAssertion::judge_diff(
1043 "A new test for overwritten files was created, without changing any previous test",
1044 ),
1045 ))
1046 });
1047}
1048
1049#[test]
1050#[cfg_attr(not(feature = "unit-eval"), ignore)]
1051fn eval_create_empty_file() {
1052 // Check that Edit Agent can create a file without writing its
1053 // thoughts into it. This issue is not specific to empty files, but
1054 // it's easier to reproduce with them.
1055 //
1056 // Model | Pass rate
1057 // ============================================
1058 //
1059 // claude-3.7-sonnet | 1.00 (2025-06-14)
1060 // claude-sonnet-4 | 1.00 (2025-06-14)
1061 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1062 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1063 // gpt-4.1 | 1.00 (2025-05-21)
1064 //
1065 //
1066 // TODO: gpt-4.1-mini errored 38 times:
1067 // "data did not match any variant of untagged enum ResponseStreamResult"
1068
1069 let input_file_content = None;
1070 let expected_output_content = String::new();
1071
1072 eval_utils::eval(100, 0.99, mismatched_tag_threshold(0.05), move || {
1073 run_eval(EvalInput::from_conversation(
1074 vec![
1075 message(User, [text("Create a second empty todo file ")]),
1076 message(
1077 Assistant,
1078 [
1079 text(formatdoc! {"
1080 I'll help you create a second empty todo file.
1081 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1082 "}),
1083 tool_use(
1084 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1085 "list_directory",
1086 ListDirectoryToolInput {
1087 path: "root".to_string(),
1088 },
1089 ),
1090 ],
1091 ),
1092 message(
1093 User,
1094 [tool_result(
1095 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1096 "list_directory",
1097 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1098 )],
1099 ),
1100 message(
1101 Assistant,
1102 [
1103 text(formatdoc! {"
1104 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1105 "}),
1106 tool_use(
1107 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1108 "edit_file",
1109 EditFileToolInput {
1110 display_description: "Create empty TODO3 file".to_string(),
1111 mode: EditFileMode::Create,
1112 path: "root/TODO3".into(),
1113 },
1114 ),
1115 ],
1116 ),
1117 ],
1118 input_file_content.clone(),
1119 // Bad behavior is to write something like
1120 // "I'll create an empty TODO3 file as requested."
1121 EvalAssertion::assert_eq(expected_output_content.clone()),
1122 ))
1123 });
1124}
1125
1126fn message(
1127 role: Role,
1128 contents: impl IntoIterator<Item = MessageContent>,
1129) -> LanguageModelRequestMessage {
1130 LanguageModelRequestMessage {
1131 role,
1132 content: contents.into_iter().collect(),
1133 cache: false,
1134 reasoning_details: None,
1135 }
1136}
1137
1138fn text(text: impl Into<String>) -> MessageContent {
1139 MessageContent::Text(text.into())
1140}
1141
1142fn lines(input: &str, range: Range<usize>) -> String {
1143 input
1144 .lines()
1145 .skip(range.start)
1146 .take(range.len())
1147 .collect::<Vec<_>>()
1148 .join("\n")
1149}
1150
1151fn tool_use(
1152 id: impl Into<Arc<str>>,
1153 name: impl Into<Arc<str>>,
1154 input: impl Serialize,
1155) -> MessageContent {
1156 MessageContent::ToolUse(LanguageModelToolUse {
1157 id: LanguageModelToolUseId::from(id.into()),
1158 name: name.into(),
1159 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1160 input: serde_json::to_value(input).unwrap(),
1161 is_input_complete: true,
1162 thought_signature: None,
1163 })
1164}
1165
1166fn tool_result(
1167 id: impl Into<Arc<str>>,
1168 name: impl Into<Arc<str>>,
1169 result: impl Into<Arc<str>>,
1170) -> MessageContent {
1171 MessageContent::ToolResult(LanguageModelToolResult {
1172 tool_use_id: LanguageModelToolUseId::from(id.into()),
1173 tool_name: name.into(),
1174 is_error: false,
1175 content: LanguageModelToolResultContent::Text(result.into()),
1176 output: None,
1177 })
1178}
1179
1180#[derive(Clone)]
1181struct EvalInput {
1182 conversation: Vec<LanguageModelRequestMessage>,
1183 edit_file_input: EditFileToolInput,
1184 input_content: Option<String>,
1185 assertion: EvalAssertion,
1186}
1187
1188impl EvalInput {
1189 fn from_conversation(
1190 conversation: Vec<LanguageModelRequestMessage>,
1191 input_content: Option<String>,
1192 assertion: EvalAssertion,
1193 ) -> Self {
1194 let msg = conversation.last().expect("Conversation must not be empty");
1195 if msg.role != Role::Assistant {
1196 panic!("Conversation must end with an assistant message");
1197 }
1198 let tool_use = msg
1199 .content
1200 .iter()
1201 .flat_map(|content| match content {
1202 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1203 Some(tool_use)
1204 }
1205 _ => None,
1206 })
1207 .next()
1208 .expect("Conversation must end with an edit_file tool use")
1209 .clone();
1210
1211 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1212
1213 EvalInput {
1214 conversation,
1215 edit_file_input,
1216 input_content,
1217 assertion,
1218 }
1219 }
1220}
1221
1222#[derive(Clone)]
1223struct EvalSample {
1224 text_before: String,
1225 text_after: String,
1226 edit_output: EditAgentOutput,
1227 diff: String,
1228}
1229
1230trait AssertionFn: 'static + Send + Sync {
1231 fn assert<'a>(
1232 &'a self,
1233 sample: &'a EvalSample,
1234 judge_model: Arc<dyn LanguageModel>,
1235 cx: &'a mut TestAppContext,
1236 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1237}
1238
1239impl<F> AssertionFn for F
1240where
1241 F: 'static
1242 + Send
1243 + Sync
1244 + AsyncFn(
1245 &EvalSample,
1246 Arc<dyn LanguageModel>,
1247 &mut TestAppContext,
1248 ) -> Result<EvalAssertionOutcome>,
1249{
1250 fn assert<'a>(
1251 &'a self,
1252 sample: &'a EvalSample,
1253 judge_model: Arc<dyn LanguageModel>,
1254 cx: &'a mut TestAppContext,
1255 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1256 (self)(sample, judge_model, cx).boxed_local()
1257 }
1258}
1259
1260#[derive(Clone)]
1261struct EvalAssertion(Arc<dyn AssertionFn>);
1262
1263impl EvalAssertion {
1264 fn new<F>(f: F) -> Self
1265 where
1266 F: 'static
1267 + Send
1268 + Sync
1269 + AsyncFn(
1270 &EvalSample,
1271 Arc<dyn LanguageModel>,
1272 &mut TestAppContext,
1273 ) -> Result<EvalAssertionOutcome>,
1274 {
1275 EvalAssertion(Arc::new(f))
1276 }
1277
1278 fn assert_eq(expected: impl Into<String>) -> Self {
1279 let expected = expected.into();
1280 Self::new(async move |sample, _judge, _cx| {
1281 Ok(EvalAssertionOutcome {
1282 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1283 100
1284 } else {
1285 0
1286 },
1287 message: None,
1288 })
1289 })
1290 }
1291
1292 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1293 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1294 Self::new(async move |sample, _judge, _cx| {
1295 let matches = expected_diffs.iter().any(|possible_diff| {
1296 let expected =
1297 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1298 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1299 });
1300
1301 Ok(EvalAssertionOutcome {
1302 score: if matches { 100 } else { 0 },
1303 message: None,
1304 })
1305 })
1306 }
1307
1308 fn judge_diff(assertions: &'static str) -> Self {
1309 Self::new(async move |sample, judge, cx| {
1310 let prompt = DiffJudgeTemplate {
1311 diff: sample.diff.clone(),
1312 assertions,
1313 }
1314 .render(&Templates::new())
1315 .unwrap();
1316
1317 let request = LanguageModelRequest {
1318 messages: vec![LanguageModelRequestMessage {
1319 role: Role::User,
1320 content: vec![prompt.into()],
1321 cache: false,
1322 reasoning_details: None,
1323 }],
1324 thinking_allowed: true,
1325 ..Default::default()
1326 };
1327 let mut response = retry_on_rate_limit(async || {
1328 Ok(judge
1329 .stream_completion_text(request.clone(), &cx.to_async())
1330 .await?)
1331 })
1332 .await?;
1333 let mut output = String::new();
1334 while let Some(chunk) = response.stream.next().await {
1335 let chunk = chunk?;
1336 output.push_str(&chunk);
1337 }
1338
1339 // Parse the score from the response
1340 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1341 if let Some(captures) = re.captures(&output)
1342 && let Some(score_match) = captures.get(1)
1343 {
1344 let score = score_match.as_str().parse().unwrap_or(0);
1345 return Ok(EvalAssertionOutcome {
1346 score,
1347 message: Some(output),
1348 });
1349 }
1350
1351 anyhow::bail!("No score found in response. Raw output: {output}");
1352 })
1353 }
1354
1355 async fn run(
1356 &self,
1357 input: &EvalSample,
1358 judge_model: Arc<dyn LanguageModel>,
1359 cx: &mut TestAppContext,
1360 ) -> Result<EvalAssertionOutcome> {
1361 self.0.assert(input, judge_model, cx).await
1362 }
1363}
1364
1365fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<EditEvalMetadata> {
1366 let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1367 let mut cx = TestAppContext::build(dispatcher, None);
1368 let result = cx.executor().block_test(async {
1369 let test = EditAgentTest::new(&mut cx).await;
1370 test.eval(eval, &mut cx).await
1371 });
1372 cx.quit();
1373 match result {
1374 Ok(output) => eval_utils::EvalOutput {
1375 data: output.to_string(),
1376 outcome: if output.assertion.score < 80 {
1377 eval_utils::OutcomeKind::Failed
1378 } else {
1379 eval_utils::OutcomeKind::Passed
1380 },
1381 metadata: EditEvalMetadata {
1382 tags: output.sample.edit_output.parser_metrics.tags,
1383 mismatched_tags: output.sample.edit_output.parser_metrics.mismatched_tags,
1384 },
1385 },
1386 Err(e) => eval_utils::EvalOutput {
1387 data: format!("{e:?}"),
1388 outcome: eval_utils::OutcomeKind::Error,
1389 metadata: EditEvalMetadata {
1390 tags: 0,
1391 mismatched_tags: 0,
1392 },
1393 },
1394 }
1395}
1396
1397#[derive(Clone)]
1398struct EditEvalOutput {
1399 sample: EvalSample,
1400 assertion: EvalAssertionOutcome,
1401}
1402
1403impl Display for EditEvalOutput {
1404 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1405 writeln!(f, "Score: {:?}", self.assertion.score)?;
1406 if let Some(message) = self.assertion.message.as_ref() {
1407 writeln!(f, "Message: {}", message)?;
1408 }
1409
1410 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1411
1412 writeln!(
1413 f,
1414 "Parser Metrics:\n{:#?}",
1415 self.sample.edit_output.parser_metrics
1416 )?;
1417 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1418 Ok(())
1419 }
1420}
1421
1422struct EditAgentTest {
1423 agent: EditAgent,
1424 project: Entity<Project>,
1425 judge_model: Arc<dyn LanguageModel>,
1426}
1427
1428impl EditAgentTest {
1429 async fn new(cx: &mut TestAppContext) -> Self {
1430 cx.executor().allow_parking();
1431
1432 let fs = FakeFs::new(cx.executor());
1433 cx.update(|cx| {
1434 settings::init(cx);
1435 gpui_tokio::init(cx);
1436 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1437 cx.set_http_client(http_client);
1438 let client = Client::production(cx);
1439 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1440 settings::init(cx);
1441 language_model::init(client.clone(), cx);
1442 language_models::init(user_store, client.clone(), cx);
1443 });
1444
1445 fs.insert_tree("/root", json!({})).await;
1446 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1447 let agent_model = SelectedModel::from_str(
1448 &std::env::var("ZED_AGENT_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1449 )
1450 .unwrap();
1451 let judge_model = SelectedModel::from_str(
1452 &std::env::var("ZED_JUDGE_MODEL").unwrap_or("anthropic/claude-sonnet-4-latest".into()),
1453 )
1454 .unwrap();
1455
1456 let authenticate_provider_tasks = cx.update(|cx| {
1457 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
1458 registry
1459 .providers()
1460 .iter()
1461 .map(|p| p.authenticate(cx))
1462 .collect::<Vec<_>>()
1463 })
1464 });
1465 let (agent_model, judge_model) = cx
1466 .update(|cx| {
1467 cx.spawn(async move |cx| {
1468 futures::future::join_all(authenticate_provider_tasks).await;
1469 let agent_model = Self::load_model(&agent_model, cx).await;
1470 let judge_model = Self::load_model(&judge_model, cx).await;
1471 (agent_model.unwrap(), judge_model.unwrap())
1472 })
1473 })
1474 .await;
1475 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1476
1477 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1478
1479 Self {
1480 agent: EditAgent::new(
1481 agent_model,
1482 project.clone(),
1483 action_log,
1484 Templates::new(),
1485 edit_format,
1486 ),
1487 project,
1488 judge_model,
1489 }
1490 }
1491
1492 async fn load_model(
1493 selected_model: &SelectedModel,
1494 cx: &mut AsyncApp,
1495 ) -> Result<Arc<dyn LanguageModel>> {
1496 cx.update(|cx| {
1497 let registry = LanguageModelRegistry::read_global(cx);
1498 let provider = registry
1499 .provider(&selected_model.provider)
1500 .expect("Provider not found");
1501 provider.authenticate(cx)
1502 })?
1503 .await?;
1504 cx.update(|cx| {
1505 let models = LanguageModelRegistry::read_global(cx);
1506 let model = models
1507 .available_models(cx)
1508 .find(|model| {
1509 model.provider_id() == selected_model.provider
1510 && model.id() == selected_model.model
1511 })
1512 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0));
1513 model
1514 })
1515 }
1516
1517 async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
1518 // Make sure the last message in the conversation is cached.
1519 eval.conversation.last_mut().unwrap().cache = true;
1520
1521 let path = self
1522 .project
1523 .read_with(cx, |project, cx| {
1524 project.find_project_path(eval.edit_file_input.path, cx)
1525 })
1526 .unwrap();
1527 let buffer = self
1528 .project
1529 .update(cx, |project, cx| project.open_buffer(path, cx))
1530 .await
1531 .unwrap();
1532
1533 let tools = crate::built_in_tools().collect::<Vec<_>>();
1534
1535 let system_prompt = {
1536 let worktrees = vec![WorktreeContext {
1537 root_name: "root".to_string(),
1538 abs_path: Path::new("/path/to/root").into(),
1539 rules_file: None,
1540 }];
1541 let project_context = ProjectContext::new(worktrees, Vec::default());
1542 let tool_names = tools
1543 .iter()
1544 .map(|tool| tool.name.clone().into())
1545 .collect::<Vec<_>>();
1546 let template = crate::SystemPromptTemplate {
1547 project: &project_context,
1548 available_tools: tool_names,
1549 model_name: None,
1550 };
1551 let templates = Templates::new();
1552 template.render(&templates).unwrap()
1553 };
1554
1555 let has_system_prompt = eval
1556 .conversation
1557 .first()
1558 .is_some_and(|msg| msg.role == Role::System);
1559 let messages = if has_system_prompt {
1560 eval.conversation
1561 } else {
1562 [LanguageModelRequestMessage {
1563 role: Role::System,
1564 content: vec![MessageContent::Text(system_prompt)],
1565 cache: true,
1566 reasoning_details: None,
1567 }]
1568 .into_iter()
1569 .chain(eval.conversation)
1570 .collect::<Vec<_>>()
1571 };
1572
1573 let conversation = LanguageModelRequest {
1574 messages,
1575 tools,
1576 thinking_allowed: true,
1577 ..Default::default()
1578 };
1579
1580 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1581 if let Some(input_content) = eval.input_content.as_deref() {
1582 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1583 }
1584 retry_on_rate_limit(async || {
1585 self.agent
1586 .edit(
1587 buffer.clone(),
1588 eval.edit_file_input.display_description.clone(),
1589 &conversation,
1590 &mut cx.to_async(),
1591 )
1592 .0
1593 .await
1594 })
1595 .await?
1596 } else {
1597 retry_on_rate_limit(async || {
1598 self.agent
1599 .overwrite(
1600 buffer.clone(),
1601 eval.edit_file_input.display_description.clone(),
1602 &conversation,
1603 &mut cx.to_async(),
1604 )
1605 .0
1606 .await
1607 })
1608 .await?
1609 };
1610
1611 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1612 let sample = EvalSample {
1613 edit_output,
1614 diff: language::unified_diff(
1615 eval.input_content.as_deref().unwrap_or_default(),
1616 &buffer_text,
1617 ),
1618 text_before: eval.input_content.unwrap_or_default(),
1619 text_after: buffer_text,
1620 };
1621 let assertion = eval
1622 .assertion
1623 .run(&sample, self.judge_model.clone(), cx)
1624 .await?;
1625
1626 Ok(EditEvalOutput { assertion, sample })
1627 }
1628}
1629
1630async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1631 const MAX_RETRIES: usize = 20;
1632 let mut attempt = 0;
1633
1634 loop {
1635 attempt += 1;
1636 let response = request().await;
1637
1638 if attempt >= MAX_RETRIES {
1639 return response;
1640 }
1641
1642 let retry_delay = match &response {
1643 Ok(_) => None,
1644 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1645 Some(err) => match &err {
1646 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1647 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1648 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1649 }
1650 LanguageModelCompletionError::UpstreamProviderError {
1651 status,
1652 retry_after,
1653 ..
1654 } => {
1655 // Only retry for specific status codes
1656 let should_retry = matches!(
1657 *status,
1658 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1659 ) || status.as_u16() == 529;
1660
1661 if should_retry {
1662 // Use server-provided retry_after if available, otherwise use default
1663 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1664 } else {
1665 None
1666 }
1667 }
1668 LanguageModelCompletionError::ApiReadResponseError { .. }
1669 | LanguageModelCompletionError::ApiInternalServerError { .. }
1670 | LanguageModelCompletionError::HttpSend { .. } => {
1671 // Exponential backoff for transient I/O and internal server errors
1672 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1673 }
1674 _ => None,
1675 },
1676 _ => None,
1677 },
1678 };
1679
1680 if let Some(retry_after) = retry_delay {
1681 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1682 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1683 Timer::after(retry_after + jitter).await;
1684 } else {
1685 return response;
1686 }
1687 }
1688}
1689
1690#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1691struct EvalAssertionOutcome {
1692 score: usize,
1693 message: Option<String>,
1694}
1695
1696#[derive(Serialize)]
1697pub struct DiffJudgeTemplate {
1698 diff: String,
1699 assertions: &'static str,
1700}
1701
1702impl Template for DiffJudgeTemplate {
1703 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1704}
1705
1706fn strip_empty_lines(text: &str) -> String {
1707 text.lines()
1708 .filter(|line| !line.trim().is_empty())
1709 .collect::<Vec<_>>()
1710 .join("\n")
1711}