1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use http_client::StatusCode;
16use indoc::{formatdoc, indoc};
17use language_model::{
18 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
19 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
20};
21use project::Project;
22use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
23use rand::prelude::*;
24use reqwest_client::ReqwestClient;
25use serde_json::json;
26use std::{
27 cmp::Reverse,
28 fmt::{self, Display},
29 io::Write as _,
30 path::Path,
31 str::FromStr,
32 sync::mpsc,
33 time::Duration,
34};
35use util::path;
36
37#[test]
38#[cfg_attr(not(feature = "eval"), ignore)]
39fn eval_extract_handle_command_output() {
40 // Test how well agent generates multiple edit hunks.
41 //
42 // Model | Pass rate
43 // ----------------------------|----------
44 // claude-3.7-sonnet | 0.99 (2025-06-14)
45 // claude-sonnet-4 | 0.97 (2025-06-14)
46 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
47 // gemini-2.5-flash | 0.11 (2025-05-22)
48 // gpt-4.1 | 1.00 (2025-05-22)
49
50 let input_file_path = "root/blame.rs";
51 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
52 let possible_diffs = vec![
53 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
57 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
58 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
59 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
60 ];
61 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
62 eval(
63 100,
64 0.95,
65 0.05,
66 EvalInput::from_conversation(
67 vec![
68 message(
69 User,
70 [text(formatdoc! {"
71 Read the `{input_file_path}` file and extract a method in
72 the final stanza of `run_git_blame` to deal with command failures,
73 call it `handle_command_output` and take the std::process::Output as the only parameter.
74 Do not document the method and do not add any comments.
75
76 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
77 "})],
78 ),
79 message(
80 Assistant,
81 [tool_use(
82 "tool_1",
83 "read_file",
84 ReadFileToolInput {
85 path: input_file_path.into(),
86 start_line: None,
87 end_line: None,
88 },
89 )],
90 ),
91 message(
92 User,
93 [tool_result("tool_1", "read_file", input_file_content)],
94 ),
95 message(
96 Assistant,
97 [tool_use(
98 "tool_2",
99 "edit_file",
100 EditFileToolInput {
101 display_description: edit_description.into(),
102 path: input_file_path.into(),
103 mode: EditFileMode::Edit,
104 },
105 )],
106 ),
107 ],
108 Some(input_file_content.into()),
109 EvalAssertion::assert_diff_any(possible_diffs),
110 ),
111 );
112}
113
114#[test]
115#[cfg_attr(not(feature = "eval"), ignore)]
116fn eval_delete_run_git_blame() {
117 // Model | Pass rate
118 // ----------------------------|----------
119 // claude-3.7-sonnet | 1.0 (2025-06-14)
120 // claude-sonnet-4 | 0.96 (2025-06-14)
121 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
122 // gemini-2.5-flash |
123 // gpt-4.1 |
124 let input_file_path = "root/blame.rs";
125 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
126 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
127 let edit_description = "Delete the `run_git_blame` function.";
128 eval(
129 100,
130 0.95,
131 0.05,
132 EvalInput::from_conversation(
133 vec![
134 message(
135 User,
136 [text(formatdoc! {"
137 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
138 one function, not its usages.
139 "})],
140 ),
141 message(
142 Assistant,
143 [tool_use(
144 "tool_1",
145 "read_file",
146 ReadFileToolInput {
147 path: input_file_path.into(),
148 start_line: None,
149 end_line: None,
150 },
151 )],
152 ),
153 message(
154 User,
155 [tool_result("tool_1", "read_file", input_file_content)],
156 ),
157 message(
158 Assistant,
159 [tool_use(
160 "tool_2",
161 "edit_file",
162 EditFileToolInput {
163 display_description: edit_description.into(),
164 path: input_file_path.into(),
165 mode: EditFileMode::Edit,
166 },
167 )],
168 ),
169 ],
170 Some(input_file_content.into()),
171 EvalAssertion::assert_eq(output_file_content),
172 ),
173 );
174}
175
176#[test]
177#[cfg_attr(not(feature = "eval"), ignore)]
178fn eval_translate_doc_comments() {
179 // Model | Pass rate
180 // ============================================
181 //
182 // claude-3.7-sonnet | 1.0 (2025-06-14)
183 // claude-sonnet-4 | 1.0 (2025-06-14)
184 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
185 // gemini-2.5-flash-preview-04-17 |
186 // gpt-4.1 |
187 let input_file_path = "root/canvas.rs";
188 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
189 let edit_description = "Translate all doc comments to Italian";
190 eval(
191 200,
192 1.,
193 0.05,
194 EvalInput::from_conversation(
195 vec![
196 message(
197 User,
198 [text(formatdoc! {"
199 Read the {input_file_path} file and edit it (without overwriting it),
200 translating all the doc comments to italian.
201 "})],
202 ),
203 message(
204 Assistant,
205 [tool_use(
206 "tool_1",
207 "read_file",
208 ReadFileToolInput {
209 path: input_file_path.into(),
210 start_line: None,
211 end_line: None,
212 },
213 )],
214 ),
215 message(
216 User,
217 [tool_result("tool_1", "read_file", input_file_content)],
218 ),
219 message(
220 Assistant,
221 [tool_use(
222 "tool_2",
223 "edit_file",
224 EditFileToolInput {
225 display_description: edit_description.into(),
226 path: input_file_path.into(),
227 mode: EditFileMode::Edit,
228 },
229 )],
230 ),
231 ],
232 Some(input_file_content.into()),
233 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
234 ),
235 );
236}
237
238#[test]
239#[cfg_attr(not(feature = "eval"), ignore)]
240fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
241 // Model | Pass rate
242 // ============================================
243 //
244 // claude-3.7-sonnet | 0.96 (2025-06-14)
245 // claude-sonnet-4 | 0.11 (2025-06-14)
246 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
247 // gemini-2.5-flash-preview-04-17 |
248 // gpt-4.1 |
249 let input_file_path = "root/lib.rs";
250 let input_file_content =
251 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
252 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
253 eval(
254 100,
255 0.95,
256 0.05,
257 EvalInput::from_conversation(
258 vec![
259 message(
260 User,
261 [text(formatdoc! {"
262 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
263 Use `ureq` to download the SDK for the current platform and architecture.
264 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
265 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
266 that's inside of the archive.
267 Don't re-download the SDK if that executable already exists.
268
269 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
270
271 Here are the available wasi-sdk assets:
272 - wasi-sdk-25.0-x86_64-macos.tar.gz
273 - wasi-sdk-25.0-arm64-macos.tar.gz
274 - wasi-sdk-25.0-x86_64-linux.tar.gz
275 - wasi-sdk-25.0-arm64-linux.tar.gz
276 - wasi-sdk-25.0-x86_64-linux.tar.gz
277 - wasi-sdk-25.0-arm64-linux.tar.gz
278 - wasi-sdk-25.0-x86_64-windows.tar.gz
279 "})],
280 ),
281 message(
282 Assistant,
283 [tool_use(
284 "tool_1",
285 "read_file",
286 ReadFileToolInput {
287 path: input_file_path.into(),
288 start_line: Some(971),
289 end_line: Some(1050),
290 },
291 )],
292 ),
293 message(
294 User,
295 [tool_result(
296 "tool_1",
297 "read_file",
298 lines(input_file_content, 971..1050),
299 )],
300 ),
301 message(
302 Assistant,
303 [tool_use(
304 "tool_2",
305 "read_file",
306 ReadFileToolInput {
307 path: input_file_path.into(),
308 start_line: Some(1050),
309 end_line: Some(1100),
310 },
311 )],
312 ),
313 message(
314 User,
315 [tool_result(
316 "tool_2",
317 "read_file",
318 lines(input_file_content, 1050..1100),
319 )],
320 ),
321 message(
322 Assistant,
323 [tool_use(
324 "tool_3",
325 "read_file",
326 ReadFileToolInput {
327 path: input_file_path.into(),
328 start_line: Some(1100),
329 end_line: Some(1150),
330 },
331 )],
332 ),
333 message(
334 User,
335 [tool_result(
336 "tool_3",
337 "read_file",
338 lines(input_file_content, 1100..1150),
339 )],
340 ),
341 message(
342 Assistant,
343 [tool_use(
344 "tool_4",
345 "edit_file",
346 EditFileToolInput {
347 display_description: edit_description.into(),
348 path: input_file_path.into(),
349 mode: EditFileMode::Edit,
350 },
351 )],
352 ),
353 ],
354 Some(input_file_content.into()),
355 EvalAssertion::judge_diff(indoc! {"
356 - The compile_parser_to_wasm method has been changed to use wasi-sdk
357 - ureq is used to download the SDK for current platform and architecture
358 "}),
359 ),
360 );
361}
362
363#[test]
364#[cfg_attr(not(feature = "eval"), ignore)]
365fn eval_disable_cursor_blinking() {
366 // Model | Pass rate
367 // ============================================
368 //
369 // claude-3.7-sonnet | 0.59 (2025-07-14)
370 // claude-sonnet-4 | 0.81 (2025-07-14)
371 // gemini-2.5-pro | 0.95 (2025-07-14)
372 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
373 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
374 let input_file_path = "root/editor.rs";
375 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
376 let edit_description = "Comment out the call to `BlinkManager::enable`";
377 let possible_diffs = vec![
378 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
379 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
380 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
381 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
382 ];
383 eval(
384 100,
385 0.51,
386 0.05,
387 EvalInput::from_conversation(
388 vec![
389 message(User, [text("Let's research how to cursor blinking works.")]),
390 message(
391 Assistant,
392 [tool_use(
393 "tool_1",
394 "grep",
395 GrepToolInput {
396 regex: "blink".into(),
397 include_pattern: None,
398 offset: 0,
399 case_sensitive: false,
400 },
401 )],
402 ),
403 message(
404 User,
405 [tool_result(
406 "tool_1",
407 "grep",
408 [
409 lines(input_file_content, 100..400),
410 lines(input_file_content, 800..1300),
411 lines(input_file_content, 1600..2000),
412 lines(input_file_content, 5000..5500),
413 lines(input_file_content, 8000..9000),
414 lines(input_file_content, 18455..18470),
415 lines(input_file_content, 20000..20500),
416 lines(input_file_content, 21000..21300),
417 ]
418 .join("Match found:\n\n"),
419 )],
420 ),
421 message(
422 User,
423 [text(indoc! {"
424 Comment out the lines that interact with the BlinkManager.
425 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
426 Don't add additional comments.
427 "})],
428 ),
429 message(
430 Assistant,
431 [tool_use(
432 "tool_4",
433 "edit_file",
434 EditFileToolInput {
435 display_description: edit_description.into(),
436 path: input_file_path.into(),
437 mode: EditFileMode::Edit,
438 },
439 )],
440 ),
441 ],
442 Some(input_file_content.into()),
443 EvalAssertion::assert_diff_any(possible_diffs),
444 ),
445 );
446}
447
448#[test]
449#[cfg_attr(not(feature = "eval"), ignore)]
450fn eval_from_pixels_constructor() {
451 // Results for 2025-06-13
452 //
453 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
454 // value. Higher values improve the pass rate but may sometimes cause
455 // edits to be misapplied. In the context of this eval, this means
456 // the agent might add from_pixels tests in incorrect locations
457 // (e.g., at the beginning of the file), yet the evaluation may still
458 // rate it highly.
459 //
460 // Model | Date | Pass rate
461 // =========================================================
462 // claude-4.0-sonnet | 2025-06-14 | 0.99
463 // claude-3.7-sonnet | 2025-06-14 | 0.88
464 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
465 // gpt-4.1 |
466 let input_file_path = "root/canvas.rs";
467 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
468 let edit_description = "Implement from_pixels constructor and add tests.";
469 eval(
470 100,
471 0.95,
472 // For whatever reason, this eval produces more mismatched tags.
473 // Increasing for now, let's see if we can bring this down.
474 0.25,
475 EvalInput::from_conversation(
476 vec![
477 message(
478 User,
479 [text(indoc! {"
480 Introduce a new `from_pixels` constructor in Canvas and
481 also add tests for it in the same file.
482 "})],
483 ),
484 message(
485 Assistant,
486 [tool_use(
487 "tool_1",
488 "read_file",
489 ReadFileToolInput {
490 path: input_file_path.into(),
491 start_line: None,
492 end_line: None,
493 },
494 )],
495 ),
496 message(
497 User,
498 [tool_result("tool_1", "read_file", input_file_content)],
499 ),
500 message(
501 Assistant,
502 [tool_use(
503 "tool_2",
504 "grep",
505 GrepToolInput {
506 regex: "mod\\s+tests".into(),
507 include_pattern: Some("font-kit/src/canvas.rs".into()),
508 offset: 0,
509 case_sensitive: false,
510 },
511 )],
512 ),
513 message(User, [tool_result("tool_2", "grep", "No matches found")]),
514 message(
515 Assistant,
516 [tool_use(
517 "tool_3",
518 "grep",
519 GrepToolInput {
520 regex: "mod\\s+tests".into(),
521 include_pattern: Some("font-kit/src/**/*.rs".into()),
522 offset: 0,
523 case_sensitive: false,
524 },
525 )],
526 ),
527 message(User, [tool_result("tool_3", "grep", "No matches found")]),
528 message(
529 Assistant,
530 [tool_use(
531 "tool_4",
532 "grep",
533 GrepToolInput {
534 regex: "#\\[test\\]".into(),
535 include_pattern: Some("font-kit/src/**/*.rs".into()),
536 offset: 0,
537 case_sensitive: false,
538 },
539 )],
540 ),
541 message(
542 User,
543 [tool_result(
544 "tool_4",
545 "grep",
546 indoc! {"
547 Found 6 matches:
548
549 ## Matches in font-kit/src/loaders/core_text.rs
550
551 ### mod test › L926-936
552 ```
553 mod test {
554 use super::Font;
555 use crate::properties::{Stretch, Weight};
556
557 #[cfg(feature = \"source\")]
558 use crate::source::SystemSource;
559
560 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
561
562 #[cfg(feature = \"source\")]
563 #[test]
564 ```
565
566 55 lines remaining in ancestor node. Read the file to see all.
567
568 ### mod test › L947-951
569 ```
570 }
571
572 #[test]
573 fn test_core_text_to_css_font_weight() {
574 // Exact matches
575 ```
576
577 ### mod test › L959-963
578 ```
579 }
580
581 #[test]
582 fn test_core_text_to_css_font_stretch() {
583 // Exact matches
584 ```
585
586 ## Matches in font-kit/src/loaders/freetype.rs
587
588 ### mod test › L1238-1248
589 ```
590 mod test {
591 use crate::loaders::freetype::Font;
592
593 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
594 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
595
596 #[test]
597 fn get_pcf_postscript_name() {
598 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
599 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
600 }
601 ```
602
603 1 lines remaining in ancestor node. Read the file to see all.
604
605 ## Matches in font-kit/src/sources/core_text.rs
606
607 ### mod test › L265-275
608 ```
609 mod test {
610 use crate::properties::{Stretch, Weight};
611
612 #[test]
613 fn test_css_to_core_text_font_weight() {
614 // Exact matches
615 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
616 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
617 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
618 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
619
620 ```
621
622 27 lines remaining in ancestor node. Read the file to see all.
623
624 ### mod test › L278-282
625 ```
626 }
627
628 #[test]
629 fn test_css_to_core_text_font_stretch() {
630 // Exact matches
631 ```
632 "},
633 )],
634 ),
635 message(
636 Assistant,
637 [tool_use(
638 "tool_5",
639 "edit_file",
640 EditFileToolInput {
641 display_description: edit_description.into(),
642 path: input_file_path.into(),
643 mode: EditFileMode::Edit,
644 },
645 )],
646 ),
647 ],
648 Some(input_file_content.into()),
649 EvalAssertion::judge_diff(indoc! {"
650 - The diff contains a new `from_pixels` constructor
651 - The diff contains new tests for the `from_pixels` constructor
652 "}),
653 ),
654 );
655}
656
657#[test]
658#[cfg_attr(not(feature = "eval"), ignore)]
659fn eval_zode() {
660 // Model | Pass rate
661 // ============================================
662 //
663 // claude-3.7-sonnet | 1.0 (2025-06-14)
664 // claude-sonnet-4 | 1.0 (2025-06-14)
665 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
666 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
667 // gpt-4.1 | 1.0 (2025-05-22)
668 let input_file_path = "root/zode.py";
669 let input_content = None;
670 let edit_description = "Create the main Zode CLI script";
671 eval(
672 50,
673 1.,
674 0.05,
675 EvalInput::from_conversation(
676 vec![
677 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
678 message(
679 Assistant,
680 [
681 tool_use(
682 "tool_1",
683 "read_file",
684 ReadFileToolInput {
685 path: "root/eval/react.py".into(),
686 start_line: None,
687 end_line: None,
688 },
689 ),
690 tool_use(
691 "tool_2",
692 "read_file",
693 ReadFileToolInput {
694 path: "root/eval/react_test.py".into(),
695 start_line: None,
696 end_line: None,
697 },
698 ),
699 ],
700 ),
701 message(
702 User,
703 [
704 tool_result(
705 "tool_1",
706 "read_file",
707 include_str!("evals/fixtures/zode/react.py"),
708 ),
709 tool_result(
710 "tool_2",
711 "read_file",
712 include_str!("evals/fixtures/zode/react_test.py"),
713 ),
714 ],
715 ),
716 message(
717 Assistant,
718 [
719 text(
720 "Now that I understand what we need to build, I'll create the main Python script:",
721 ),
722 tool_use(
723 "tool_3",
724 "edit_file",
725 EditFileToolInput {
726 display_description: edit_description.into(),
727 path: input_file_path.into(),
728 mode: EditFileMode::Create,
729 },
730 ),
731 ],
732 ),
733 ],
734 input_content,
735 EvalAssertion::new(async move |sample, _, _cx| {
736 let invalid_starts = [' ', '`', '\n'];
737 let mut message = String::new();
738 for start in invalid_starts {
739 if sample.text_after.starts_with(start) {
740 message.push_str(&format!("The sample starts with a {:?}\n", start));
741 break;
742 }
743 }
744 // Remove trailing newline.
745 message.pop();
746
747 if message.is_empty() {
748 Ok(EvalAssertionOutcome {
749 score: 100,
750 message: None,
751 })
752 } else {
753 Ok(EvalAssertionOutcome {
754 score: 0,
755 message: Some(message),
756 })
757 }
758 }),
759 ),
760 );
761}
762
763#[test]
764#[cfg_attr(not(feature = "eval"), ignore)]
765fn eval_add_overwrite_test() {
766 // Model | Pass rate
767 // ============================================
768 //
769 // claude-3.7-sonnet | 0.65 (2025-06-14)
770 // claude-sonnet-4 | 0.07 (2025-06-14)
771 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
772 // gemini-2.5-flash-preview-04-17 |
773 // gpt-4.1 |
774 let input_file_path = "root/action_log.rs";
775 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
776 let edit_description = "Add a new test for overwriting a file in action_log.rs";
777 eval(
778 200,
779 0.5, // TODO: make this eval better
780 0.05,
781 EvalInput::from_conversation(
782 vec![
783 message(
784 User,
785 [text(indoc! {"
786 Introduce a new test in `action_log.rs` to test overwriting a file.
787 That is, a file already exists, but we call `buffer_created` as if the file were new.
788 Take inspiration from all the other tests in the file.
789 "})],
790 ),
791 message(
792 Assistant,
793 [tool_use(
794 "tool_1",
795 "read_file",
796 ReadFileToolInput {
797 path: input_file_path.into(),
798 start_line: None,
799 end_line: None,
800 },
801 )],
802 ),
803 message(
804 User,
805 [tool_result(
806 "tool_1",
807 "read_file",
808 indoc! {"
809 pub struct ActionLog [L13-20]
810 tracked_buffers [L15]
811 edited_since_project_diagnostics_check [L17]
812 project [L19]
813 impl ActionLog [L22-498]
814 pub fn new [L24-30]
815 pub fn project [L32-34]
816 pub fn checked_project_diagnostics [L37-39]
817 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
818 fn track_buffer_internal [L46-101]
819 fn handle_buffer_event [L103-116]
820 fn handle_buffer_edited [L118-123]
821 fn handle_buffer_file_changed [L125-158]
822 async fn maintain_diff [L160-264]
823 pub fn buffer_read [L267-269]
824 pub fn buffer_created [L272-276]
825 pub fn buffer_edited [L279-287]
826 pub fn will_delete_buffer [L289-304]
827 pub fn keep_edits_in_range [L306-364]
828 pub fn reject_edits_in_ranges [L366-459]
829 pub fn keep_all_edits [L461-473]
830 pub fn changed_buffers [L476-482]
831 pub fn stale_buffers [L485-497]
832 fn apply_non_conflicting_edits [L500-561]
833 fn diff_snapshots [L563-585]
834 fn point_to_row_edit [L587-614]
835 enum ChangeAuthor [L617-620]
836 User [L618]
837 Agent [L619]
838 enum TrackedBufferStatus [L623-627]
839 Created [L624]
840 Modified [L625]
841 Deleted [L626]
842 struct TrackedBuffer [L629-641]
843 buffer [L630]
844 base_text [L631]
845 unreviewed_changes [L632]
846 status [L633]
847 version [L634]
848 diff [L635]
849 snapshot [L636]
850 diff_update [L637]
851 _open_lsp_handle [L638]
852 _maintain_diff [L639]
853 _subscription [L640]
854 impl TrackedBuffer [L643-657]
855 fn has_changes [L644-650]
856 fn schedule_diff_update [L652-656]
857 pub struct ChangedBuffer [L659-661]
858 pub diff [L660]
859 mod tests [L664-1574]
860 fn init_logger [L678-682]
861 fn init_test [L684-691]
862 async fn test_keep_edits [L694-769]
863 async fn test_deletions [L772-854]
864 async fn test_overlapping_user_edits [L857-951]
865 async fn test_creating_files [L954-1010]
866 async fn test_deleting_files [L1013-1120]
867 async fn test_reject_edits [L1123-1255]
868 async fn test_reject_multiple_edits [L1258-1331]
869 async fn test_reject_deleted_file [L1334-1388]
870 async fn test_reject_created_file [L1391-1443]
871 async fn test_random_diffs [L1446-1535]
872 fn quiesce [L1510-1534]
873 struct HunkStatus [L1538-1542]
874 range [L1539]
875 diff_status [L1540]
876 old_text [L1541]
877 fn unreviewed_hunks [L1544-1573]
878
879 Showing symbols 1-69 (total symbols: 69)
880
881 Using the line numbers in this outline, you can call this tool again while specifying
882 the start_line and end_line fields to see the implementations of symbols in the outline.
883 "},
884 )],
885 ),
886 message(
887 Assistant,
888 [
889 text(
890 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
891 ),
892 tool_use(
893 "tool_2",
894 "read_file",
895 ReadFileToolInput {
896 path: input_file_path.into(),
897 start_line: Some(953),
898 end_line: Some(1010),
899 },
900 ),
901 ],
902 ),
903 message(
904 User,
905 [tool_result(
906 "tool_2",
907 "read_file",
908 lines(input_file_content, 953..1010),
909 )],
910 ),
911 message(
912 Assistant,
913 [
914 text(
915 "Now let's look at another relevant test to understand how they're structured:",
916 ),
917 tool_use(
918 "tool_3",
919 "read_file",
920 ReadFileToolInput {
921 path: input_file_path.into(),
922 start_line: Some(1012),
923 end_line: Some(1120),
924 },
925 ),
926 ],
927 ),
928 message(
929 User,
930 [tool_result(
931 "tool_3",
932 "read_file",
933 lines(input_file_content, 1012..1120),
934 )],
935 ),
936 message(
937 Assistant,
938 [
939 text("Now let's look at how `buffer_created` is implemented:"),
940 tool_use(
941 "tool_4",
942 "read_file",
943 ReadFileToolInput {
944 path: input_file_path.into(),
945 start_line: Some(271),
946 end_line: Some(276),
947 },
948 ),
949 ],
950 ),
951 message(
952 User,
953 [tool_result(
954 "tool_4",
955 "read_file",
956 lines(input_file_content, 271..276),
957 )],
958 ),
959 message(
960 Assistant,
961 [
962 text(indoc! {"
963 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
964
965 1. Create a file
966 2. Open a buffer for it
967 3. Track it in the action log
968 4. Modify the buffer
969 5. Call `buffer_created` again as if the file were new
970 6. Check that the changes are properly tracked
971
972 Let's write the test:
973 "}),
974 tool_use(
975 "tool_5",
976 "edit_file",
977 EditFileToolInput {
978 display_description: edit_description.into(),
979 path: input_file_path.into(),
980 mode: EditFileMode::Edit,
981 },
982 ),
983 ],
984 ),
985 ],
986 Some(input_file_content.into()),
987 EvalAssertion::judge_diff(
988 "A new test for overwritten files was created, without changing any previous test",
989 ),
990 ),
991 );
992}
993
994#[test]
995#[cfg_attr(not(feature = "eval"), ignore)]
996fn eval_create_empty_file() {
997 // Check that Edit Agent can create a file without writing its
998 // thoughts into it. This issue is not specific to empty files, but
999 // it's easier to reproduce with them.
1000 //
1001 // Model | Pass rate
1002 // ============================================
1003 //
1004 // claude-3.7-sonnet | 1.00 (2025-06-14)
1005 // claude-sonnet-4 | 1.00 (2025-06-14)
1006 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1007 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1008 // gpt-4.1 | 1.00 (2025-05-21)
1009 //
1010 //
1011 // TODO: gpt-4.1-mini errored 38 times:
1012 // "data did not match any variant of untagged enum ResponseStreamResult"
1013 //
1014 let input_file_content = None;
1015 let expected_output_content = String::new();
1016 eval(
1017 100,
1018 0.99,
1019 0.05,
1020 EvalInput::from_conversation(
1021 vec![
1022 message(User, [text("Create a second empty todo file ")]),
1023 message(
1024 Assistant,
1025 [
1026 text(formatdoc! {"
1027 I'll help you create a second empty todo file.
1028 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1029 "}),
1030 tool_use(
1031 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1032 "list_directory",
1033 ListDirectoryToolInput {
1034 path: "root".to_string(),
1035 },
1036 ),
1037 ],
1038 ),
1039 message(
1040 User,
1041 [tool_result(
1042 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1043 "list_directory",
1044 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1045 )],
1046 ),
1047 message(
1048 Assistant,
1049 [
1050 text(formatdoc! {"
1051 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1052 "}),
1053 tool_use(
1054 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1055 "edit_file",
1056 EditFileToolInput {
1057 display_description: "Create empty TODO3 file".to_string(),
1058 mode: EditFileMode::Create,
1059 path: "root/TODO3".into(),
1060 },
1061 ),
1062 ],
1063 ),
1064 ],
1065 input_file_content,
1066 // Bad behavior is to write something like
1067 // "I'll create an empty TODO3 file as requested."
1068 EvalAssertion::assert_eq(expected_output_content),
1069 ),
1070 );
1071}
1072
1073fn message(
1074 role: Role,
1075 contents: impl IntoIterator<Item = MessageContent>,
1076) -> LanguageModelRequestMessage {
1077 LanguageModelRequestMessage {
1078 role,
1079 content: contents.into_iter().collect(),
1080 cache: false,
1081 }
1082}
1083
1084fn text(text: impl Into<String>) -> MessageContent {
1085 MessageContent::Text(text.into())
1086}
1087
1088fn lines(input: &str, range: Range<usize>) -> String {
1089 input
1090 .lines()
1091 .skip(range.start)
1092 .take(range.len())
1093 .collect::<Vec<_>>()
1094 .join("\n")
1095}
1096
1097fn tool_use(
1098 id: impl Into<Arc<str>>,
1099 name: impl Into<Arc<str>>,
1100 input: impl Serialize,
1101) -> MessageContent {
1102 MessageContent::ToolUse(LanguageModelToolUse {
1103 id: LanguageModelToolUseId::from(id.into()),
1104 name: name.into(),
1105 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1106 input: serde_json::to_value(input).unwrap(),
1107 is_input_complete: true,
1108 })
1109}
1110
1111fn tool_result(
1112 id: impl Into<Arc<str>>,
1113 name: impl Into<Arc<str>>,
1114 result: impl Into<Arc<str>>,
1115) -> MessageContent {
1116 MessageContent::ToolResult(LanguageModelToolResult {
1117 tool_use_id: LanguageModelToolUseId::from(id.into()),
1118 tool_name: name.into(),
1119 is_error: false,
1120 content: LanguageModelToolResultContent::Text(result.into()),
1121 output: None,
1122 })
1123}
1124
1125#[derive(Clone)]
1126struct EvalInput {
1127 conversation: Vec<LanguageModelRequestMessage>,
1128 edit_file_input: EditFileToolInput,
1129 input_content: Option<String>,
1130 assertion: EvalAssertion,
1131}
1132
1133impl EvalInput {
1134 fn from_conversation(
1135 conversation: Vec<LanguageModelRequestMessage>,
1136 input_content: Option<String>,
1137 assertion: EvalAssertion,
1138 ) -> Self {
1139 let msg = conversation.last().expect("Conversation must not be empty");
1140 if msg.role != Role::Assistant {
1141 panic!("Conversation must end with an assistant message");
1142 }
1143 let tool_use = msg
1144 .content
1145 .iter()
1146 .flat_map(|content| match content {
1147 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1148 Some(tool_use)
1149 }
1150 _ => None,
1151 })
1152 .next()
1153 .expect("Conversation must end with an edit_file tool use")
1154 .clone();
1155
1156 let edit_file_input: EditFileToolInput = serde_json::from_value(tool_use.input).unwrap();
1157
1158 EvalInput {
1159 conversation,
1160 edit_file_input,
1161 input_content,
1162 assertion,
1163 }
1164 }
1165}
1166
1167#[derive(Clone)]
1168struct EvalSample {
1169 text_before: String,
1170 text_after: String,
1171 edit_output: EditAgentOutput,
1172 diff: String,
1173}
1174
1175trait AssertionFn: 'static + Send + Sync {
1176 fn assert<'a>(
1177 &'a self,
1178 sample: &'a EvalSample,
1179 judge_model: Arc<dyn LanguageModel>,
1180 cx: &'a mut TestAppContext,
1181 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1182}
1183
1184impl<F> AssertionFn for F
1185where
1186 F: 'static
1187 + Send
1188 + Sync
1189 + AsyncFn(
1190 &EvalSample,
1191 Arc<dyn LanguageModel>,
1192 &mut TestAppContext,
1193 ) -> Result<EvalAssertionOutcome>,
1194{
1195 fn assert<'a>(
1196 &'a self,
1197 sample: &'a EvalSample,
1198 judge_model: Arc<dyn LanguageModel>,
1199 cx: &'a mut TestAppContext,
1200 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1201 (self)(sample, judge_model, cx).boxed_local()
1202 }
1203}
1204
1205#[derive(Clone)]
1206struct EvalAssertion(Arc<dyn AssertionFn>);
1207
1208impl EvalAssertion {
1209 fn new<F>(f: F) -> Self
1210 where
1211 F: 'static
1212 + Send
1213 + Sync
1214 + AsyncFn(
1215 &EvalSample,
1216 Arc<dyn LanguageModel>,
1217 &mut TestAppContext,
1218 ) -> Result<EvalAssertionOutcome>,
1219 {
1220 EvalAssertion(Arc::new(f))
1221 }
1222
1223 fn assert_eq(expected: impl Into<String>) -> Self {
1224 let expected = expected.into();
1225 Self::new(async move |sample, _judge, _cx| {
1226 Ok(EvalAssertionOutcome {
1227 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1228 100
1229 } else {
1230 0
1231 },
1232 message: None,
1233 })
1234 })
1235 }
1236
1237 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1238 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1239 Self::new(async move |sample, _judge, _cx| {
1240 let matches = expected_diffs.iter().any(|possible_diff| {
1241 let expected =
1242 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1243 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1244 });
1245
1246 Ok(EvalAssertionOutcome {
1247 score: if matches { 100 } else { 0 },
1248 message: None,
1249 })
1250 })
1251 }
1252
1253 fn judge_diff(assertions: &'static str) -> Self {
1254 Self::new(async move |sample, judge, cx| {
1255 let prompt = DiffJudgeTemplate {
1256 diff: sample.diff.clone(),
1257 assertions,
1258 }
1259 .render(&Templates::new())
1260 .unwrap();
1261
1262 let request = LanguageModelRequest {
1263 messages: vec![LanguageModelRequestMessage {
1264 role: Role::User,
1265 content: vec![prompt.into()],
1266 cache: false,
1267 }],
1268 thinking_allowed: true,
1269 ..Default::default()
1270 };
1271 let mut response = retry_on_rate_limit(async || {
1272 Ok(judge
1273 .stream_completion_text(request.clone(), &cx.to_async())
1274 .await?)
1275 })
1276 .await?;
1277 let mut output = String::new();
1278 while let Some(chunk) = response.stream.next().await {
1279 let chunk = chunk?;
1280 output.push_str(&chunk);
1281 }
1282
1283 // Parse the score from the response
1284 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1285 if let Some(captures) = re.captures(&output)
1286 && let Some(score_match) = captures.get(1)
1287 {
1288 let score = score_match.as_str().parse().unwrap_or(0);
1289 return Ok(EvalAssertionOutcome {
1290 score,
1291 message: Some(output),
1292 });
1293 }
1294
1295 anyhow::bail!("No score found in response. Raw output: {output}");
1296 })
1297 }
1298
1299 async fn run(
1300 &self,
1301 input: &EvalSample,
1302 judge_model: Arc<dyn LanguageModel>,
1303 cx: &mut TestAppContext,
1304 ) -> Result<EvalAssertionOutcome> {
1305 self.0.assert(input, judge_model, cx).await
1306 }
1307}
1308
1309fn eval(
1310 iterations: usize,
1311 expected_pass_ratio: f32,
1312 mismatched_tag_threshold: f32,
1313 mut eval: EvalInput,
1314) {
1315 let mut evaluated_count = 0;
1316 let mut failed_count = 0;
1317 report_progress(evaluated_count, failed_count, iterations);
1318
1319 let (tx, rx) = mpsc::channel();
1320
1321 // Cache the last message in the conversation, and run one instance of the eval so that
1322 // all the next ones are cached.
1323 eval.conversation.last_mut().unwrap().cache = true;
1324 run_eval(eval.clone(), tx.clone());
1325
1326 let executor = gpui::background_executor();
1327 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1328 for _ in 1..iterations {
1329 let eval = eval.clone();
1330 let tx = tx.clone();
1331 let semaphore = semaphore.clone();
1332 executor
1333 .spawn(async move {
1334 let _guard = semaphore.acquire().await;
1335 run_eval(eval, tx)
1336 })
1337 .detach();
1338 }
1339 drop(tx);
1340
1341 let mut failed_evals = HashMap::default();
1342 let mut errored_evals = HashMap::default();
1343 let mut eval_outputs = Vec::new();
1344 let mut cumulative_parser_metrics = EditParserMetrics::default();
1345 while let Ok(output) = rx.recv() {
1346 match output {
1347 Ok(output) => {
1348 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1349 eval_outputs.push(output.clone());
1350 if output.assertion.score < 80 {
1351 failed_count += 1;
1352 failed_evals
1353 .entry(output.sample.text_after.clone())
1354 .or_insert(Vec::new())
1355 .push(output);
1356 }
1357 }
1358 Err(error) => {
1359 failed_count += 1;
1360 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1361 }
1362 }
1363
1364 evaluated_count += 1;
1365 report_progress(evaluated_count, failed_count, iterations);
1366 }
1367
1368 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1369 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1370 if actual_pass_ratio < expected_pass_ratio {
1371 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1372 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1373 for (error, count) in errored_evals {
1374 println!("Eval errored {} times. Error: {}", count, error);
1375 }
1376
1377 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1378 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1379 for (_buffer_output, failed_evals) in failed_evals {
1380 let eval_output = failed_evals.first().unwrap();
1381 println!("Eval failed {} times", failed_evals.len());
1382 println!("{}", eval_output);
1383 }
1384
1385 panic!(
1386 "Actual pass ratio: {}\nExpected pass ratio: {}",
1387 actual_pass_ratio, expected_pass_ratio
1388 );
1389 }
1390
1391 let mismatched_tag_ratio =
1392 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1393 if mismatched_tag_ratio > mismatched_tag_threshold {
1394 for eval_output in eval_outputs {
1395 println!("{}", eval_output);
1396 }
1397 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1398 }
1399}
1400
1401fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1402 let dispatcher = gpui::TestDispatcher::new(StdRng::from_os_rng());
1403 let mut cx = TestAppContext::build(dispatcher, None);
1404 let output = cx.executor().block_test(async {
1405 let test = EditAgentTest::new(&mut cx).await;
1406 test.eval(eval, &mut cx).await
1407 });
1408 tx.send(output).unwrap();
1409}
1410
1411#[derive(Clone)]
1412struct EvalOutput {
1413 sample: EvalSample,
1414 assertion: EvalAssertionOutcome,
1415}
1416
1417impl Display for EvalOutput {
1418 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1419 writeln!(f, "Score: {:?}", self.assertion.score)?;
1420 if let Some(message) = self.assertion.message.as_ref() {
1421 writeln!(f, "Message: {}", message)?;
1422 }
1423
1424 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1425
1426 writeln!(
1427 f,
1428 "Parser Metrics:\n{:#?}",
1429 self.sample.edit_output.parser_metrics
1430 )?;
1431 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1432 Ok(())
1433 }
1434}
1435
1436fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1437 let passed_count = evaluated_count - failed_count;
1438 let passed_ratio = if evaluated_count == 0 {
1439 0.0
1440 } else {
1441 passed_count as f64 / evaluated_count as f64
1442 };
1443 print!(
1444 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1445 evaluated_count,
1446 iterations,
1447 passed_ratio * 100.0
1448 );
1449 std::io::stdout().flush().unwrap();
1450}
1451
1452struct EditAgentTest {
1453 agent: EditAgent,
1454 project: Entity<Project>,
1455 judge_model: Arc<dyn LanguageModel>,
1456}
1457
1458impl EditAgentTest {
1459 async fn new(cx: &mut TestAppContext) -> Self {
1460 cx.executor().allow_parking();
1461
1462 let fs = FakeFs::new(cx.executor());
1463 cx.update(|cx| {
1464 settings::init(cx);
1465 gpui_tokio::init(cx);
1466 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1467 cx.set_http_client(http_client);
1468
1469 client::init_settings(cx);
1470 let client = Client::production(cx);
1471 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1472
1473 settings::init(cx);
1474 Project::init_settings(cx);
1475 language::init(cx);
1476 language_model::init(client.clone(), cx);
1477 language_models::init(user_store, client.clone(), cx);
1478 crate::init(client.http_client(), cx);
1479 });
1480
1481 fs.insert_tree("/root", json!({})).await;
1482 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1483 let agent_model = SelectedModel::from_str(
1484 &std::env::var("ZED_AGENT_MODEL")
1485 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1486 )
1487 .unwrap();
1488 let judge_model = SelectedModel::from_str(
1489 &std::env::var("ZED_JUDGE_MODEL")
1490 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1491 )
1492 .unwrap();
1493 let (agent_model, judge_model) = cx
1494 .update(|cx| {
1495 cx.spawn(async move |cx| {
1496 let agent_model = Self::load_model(&agent_model, cx).await;
1497 let judge_model = Self::load_model(&judge_model, cx).await;
1498 (agent_model.unwrap(), judge_model.unwrap())
1499 })
1500 })
1501 .await;
1502 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1503
1504 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1505
1506 Self {
1507 agent: EditAgent::new(
1508 agent_model,
1509 project.clone(),
1510 action_log,
1511 Templates::new(),
1512 edit_format,
1513 ),
1514 project,
1515 judge_model,
1516 }
1517 }
1518
1519 async fn load_model(
1520 selected_model: &SelectedModel,
1521 cx: &mut AsyncApp,
1522 ) -> Result<Arc<dyn LanguageModel>> {
1523 cx.update(|cx| {
1524 let registry = LanguageModelRegistry::read_global(cx);
1525 let provider = registry
1526 .provider(&selected_model.provider)
1527 .expect("Provider not found");
1528 provider.authenticate(cx)
1529 })?
1530 .await?;
1531 cx.update(|cx| {
1532 let models = LanguageModelRegistry::read_global(cx);
1533 let model = models
1534 .available_models(cx)
1535 .find(|model| {
1536 model.provider_id() == selected_model.provider
1537 && model.id() == selected_model.model
1538 })
1539 .expect("Model not found");
1540 model
1541 })
1542 }
1543
1544 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1545 let path = self
1546 .project
1547 .read_with(cx, |project, cx| {
1548 project.find_project_path(eval.edit_file_input.path, cx)
1549 })
1550 .unwrap();
1551 let buffer = self
1552 .project
1553 .update(cx, |project, cx| project.open_buffer(path, cx))
1554 .await
1555 .unwrap();
1556 let tools = cx.update(|cx| {
1557 ToolRegistry::default_global(cx)
1558 .tools()
1559 .into_iter()
1560 .filter_map(|tool| {
1561 let input_schema = tool
1562 .input_schema(self.agent.model.tool_input_format())
1563 .ok()?;
1564 Some(LanguageModelRequestTool {
1565 name: tool.name(),
1566 description: tool.description(),
1567 input_schema,
1568 })
1569 })
1570 .collect::<Vec<_>>()
1571 });
1572 let tool_names = tools
1573 .iter()
1574 .map(|tool| tool.name.clone())
1575 .collect::<Vec<_>>();
1576 let worktrees = vec![WorktreeContext {
1577 root_name: "root".to_string(),
1578 abs_path: Path::new("/path/to/root").into(),
1579 rules_file: None,
1580 }];
1581 let prompt_builder = PromptBuilder::new(None)?;
1582 let project_context = ProjectContext::new(worktrees, Vec::default());
1583 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1584 &project_context,
1585 &ModelContext {
1586 available_tools: tool_names,
1587 },
1588 )?;
1589
1590 let has_system_prompt = eval
1591 .conversation
1592 .first()
1593 .is_some_and(|msg| msg.role == Role::System);
1594 let messages = if has_system_prompt {
1595 eval.conversation
1596 } else {
1597 [LanguageModelRequestMessage {
1598 role: Role::System,
1599 content: vec![MessageContent::Text(system_prompt)],
1600 cache: true,
1601 }]
1602 .into_iter()
1603 .chain(eval.conversation)
1604 .collect::<Vec<_>>()
1605 };
1606
1607 let conversation = LanguageModelRequest {
1608 messages,
1609 tools,
1610 thinking_allowed: true,
1611 ..Default::default()
1612 };
1613
1614 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1615 if let Some(input_content) = eval.input_content.as_deref() {
1616 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1617 }
1618 retry_on_rate_limit(async || {
1619 self.agent
1620 .edit(
1621 buffer.clone(),
1622 eval.edit_file_input.display_description.clone(),
1623 &conversation,
1624 &mut cx.to_async(),
1625 )
1626 .0
1627 .await
1628 })
1629 .await?
1630 } else {
1631 retry_on_rate_limit(async || {
1632 self.agent
1633 .overwrite(
1634 buffer.clone(),
1635 eval.edit_file_input.display_description.clone(),
1636 &conversation,
1637 &mut cx.to_async(),
1638 )
1639 .0
1640 .await
1641 })
1642 .await?
1643 };
1644
1645 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1646 let sample = EvalSample {
1647 edit_output,
1648 diff: language::unified_diff(
1649 eval.input_content.as_deref().unwrap_or_default(),
1650 &buffer_text,
1651 ),
1652 text_before: eval.input_content.unwrap_or_default(),
1653 text_after: buffer_text,
1654 };
1655 let assertion = eval
1656 .assertion
1657 .run(&sample, self.judge_model.clone(), cx)
1658 .await?;
1659
1660 Ok(EvalOutput { assertion, sample })
1661 }
1662}
1663
1664async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1665 const MAX_RETRIES: usize = 20;
1666 let mut attempt = 0;
1667
1668 loop {
1669 attempt += 1;
1670 let response = request().await;
1671
1672 if attempt >= MAX_RETRIES {
1673 return response;
1674 }
1675
1676 let retry_delay = match &response {
1677 Ok(_) => None,
1678 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1679 Some(err) => match &err {
1680 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1681 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1682 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1683 }
1684 LanguageModelCompletionError::UpstreamProviderError {
1685 status,
1686 retry_after,
1687 ..
1688 } => {
1689 // Only retry for specific status codes
1690 let should_retry = matches!(
1691 *status,
1692 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1693 ) || status.as_u16() == 529;
1694
1695 if should_retry {
1696 // Use server-provided retry_after if available, otherwise use default
1697 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1698 } else {
1699 None
1700 }
1701 }
1702 LanguageModelCompletionError::ApiReadResponseError { .. }
1703 | LanguageModelCompletionError::ApiInternalServerError { .. }
1704 | LanguageModelCompletionError::HttpSend { .. } => {
1705 // Exponential backoff for transient I/O and internal server errors
1706 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1707 }
1708 _ => None,
1709 },
1710 _ => None,
1711 },
1712 };
1713
1714 if let Some(retry_after) = retry_delay {
1715 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
1716 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1717 Timer::after(retry_after + jitter).await;
1718 } else {
1719 return response;
1720 }
1721 }
1722}
1723
1724#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1725struct EvalAssertionOutcome {
1726 score: usize,
1727 message: Option<String>,
1728}
1729
1730#[derive(Serialize)]
1731pub struct DiffJudgeTemplate {
1732 diff: String,
1733 assertions: &'static str,
1734}
1735
1736impl Template for DiffJudgeTemplate {
1737 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1738}
1739
1740fn strip_empty_lines(text: &str) -> String {
1741 text.lines()
1742 .filter(|line| !line.trim().is_empty())
1743 .collect::<Vec<_>>()
1744 .join("\n")
1745}