1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext, Timer};
15use http_client::StatusCode;
16use indoc::{formatdoc, indoc};
17use language_model::{
18 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
19 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
20};
21use project::Project;
22use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
23use rand::prelude::*;
24use reqwest_client::ReqwestClient;
25use serde_json::json;
26use std::{
27 cmp::Reverse,
28 fmt::{self, Display},
29 io::Write as _,
30 path::Path,
31 str::FromStr,
32 sync::mpsc,
33 time::Duration,
34};
35use util::path;
36
37#[test]
38#[cfg_attr(not(feature = "eval"), ignore)]
39fn eval_extract_handle_command_output() {
40 // Test how well agent generates multiple edit hunks.
41 //
42 // Model | Pass rate
43 // ----------------------------|----------
44 // claude-3.7-sonnet | 0.99 (2025-06-14)
45 // claude-sonnet-4 | 0.97 (2025-06-14)
46 // gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
47 // gemini-2.5-flash | 0.11 (2025-05-22)
48 // gpt-4.1 | 1.00 (2025-05-22)
49
50 let input_file_path = "root/blame.rs";
51 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
52 let possible_diffs = vec![
53 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
56 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
57 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
58 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
59 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
60 ];
61 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
62 eval(
63 100,
64 0.95,
65 0.05,
66 EvalInput::from_conversation(
67 vec![
68 message(
69 User,
70 [text(formatdoc! {"
71 Read the `{input_file_path}` file and extract a method in
72 the final stanza of `run_git_blame` to deal with command failures,
73 call it `handle_command_output` and take the std::process::Output as the only parameter.
74 Do not document the method and do not add any comments.
75
76 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
77 "})],
78 ),
79 message(
80 Assistant,
81 [tool_use(
82 "tool_1",
83 "read_file",
84 ReadFileToolInput {
85 path: input_file_path.into(),
86 start_line: None,
87 end_line: None,
88 },
89 )],
90 ),
91 message(
92 User,
93 [tool_result("tool_1", "read_file", input_file_content)],
94 ),
95 message(
96 Assistant,
97 [tool_use(
98 "tool_2",
99 "edit_file",
100 EditFileToolInput {
101 display_description: edit_description.into(),
102 path: input_file_path.into(),
103 mode: EditFileMode::Edit,
104 },
105 )],
106 ),
107 ],
108 Some(input_file_content.into()),
109 EvalAssertion::assert_diff_any(possible_diffs),
110 ),
111 );
112}
113
114#[test]
115#[cfg_attr(not(feature = "eval"), ignore)]
116fn eval_delete_run_git_blame() {
117 // Model | Pass rate
118 // ----------------------------|----------
119 // claude-3.7-sonnet | 1.0 (2025-06-14)
120 // claude-sonnet-4 | 0.96 (2025-06-14)
121 // gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
122 // gemini-2.5-flash |
123 // gpt-4.1 |
124 let input_file_path = "root/blame.rs";
125 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
126 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
127 let edit_description = "Delete the `run_git_blame` function.";
128 eval(
129 100,
130 0.95,
131 0.05,
132 EvalInput::from_conversation(
133 vec![
134 message(
135 User,
136 [text(formatdoc! {"
137 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
138 one function, not its usages.
139 "})],
140 ),
141 message(
142 Assistant,
143 [tool_use(
144 "tool_1",
145 "read_file",
146 ReadFileToolInput {
147 path: input_file_path.into(),
148 start_line: None,
149 end_line: None,
150 },
151 )],
152 ),
153 message(
154 User,
155 [tool_result("tool_1", "read_file", input_file_content)],
156 ),
157 message(
158 Assistant,
159 [tool_use(
160 "tool_2",
161 "edit_file",
162 EditFileToolInput {
163 display_description: edit_description.into(),
164 path: input_file_path.into(),
165 mode: EditFileMode::Edit,
166 },
167 )],
168 ),
169 ],
170 Some(input_file_content.into()),
171 EvalAssertion::assert_eq(output_file_content),
172 ),
173 );
174}
175
176#[test]
177#[cfg_attr(not(feature = "eval"), ignore)]
178fn eval_translate_doc_comments() {
179 // Model | Pass rate
180 // ============================================
181 //
182 // claude-3.7-sonnet | 1.0 (2025-06-14)
183 // claude-sonnet-4 | 1.0 (2025-06-14)
184 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
185 // gemini-2.5-flash-preview-04-17 |
186 // gpt-4.1 |
187 let input_file_path = "root/canvas.rs";
188 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
189 let edit_description = "Translate all doc comments to Italian";
190 eval(
191 200,
192 1.,
193 0.05,
194 EvalInput::from_conversation(
195 vec![
196 message(
197 User,
198 [text(formatdoc! {"
199 Read the {input_file_path} file and edit it (without overwriting it),
200 translating all the doc comments to italian.
201 "})],
202 ),
203 message(
204 Assistant,
205 [tool_use(
206 "tool_1",
207 "read_file",
208 ReadFileToolInput {
209 path: input_file_path.into(),
210 start_line: None,
211 end_line: None,
212 },
213 )],
214 ),
215 message(
216 User,
217 [tool_result("tool_1", "read_file", input_file_content)],
218 ),
219 message(
220 Assistant,
221 [tool_use(
222 "tool_2",
223 "edit_file",
224 EditFileToolInput {
225 display_description: edit_description.into(),
226 path: input_file_path.into(),
227 mode: EditFileMode::Edit,
228 },
229 )],
230 ),
231 ],
232 Some(input_file_content.into()),
233 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
234 ),
235 );
236}
237
238#[test]
239#[cfg_attr(not(feature = "eval"), ignore)]
240fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
241 // Model | Pass rate
242 // ============================================
243 //
244 // claude-3.7-sonnet | 0.96 (2025-06-14)
245 // claude-sonnet-4 | 0.11 (2025-06-14)
246 // gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
247 // gemini-2.5-flash-preview-04-17 |
248 // gpt-4.1 |
249 let input_file_path = "root/lib.rs";
250 let input_file_content =
251 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
252 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
253 eval(
254 100,
255 0.95,
256 0.05,
257 EvalInput::from_conversation(
258 vec![
259 message(
260 User,
261 [text(formatdoc! {"
262 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
263 Use `ureq` to download the SDK for the current platform and architecture.
264 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
265 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
266 that's inside of the archive.
267 Don't re-download the SDK if that executable already exists.
268
269 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
270
271 Here are the available wasi-sdk assets:
272 - wasi-sdk-25.0-x86_64-macos.tar.gz
273 - wasi-sdk-25.0-arm64-macos.tar.gz
274 - wasi-sdk-25.0-x86_64-linux.tar.gz
275 - wasi-sdk-25.0-arm64-linux.tar.gz
276 - wasi-sdk-25.0-x86_64-linux.tar.gz
277 - wasi-sdk-25.0-arm64-linux.tar.gz
278 - wasi-sdk-25.0-x86_64-windows.tar.gz
279 "})],
280 ),
281 message(
282 Assistant,
283 [tool_use(
284 "tool_1",
285 "read_file",
286 ReadFileToolInput {
287 path: input_file_path.into(),
288 start_line: Some(971),
289 end_line: Some(1050),
290 },
291 )],
292 ),
293 message(
294 User,
295 [tool_result(
296 "tool_1",
297 "read_file",
298 lines(input_file_content, 971..1050),
299 )],
300 ),
301 message(
302 Assistant,
303 [tool_use(
304 "tool_2",
305 "read_file",
306 ReadFileToolInput {
307 path: input_file_path.into(),
308 start_line: Some(1050),
309 end_line: Some(1100),
310 },
311 )],
312 ),
313 message(
314 User,
315 [tool_result(
316 "tool_2",
317 "read_file",
318 lines(input_file_content, 1050..1100),
319 )],
320 ),
321 message(
322 Assistant,
323 [tool_use(
324 "tool_3",
325 "read_file",
326 ReadFileToolInput {
327 path: input_file_path.into(),
328 start_line: Some(1100),
329 end_line: Some(1150),
330 },
331 )],
332 ),
333 message(
334 User,
335 [tool_result(
336 "tool_3",
337 "read_file",
338 lines(input_file_content, 1100..1150),
339 )],
340 ),
341 message(
342 Assistant,
343 [tool_use(
344 "tool_4",
345 "edit_file",
346 EditFileToolInput {
347 display_description: edit_description.into(),
348 path: input_file_path.into(),
349 mode: EditFileMode::Edit,
350 },
351 )],
352 ),
353 ],
354 Some(input_file_content.into()),
355 EvalAssertion::judge_diff(indoc! {"
356 - The compile_parser_to_wasm method has been changed to use wasi-sdk
357 - ureq is used to download the SDK for current platform and architecture
358 "}),
359 ),
360 );
361}
362
363#[test]
364#[cfg_attr(not(feature = "eval"), ignore)]
365fn eval_disable_cursor_blinking() {
366 // Model | Pass rate
367 // ============================================
368 //
369 // claude-3.7-sonnet | 0.59 (2025-07-14)
370 // claude-sonnet-4 | 0.81 (2025-07-14)
371 // gemini-2.5-pro | 0.95 (2025-07-14)
372 // gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
373 // gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
374 let input_file_path = "root/editor.rs";
375 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
376 let edit_description = "Comment out the call to `BlinkManager::enable`";
377 let possible_diffs = vec![
378 include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
379 include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
380 include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
381 include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
382 ];
383 eval(
384 100,
385 0.51,
386 0.05,
387 EvalInput::from_conversation(
388 vec![
389 message(User, [text("Let's research how to cursor blinking works.")]),
390 message(
391 Assistant,
392 [tool_use(
393 "tool_1",
394 "grep",
395 GrepToolInput {
396 regex: "blink".into(),
397 include_pattern: None,
398 offset: 0,
399 case_sensitive: false,
400 },
401 )],
402 ),
403 message(
404 User,
405 [tool_result(
406 "tool_1",
407 "grep",
408 [
409 lines(input_file_content, 100..400),
410 lines(input_file_content, 800..1300),
411 lines(input_file_content, 1600..2000),
412 lines(input_file_content, 5000..5500),
413 lines(input_file_content, 8000..9000),
414 lines(input_file_content, 18455..18470),
415 lines(input_file_content, 20000..20500),
416 lines(input_file_content, 21000..21300),
417 ]
418 .join("Match found:\n\n"),
419 )],
420 ),
421 message(
422 User,
423 [text(indoc! {"
424 Comment out the lines that interact with the BlinkManager.
425 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
426 Don't add additional comments.
427 "})],
428 ),
429 message(
430 Assistant,
431 [tool_use(
432 "tool_4",
433 "edit_file",
434 EditFileToolInput {
435 display_description: edit_description.into(),
436 path: input_file_path.into(),
437 mode: EditFileMode::Edit,
438 },
439 )],
440 ),
441 ],
442 Some(input_file_content.into()),
443 EvalAssertion::assert_diff_any(possible_diffs),
444 ),
445 );
446}
447
448#[test]
449#[cfg_attr(not(feature = "eval"), ignore)]
450fn eval_from_pixels_constructor() {
451 // Results for 2025-06-13
452 //
453 // The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
454 // value. Higher values improve the pass rate but may sometimes cause
455 // edits to be misapplied. In the context of this eval, this means
456 // the agent might add from_pixels tests in incorrect locations
457 // (e.g., at the beginning of the file), yet the evaluation may still
458 // rate it highly.
459 //
460 // Model | Date | Pass rate
461 // =========================================================
462 // claude-4.0-sonnet | 2025-06-14 | 0.99
463 // claude-3.7-sonnet | 2025-06-14 | 0.88
464 // gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
465 // gpt-4.1 |
466 let input_file_path = "root/canvas.rs";
467 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
468 let edit_description = "Implement from_pixels constructor and add tests.";
469 eval(
470 100,
471 0.95,
472 // For whatever reason, this eval produces more mismatched tags.
473 // Increasing for now, let's see if we can bring this down.
474 0.25,
475 EvalInput::from_conversation(
476 vec![
477 message(
478 User,
479 [text(indoc! {"
480 Introduce a new `from_pixels` constructor in Canvas and
481 also add tests for it in the same file.
482 "})],
483 ),
484 message(
485 Assistant,
486 [tool_use(
487 "tool_1",
488 "read_file",
489 ReadFileToolInput {
490 path: input_file_path.into(),
491 start_line: None,
492 end_line: None,
493 },
494 )],
495 ),
496 message(
497 User,
498 [tool_result("tool_1", "read_file", input_file_content)],
499 ),
500 message(
501 Assistant,
502 [tool_use(
503 "tool_2",
504 "grep",
505 GrepToolInput {
506 regex: "mod\\s+tests".into(),
507 include_pattern: Some("font-kit/src/canvas.rs".into()),
508 offset: 0,
509 case_sensitive: false,
510 },
511 )],
512 ),
513 message(User, [tool_result("tool_2", "grep", "No matches found")]),
514 message(
515 Assistant,
516 [tool_use(
517 "tool_3",
518 "grep",
519 GrepToolInput {
520 regex: "mod\\s+tests".into(),
521 include_pattern: Some("font-kit/src/**/*.rs".into()),
522 offset: 0,
523 case_sensitive: false,
524 },
525 )],
526 ),
527 message(User, [tool_result("tool_3", "grep", "No matches found")]),
528 message(
529 Assistant,
530 [tool_use(
531 "tool_4",
532 "grep",
533 GrepToolInput {
534 regex: "#\\[test\\]".into(),
535 include_pattern: Some("font-kit/src/**/*.rs".into()),
536 offset: 0,
537 case_sensitive: false,
538 },
539 )],
540 ),
541 message(
542 User,
543 [tool_result(
544 "tool_4",
545 "grep",
546 indoc! {"
547 Found 6 matches:
548
549 ## Matches in font-kit/src/loaders/core_text.rs
550
551 ### mod test › L926-936
552 ```
553 mod test {
554 use super::Font;
555 use crate::properties::{Stretch, Weight};
556
557 #[cfg(feature = \"source\")]
558 use crate::source::SystemSource;
559
560 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
561
562 #[cfg(feature = \"source\")]
563 #[test]
564 ```
565
566 55 lines remaining in ancestor node. Read the file to see all.
567
568 ### mod test › L947-951
569 ```
570 }
571
572 #[test]
573 fn test_core_text_to_css_font_weight() {
574 // Exact matches
575 ```
576
577 ### mod test › L959-963
578 ```
579 }
580
581 #[test]
582 fn test_core_text_to_css_font_stretch() {
583 // Exact matches
584 ```
585
586 ## Matches in font-kit/src/loaders/freetype.rs
587
588 ### mod test › L1238-1248
589 ```
590 mod test {
591 use crate::loaders::freetype::Font;
592
593 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
594 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
595
596 #[test]
597 fn get_pcf_postscript_name() {
598 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
599 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
600 }
601 ```
602
603 1 lines remaining in ancestor node. Read the file to see all.
604
605 ## Matches in font-kit/src/sources/core_text.rs
606
607 ### mod test › L265-275
608 ```
609 mod test {
610 use crate::properties::{Stretch, Weight};
611
612 #[test]
613 fn test_css_to_core_text_font_weight() {
614 // Exact matches
615 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
616 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
617 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
618 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
619
620 ```
621
622 27 lines remaining in ancestor node. Read the file to see all.
623
624 ### mod test › L278-282
625 ```
626 }
627
628 #[test]
629 fn test_css_to_core_text_font_stretch() {
630 // Exact matches
631 ```
632 "},
633 )],
634 ),
635 message(
636 Assistant,
637 [tool_use(
638 "tool_5",
639 "edit_file",
640 EditFileToolInput {
641 display_description: edit_description.into(),
642 path: input_file_path.into(),
643 mode: EditFileMode::Edit,
644 },
645 )],
646 ),
647 ],
648 Some(input_file_content.into()),
649 EvalAssertion::judge_diff(indoc! {"
650 - The diff contains a new `from_pixels` constructor
651 - The diff contains new tests for the `from_pixels` constructor
652 "}),
653 ),
654 );
655}
656
657#[test]
658#[cfg_attr(not(feature = "eval"), ignore)]
659fn eval_zode() {
660 // Model | Pass rate
661 // ============================================
662 //
663 // claude-3.7-sonnet | 1.0 (2025-06-14)
664 // claude-sonnet-4 | 1.0 (2025-06-14)
665 // gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
666 // gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
667 // gpt-4.1 | 1.0 (2025-05-22)
668 let input_file_path = "root/zode.py";
669 let input_content = None;
670 let edit_description = "Create the main Zode CLI script";
671 eval(
672 50,
673 1.,
674 0.05,
675 EvalInput::from_conversation(
676 vec![
677 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
678 message(
679 Assistant,
680 [
681 tool_use(
682 "tool_1",
683 "read_file",
684 ReadFileToolInput {
685 path: "root/eval/react.py".into(),
686 start_line: None,
687 end_line: None,
688 },
689 ),
690 tool_use(
691 "tool_2",
692 "read_file",
693 ReadFileToolInput {
694 path: "root/eval/react_test.py".into(),
695 start_line: None,
696 end_line: None,
697 },
698 ),
699 ],
700 ),
701 message(
702 User,
703 [
704 tool_result(
705 "tool_1",
706 "read_file",
707 include_str!("evals/fixtures/zode/react.py"),
708 ),
709 tool_result(
710 "tool_2",
711 "read_file",
712 include_str!("evals/fixtures/zode/react_test.py"),
713 ),
714 ],
715 ),
716 message(
717 Assistant,
718 [
719 text(
720 "Now that I understand what we need to build, I'll create the main Python script:",
721 ),
722 tool_use(
723 "tool_3",
724 "edit_file",
725 EditFileToolInput {
726 display_description: edit_description.into(),
727 path: input_file_path.into(),
728 mode: EditFileMode::Create,
729 },
730 ),
731 ],
732 ),
733 ],
734 input_content,
735 EvalAssertion::new(async move |sample, _, _cx| {
736 let invalid_starts = [' ', '`', '\n'];
737 let mut message = String::new();
738 for start in invalid_starts {
739 if sample.text_after.starts_with(start) {
740 message.push_str(&format!("The sample starts with a {:?}\n", start));
741 break;
742 }
743 }
744 // Remove trailing newline.
745 message.pop();
746
747 if message.is_empty() {
748 Ok(EvalAssertionOutcome {
749 score: 100,
750 message: None,
751 })
752 } else {
753 Ok(EvalAssertionOutcome {
754 score: 0,
755 message: Some(message),
756 })
757 }
758 }),
759 ),
760 );
761}
762
763#[test]
764#[cfg_attr(not(feature = "eval"), ignore)]
765fn eval_add_overwrite_test() {
766 // Model | Pass rate
767 // ============================================
768 //
769 // claude-3.7-sonnet | 0.65 (2025-06-14)
770 // claude-sonnet-4 | 0.07 (2025-06-14)
771 // gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
772 // gemini-2.5-flash-preview-04-17 |
773 // gpt-4.1 |
774 let input_file_path = "root/action_log.rs";
775 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
776 let edit_description = "Add a new test for overwriting a file in action_log.rs";
777 eval(
778 200,
779 0.5, // TODO: make this eval better
780 0.05,
781 EvalInput::from_conversation(
782 vec![
783 message(
784 User,
785 [text(indoc! {"
786 Introduce a new test in `action_log.rs` to test overwriting a file.
787 That is, a file already exists, but we call `buffer_created` as if the file were new.
788 Take inspiration from all the other tests in the file.
789 "})],
790 ),
791 message(
792 Assistant,
793 [tool_use(
794 "tool_1",
795 "read_file",
796 ReadFileToolInput {
797 path: input_file_path.into(),
798 start_line: None,
799 end_line: None,
800 },
801 )],
802 ),
803 message(
804 User,
805 [tool_result(
806 "tool_1",
807 "read_file",
808 indoc! {"
809 pub struct ActionLog [L13-20]
810 tracked_buffers [L15]
811 edited_since_project_diagnostics_check [L17]
812 project [L19]
813 impl ActionLog [L22-498]
814 pub fn new [L24-30]
815 pub fn project [L32-34]
816 pub fn checked_project_diagnostics [L37-39]
817 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
818 fn track_buffer_internal [L46-101]
819 fn handle_buffer_event [L103-116]
820 fn handle_buffer_edited [L118-123]
821 fn handle_buffer_file_changed [L125-158]
822 async fn maintain_diff [L160-264]
823 pub fn buffer_read [L267-269]
824 pub fn buffer_created [L272-276]
825 pub fn buffer_edited [L279-287]
826 pub fn will_delete_buffer [L289-304]
827 pub fn keep_edits_in_range [L306-364]
828 pub fn reject_edits_in_ranges [L366-459]
829 pub fn keep_all_edits [L461-473]
830 pub fn changed_buffers [L476-482]
831 pub fn stale_buffers [L485-497]
832 fn apply_non_conflicting_edits [L500-561]
833 fn diff_snapshots [L563-585]
834 fn point_to_row_edit [L587-614]
835 enum ChangeAuthor [L617-620]
836 User [L618]
837 Agent [L619]
838 enum TrackedBufferStatus [L623-627]
839 Created [L624]
840 Modified [L625]
841 Deleted [L626]
842 struct TrackedBuffer [L629-641]
843 buffer [L630]
844 base_text [L631]
845 unreviewed_changes [L632]
846 status [L633]
847 version [L634]
848 diff [L635]
849 snapshot [L636]
850 diff_update [L637]
851 _open_lsp_handle [L638]
852 _maintain_diff [L639]
853 _subscription [L640]
854 impl TrackedBuffer [L643-657]
855 fn has_changes [L644-650]
856 fn schedule_diff_update [L652-656]
857 pub struct ChangedBuffer [L659-661]
858 pub diff [L660]
859 mod tests [L664-1574]
860 fn init_logger [L678-682]
861 fn init_test [L684-691]
862 async fn test_keep_edits [L694-769]
863 async fn test_deletions [L772-854]
864 async fn test_overlapping_user_edits [L857-951]
865 async fn test_creating_files [L954-1010]
866 async fn test_deleting_files [L1013-1120]
867 async fn test_reject_edits [L1123-1255]
868 async fn test_reject_multiple_edits [L1258-1331]
869 async fn test_reject_deleted_file [L1334-1388]
870 async fn test_reject_created_file [L1391-1443]
871 async fn test_random_diffs [L1446-1535]
872 fn quiesce [L1510-1534]
873 struct HunkStatus [L1538-1542]
874 range [L1539]
875 diff_status [L1540]
876 old_text [L1541]
877 fn unreviewed_hunks [L1544-1573]
878
879 Showing symbols 1-69 (total symbols: 69)
880
881 Using the line numbers in this outline, you can call this tool again while specifying
882 the start_line and end_line fields to see the implementations of symbols in the outline.
883 "},
884 )],
885 ),
886 message(
887 Assistant,
888 [
889 text(
890 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
891 ),
892 tool_use(
893 "tool_2",
894 "read_file",
895 ReadFileToolInput {
896 path: input_file_path.into(),
897 start_line: Some(953),
898 end_line: Some(1010),
899 },
900 ),
901 ],
902 ),
903 message(
904 User,
905 [tool_result(
906 "tool_2",
907 "read_file",
908 lines(input_file_content, 953..1010),
909 )],
910 ),
911 message(
912 Assistant,
913 [
914 text(
915 "Now let's look at another relevant test to understand how they're structured:",
916 ),
917 tool_use(
918 "tool_3",
919 "read_file",
920 ReadFileToolInput {
921 path: input_file_path.into(),
922 start_line: Some(1012),
923 end_line: Some(1120),
924 },
925 ),
926 ],
927 ),
928 message(
929 User,
930 [tool_result(
931 "tool_3",
932 "read_file",
933 lines(input_file_content, 1012..1120),
934 )],
935 ),
936 message(
937 Assistant,
938 [
939 text("Now let's look at how `buffer_created` is implemented:"),
940 tool_use(
941 "tool_4",
942 "read_file",
943 ReadFileToolInput {
944 path: input_file_path.into(),
945 start_line: Some(271),
946 end_line: Some(276),
947 },
948 ),
949 ],
950 ),
951 message(
952 User,
953 [tool_result(
954 "tool_4",
955 "read_file",
956 lines(input_file_content, 271..276),
957 )],
958 ),
959 message(
960 Assistant,
961 [
962 text(indoc! {"
963 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
964
965 1. Create a file
966 2. Open a buffer for it
967 3. Track it in the action log
968 4. Modify the buffer
969 5. Call `buffer_created` again as if the file were new
970 6. Check that the changes are properly tracked
971
972 Let's write the test:
973 "}),
974 tool_use(
975 "tool_5",
976 "edit_file",
977 EditFileToolInput {
978 display_description: edit_description.into(),
979 path: input_file_path.into(),
980 mode: EditFileMode::Edit,
981 },
982 ),
983 ],
984 ),
985 ],
986 Some(input_file_content.into()),
987 EvalAssertion::judge_diff(
988 "A new test for overwritten files was created, without changing any previous test",
989 ),
990 ),
991 );
992}
993
994#[test]
995#[cfg_attr(not(feature = "eval"), ignore)]
996fn eval_create_empty_file() {
997 // Check that Edit Agent can create a file without writing its
998 // thoughts into it. This issue is not specific to empty files, but
999 // it's easier to reproduce with them.
1000 //
1001 // Model | Pass rate
1002 // ============================================
1003 //
1004 // claude-3.7-sonnet | 1.00 (2025-06-14)
1005 // claude-sonnet-4 | 1.00 (2025-06-14)
1006 // gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
1007 // gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
1008 // gpt-4.1 | 1.00 (2025-05-21)
1009 //
1010 //
1011 // TODO: gpt-4.1-mini errored 38 times:
1012 // "data did not match any variant of untagged enum ResponseStreamResult"
1013 //
1014 let input_file_content = None;
1015 let expected_output_content = String::new();
1016 eval(
1017 100,
1018 0.99,
1019 0.05,
1020 EvalInput::from_conversation(
1021 vec![
1022 message(User, [text("Create a second empty todo file ")]),
1023 message(
1024 Assistant,
1025 [
1026 text(formatdoc! {"
1027 I'll help you create a second empty todo file.
1028 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1029 "}),
1030 tool_use(
1031 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1032 "list_directory",
1033 ListDirectoryToolInput {
1034 path: "root".to_string(),
1035 },
1036 ),
1037 ],
1038 ),
1039 message(
1040 User,
1041 [tool_result(
1042 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1043 "list_directory",
1044 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1045 )],
1046 ),
1047 message(
1048 Assistant,
1049 [
1050 text(formatdoc! {"
1051 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1052 "}),
1053 tool_use(
1054 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1055 "edit_file",
1056 EditFileToolInput {
1057 display_description: "Create empty TODO3 file".to_string(),
1058 mode: EditFileMode::Create,
1059 path: "root/TODO3".into(),
1060 },
1061 ),
1062 ],
1063 ),
1064 ],
1065 input_file_content,
1066 // Bad behavior is to write something like
1067 // "I'll create an empty TODO3 file as requested."
1068 EvalAssertion::assert_eq(expected_output_content),
1069 ),
1070 );
1071}
1072
1073fn message(
1074 role: Role,
1075 contents: impl IntoIterator<Item = MessageContent>,
1076) -> LanguageModelRequestMessage {
1077 LanguageModelRequestMessage {
1078 role,
1079 content: contents.into_iter().collect(),
1080 cache: false,
1081 }
1082}
1083
1084fn text(text: impl Into<String>) -> MessageContent {
1085 MessageContent::Text(text.into())
1086}
1087
1088fn lines(input: &str, range: Range<usize>) -> String {
1089 input
1090 .lines()
1091 .skip(range.start)
1092 .take(range.len())
1093 .collect::<Vec<_>>()
1094 .join("\n")
1095}
1096
1097fn tool_use(
1098 id: impl Into<Arc<str>>,
1099 name: impl Into<Arc<str>>,
1100 input: impl Serialize,
1101) -> MessageContent {
1102 MessageContent::ToolUse(LanguageModelToolUse {
1103 id: LanguageModelToolUseId::from(id.into()),
1104 name: name.into(),
1105 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1106 input: serde_json::to_value(input).unwrap(),
1107 is_input_complete: true,
1108 })
1109}
1110
1111fn tool_result(
1112 id: impl Into<Arc<str>>,
1113 name: impl Into<Arc<str>>,
1114 result: impl Into<Arc<str>>,
1115) -> MessageContent {
1116 MessageContent::ToolResult(LanguageModelToolResult {
1117 tool_use_id: LanguageModelToolUseId::from(id.into()),
1118 tool_name: name.into(),
1119 is_error: false,
1120 content: LanguageModelToolResultContent::Text(result.into()),
1121 output: None,
1122 })
1123}
1124
1125#[derive(Clone)]
1126struct EvalInput {
1127 conversation: Vec<LanguageModelRequestMessage>,
1128 edit_file_input: EditFileToolInput,
1129 input_content: Option<String>,
1130 assertion: EvalAssertion,
1131}
1132
1133impl EvalInput {
1134 fn from_conversation(
1135 conversation: Vec<LanguageModelRequestMessage>,
1136 input_content: Option<String>,
1137 assertion: EvalAssertion,
1138 ) -> Self {
1139 let msg = conversation.last().expect("Conversation must not be empty");
1140 if msg.role != Role::Assistant {
1141 panic!("Conversation must end with an assistant message");
1142 }
1143 let tool_use = msg
1144 .content
1145 .iter()
1146 .flat_map(|content| match content {
1147 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1148 Some(tool_use)
1149 }
1150 _ => None,
1151 })
1152 .next()
1153 .expect("Conversation must end with an edit_file tool use")
1154 .clone();
1155
1156 let edit_file_input: EditFileToolInput =
1157 serde_json::from_value(tool_use.input.clone()).unwrap();
1158
1159 EvalInput {
1160 conversation,
1161 edit_file_input,
1162 input_content,
1163 assertion,
1164 }
1165 }
1166}
1167
1168#[derive(Clone)]
1169struct EvalSample {
1170 text_before: String,
1171 text_after: String,
1172 edit_output: EditAgentOutput,
1173 diff: String,
1174}
1175
1176trait AssertionFn: 'static + Send + Sync {
1177 fn assert<'a>(
1178 &'a self,
1179 sample: &'a EvalSample,
1180 judge_model: Arc<dyn LanguageModel>,
1181 cx: &'a mut TestAppContext,
1182 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1183}
1184
1185impl<F> AssertionFn for F
1186where
1187 F: 'static
1188 + Send
1189 + Sync
1190 + AsyncFn(
1191 &EvalSample,
1192 Arc<dyn LanguageModel>,
1193 &mut TestAppContext,
1194 ) -> Result<EvalAssertionOutcome>,
1195{
1196 fn assert<'a>(
1197 &'a self,
1198 sample: &'a EvalSample,
1199 judge_model: Arc<dyn LanguageModel>,
1200 cx: &'a mut TestAppContext,
1201 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1202 (self)(sample, judge_model, cx).boxed_local()
1203 }
1204}
1205
1206#[derive(Clone)]
1207struct EvalAssertion(Arc<dyn AssertionFn>);
1208
1209impl EvalAssertion {
1210 fn new<F>(f: F) -> Self
1211 where
1212 F: 'static
1213 + Send
1214 + Sync
1215 + AsyncFn(
1216 &EvalSample,
1217 Arc<dyn LanguageModel>,
1218 &mut TestAppContext,
1219 ) -> Result<EvalAssertionOutcome>,
1220 {
1221 EvalAssertion(Arc::new(f))
1222 }
1223
1224 fn assert_eq(expected: impl Into<String>) -> Self {
1225 let expected = expected.into();
1226 Self::new(async move |sample, _judge, _cx| {
1227 Ok(EvalAssertionOutcome {
1228 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1229 100
1230 } else {
1231 0
1232 },
1233 message: None,
1234 })
1235 })
1236 }
1237
1238 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1239 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1240 Self::new(async move |sample, _judge, _cx| {
1241 let matches = expected_diffs.iter().any(|possible_diff| {
1242 let expected =
1243 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1244 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1245 });
1246
1247 Ok(EvalAssertionOutcome {
1248 score: if matches { 100 } else { 0 },
1249 message: None,
1250 })
1251 })
1252 }
1253
1254 fn judge_diff(assertions: &'static str) -> Self {
1255 Self::new(async move |sample, judge, cx| {
1256 let prompt = DiffJudgeTemplate {
1257 diff: sample.diff.clone(),
1258 assertions,
1259 }
1260 .render(&Templates::new())
1261 .unwrap();
1262
1263 let request = LanguageModelRequest {
1264 messages: vec![LanguageModelRequestMessage {
1265 role: Role::User,
1266 content: vec![prompt.into()],
1267 cache: false,
1268 }],
1269 thinking_allowed: true,
1270 ..Default::default()
1271 };
1272 let mut response = retry_on_rate_limit(async || {
1273 Ok(judge
1274 .stream_completion_text(request.clone(), &cx.to_async())
1275 .await?)
1276 })
1277 .await?;
1278 let mut output = String::new();
1279 while let Some(chunk) = response.stream.next().await {
1280 let chunk = chunk?;
1281 output.push_str(&chunk);
1282 }
1283
1284 // Parse the score from the response
1285 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1286 if let Some(captures) = re.captures(&output)
1287 && let Some(score_match) = captures.get(1)
1288 {
1289 let score = score_match.as_str().parse().unwrap_or(0);
1290 return Ok(EvalAssertionOutcome {
1291 score,
1292 message: Some(output),
1293 });
1294 }
1295
1296 anyhow::bail!("No score found in response. Raw output: {output}");
1297 })
1298 }
1299
1300 async fn run(
1301 &self,
1302 input: &EvalSample,
1303 judge_model: Arc<dyn LanguageModel>,
1304 cx: &mut TestAppContext,
1305 ) -> Result<EvalAssertionOutcome> {
1306 self.0.assert(input, judge_model, cx).await
1307 }
1308}
1309
1310fn eval(
1311 iterations: usize,
1312 expected_pass_ratio: f32,
1313 mismatched_tag_threshold: f32,
1314 mut eval: EvalInput,
1315) {
1316 let mut evaluated_count = 0;
1317 let mut failed_count = 0;
1318 report_progress(evaluated_count, failed_count, iterations);
1319
1320 let (tx, rx) = mpsc::channel();
1321
1322 // Cache the last message in the conversation, and run one instance of the eval so that
1323 // all the next ones are cached.
1324 eval.conversation.last_mut().unwrap().cache = true;
1325 run_eval(eval.clone(), tx.clone());
1326
1327 let executor = gpui::background_executor();
1328 let semaphore = Arc::new(smol::lock::Semaphore::new(32));
1329 for _ in 1..iterations {
1330 let eval = eval.clone();
1331 let tx = tx.clone();
1332 let semaphore = semaphore.clone();
1333 executor
1334 .spawn(async move {
1335 let _guard = semaphore.acquire().await;
1336 run_eval(eval, tx)
1337 })
1338 .detach();
1339 }
1340 drop(tx);
1341
1342 let mut failed_evals = HashMap::default();
1343 let mut errored_evals = HashMap::default();
1344 let mut eval_outputs = Vec::new();
1345 let mut cumulative_parser_metrics = EditParserMetrics::default();
1346 while let Ok(output) = rx.recv() {
1347 match output {
1348 Ok(output) => {
1349 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1350 eval_outputs.push(output.clone());
1351 if output.assertion.score < 80 {
1352 failed_count += 1;
1353 failed_evals
1354 .entry(output.sample.text_after.clone())
1355 .or_insert(Vec::new())
1356 .push(output);
1357 }
1358 }
1359 Err(error) => {
1360 failed_count += 1;
1361 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1362 }
1363 }
1364
1365 evaluated_count += 1;
1366 report_progress(evaluated_count, failed_count, iterations);
1367 }
1368
1369 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1370 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1371 if actual_pass_ratio < expected_pass_ratio {
1372 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1373 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1374 for (error, count) in errored_evals {
1375 println!("Eval errored {} times. Error: {}", count, error);
1376 }
1377
1378 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1379 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1380 for (_buffer_output, failed_evals) in failed_evals {
1381 let eval_output = failed_evals.first().unwrap();
1382 println!("Eval failed {} times", failed_evals.len());
1383 println!("{}", eval_output);
1384 }
1385
1386 panic!(
1387 "Actual pass ratio: {}\nExpected pass ratio: {}",
1388 actual_pass_ratio, expected_pass_ratio
1389 );
1390 }
1391
1392 let mismatched_tag_ratio =
1393 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1394 if mismatched_tag_ratio > mismatched_tag_threshold {
1395 for eval_output in eval_outputs {
1396 println!("{}", eval_output);
1397 }
1398 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1399 }
1400}
1401
1402fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1403 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1404 let mut cx = TestAppContext::build(dispatcher, None);
1405 let output = cx.executor().block_test(async {
1406 let test = EditAgentTest::new(&mut cx).await;
1407 test.eval(eval, &mut cx).await
1408 });
1409 tx.send(output).unwrap();
1410}
1411
1412#[derive(Clone)]
1413struct EvalOutput {
1414 sample: EvalSample,
1415 assertion: EvalAssertionOutcome,
1416}
1417
1418impl Display for EvalOutput {
1419 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1420 writeln!(f, "Score: {:?}", self.assertion.score)?;
1421 if let Some(message) = self.assertion.message.as_ref() {
1422 writeln!(f, "Message: {}", message)?;
1423 }
1424
1425 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1426
1427 writeln!(
1428 f,
1429 "Parser Metrics:\n{:#?}",
1430 self.sample.edit_output.parser_metrics
1431 )?;
1432 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1433 Ok(())
1434 }
1435}
1436
1437fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1438 let passed_count = evaluated_count - failed_count;
1439 let passed_ratio = if evaluated_count == 0 {
1440 0.0
1441 } else {
1442 passed_count as f64 / evaluated_count as f64
1443 };
1444 print!(
1445 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1446 evaluated_count,
1447 iterations,
1448 passed_ratio * 100.0
1449 );
1450 std::io::stdout().flush().unwrap();
1451}
1452
1453struct EditAgentTest {
1454 agent: EditAgent,
1455 project: Entity<Project>,
1456 judge_model: Arc<dyn LanguageModel>,
1457}
1458
1459impl EditAgentTest {
1460 async fn new(cx: &mut TestAppContext) -> Self {
1461 cx.executor().allow_parking();
1462
1463 let fs = FakeFs::new(cx.executor().clone());
1464 cx.update(|cx| {
1465 settings::init(cx);
1466 gpui_tokio::init(cx);
1467 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1468 cx.set_http_client(http_client);
1469
1470 client::init_settings(cx);
1471 let client = Client::production(cx);
1472 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1473
1474 settings::init(cx);
1475 Project::init_settings(cx);
1476 language::init(cx);
1477 language_model::init(client.clone(), cx);
1478 language_models::init(user_store.clone(), client.clone(), cx);
1479 crate::init(client.http_client(), cx);
1480 });
1481
1482 fs.insert_tree("/root", json!({})).await;
1483 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1484 let agent_model = SelectedModel::from_str(
1485 &std::env::var("ZED_AGENT_MODEL")
1486 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1487 )
1488 .unwrap();
1489 let judge_model = SelectedModel::from_str(
1490 &std::env::var("ZED_JUDGE_MODEL")
1491 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1492 )
1493 .unwrap();
1494 let (agent_model, judge_model) = cx
1495 .update(|cx| {
1496 cx.spawn(async move |cx| {
1497 let agent_model = Self::load_model(&agent_model, cx).await;
1498 let judge_model = Self::load_model(&judge_model, cx).await;
1499 (agent_model.unwrap(), judge_model.unwrap())
1500 })
1501 })
1502 .await;
1503 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1504
1505 let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
1506
1507 Self {
1508 agent: EditAgent::new(
1509 agent_model,
1510 project.clone(),
1511 action_log,
1512 Templates::new(),
1513 edit_format,
1514 ),
1515 project,
1516 judge_model,
1517 }
1518 }
1519
1520 async fn load_model(
1521 selected_model: &SelectedModel,
1522 cx: &mut AsyncApp,
1523 ) -> Result<Arc<dyn LanguageModel>> {
1524 let (provider, model) = cx.update(|cx| {
1525 let models = LanguageModelRegistry::read_global(cx);
1526 let model = models
1527 .available_models(cx)
1528 .find(|model| {
1529 model.provider_id() == selected_model.provider
1530 && model.id() == selected_model.model
1531 })
1532 .expect("Model not found");
1533 let provider = models.provider(&model.provider_id()).unwrap();
1534 (provider, model)
1535 })?;
1536 cx.update(|cx| provider.authenticate(cx))?.await?;
1537 Ok(model)
1538 }
1539
1540 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1541 let path = self
1542 .project
1543 .read_with(cx, |project, cx| {
1544 project.find_project_path(eval.edit_file_input.path, cx)
1545 })
1546 .unwrap();
1547 let buffer = self
1548 .project
1549 .update(cx, |project, cx| project.open_buffer(path, cx))
1550 .await
1551 .unwrap();
1552 let tools = cx.update(|cx| {
1553 ToolRegistry::default_global(cx)
1554 .tools()
1555 .into_iter()
1556 .filter_map(|tool| {
1557 let input_schema = tool
1558 .input_schema(self.agent.model.tool_input_format())
1559 .ok()?;
1560 Some(LanguageModelRequestTool {
1561 name: tool.name(),
1562 description: tool.description(),
1563 input_schema,
1564 })
1565 })
1566 .collect::<Vec<_>>()
1567 });
1568 let tool_names = tools
1569 .iter()
1570 .map(|tool| tool.name.clone())
1571 .collect::<Vec<_>>();
1572 let worktrees = vec![WorktreeContext {
1573 root_name: "root".to_string(),
1574 abs_path: Path::new("/path/to/root").into(),
1575 rules_file: None,
1576 }];
1577 let prompt_builder = PromptBuilder::new(None)?;
1578 let project_context = ProjectContext::new(worktrees, Vec::default());
1579 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1580 &project_context,
1581 &ModelContext {
1582 available_tools: tool_names,
1583 },
1584 )?;
1585
1586 let has_system_prompt = eval
1587 .conversation
1588 .first()
1589 .is_some_and(|msg| msg.role == Role::System);
1590 let messages = if has_system_prompt {
1591 eval.conversation
1592 } else {
1593 [LanguageModelRequestMessage {
1594 role: Role::System,
1595 content: vec![MessageContent::Text(system_prompt)],
1596 cache: true,
1597 }]
1598 .into_iter()
1599 .chain(eval.conversation)
1600 .collect::<Vec<_>>()
1601 };
1602
1603 let conversation = LanguageModelRequest {
1604 messages,
1605 tools,
1606 thinking_allowed: true,
1607 ..Default::default()
1608 };
1609
1610 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1611 if let Some(input_content) = eval.input_content.as_deref() {
1612 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1613 }
1614 retry_on_rate_limit(async || {
1615 self.agent
1616 .edit(
1617 buffer.clone(),
1618 eval.edit_file_input.display_description.clone(),
1619 &conversation,
1620 &mut cx.to_async(),
1621 )
1622 .0
1623 .await
1624 })
1625 .await?
1626 } else {
1627 retry_on_rate_limit(async || {
1628 self.agent
1629 .overwrite(
1630 buffer.clone(),
1631 eval.edit_file_input.display_description.clone(),
1632 &conversation,
1633 &mut cx.to_async(),
1634 )
1635 .0
1636 .await
1637 })
1638 .await?
1639 };
1640
1641 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1642 let sample = EvalSample {
1643 edit_output,
1644 diff: language::unified_diff(
1645 eval.input_content.as_deref().unwrap_or_default(),
1646 &buffer_text,
1647 ),
1648 text_before: eval.input_content.unwrap_or_default(),
1649 text_after: buffer_text,
1650 };
1651 let assertion = eval
1652 .assertion
1653 .run(&sample, self.judge_model.clone(), cx)
1654 .await?;
1655
1656 Ok(EvalOutput { assertion, sample })
1657 }
1658}
1659
1660async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
1661 const MAX_RETRIES: usize = 20;
1662 let mut attempt = 0;
1663
1664 loop {
1665 attempt += 1;
1666 let response = request().await;
1667
1668 if attempt >= MAX_RETRIES {
1669 return response;
1670 }
1671
1672 let retry_delay = match &response {
1673 Ok(_) => None,
1674 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
1675 Some(err) => match &err {
1676 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
1677 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
1678 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1679 }
1680 LanguageModelCompletionError::UpstreamProviderError {
1681 status,
1682 retry_after,
1683 ..
1684 } => {
1685 // Only retry for specific status codes
1686 let should_retry = matches!(
1687 *status,
1688 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
1689 ) || status.as_u16() == 529;
1690
1691 if should_retry {
1692 // Use server-provided retry_after if available, otherwise use default
1693 Some(retry_after.unwrap_or(Duration::from_secs(5)))
1694 } else {
1695 None
1696 }
1697 }
1698 LanguageModelCompletionError::ApiReadResponseError { .. }
1699 | LanguageModelCompletionError::ApiInternalServerError { .. }
1700 | LanguageModelCompletionError::HttpSend { .. } => {
1701 // Exponential backoff for transient I/O and internal server errors
1702 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
1703 }
1704 _ => None,
1705 },
1706 _ => None,
1707 },
1708 };
1709
1710 if let Some(retry_after) = retry_delay {
1711 let jitter = retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
1712 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
1713 Timer::after(retry_after + jitter).await;
1714 } else {
1715 return response;
1716 }
1717 }
1718}
1719
1720#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1721struct EvalAssertionOutcome {
1722 score: usize,
1723 message: Option<String>,
1724}
1725
1726#[derive(Serialize)]
1727pub struct DiffJudgeTemplate {
1728 diff: String,
1729 assertions: &'static str,
1730}
1731
1732impl Template for DiffJudgeTemplate {
1733 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1734}
1735
1736fn strip_empty_lines(text: &str) -> String {
1737 text.lines()
1738 .filter(|line| !line.trim().is_empty())
1739 .collect::<Vec<_>>()
1740 .join("\n")
1741}