1use super::*;
2use crate::{
3 ReadFileToolInput,
4 edit_file_tool::{EditFileMode, EditFileToolInput},
5 grep_tool::GrepToolInput,
6 list_directory_tool::ListDirectoryToolInput,
7};
8use Role::*;
9use assistant_tool::ToolRegistry;
10use client::{Client, UserStore};
11use collections::HashMap;
12use fs::FakeFs;
13use futures::{FutureExt, future::LocalBoxFuture};
14use gpui::{AppContext, TestAppContext};
15use indoc::{formatdoc, indoc};
16use language_model::{
17 LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
18 LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
19};
20use project::Project;
21use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
22use rand::prelude::*;
23use reqwest_client::ReqwestClient;
24use serde_json::json;
25use std::{
26 cmp::Reverse,
27 fmt::{self, Display},
28 io::Write as _,
29 str::FromStr,
30 sync::mpsc,
31};
32use util::path;
33
34#[test]
35#[cfg_attr(not(feature = "eval"), ignore)]
36fn eval_extract_handle_command_output() {
37 // Test how well agent generates multiple edit hunks.
38 //
39 // Model | Pass rate
40 // ----------------------------|----------
41 // claude-3.7-sonnet | 0.98
42 // gemini-2.5-pro | 0.86
43 // gemini-2.5-flash | 0.11
44 // gpt-4.1 | 1.00
45
46 let input_file_path = "root/blame.rs";
47 let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
48 let possible_diffs = vec![
49 include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
50 include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
51 include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
52 include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
53 include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
54 include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
55 include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
56 ];
57 let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
58 eval(
59 100,
60 0.7, // Taking the lower bar for Gemini
61 EvalInput::from_conversation(
62 vec![
63 message(
64 User,
65 [text(formatdoc! {"
66 Read the `{input_file_path}` file and extract a method in
67 the final stanza of `run_git_blame` to deal with command failures,
68 call it `handle_command_output` and take the std::process::Output as the only parameter.
69 Do not document the method and do not add any comments.
70
71 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
72 "})],
73 ),
74 message(
75 Assistant,
76 [tool_use(
77 "tool_1",
78 "read_file",
79 ReadFileToolInput {
80 path: input_file_path.into(),
81 start_line: None,
82 end_line: None,
83 },
84 )],
85 ),
86 message(
87 User,
88 [tool_result("tool_1", "read_file", input_file_content)],
89 ),
90 message(
91 Assistant,
92 [tool_use(
93 "tool_2",
94 "edit_file",
95 EditFileToolInput {
96 display_description: edit_description.into(),
97 path: input_file_path.into(),
98 mode: EditFileMode::Edit,
99 },
100 )],
101 ),
102 ],
103 Some(input_file_content.into()),
104 EvalAssertion::assert_diff_any(possible_diffs),
105 ),
106 );
107}
108
109#[test]
110#[cfg_attr(not(feature = "eval"), ignore)]
111fn eval_delete_run_git_blame() {
112 let input_file_path = "root/blame.rs";
113 let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
114 let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
115 let edit_description = "Delete the `run_git_blame` function.";
116 eval(
117 100,
118 0.95,
119 EvalInput::from_conversation(
120 vec![
121 message(
122 User,
123 [text(formatdoc! {"
124 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
125 one function, not its usages.
126 "})],
127 ),
128 message(
129 Assistant,
130 [tool_use(
131 "tool_1",
132 "read_file",
133 ReadFileToolInput {
134 path: input_file_path.into(),
135 start_line: None,
136 end_line: None,
137 },
138 )],
139 ),
140 message(
141 User,
142 [tool_result("tool_1", "read_file", input_file_content)],
143 ),
144 message(
145 Assistant,
146 [tool_use(
147 "tool_2",
148 "edit_file",
149 EditFileToolInput {
150 display_description: edit_description.into(),
151 path: input_file_path.into(),
152 mode: EditFileMode::Edit,
153 },
154 )],
155 ),
156 ],
157 Some(input_file_content.into()),
158 EvalAssertion::assert_eq(output_file_content),
159 ),
160 );
161}
162
163#[test]
164#[cfg_attr(not(feature = "eval"), ignore)]
165fn eval_translate_doc_comments() {
166 // Results for 2025-05-22
167 //
168 // Model | Pass rate
169 // ============================================
170 //
171 // claude-3.7-sonnet |
172 // gemini-2.5-pro-preview-03-25 | 1.0
173 // gemini-2.5-flash-preview-04-17 |
174 // gpt-4.1 |
175 let input_file_path = "root/canvas.rs";
176 let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
177 let edit_description = "Translate all doc comments to Italian";
178 eval(
179 200,
180 1.,
181 EvalInput::from_conversation(
182 vec![
183 message(
184 User,
185 [text(formatdoc! {"
186 Read the {input_file_path} file and edit it (without overwriting it),
187 translating all the doc comments to italian.
188 "})],
189 ),
190 message(
191 Assistant,
192 [tool_use(
193 "tool_1",
194 "read_file",
195 ReadFileToolInput {
196 path: input_file_path.into(),
197 start_line: None,
198 end_line: None,
199 },
200 )],
201 ),
202 message(
203 User,
204 [tool_result("tool_1", "read_file", input_file_content)],
205 ),
206 message(
207 Assistant,
208 [tool_use(
209 "tool_2",
210 "edit_file",
211 EditFileToolInput {
212 display_description: edit_description.into(),
213 path: input_file_path.into(),
214 mode: EditFileMode::Edit,
215 },
216 )],
217 ),
218 ],
219 Some(input_file_content.into()),
220 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
221 ),
222 );
223}
224
225#[test]
226#[cfg_attr(not(feature = "eval"), ignore)]
227fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
228 // Results for 2025-05-22
229 //
230 // Model | Pass rate
231 // ============================================
232 //
233 // claude-3.7-sonnet | 0.98
234 // gemini-2.5-pro-preview-03-25 | 0.99
235 // gemini-2.5-flash-preview-04-17 |
236 // gpt-4.1 |
237 let input_file_path = "root/lib.rs";
238 let input_file_content =
239 include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
240 let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
241 eval(
242 100,
243 0.95,
244 EvalInput::from_conversation(
245 vec![
246 message(
247 User,
248 [text(formatdoc! {"
249 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
250 Use `ureq` to download the SDK for the current platform and architecture.
251 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
252 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
253 that's inside of the archive.
254 Don't re-download the SDK if that executable already exists.
255
256 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
257
258 Here are the available wasi-sdk assets:
259 - wasi-sdk-25.0-x86_64-macos.tar.gz
260 - wasi-sdk-25.0-arm64-macos.tar.gz
261 - wasi-sdk-25.0-x86_64-linux.tar.gz
262 - wasi-sdk-25.0-arm64-linux.tar.gz
263 - wasi-sdk-25.0-x86_64-linux.tar.gz
264 - wasi-sdk-25.0-arm64-linux.tar.gz
265 - wasi-sdk-25.0-x86_64-windows.tar.gz
266 "})],
267 ),
268 message(
269 Assistant,
270 [tool_use(
271 "tool_1",
272 "read_file",
273 ReadFileToolInput {
274 path: input_file_path.into(),
275 start_line: Some(971),
276 end_line: Some(1050),
277 },
278 )],
279 ),
280 message(
281 User,
282 [tool_result(
283 "tool_1",
284 "read_file",
285 lines(input_file_content, 971..1050),
286 )],
287 ),
288 message(
289 Assistant,
290 [tool_use(
291 "tool_2",
292 "read_file",
293 ReadFileToolInput {
294 path: input_file_path.into(),
295 start_line: Some(1050),
296 end_line: Some(1100),
297 },
298 )],
299 ),
300 message(
301 User,
302 [tool_result(
303 "tool_2",
304 "read_file",
305 lines(input_file_content, 1050..1100),
306 )],
307 ),
308 message(
309 Assistant,
310 [tool_use(
311 "tool_3",
312 "read_file",
313 ReadFileToolInput {
314 path: input_file_path.into(),
315 start_line: Some(1100),
316 end_line: Some(1150),
317 },
318 )],
319 ),
320 message(
321 User,
322 [tool_result(
323 "tool_3",
324 "read_file",
325 lines(input_file_content, 1100..1150),
326 )],
327 ),
328 message(
329 Assistant,
330 [tool_use(
331 "tool_4",
332 "edit_file",
333 EditFileToolInput {
334 display_description: edit_description.into(),
335 path: input_file_path.into(),
336 mode: EditFileMode::Edit,
337 },
338 )],
339 ),
340 ],
341 Some(input_file_content.into()),
342 EvalAssertion::judge_diff(indoc! {"
343 - The compile_parser_to_wasm method has been changed to use wasi-sdk
344 - ureq is used to download the SDK for current platform and architecture
345 "}),
346 ),
347 );
348}
349
350#[test]
351#[cfg_attr(not(feature = "eval"), ignore)]
352fn eval_disable_cursor_blinking() {
353 // Results for 2025-05-22
354 //
355 // Model | Pass rate
356 // ============================================
357 //
358 // claude-3.7-sonnet |
359 // gemini-2.5-pro-preview-03-25 | 1.0
360 // gemini-2.5-flash-preview-04-17 |
361 // gpt-4.1 |
362 let input_file_path = "root/editor.rs";
363 let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
364 let edit_description = "Comment out the call to `BlinkManager::enable`";
365 eval(
366 100,
367 0.95,
368 EvalInput::from_conversation(
369 vec![
370 message(User, [text("Let's research how to cursor blinking works.")]),
371 message(
372 Assistant,
373 [tool_use(
374 "tool_1",
375 "grep",
376 GrepToolInput {
377 regex: "blink".into(),
378 include_pattern: None,
379 offset: 0,
380 case_sensitive: false,
381 },
382 )],
383 ),
384 message(
385 User,
386 [tool_result(
387 "tool_1",
388 "grep",
389 [
390 lines(input_file_content, 100..400),
391 lines(input_file_content, 800..1300),
392 lines(input_file_content, 1600..2000),
393 lines(input_file_content, 5000..5500),
394 lines(input_file_content, 8000..9000),
395 lines(input_file_content, 18455..18470),
396 lines(input_file_content, 20000..20500),
397 lines(input_file_content, 21000..21300),
398 ]
399 .join("Match found:\n\n"),
400 )],
401 ),
402 message(
403 User,
404 [text(indoc! {"
405 Comment out the lines that interact with the BlinkManager.
406 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
407 Don't add additional comments.
408 "})],
409 ),
410 message(
411 Assistant,
412 [tool_use(
413 "tool_4",
414 "edit_file",
415 EditFileToolInput {
416 display_description: edit_description.into(),
417 path: input_file_path.into(),
418 mode: EditFileMode::Edit,
419 },
420 )],
421 ),
422 ],
423 Some(input_file_content.into()),
424 EvalAssertion::judge_diff(indoc! {"
425 - Calls to BlinkManager in `observe_window_activation` were commented out
426 - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
427 - All the edits have valid indentation
428 "}),
429 ),
430 );
431}
432
433#[test]
434#[cfg_attr(not(feature = "eval"), ignore)]
435fn eval_from_pixels_constructor() {
436 // Results for 2025-05-22
437 //
438 // Model | Pass rate
439 // ============================================
440 //
441 // claude-3.7-sonnet |
442 // gemini-2.5-pro-preview-03-25 | 0.94
443 // gemini-2.5-flash-preview-04-17 |
444 // gpt-4.1 |
445 let input_file_path = "root/canvas.rs";
446 let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
447 let edit_description = "Implement from_pixels constructor and add tests.";
448 eval(
449 100,
450 0.95,
451 EvalInput::from_conversation(
452 vec![
453 message(
454 User,
455 [text(indoc! {"
456 Introduce a new `from_pixels` constructor in Canvas and
457 also add tests for it in the same file.
458 "})],
459 ),
460 message(
461 Assistant,
462 [tool_use(
463 "tool_1",
464 "read_file",
465 ReadFileToolInput {
466 path: input_file_path.into(),
467 start_line: None,
468 end_line: None,
469 },
470 )],
471 ),
472 message(
473 User,
474 [tool_result("tool_1", "read_file", input_file_content)],
475 ),
476 message(
477 Assistant,
478 [tool_use(
479 "tool_2",
480 "grep",
481 GrepToolInput {
482 regex: "mod\\s+tests".into(),
483 include_pattern: Some("font-kit/src/canvas.rs".into()),
484 offset: 0,
485 case_sensitive: false,
486 },
487 )],
488 ),
489 message(User, [tool_result("tool_2", "grep", "No matches found")]),
490 message(
491 Assistant,
492 [tool_use(
493 "tool_3",
494 "grep",
495 GrepToolInput {
496 regex: "mod\\s+tests".into(),
497 include_pattern: Some("font-kit/src/**/*.rs".into()),
498 offset: 0,
499 case_sensitive: false,
500 },
501 )],
502 ),
503 message(User, [tool_result("tool_3", "grep", "No matches found")]),
504 message(
505 Assistant,
506 [tool_use(
507 "tool_4",
508 "grep",
509 GrepToolInput {
510 regex: "#\\[test\\]".into(),
511 include_pattern: Some("font-kit/src/**/*.rs".into()),
512 offset: 0,
513 case_sensitive: false,
514 },
515 )],
516 ),
517 message(
518 User,
519 [tool_result(
520 "tool_4",
521 "grep",
522 indoc! {"
523 Found 6 matches:
524
525 ## Matches in font-kit/src/loaders/core_text.rs
526
527 ### mod test › L926-936
528 ```
529 mod test {
530 use super::Font;
531 use crate::properties::{Stretch, Weight};
532
533 #[cfg(feature = \"source\")]
534 use crate::source::SystemSource;
535
536 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
537
538 #[cfg(feature = \"source\")]
539 #[test]
540 ```
541
542 55 lines remaining in ancestor node. Read the file to see all.
543
544 ### mod test › L947-951
545 ```
546 }
547
548 #[test]
549 fn test_core_text_to_css_font_weight() {
550 // Exact matches
551 ```
552
553 ### mod test › L959-963
554 ```
555 }
556
557 #[test]
558 fn test_core_text_to_css_font_stretch() {
559 // Exact matches
560 ```
561
562 ## Matches in font-kit/src/loaders/freetype.rs
563
564 ### mod test › L1238-1248
565 ```
566 mod test {
567 use crate::loaders::freetype::Font;
568
569 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
570 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
571
572 #[test]
573 fn get_pcf_postscript_name() {
574 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
575 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
576 }
577 ```
578
579 1 lines remaining in ancestor node. Read the file to see all.
580
581 ## Matches in font-kit/src/sources/core_text.rs
582
583 ### mod test › L265-275
584 ```
585 mod test {
586 use crate::properties::{Stretch, Weight};
587
588 #[test]
589 fn test_css_to_core_text_font_weight() {
590 // Exact matches
591 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
592 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
593 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
594 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
595
596 ```
597
598 27 lines remaining in ancestor node. Read the file to see all.
599
600 ### mod test › L278-282
601 ```
602 }
603
604 #[test]
605 fn test_css_to_core_text_font_stretch() {
606 // Exact matches
607 ```
608 "},
609 )],
610 ),
611 message(
612 Assistant,
613 [tool_use(
614 "tool_5",
615 "edit_file",
616 EditFileToolInput {
617 display_description: edit_description.into(),
618 path: input_file_path.into(),
619 mode: EditFileMode::Edit,
620 },
621 )],
622 ),
623 ],
624 Some(input_file_content.into()),
625 EvalAssertion::judge_diff(indoc! {"
626 - The diff contains a new `from_pixels` constructor
627 - The diff contains new tests for the `from_pixels` constructor
628 "}),
629 ),
630 );
631}
632
633#[test]
634#[cfg_attr(not(feature = "eval"), ignore)]
635fn eval_zode() {
636 // Results for 2025-05-22
637 //
638 // Model | Pass rate
639 // ============================================
640 //
641 // claude-3.7-sonnet | 1.0
642 // gemini-2.5-pro-preview-03-25 | 1.0
643 // gemini-2.5-flash-preview-04-17 | 1.0
644 // gpt-4.1 | 1.0
645 let input_file_path = "root/zode.py";
646 let input_content = None;
647 let edit_description = "Create the main Zode CLI script";
648 eval(
649 50,
650 1.,
651 EvalInput::from_conversation(
652 vec![
653 message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
654 message(
655 Assistant,
656 [
657 tool_use(
658 "tool_1",
659 "read_file",
660 ReadFileToolInput {
661 path: "root/eval/react.py".into(),
662 start_line: None,
663 end_line: None,
664 },
665 ),
666 tool_use(
667 "tool_2",
668 "read_file",
669 ReadFileToolInput {
670 path: "root/eval/react_test.py".into(),
671 start_line: None,
672 end_line: None,
673 },
674 ),
675 ],
676 ),
677 message(
678 User,
679 [
680 tool_result(
681 "tool_1",
682 "read_file",
683 include_str!("evals/fixtures/zode/react.py"),
684 ),
685 tool_result(
686 "tool_2",
687 "read_file",
688 include_str!("evals/fixtures/zode/react_test.py"),
689 ),
690 ],
691 ),
692 message(
693 Assistant,
694 [
695 text(
696 "Now that I understand what we need to build, I'll create the main Python script:",
697 ),
698 tool_use(
699 "tool_3",
700 "edit_file",
701 EditFileToolInput {
702 display_description: edit_description.into(),
703 path: input_file_path.into(),
704 mode: EditFileMode::Create,
705 },
706 ),
707 ],
708 ),
709 ],
710 input_content,
711 EvalAssertion::new(async move |sample, _, _cx| {
712 let invalid_starts = [' ', '`', '\n'];
713 let mut message = String::new();
714 for start in invalid_starts {
715 if sample.text_after.starts_with(start) {
716 message.push_str(&format!("The sample starts with a {:?}\n", start));
717 break;
718 }
719 }
720 // Remove trailing newline.
721 message.pop();
722
723 if message.is_empty() {
724 Ok(EvalAssertionOutcome {
725 score: 100,
726 message: None,
727 })
728 } else {
729 Ok(EvalAssertionOutcome {
730 score: 0,
731 message: Some(message),
732 })
733 }
734 }),
735 ),
736 );
737}
738
739#[test]
740#[cfg_attr(not(feature = "eval"), ignore)]
741fn eval_add_overwrite_test() {
742 // Results for 2025-05-22
743 //
744 // Model | Pass rate
745 // ============================================
746 //
747 // claude-3.7-sonnet | 0.16
748 // gemini-2.5-pro-preview-03-25 | 0.35
749 // gemini-2.5-flash-preview-04-17 |
750 // gpt-4.1 |
751 let input_file_path = "root/action_log.rs";
752 let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
753 let edit_description = "Add a new test for overwriting a file in action_log.rs";
754 eval(
755 200,
756 0.5, // TODO: make this eval better
757 EvalInput::from_conversation(
758 vec![
759 message(
760 User,
761 [text(indoc! {"
762 Introduce a new test in `action_log.rs` to test overwriting a file.
763 That is, a file already exists, but we call `buffer_created` as if the file were new.
764 Take inspiration from all the other tests in the file.
765 "})],
766 ),
767 message(
768 Assistant,
769 [tool_use(
770 "tool_1",
771 "read_file",
772 ReadFileToolInput {
773 path: input_file_path.into(),
774 start_line: None,
775 end_line: None,
776 },
777 )],
778 ),
779 message(
780 User,
781 [tool_result(
782 "tool_1",
783 "read_file",
784 indoc! {"
785 pub struct ActionLog [L13-20]
786 tracked_buffers [L15]
787 edited_since_project_diagnostics_check [L17]
788 project [L19]
789 impl ActionLog [L22-498]
790 pub fn new [L24-30]
791 pub fn project [L32-34]
792 pub fn checked_project_diagnostics [L37-39]
793 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
794 fn track_buffer_internal [L46-101]
795 fn handle_buffer_event [L103-116]
796 fn handle_buffer_edited [L118-123]
797 fn handle_buffer_file_changed [L125-158]
798 async fn maintain_diff [L160-264]
799 pub fn buffer_read [L267-269]
800 pub fn buffer_created [L272-276]
801 pub fn buffer_edited [L279-287]
802 pub fn will_delete_buffer [L289-304]
803 pub fn keep_edits_in_range [L306-364]
804 pub fn reject_edits_in_ranges [L366-459]
805 pub fn keep_all_edits [L461-473]
806 pub fn changed_buffers [L476-482]
807 pub fn stale_buffers [L485-497]
808 fn apply_non_conflicting_edits [L500-561]
809 fn diff_snapshots [L563-585]
810 fn point_to_row_edit [L587-614]
811 enum ChangeAuthor [L617-620]
812 User [L618]
813 Agent [L619]
814 enum TrackedBufferStatus [L623-627]
815 Created [L624]
816 Modified [L625]
817 Deleted [L626]
818 struct TrackedBuffer [L629-641]
819 buffer [L630]
820 base_text [L631]
821 unreviewed_changes [L632]
822 status [L633]
823 version [L634]
824 diff [L635]
825 snapshot [L636]
826 diff_update [L637]
827 _open_lsp_handle [L638]
828 _maintain_diff [L639]
829 _subscription [L640]
830 impl TrackedBuffer [L643-657]
831 fn has_changes [L644-650]
832 fn schedule_diff_update [L652-656]
833 pub struct ChangedBuffer [L659-661]
834 pub diff [L660]
835 mod tests [L664-1574]
836 fn init_logger [L678-682]
837 fn init_test [L684-691]
838 async fn test_keep_edits [L694-769]
839 async fn test_deletions [L772-854]
840 async fn test_overlapping_user_edits [L857-951]
841 async fn test_creating_files [L954-1010]
842 async fn test_deleting_files [L1013-1120]
843 async fn test_reject_edits [L1123-1255]
844 async fn test_reject_multiple_edits [L1258-1331]
845 async fn test_reject_deleted_file [L1334-1388]
846 async fn test_reject_created_file [L1391-1443]
847 async fn test_random_diffs [L1446-1535]
848 fn quiesce [L1510-1534]
849 struct HunkStatus [L1538-1542]
850 range [L1539]
851 diff_status [L1540]
852 old_text [L1541]
853 fn unreviewed_hunks [L1544-1573]
854
855 Showing symbols 1-69 (total symbols: 69)
856
857 Using the line numbers in this outline, you can call this tool again while specifying
858 the start_line and end_line fields to see the implementations of symbols in the outline.
859 "},
860 )],
861 ),
862 message(
863 Assistant,
864 [
865 text(
866 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
867 ),
868 tool_use(
869 "tool_2",
870 "read_file",
871 ReadFileToolInput {
872 path: input_file_path.into(),
873 start_line: Some(953),
874 end_line: Some(1010),
875 },
876 ),
877 ],
878 ),
879 message(
880 User,
881 [tool_result(
882 "tool_2",
883 "read_file",
884 lines(input_file_content, 953..1010),
885 )],
886 ),
887 message(
888 Assistant,
889 [
890 text(
891 "Now let's look at another relevant test to understand how they're structured:",
892 ),
893 tool_use(
894 "tool_3",
895 "read_file",
896 ReadFileToolInput {
897 path: input_file_path.into(),
898 start_line: Some(1012),
899 end_line: Some(1120),
900 },
901 ),
902 ],
903 ),
904 message(
905 User,
906 [tool_result(
907 "tool_3",
908 "read_file",
909 lines(input_file_content, 1012..1120),
910 )],
911 ),
912 message(
913 Assistant,
914 [
915 text("Now let's look at how `buffer_created` is implemented:"),
916 tool_use(
917 "tool_4",
918 "read_file",
919 ReadFileToolInput {
920 path: input_file_path.into(),
921 start_line: Some(271),
922 end_line: Some(276),
923 },
924 ),
925 ],
926 ),
927 message(
928 User,
929 [tool_result(
930 "tool_4",
931 "read_file",
932 lines(input_file_content, 271..276),
933 )],
934 ),
935 message(
936 Assistant,
937 [
938 text(indoc! {"
939 Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
940
941 1. Create a file
942 2. Open a buffer for it
943 3. Track it in the action log
944 4. Modify the buffer
945 5. Call `buffer_created` again as if the file were new
946 6. Check that the changes are properly tracked
947
948 Let's write the test:
949 "}),
950 tool_use(
951 "tool_5",
952 "edit_file",
953 EditFileToolInput {
954 display_description: edit_description.into(),
955 path: input_file_path.into(),
956 mode: EditFileMode::Edit,
957 },
958 ),
959 ],
960 ),
961 ],
962 Some(input_file_content.into()),
963 EvalAssertion::judge_diff(
964 "A new test for overwritten files was created, without changing any previous test",
965 ),
966 ),
967 );
968}
969
970#[test]
971#[cfg_attr(not(feature = "eval"), ignore)]
972fn eval_create_empty_file() {
973 // Check that Edit Agent can create a file without writing its
974 // thoughts into it. This issue is not specific to empty files, but
975 // it's easier to reproduce with them.
976 //
977 // Results for 2025-05-21:
978 //
979 // Model | Pass rate
980 // ============================================
981 //
982 // claude-3.7-sonnet | 1.00
983 // gemini-2.5-pro-preview-03-25 | 1.00
984 // gemini-2.5-flash-preview-04-17 | 1.00
985 // gpt-4.1 | 1.00
986 //
987 //
988 // TODO: gpt-4.1-mini errored 38 times:
989 // "data did not match any variant of untagged enum ResponseStreamResult"
990 //
991 let input_file_content = None;
992 let expected_output_content = String::new();
993 eval(
994 100,
995 0.99,
996 EvalInput::from_conversation(
997 vec![
998 message(User, [text("Create a second empty todo file ")]),
999 message(
1000 Assistant,
1001 [
1002 text(formatdoc! {"
1003 I'll help you create a second empty todo file.
1004 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1005 "}),
1006 tool_use(
1007 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1008 "list_directory",
1009 ListDirectoryToolInput {
1010 path: "root".to_string(),
1011 },
1012 ),
1013 ],
1014 ),
1015 message(
1016 User,
1017 [tool_result(
1018 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1019 "list_directory",
1020 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1021 )],
1022 ),
1023 message(
1024 Assistant,
1025 [
1026 text(formatdoc! {"
1027 I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
1028 "}),
1029 tool_use(
1030 "toolu_01Tb3iQ9griqSYMmVuykQPWU",
1031 "edit_file",
1032 EditFileToolInput {
1033 display_description: "Create empty TODO3 file".to_string(),
1034 mode: EditFileMode::Create,
1035 path: "root/TODO3".into(),
1036 },
1037 ),
1038 ],
1039 ),
1040 ],
1041 input_file_content,
1042 // Bad behavior is to write something like
1043 // "I'll create an empty TODO3 file as requested."
1044 EvalAssertion::assert_eq(expected_output_content),
1045 ),
1046 );
1047}
1048
1049fn message(
1050 role: Role,
1051 contents: impl IntoIterator<Item = MessageContent>,
1052) -> LanguageModelRequestMessage {
1053 LanguageModelRequestMessage {
1054 role,
1055 content: contents.into_iter().collect(),
1056 cache: false,
1057 }
1058}
1059
1060fn text(text: impl Into<String>) -> MessageContent {
1061 MessageContent::Text(text.into())
1062}
1063
1064fn lines(input: &str, range: Range<usize>) -> String {
1065 input
1066 .lines()
1067 .skip(range.start)
1068 .take(range.len())
1069 .collect::<Vec<_>>()
1070 .join("\n")
1071}
1072
1073fn tool_use(
1074 id: impl Into<Arc<str>>,
1075 name: impl Into<Arc<str>>,
1076 input: impl Serialize,
1077) -> MessageContent {
1078 MessageContent::ToolUse(LanguageModelToolUse {
1079 id: LanguageModelToolUseId::from(id.into()),
1080 name: name.into(),
1081 raw_input: serde_json::to_string_pretty(&input).unwrap(),
1082 input: serde_json::to_value(input).unwrap(),
1083 is_input_complete: true,
1084 })
1085}
1086
1087fn tool_result(
1088 id: impl Into<Arc<str>>,
1089 name: impl Into<Arc<str>>,
1090 result: impl Into<Arc<str>>,
1091) -> MessageContent {
1092 MessageContent::ToolResult(LanguageModelToolResult {
1093 tool_use_id: LanguageModelToolUseId::from(id.into()),
1094 tool_name: name.into(),
1095 is_error: false,
1096 content: LanguageModelToolResultContent::Text(result.into()),
1097 output: None,
1098 })
1099}
1100
1101#[derive(Clone)]
1102struct EvalInput {
1103 conversation: Vec<LanguageModelRequestMessage>,
1104 edit_file_input: EditFileToolInput,
1105 input_content: Option<String>,
1106 assertion: EvalAssertion,
1107}
1108
1109impl EvalInput {
1110 fn from_conversation(
1111 conversation: Vec<LanguageModelRequestMessage>,
1112 input_content: Option<String>,
1113 assertion: EvalAssertion,
1114 ) -> Self {
1115 let msg = conversation.last().expect("Conversation must not be empty");
1116 if msg.role != Role::Assistant {
1117 panic!("Conversation must end with an assistant message");
1118 }
1119 let tool_use = msg
1120 .content
1121 .iter()
1122 .flat_map(|content| match content {
1123 MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
1124 Some(tool_use)
1125 }
1126 _ => None,
1127 })
1128 .next()
1129 .expect("Conversation must end with an edit_file tool use")
1130 .clone();
1131
1132 let edit_file_input: EditFileToolInput =
1133 serde_json::from_value(tool_use.input.clone()).unwrap();
1134
1135 EvalInput {
1136 conversation,
1137 edit_file_input,
1138 input_content,
1139 assertion,
1140 }
1141 }
1142}
1143
1144#[derive(Clone)]
1145struct EvalSample {
1146 text_before: String,
1147 text_after: String,
1148 edit_output: EditAgentOutput,
1149 diff: String,
1150}
1151
1152trait AssertionFn: 'static + Send + Sync {
1153 fn assert<'a>(
1154 &'a self,
1155 sample: &'a EvalSample,
1156 judge_model: Arc<dyn LanguageModel>,
1157 cx: &'a mut TestAppContext,
1158 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
1159}
1160
1161impl<F> AssertionFn for F
1162where
1163 F: 'static
1164 + Send
1165 + Sync
1166 + AsyncFn(
1167 &EvalSample,
1168 Arc<dyn LanguageModel>,
1169 &mut TestAppContext,
1170 ) -> Result<EvalAssertionOutcome>,
1171{
1172 fn assert<'a>(
1173 &'a self,
1174 sample: &'a EvalSample,
1175 judge_model: Arc<dyn LanguageModel>,
1176 cx: &'a mut TestAppContext,
1177 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
1178 (self)(sample, judge_model, cx).boxed_local()
1179 }
1180}
1181
1182#[derive(Clone)]
1183struct EvalAssertion(Arc<dyn AssertionFn>);
1184
1185impl EvalAssertion {
1186 fn new<F>(f: F) -> Self
1187 where
1188 F: 'static
1189 + Send
1190 + Sync
1191 + AsyncFn(
1192 &EvalSample,
1193 Arc<dyn LanguageModel>,
1194 &mut TestAppContext,
1195 ) -> Result<EvalAssertionOutcome>,
1196 {
1197 EvalAssertion(Arc::new(f))
1198 }
1199
1200 fn assert_eq(expected: impl Into<String>) -> Self {
1201 let expected = expected.into();
1202 Self::new(async move |sample, _judge, _cx| {
1203 Ok(EvalAssertionOutcome {
1204 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
1205 100
1206 } else {
1207 0
1208 },
1209 message: None,
1210 })
1211 })
1212 }
1213
1214 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
1215 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
1216 Self::new(async move |sample, _judge, _cx| {
1217 let matches = expected_diffs.iter().any(|possible_diff| {
1218 let expected =
1219 language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
1220 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
1221 });
1222
1223 Ok(EvalAssertionOutcome {
1224 score: if matches { 100 } else { 0 },
1225 message: None,
1226 })
1227 })
1228 }
1229
1230 fn judge_diff(assertions: &'static str) -> Self {
1231 Self::new(async move |sample, judge, cx| {
1232 let prompt = DiffJudgeTemplate {
1233 diff: sample.diff.clone(),
1234 assertions,
1235 }
1236 .render(&Templates::new())
1237 .unwrap();
1238
1239 let request = LanguageModelRequest {
1240 messages: vec![LanguageModelRequestMessage {
1241 role: Role::User,
1242 content: vec![prompt.into()],
1243 cache: false,
1244 }],
1245 ..Default::default()
1246 };
1247 let mut response = judge
1248 .stream_completion_text(request, &cx.to_async())
1249 .await?;
1250 let mut output = String::new();
1251 while let Some(chunk) = response.stream.next().await {
1252 let chunk = chunk?;
1253 output.push_str(&chunk);
1254 }
1255
1256 // Parse the score from the response
1257 let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
1258 if let Some(captures) = re.captures(&output) {
1259 if let Some(score_match) = captures.get(1) {
1260 let score = score_match.as_str().parse().unwrap_or(0);
1261 return Ok(EvalAssertionOutcome {
1262 score,
1263 message: Some(output),
1264 });
1265 }
1266 }
1267
1268 anyhow::bail!("No score found in response. Raw output: {output}");
1269 })
1270 }
1271
1272 async fn run(
1273 &self,
1274 input: &EvalSample,
1275 judge_model: Arc<dyn LanguageModel>,
1276 cx: &mut TestAppContext,
1277 ) -> Result<EvalAssertionOutcome> {
1278 self.0.assert(input, judge_model, cx).await
1279 }
1280}
1281
1282fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
1283 let mut evaluated_count = 0;
1284 let mut failed_count = 0;
1285 report_progress(evaluated_count, failed_count, iterations);
1286
1287 let (tx, rx) = mpsc::channel();
1288
1289 // Cache the last message in the conversation, and run one instance of the eval so that
1290 // all the next ones are cached.
1291 eval.conversation.last_mut().unwrap().cache = true;
1292 run_eval(eval.clone(), tx.clone());
1293
1294 let executor = gpui::background_executor();
1295 for _ in 1..iterations {
1296 let eval = eval.clone();
1297 let tx = tx.clone();
1298 executor.spawn(async move { run_eval(eval, tx) }).detach();
1299 }
1300 drop(tx);
1301
1302 let mut failed_evals = HashMap::default();
1303 let mut errored_evals = HashMap::default();
1304 let mut eval_outputs = Vec::new();
1305 let mut cumulative_parser_metrics = EditParserMetrics::default();
1306 while let Ok(output) = rx.recv() {
1307 match output {
1308 Ok(output) => {
1309 cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
1310 eval_outputs.push(output.clone());
1311 if output.assertion.score < 80 {
1312 failed_count += 1;
1313 failed_evals
1314 .entry(output.sample.text_after.clone())
1315 .or_insert(Vec::new())
1316 .push(output);
1317 }
1318 }
1319 Err(error) => {
1320 failed_count += 1;
1321 *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
1322 }
1323 }
1324
1325 evaluated_count += 1;
1326 report_progress(evaluated_count, failed_count, iterations);
1327 }
1328
1329 let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
1330 println!("Actual pass ratio: {}\n", actual_pass_ratio);
1331 if actual_pass_ratio < expected_pass_ratio {
1332 let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
1333 errored_evals.sort_by_key(|(_, count)| Reverse(*count));
1334 for (error, count) in errored_evals {
1335 println!("Eval errored {} times. Error: {}", count, error);
1336 }
1337
1338 let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
1339 failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
1340 for (_buffer_output, failed_evals) in failed_evals {
1341 let eval_output = failed_evals.first().unwrap();
1342 println!("Eval failed {} times", failed_evals.len());
1343 println!("{}", eval_output);
1344 }
1345
1346 panic!(
1347 "Actual pass ratio: {}\nExpected pass ratio: {}",
1348 actual_pass_ratio, expected_pass_ratio
1349 );
1350 }
1351
1352 let mismatched_tag_ratio =
1353 cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
1354 if mismatched_tag_ratio > 0.05 {
1355 for eval_output in eval_outputs {
1356 println!("{}", eval_output);
1357 }
1358 panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
1359 }
1360}
1361
1362fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
1363 let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
1364 let mut cx = TestAppContext::build(dispatcher, None);
1365 let output = cx.executor().block_test(async {
1366 let test = EditAgentTest::new(&mut cx).await;
1367 test.eval(eval, &mut cx).await
1368 });
1369 tx.send(output).unwrap();
1370}
1371
1372#[derive(Clone)]
1373struct EvalOutput {
1374 sample: EvalSample,
1375 assertion: EvalAssertionOutcome,
1376}
1377
1378impl Display for EvalOutput {
1379 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1380 writeln!(f, "Score: {:?}", self.assertion.score)?;
1381 if let Some(message) = self.assertion.message.as_ref() {
1382 writeln!(f, "Message: {}", message)?;
1383 }
1384
1385 writeln!(f, "Diff:\n{}", self.sample.diff)?;
1386
1387 writeln!(
1388 f,
1389 "Parser Metrics:\n{:#?}",
1390 self.sample.edit_output.parser_metrics
1391 )?;
1392 writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
1393 Ok(())
1394 }
1395}
1396
1397fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
1398 let passed_count = evaluated_count - failed_count;
1399 let passed_ratio = if evaluated_count == 0 {
1400 0.0
1401 } else {
1402 passed_count as f64 / evaluated_count as f64
1403 };
1404 print!(
1405 "\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
1406 evaluated_count,
1407 iterations,
1408 passed_ratio * 100.0
1409 );
1410 std::io::stdout().flush().unwrap();
1411}
1412
1413struct EditAgentTest {
1414 agent: EditAgent,
1415 project: Entity<Project>,
1416 judge_model: Arc<dyn LanguageModel>,
1417}
1418
1419impl EditAgentTest {
1420 async fn new(cx: &mut TestAppContext) -> Self {
1421 cx.executor().allow_parking();
1422
1423 let fs = FakeFs::new(cx.executor().clone());
1424 cx.update(|cx| {
1425 settings::init(cx);
1426 gpui_tokio::init(cx);
1427 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
1428 cx.set_http_client(http_client);
1429
1430 client::init_settings(cx);
1431 let client = Client::production(cx);
1432 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
1433
1434 settings::init(cx);
1435 Project::init_settings(cx);
1436 language::init(cx);
1437 language_model::init(client.clone(), cx);
1438 language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
1439 crate::init(client.http_client(), cx);
1440 });
1441
1442 fs.insert_tree("/root", json!({})).await;
1443 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
1444 let agent_model = SelectedModel::from_str(
1445 &std::env::var("ZED_AGENT_MODEL")
1446 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1447 )
1448 .unwrap();
1449 let judge_model = SelectedModel::from_str(
1450 &std::env::var("ZED_JUDGE_MODEL")
1451 .unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
1452 )
1453 .unwrap();
1454 let (agent_model, judge_model) = cx
1455 .update(|cx| {
1456 cx.spawn(async move |cx| {
1457 let agent_model = Self::load_model(&agent_model, cx).await;
1458 let judge_model = Self::load_model(&judge_model, cx).await;
1459 (agent_model.unwrap(), judge_model.unwrap())
1460 })
1461 })
1462 .await;
1463 let action_log = cx.new(|_| ActionLog::new(project.clone()));
1464
1465 Self {
1466 agent: EditAgent::new(agent_model, project.clone(), action_log, Templates::new()),
1467 project,
1468 judge_model,
1469 }
1470 }
1471
1472 async fn load_model(
1473 selected_model: &SelectedModel,
1474 cx: &mut AsyncApp,
1475 ) -> Result<Arc<dyn LanguageModel>> {
1476 let (provider, model) = cx.update(|cx| {
1477 let models = LanguageModelRegistry::read_global(cx);
1478 let model = models
1479 .available_models(cx)
1480 .find(|model| {
1481 model.provider_id() == selected_model.provider
1482 && model.id() == selected_model.model
1483 })
1484 .expect("Model not found");
1485 let provider = models.provider(&model.provider_id()).unwrap();
1486 (provider, model)
1487 })?;
1488 cx.update(|cx| provider.authenticate(cx))?.await?;
1489 Ok(model)
1490 }
1491
1492 async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
1493 let path = self
1494 .project
1495 .read_with(cx, |project, cx| {
1496 project.find_project_path(eval.edit_file_input.path, cx)
1497 })
1498 .unwrap();
1499 let buffer = self
1500 .project
1501 .update(cx, |project, cx| project.open_buffer(path, cx))
1502 .await
1503 .unwrap();
1504 let tools = cx.update(|cx| {
1505 ToolRegistry::default_global(cx)
1506 .tools()
1507 .into_iter()
1508 .filter_map(|tool| {
1509 let input_schema = tool
1510 .input_schema(self.agent.model.tool_input_format())
1511 .ok()?;
1512 Some(LanguageModelRequestTool {
1513 name: tool.name(),
1514 description: tool.description(),
1515 input_schema,
1516 })
1517 })
1518 .collect::<Vec<_>>()
1519 });
1520 let tool_names = tools
1521 .iter()
1522 .map(|tool| tool.name.clone())
1523 .collect::<Vec<_>>();
1524 let worktrees = vec![WorktreeContext {
1525 root_name: "root".to_string(),
1526 rules_file: None,
1527 }];
1528 let prompt_builder = PromptBuilder::new(None)?;
1529 let project_context = ProjectContext::new(worktrees, Vec::default());
1530 let system_prompt = prompt_builder.generate_assistant_system_prompt(
1531 &project_context,
1532 &ModelContext {
1533 available_tools: tool_names,
1534 },
1535 )?;
1536
1537 let has_system_prompt = eval
1538 .conversation
1539 .first()
1540 .map_or(false, |msg| msg.role == Role::System);
1541 let messages = if has_system_prompt {
1542 eval.conversation
1543 } else {
1544 [LanguageModelRequestMessage {
1545 role: Role::System,
1546 content: vec![MessageContent::Text(system_prompt)],
1547 cache: true,
1548 }]
1549 .into_iter()
1550 .chain(eval.conversation)
1551 .collect::<Vec<_>>()
1552 };
1553
1554 let conversation = LanguageModelRequest {
1555 messages,
1556 tools,
1557 ..Default::default()
1558 };
1559
1560 let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
1561 if let Some(input_content) = eval.input_content.as_deref() {
1562 buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
1563 }
1564 let (edit_output, _) = self.agent.edit(
1565 buffer.clone(),
1566 eval.edit_file_input.display_description,
1567 &conversation,
1568 &mut cx.to_async(),
1569 );
1570 edit_output.await?
1571 } else {
1572 let (edit_output, _) = self.agent.overwrite(
1573 buffer.clone(),
1574 eval.edit_file_input.display_description,
1575 &conversation,
1576 &mut cx.to_async(),
1577 );
1578 edit_output.await?
1579 };
1580
1581 let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
1582 let sample = EvalSample {
1583 edit_output,
1584 diff: language::unified_diff(
1585 eval.input_content.as_deref().unwrap_or_default(),
1586 &buffer_text,
1587 ),
1588 text_before: eval.input_content.unwrap_or_default(),
1589 text_after: buffer_text,
1590 };
1591 let assertion = eval
1592 .assertion
1593 .run(&sample, self.judge_model.clone(), cx)
1594 .await?;
1595
1596 Ok(EvalOutput { assertion, sample })
1597 }
1598}
1599
1600#[derive(Clone, Debug, Eq, PartialEq, Hash)]
1601struct EvalAssertionOutcome {
1602 score: usize,
1603 message: Option<String>,
1604}
1605
1606#[derive(Serialize)]
1607pub struct DiffJudgeTemplate {
1608 diff: String,
1609 assertions: &'static str,
1610}
1611
1612impl Template for DiffJudgeTemplate {
1613 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
1614}
1615
1616fn strip_empty_lines(text: &str) -> String {
1617 text.lines()
1618 .filter(|line| !line.trim().is_empty())
1619 .collect::<Vec<_>>()
1620 .join("\n")
1621}