1use crate::tools::streaming_edit_file_tool::*;
2use crate::{
3 AgentTool, ContextServerRegistry, EditFileTool, GrepTool, GrepToolInput, ListDirectoryTool,
4 ListDirectoryToolInput, ReadFileTool, ReadFileToolInput, StreamingEditFileTool, Template,
5 Templates, Thread, ToolCallEventStream, ToolInput,
6};
7use Role::*;
8use anyhow::{Context as _, Result};
9use client::{Client, RefreshLlmTokenListener, UserStore};
10use fs::FakeFs;
11use futures::{FutureExt, StreamExt, future::LocalBoxFuture};
12use gpui::{AppContext as _, AsyncApp, Entity, TestAppContext, UpdateGlobal as _};
13use http_client::StatusCode;
14use language::language_settings::FormatOnSave;
15use language_model::{
16 LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
17 LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage,
18 LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent,
19 LanguageModelToolSchemaFormat, LanguageModelToolUse, LanguageModelToolUseId, MessageContent,
20 Role, SelectedModel,
21};
22use project::Project;
23use prompt_store::{ProjectContext, WorktreeContext};
24use rand::prelude::*;
25use reqwest_client::ReqwestClient;
26use serde::Serialize;
27use serde_json::json;
28use settings::SettingsStore;
29use std::{
30 fmt::{self, Display},
31 path::{Path, PathBuf},
32 str::FromStr,
33 sync::Arc,
34 time::Duration,
35};
36use util::path;
37
38#[derive(Serialize)]
39struct DiffJudgeTemplate {
40 diff: String,
41 assertions: &'static str,
42}
43
44impl Template for DiffJudgeTemplate {
45 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
46}
47
48#[derive(Clone)]
49struct EvalInput {
50 conversation: Vec<LanguageModelRequestMessage>,
51 input_file_path: PathBuf,
52 input_content: Option<String>,
53 assertion: EvalAssertion,
54}
55
56impl EvalInput {
57 fn new(
58 conversation: Vec<LanguageModelRequestMessage>,
59 input_file_path: impl Into<PathBuf>,
60 input_content: Option<String>,
61 assertion: EvalAssertion,
62 ) -> Self {
63 EvalInput {
64 conversation,
65 input_file_path: input_file_path.into(),
66 input_content,
67 assertion,
68 }
69 }
70}
71
72#[derive(Clone)]
73struct EvalSample {
74 text_before: String,
75 text_after: String,
76 tool_input: StreamingEditFileToolInput,
77 diff: String,
78}
79
80trait AssertionFn: 'static + Send + Sync {
81 fn assert<'a>(
82 &'a self,
83 sample: &'a EvalSample,
84 judge_model: Arc<dyn LanguageModel>,
85 cx: &'a mut TestAppContext,
86 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
87}
88
89impl<F> AssertionFn for F
90where
91 F: 'static
92 + Send
93 + Sync
94 + AsyncFn(
95 &EvalSample,
96 Arc<dyn LanguageModel>,
97 &mut TestAppContext,
98 ) -> Result<EvalAssertionOutcome>,
99{
100 fn assert<'a>(
101 &'a self,
102 sample: &'a EvalSample,
103 judge_model: Arc<dyn LanguageModel>,
104 cx: &'a mut TestAppContext,
105 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
106 (self)(sample, judge_model, cx).boxed_local()
107 }
108}
109
110#[derive(Clone)]
111struct EvalAssertion(Arc<dyn AssertionFn>);
112
113impl EvalAssertion {
114 fn new<F>(f: F) -> Self
115 where
116 F: 'static
117 + Send
118 + Sync
119 + AsyncFn(
120 &EvalSample,
121 Arc<dyn LanguageModel>,
122 &mut TestAppContext,
123 ) -> Result<EvalAssertionOutcome>,
124 {
125 EvalAssertion(Arc::new(f))
126 }
127
128 fn assert_eq(expected: impl Into<String>) -> Self {
129 let expected = expected.into();
130 Self::new(async move |sample, _judge, _cx| {
131 Ok(EvalAssertionOutcome {
132 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
133 100
134 } else {
135 0
136 },
137 message: None,
138 })
139 })
140 }
141
142 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
143 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
144 Self::new(async move |sample, _judge, _cx| {
145 let matches = expected_diffs.iter().any(|possible_diff| {
146 language::apply_diff_patch(&sample.text_before, possible_diff)
147 .map(|expected| {
148 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
149 })
150 .unwrap_or(false)
151 });
152
153 Ok(EvalAssertionOutcome {
154 score: if matches { 100 } else { 0 },
155 message: None,
156 })
157 })
158 }
159
160 fn judge_diff(assertions: &'static str) -> Self {
161 Self::new(async move |sample, judge, cx| {
162 let prompt = DiffJudgeTemplate {
163 diff: sample.diff.clone(),
164 assertions,
165 }
166 .render(&Templates::new())
167 .context("Failed to render diff judge template")?;
168
169 let request = LanguageModelRequest {
170 messages: vec![LanguageModelRequestMessage {
171 role: Role::User,
172 content: vec![prompt.into()],
173 cache: false,
174 reasoning_details: None,
175 }],
176 thinking_allowed: true,
177 thinking_effort: judge
178 .default_effort_level()
179 .map(|effort_level| effort_level.value.to_string()),
180 ..Default::default()
181 };
182 let mut response = retry_on_rate_limit(async || {
183 Ok(judge
184 .stream_completion_text(request.clone(), &cx.to_async())
185 .await?)
186 })
187 .await?;
188 let mut output = String::new();
189 while let Some(chunk) = response.stream.next().await {
190 let chunk = chunk?;
191 output.push_str(&chunk);
192 }
193
194 let re = regex::Regex::new(r"<score>(\d+)</score>")
195 .context("Failed to compile score regex")?;
196 if let Some(captures) = re.captures(&output)
197 && let Some(score_match) = captures.get(1)
198 {
199 let score = score_match.as_str().parse().unwrap_or(0);
200 return Ok(EvalAssertionOutcome {
201 score,
202 message: Some(output),
203 });
204 }
205
206 anyhow::bail!("No score found in response. Raw output: {output}");
207 })
208 }
209
210 async fn run(
211 &self,
212 input: &EvalSample,
213 judge_model: Arc<dyn LanguageModel>,
214 cx: &mut TestAppContext,
215 ) -> Result<EvalAssertionOutcome> {
216 self.0.assert(input, judge_model, cx).await
217 }
218}
219
220#[derive(Clone)]
221struct StreamingEditEvalOutput {
222 sample: EvalSample,
223 assertion: EvalAssertionOutcome,
224}
225
226impl Display for StreamingEditEvalOutput {
227 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
228 writeln!(f, "Score: {:?}", self.assertion.score)?;
229 if let Some(message) = self.assertion.message.as_ref() {
230 writeln!(f, "Message: {}", message)?;
231 }
232 writeln!(f, "Diff:\n{}", self.sample.diff)?;
233 writeln!(f, "Tool Input:\n{:#?}", self.sample.tool_input)?;
234 Ok(())
235 }
236}
237
238#[derive(Clone, Debug, Eq, PartialEq, Hash)]
239struct EvalAssertionOutcome {
240 score: usize,
241 message: Option<String>,
242}
243
244struct StreamingEditToolTest {
245 fs: Arc<FakeFs>,
246 project: Entity<Project>,
247 model: Arc<dyn LanguageModel>,
248 judge_model: Arc<dyn LanguageModel>,
249 model_thinking_effort: Option<String>,
250}
251
252impl StreamingEditToolTest {
253 async fn new(cx: &mut TestAppContext) -> Self {
254 cx.executor().allow_parking();
255
256 let fs = FakeFs::new(cx.executor());
257 cx.update(|cx| {
258 let settings_store = SettingsStore::test(cx);
259 cx.set_global(settings_store);
260 SettingsStore::update_global(cx, |store: &mut SettingsStore, cx| {
261 store.update_user_settings(cx, |settings| {
262 settings
263 .project
264 .all_languages
265 .defaults
266 .ensure_final_newline_on_save = Some(false);
267 settings.project.all_languages.defaults.format_on_save =
268 Some(FormatOnSave::Off);
269 });
270 });
271
272 gpui_tokio::init(cx);
273 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
274 cx.set_http_client(http_client);
275 let client = Client::production(cx);
276 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
277 language_model::init(cx);
278 RefreshLlmTokenListener::register(client.clone(), user_store.clone(), cx);
279 language_models::init(user_store, client, cx);
280 });
281
282 fs.insert_tree("/root", json!({})).await;
283 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
284 let agent_model = SelectedModel::from_str(
285 &std::env::var("ZED_AGENT_MODEL")
286 .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
287 )
288 .unwrap();
289 let judge_model = SelectedModel::from_str(
290 &std::env::var("ZED_JUDGE_MODEL")
291 .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
292 )
293 .unwrap();
294
295 let authenticate_provider_tasks = cx.update(|cx| {
296 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
297 registry
298 .providers()
299 .iter()
300 .map(|p| p.authenticate(cx))
301 .collect::<Vec<_>>()
302 })
303 });
304 let (model, judge_model) = cx
305 .update(|cx| {
306 cx.spawn(async move |cx| {
307 futures::future::join_all(authenticate_provider_tasks).await;
308 let model = Self::load_model(&agent_model, cx).await;
309 let judge_model = Self::load_model(&judge_model, cx).await;
310 (model.unwrap(), judge_model.unwrap())
311 })
312 })
313 .await;
314
315 let model_thinking_effort = model
316 .default_effort_level()
317 .map(|effort_level| effort_level.value.to_string());
318
319 Self {
320 fs,
321 project,
322 model,
323 judge_model,
324 model_thinking_effort,
325 }
326 }
327
328 async fn load_model(
329 selected_model: &SelectedModel,
330 cx: &mut AsyncApp,
331 ) -> Result<Arc<dyn LanguageModel>> {
332 cx.update(|cx| {
333 let registry = LanguageModelRegistry::read_global(cx);
334 let provider = registry
335 .provider(&selected_model.provider)
336 .expect("Provider not found");
337 provider.authenticate(cx)
338 })
339 .await?;
340 Ok(cx.update(|cx| {
341 let models = LanguageModelRegistry::read_global(cx);
342 models
343 .available_models(cx)
344 .find(|model| {
345 model.provider_id() == selected_model.provider
346 && model.id() == selected_model.model
347 })
348 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0))
349 }))
350 }
351
352 /// Build the tool definitions for the model, replacing `edit_file` with the
353 /// streaming edit file tool schema. In production the streaming tool is
354 /// exposed under the name `"edit_file"` (see `Thread::enabled_tools`), so
355 /// the model has never seen the name `"streaming_edit_file"`.
356 fn build_tools() -> Vec<LanguageModelRequestTool> {
357 let mut tools: Vec<LanguageModelRequestTool> = crate::built_in_tools()
358 .filter(|tool| tool.name != EditFileTool::NAME)
359 .collect();
360 tools.push(LanguageModelRequestTool {
361 name: EditFileTool::NAME.to_string(),
362 description: StreamingEditFileTool::description().to_string(),
363 input_schema: StreamingEditFileTool::input_schema(
364 LanguageModelToolSchemaFormat::JsonSchema,
365 )
366 .to_value(),
367 use_input_streaming: StreamingEditFileTool::supports_input_streaming(),
368 });
369 tools
370 }
371
372 async fn eval(
373 &self,
374 mut eval: EvalInput,
375 cx: &mut TestAppContext,
376 ) -> Result<StreamingEditEvalOutput> {
377 eval.conversation
378 .last_mut()
379 .context("Conversation must not be empty")?
380 .cache = true;
381
382 // Populate the FakeFs so `resolve_path` / `entry_for_path` can find
383 // the file in the worktree.
384 if let Some(input_content) = eval.input_content.as_deref() {
385 let abs_path = Path::new("/root").join(
386 eval.input_file_path
387 .strip_prefix("root")
388 .unwrap_or(&eval.input_file_path),
389 );
390 self.fs.insert_file(&abs_path, input_content.into()).await;
391
392 // Wait for the worktree to pick up the new file.
393 cx.run_until_parked();
394 }
395
396 let tools = Self::build_tools();
397
398 let system_prompt = {
399 let worktrees = vec![WorktreeContext {
400 root_name: "root".to_string(),
401 abs_path: Path::new("/path/to/root").into(),
402 rules_file: None,
403 }];
404 let project_context = ProjectContext::new(worktrees, Vec::default());
405 let tool_names = tools
406 .iter()
407 .map(|tool| tool.name.clone().into())
408 .collect::<Vec<_>>();
409 let template = crate::SystemPromptTemplate {
410 project: &project_context,
411 available_tools: tool_names,
412 model_name: None,
413 };
414 let templates = Templates::new();
415 template.render(&templates)?
416 };
417
418 let has_system_prompt = eval
419 .conversation
420 .first()
421 .is_some_and(|msg| msg.role == Role::System);
422 let messages = if has_system_prompt {
423 eval.conversation
424 } else {
425 [LanguageModelRequestMessage {
426 role: Role::System,
427 content: vec![MessageContent::Text(system_prompt)],
428 cache: true,
429 reasoning_details: None,
430 }]
431 .into_iter()
432 .chain(eval.conversation)
433 .collect::<Vec<_>>()
434 };
435
436 let request = LanguageModelRequest {
437 messages,
438 tools,
439 thinking_allowed: true,
440 thinking_effort: self.model_thinking_effort.clone(),
441 ..Default::default()
442 };
443
444 // The model will call the tool as "edit_file" (the production-visible
445 // name), but the schema is from StreamingEditFileTool.
446 let tool_input =
447 retry_on_rate_limit(async || self.extract_tool_use(request.clone(), cx).await).await?;
448
449 let language_registry = self
450 .project
451 .read_with(cx, |project, _cx| project.languages().clone());
452
453 let context_server_registry = cx
454 .new(|cx| ContextServerRegistry::new(self.project.read(cx).context_server_store(), cx));
455 let thread = cx.new(|cx| {
456 Thread::new(
457 self.project.clone(),
458 cx.new(|_cx| ProjectContext::default()),
459 context_server_registry,
460 Templates::new(),
461 Some(self.model.clone()),
462 cx,
463 )
464 });
465 let action_log = thread.read_with(cx, |thread, _| thread.action_log().clone());
466
467 let tool = Arc::new(StreamingEditFileTool::new(
468 self.project.clone(),
469 thread.downgrade(),
470 action_log,
471 language_registry,
472 ));
473
474 let result = cx
475 .update(|cx| {
476 tool.clone().run(
477 ToolInput::resolved(tool_input.clone()),
478 ToolCallEventStream::test().0,
479 cx,
480 )
481 })
482 .await;
483
484 let output = match result {
485 Ok(output) => output,
486 Err(output) => {
487 anyhow::bail!("Tool returned error: {}", output);
488 }
489 };
490
491 let StreamingEditFileToolOutput::Success { new_text, .. } = &output else {
492 anyhow::bail!("Tool returned error output: {}", output);
493 };
494
495 let sample = EvalSample {
496 tool_input,
497 diff: language::unified_diff(
498 eval.input_content.as_deref().unwrap_or_default(),
499 new_text,
500 ),
501 text_before: eval.input_content.unwrap_or_default(),
502 text_after: new_text.clone(),
503 };
504
505 let assertion = eval
506 .assertion
507 .run(&sample, self.judge_model.clone(), cx)
508 .await?;
509
510 Ok(StreamingEditEvalOutput { assertion, sample })
511 }
512
513 /// Stream the model completion and extract the first complete tool use
514 /// whose name matches `EditFileTool::NAME` (the production-visible name
515 /// for the streaming edit tool), parsed as `StreamingEditFileToolInput`.
516 async fn extract_tool_use(
517 &self,
518 request: LanguageModelRequest,
519 cx: &mut TestAppContext,
520 ) -> Result<StreamingEditFileToolInput> {
521 let model = self.model.clone();
522 let events = cx
523 .update(|cx| {
524 let async_cx = cx.to_async();
525 cx.foreground_executor()
526 .spawn(async move { model.stream_completion(request, &async_cx).await })
527 })
528 .await
529 .map_err(|err| anyhow::anyhow!("completion error: {}", err))?;
530
531 let mut streamed_text = String::new();
532 let mut stop_reason = None;
533 let mut parse_errors = Vec::new();
534
535 let mut events = events.fuse();
536 while let Some(event) = events.next().await {
537 match event {
538 Ok(LanguageModelCompletionEvent::ToolUse(tool_use))
539 if tool_use.is_input_complete
540 && tool_use.name.as_ref() == EditFileTool::NAME =>
541 {
542 let input: StreamingEditFileToolInput = serde_json::from_value(tool_use.input)
543 .context("Failed to parse tool input as StreamingEditFileToolInput")?;
544 return Ok(input);
545 }
546 Ok(LanguageModelCompletionEvent::Text(text)) => {
547 if streamed_text.len() < 2_000 {
548 streamed_text.push_str(&text);
549 }
550 }
551 Ok(LanguageModelCompletionEvent::Stop(reason)) => {
552 stop_reason = Some(reason);
553 }
554 Ok(LanguageModelCompletionEvent::ToolUseJsonParseError {
555 tool_name,
556 raw_input,
557 json_parse_error,
558 ..
559 }) if tool_name.as_ref() == EditFileTool::NAME => {
560 parse_errors.push(format!("{json_parse_error}\nRaw input:\n{raw_input:?}"));
561 }
562 Err(err) => {
563 return Err(anyhow::anyhow!("completion error: {}", err));
564 }
565 _ => {}
566 }
567 }
568
569 let streamed_text = streamed_text.trim();
570 let streamed_text_suffix = if streamed_text.is_empty() {
571 String::new()
572 } else {
573 format!("\nStreamed text:\n{streamed_text}")
574 };
575 let stop_reason_suffix = stop_reason
576 .map(|reason| format!("\nStop reason: {reason:?}"))
577 .unwrap_or_default();
578 let parse_errors_suffix = if parse_errors.is_empty() {
579 String::new()
580 } else {
581 format!("\nTool parse errors:\n{}", parse_errors.join("\n"))
582 };
583
584 anyhow::bail!(
585 "Stream ended without an edit_file tool use{stop_reason_suffix}{parse_errors_suffix}{streamed_text_suffix}"
586 )
587 }
588}
589
590fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<()> {
591 let dispatcher = gpui::TestDispatcher::new(rand::random());
592 let mut cx = TestAppContext::build(dispatcher, None);
593 let foreground_executor = cx.foreground_executor().clone();
594 let result = foreground_executor.block_test(async {
595 let test = StreamingEditToolTest::new(&mut cx).await;
596 let result = test.eval(eval, &mut cx).await;
597 drop(test);
598 cx.run_until_parked();
599 result
600 });
601 cx.quit();
602 match result {
603 Ok(output) => eval_utils::EvalOutput {
604 data: output.to_string(),
605 outcome: if output.assertion.score < 80 {
606 eval_utils::OutcomeKind::Failed
607 } else {
608 eval_utils::OutcomeKind::Passed
609 },
610 metadata: (),
611 },
612 Err(err) => eval_utils::EvalOutput {
613 data: format!("{err:?}"),
614 outcome: eval_utils::OutcomeKind::Error,
615 metadata: (),
616 },
617 }
618}
619
620fn message(
621 role: Role,
622 contents: impl IntoIterator<Item = MessageContent>,
623) -> LanguageModelRequestMessage {
624 LanguageModelRequestMessage {
625 role,
626 content: contents.into_iter().collect(),
627 cache: false,
628 reasoning_details: None,
629 }
630}
631
632fn text(text: impl Into<String>) -> MessageContent {
633 MessageContent::Text(text.into())
634}
635
636fn lines(input: &str, range: std::ops::Range<usize>) -> String {
637 input
638 .lines()
639 .skip(range.start)
640 .take(range.len())
641 .collect::<Vec<_>>()
642 .join("\n")
643}
644
645fn tool_use(
646 id: impl Into<Arc<str>>,
647 name: impl Into<Arc<str>>,
648 input: impl Serialize,
649) -> MessageContent {
650 MessageContent::ToolUse(LanguageModelToolUse {
651 id: LanguageModelToolUseId::from(id.into()),
652 name: name.into(),
653 raw_input: serde_json::to_string_pretty(&input).unwrap(),
654 input: serde_json::to_value(input).unwrap(),
655 is_input_complete: true,
656 thought_signature: None,
657 })
658}
659
660fn tool_result(
661 id: impl Into<Arc<str>>,
662 name: impl Into<Arc<str>>,
663 result: impl Into<Arc<str>>,
664) -> MessageContent {
665 MessageContent::ToolResult(LanguageModelToolResult {
666 tool_use_id: LanguageModelToolUseId::from(id.into()),
667 tool_name: name.into(),
668 is_error: false,
669 content: LanguageModelToolResultContent::Text(result.into()),
670 output: None,
671 })
672}
673
674fn strip_empty_lines(text: &str) -> String {
675 text.lines()
676 .filter(|line| !line.trim().is_empty())
677 .collect::<Vec<_>>()
678 .join("\n")
679}
680
681async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
682 const MAX_RETRIES: usize = 20;
683 let mut attempt = 0;
684
685 loop {
686 attempt += 1;
687 let response = request().await;
688
689 if attempt >= MAX_RETRIES {
690 return response;
691 }
692
693 let retry_delay = match &response {
694 Ok(_) => None,
695 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
696 Some(err) => match &err {
697 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
698 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
699 Some(retry_after.unwrap_or(Duration::from_secs(5)))
700 }
701 LanguageModelCompletionError::UpstreamProviderError {
702 status,
703 retry_after,
704 ..
705 } => {
706 let should_retry = matches!(
707 *status,
708 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
709 ) || status.as_u16() == 529;
710
711 if should_retry {
712 Some(retry_after.unwrap_or(Duration::from_secs(5)))
713 } else {
714 None
715 }
716 }
717 LanguageModelCompletionError::ApiReadResponseError { .. }
718 | LanguageModelCompletionError::ApiInternalServerError { .. }
719 | LanguageModelCompletionError::HttpSend { .. } => {
720 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
721 }
722 _ => None,
723 },
724 _ => None,
725 },
726 };
727
728 if let Some(retry_after) = retry_delay {
729 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
730 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
731 #[allow(clippy::disallowed_methods)]
732 smol::Timer::after(retry_after + jitter).await;
733 } else {
734 return response;
735 }
736 }
737}
738
739#[test]
740#[cfg_attr(not(feature = "unit-eval"), ignore)]
741fn eval_delete_function() {
742 let input_file_path = "root/blame.rs";
743 let input_file_content = include_str!("fixtures/delete_run_git_blame/before.rs");
744 let output_file_content = include_str!("fixtures/delete_run_git_blame/after.rs");
745 let possible_diffs = vec![
746 language::unified_diff(input_file_content, output_file_content),
747 language::unified_diff(
748 input_file_content,
749 &output_file_content
750 .replace(
751 "const GIT_BLAME_NO_COMMIT_ERROR: &str = \"fatal: no such ref: HEAD\";\n",
752 "",
753 )
754 .replace(
755 "const GIT_BLAME_NO_PATH: &str = \"fatal: no such path\";\n",
756 "",
757 ),
758 ),
759 ];
760
761 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
762 run_eval(EvalInput::new(
763 vec![
764 message(
765 User,
766 [text(indoc::formatdoc! {"
767 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
768 one function, not its usages.
769 "})],
770 ),
771 message(
772 Assistant,
773 [tool_use(
774 "tool_1",
775 ReadFileTool::NAME,
776 ReadFileToolInput {
777 path: input_file_path.into(),
778 start_line: None,
779 end_line: None,
780 },
781 )],
782 ),
783 message(
784 User,
785 [tool_result(
786 "tool_1",
787 ReadFileTool::NAME,
788 input_file_content,
789 )],
790 ),
791 ],
792 input_file_path,
793 Some(input_file_content.into()),
794 EvalAssertion::assert_diff_any(possible_diffs.clone()),
795 ))
796 });
797}
798
799#[test]
800#[cfg_attr(not(feature = "unit-eval"), ignore)]
801fn eval_extract_handle_command_output() {
802 let input_file_path = "root/blame.rs";
803 let input_file_content = include_str!("fixtures/extract_handle_command_output/before.rs");
804 let possible_diffs = vec![
805 include_str!("fixtures/extract_handle_command_output/possible-01.diff"),
806 include_str!("fixtures/extract_handle_command_output/possible-02.diff"),
807 include_str!("fixtures/extract_handle_command_output/possible-03.diff"),
808 include_str!("fixtures/extract_handle_command_output/possible-04.diff"),
809 include_str!("fixtures/extract_handle_command_output/possible-05.diff"),
810 include_str!("fixtures/extract_handle_command_output/possible-06.diff"),
811 include_str!("fixtures/extract_handle_command_output/possible-07.diff"),
812 include_str!("fixtures/extract_handle_command_output/possible-08.diff"),
813 include_str!("fixtures/extract_handle_command_output/possible-09.diff"),
814 ];
815
816 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
817 run_eval(EvalInput::new(
818 vec![
819 message(
820 User,
821 [text(indoc::formatdoc! {"
822 Read the `{input_file_path}` file and extract a method in
823 the final stanza of `run_git_blame` to deal with command failures,
824 call it `handle_command_output` and take the std::process::Output as the only parameter.
825 Do not document the method and do not add any comments.
826
827 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
828 "})],
829 ),
830 message(
831 Assistant,
832 [tool_use(
833 "tool_1",
834 ReadFileTool::NAME,
835 ReadFileToolInput {
836 path: input_file_path.into(),
837 start_line: None,
838 end_line: None,
839 },
840 )],
841 ),
842 message(
843 User,
844 [tool_result(
845 "tool_1",
846 ReadFileTool::NAME,
847 input_file_content,
848 )],
849 ),
850 ],
851 input_file_path,
852 Some(input_file_content.into()),
853 EvalAssertion::assert_diff_any(possible_diffs.clone()),
854 ))
855 });
856}
857
858#[test]
859#[cfg_attr(not(feature = "unit-eval"), ignore)]
860fn eval_translate_doc_comments() {
861 let input_file_path = "root/canvas.rs";
862 let input_file_content = include_str!("fixtures/translate_doc_comments/before.rs");
863
864 eval_utils::eval(200, 1., eval_utils::NoProcessor, move || {
865 run_eval(EvalInput::new(
866 vec![
867 message(
868 User,
869 [text(indoc::formatdoc! {"
870 Read the `{input_file_path}` file and edit it (without overwriting it),
871 translating all the doc comments to italian.
872 "})],
873 ),
874 message(
875 Assistant,
876 [tool_use(
877 "tool_1",
878 ReadFileTool::NAME,
879 ReadFileToolInput {
880 path: input_file_path.into(),
881 start_line: None,
882 end_line: None,
883 },
884 )],
885 ),
886 message(
887 User,
888 [tool_result(
889 "tool_1",
890 ReadFileTool::NAME,
891 input_file_content,
892 )],
893 ),
894 ],
895 input_file_path,
896 Some(input_file_content.into()),
897 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
898 ))
899 });
900}
901
902#[test]
903#[cfg_attr(not(feature = "unit-eval"), ignore)]
904fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
905 let input_file_path = "root/lib.rs";
906 let input_file_content =
907 include_str!("fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
908
909 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
910 run_eval(EvalInput::new(
911 vec![
912 message(
913 User,
914 [text(indoc::formatdoc! {"
915 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
916 Use `ureq` to download the SDK for the current platform and architecture.
917 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
918 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
919 that's inside of the archive.
920 Don't re-download the SDK if that executable already exists.
921
922 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
923
924 Here are the available wasi-sdk assets:
925 - wasi-sdk-25.0-x86_64-macos.tar.gz
926 - wasi-sdk-25.0-arm64-macos.tar.gz
927 - wasi-sdk-25.0-x86_64-linux.tar.gz
928 - wasi-sdk-25.0-arm64-linux.tar.gz
929 - wasi-sdk-25.0-x86_64-linux.tar.gz
930 - wasi-sdk-25.0-arm64-linux.tar.gz
931 - wasi-sdk-25.0-x86_64-windows.tar.gz
932 "})],
933 ),
934 message(
935 Assistant,
936 [tool_use(
937 "tool_1",
938 ReadFileTool::NAME,
939 ReadFileToolInput {
940 path: input_file_path.into(),
941 start_line: Some(971),
942 end_line: Some(1050),
943 },
944 )],
945 ),
946 message(
947 User,
948 [tool_result(
949 "tool_1",
950 ReadFileTool::NAME,
951 lines(input_file_content, 971..1050),
952 )],
953 ),
954 message(
955 Assistant,
956 [tool_use(
957 "tool_2",
958 ReadFileTool::NAME,
959 ReadFileToolInput {
960 path: input_file_path.into(),
961 start_line: Some(1050),
962 end_line: Some(1100),
963 },
964 )],
965 ),
966 message(
967 User,
968 [tool_result(
969 "tool_2",
970 ReadFileTool::NAME,
971 lines(input_file_content, 1050..1100),
972 )],
973 ),
974 message(
975 Assistant,
976 [tool_use(
977 "tool_3",
978 ReadFileTool::NAME,
979 ReadFileToolInput {
980 path: input_file_path.into(),
981 start_line: Some(1100),
982 end_line: Some(1150),
983 },
984 )],
985 ),
986 message(
987 User,
988 [tool_result(
989 "tool_3",
990 ReadFileTool::NAME,
991 lines(input_file_content, 1100..1150),
992 )],
993 ),
994 ],
995 input_file_path,
996 Some(input_file_content.into()),
997 EvalAssertion::judge_diff(indoc::indoc! {"
998 - The compile_parser_to_wasm method has been changed to use wasi-sdk
999 - ureq is used to download the SDK for current platform and architecture
1000 "}),
1001 ))
1002 });
1003}
1004
1005#[test]
1006#[cfg_attr(not(feature = "unit-eval"), ignore)]
1007fn eval_disable_cursor_blinking() {
1008 let input_file_path = "root/editor.rs";
1009 let input_file_content = include_str!("fixtures/disable_cursor_blinking/before.rs");
1010 let possible_diffs = vec![
1011 include_str!("fixtures/disable_cursor_blinking/possible-01.diff"),
1012 include_str!("fixtures/disable_cursor_blinking/possible-02.diff"),
1013 include_str!("fixtures/disable_cursor_blinking/possible-03.diff"),
1014 include_str!("fixtures/disable_cursor_blinking/possible-04.diff"),
1015 ];
1016
1017 eval_utils::eval(100, 0.51, eval_utils::NoProcessor, move || {
1018 run_eval(EvalInput::new(
1019 vec![
1020 message(User, [text("Let's research how to cursor blinking works.")]),
1021 message(
1022 Assistant,
1023 [tool_use(
1024 "tool_1",
1025 GrepTool::NAME,
1026 GrepToolInput {
1027 regex: "blink".into(),
1028 include_pattern: None,
1029 offset: 0,
1030 case_sensitive: false,
1031 },
1032 )],
1033 ),
1034 message(
1035 User,
1036 [tool_result(
1037 "tool_1",
1038 GrepTool::NAME,
1039 [
1040 lines(input_file_content, 100..400),
1041 lines(input_file_content, 800..1300),
1042 lines(input_file_content, 1600..2000),
1043 lines(input_file_content, 5000..5500),
1044 lines(input_file_content, 8000..9000),
1045 lines(input_file_content, 18455..18470),
1046 lines(input_file_content, 20000..20500),
1047 lines(input_file_content, 21000..21300),
1048 ]
1049 .join("Match found:\n\n"),
1050 )],
1051 ),
1052 message(
1053 User,
1054 [text(indoc::indoc! {"
1055 Comment out the lines that interact with the BlinkManager.
1056 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
1057 Don't add additional comments.
1058 "})],
1059 ),
1060 ],
1061 input_file_path,
1062 Some(input_file_content.into()),
1063 EvalAssertion::assert_diff_any(possible_diffs.clone()),
1064 ))
1065 });
1066}
1067
1068#[test]
1069#[cfg_attr(not(feature = "unit-eval"), ignore)]
1070fn eval_from_pixels_constructor() {
1071 let input_file_path = "root/canvas.rs";
1072 let input_file_content = include_str!("fixtures/from_pixels_constructor/before.rs");
1073
1074 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
1075 run_eval(EvalInput::new(
1076 vec![
1077 message(
1078 User,
1079 [text(indoc::indoc! {"
1080 Introduce a new `from_pixels` constructor in Canvas and
1081 also add tests for it in the same file.
1082 "})],
1083 ),
1084 message(
1085 Assistant,
1086 [tool_use(
1087 "tool_1",
1088 ReadFileTool::NAME,
1089 ReadFileToolInput {
1090 path: input_file_path.into(),
1091 start_line: None,
1092 end_line: None,
1093 },
1094 )],
1095 ),
1096 message(
1097 User,
1098 [tool_result(
1099 "tool_1",
1100 ReadFileTool::NAME,
1101 input_file_content,
1102 )],
1103 ),
1104 message(
1105 Assistant,
1106 [tool_use(
1107 "tool_2",
1108 GrepTool::NAME,
1109 GrepToolInput {
1110 regex: "mod\\s+tests".into(),
1111 include_pattern: Some("font-kit/src/canvas.rs".into()),
1112 offset: 0,
1113 case_sensitive: false,
1114 },
1115 )],
1116 ),
1117 message(
1118 User,
1119 [tool_result("tool_2", GrepTool::NAME, "No matches found")],
1120 ),
1121 message(
1122 Assistant,
1123 [tool_use(
1124 "tool_3",
1125 GrepTool::NAME,
1126 GrepToolInput {
1127 regex: "mod\\s+tests".into(),
1128 include_pattern: Some("font-kit/src/**/*.rs".into()),
1129 offset: 0,
1130 case_sensitive: false,
1131 },
1132 )],
1133 ),
1134 message(
1135 User,
1136 [tool_result("tool_3", GrepTool::NAME, "No matches found")],
1137 ),
1138 message(
1139 Assistant,
1140 [tool_use(
1141 "tool_4",
1142 GrepTool::NAME,
1143 GrepToolInput {
1144 regex: "#\\[test\\]".into(),
1145 include_pattern: Some("font-kit/src/**/*.rs".into()),
1146 offset: 0,
1147 case_sensitive: false,
1148 },
1149 )],
1150 ),
1151 message(
1152 User,
1153 [tool_result(
1154 "tool_4",
1155 GrepTool::NAME,
1156 indoc::indoc! {"
1157 Found 6 matches:
1158
1159 ## Matches in font-kit/src/loaders/core_text.rs
1160
1161 ### mod test › L926-936
1162 ```
1163 mod test {
1164 use super::Font;
1165 use crate::properties::{Stretch, Weight};
1166
1167 #[cfg(feature = \"source\")]
1168 use crate::source::SystemSource;
1169
1170 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
1171
1172 #[cfg(feature = \"source\")]
1173 #[test]
1174 ```
1175
1176 55 lines remaining in ancestor node. Read the file to see all.
1177
1178 ### mod test › L947-951
1179 ```
1180 }
1181
1182 #[test]
1183 fn test_core_text_to_css_font_weight() {
1184 // Exact matches
1185 ```
1186
1187 ### mod test › L959-963
1188 ```
1189 }
1190
1191 #[test]
1192 fn test_core_text_to_css_font_stretch() {
1193 // Exact matches
1194 ```
1195
1196 ## Matches in font-kit/src/loaders/freetype.rs
1197
1198 ### mod test › L1238-1248
1199 ```
1200 mod test {
1201 use crate::loaders::freetype::Font;
1202
1203 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
1204 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
1205
1206 #[test]
1207 fn get_pcf_postscript_name() {
1208 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
1209 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
1210 }
1211 ```
1212
1213 1 lines remaining in ancestor node. Read the file to see all.
1214
1215 ## Matches in font-kit/src/sources/core_text.rs
1216
1217 ### mod test › L265-275
1218 ```
1219 mod test {
1220 use crate::properties::{Stretch, Weight};
1221
1222 #[test]
1223 fn test_css_to_core_text_font_weight() {
1224 // Exact matches
1225 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
1226 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
1227 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
1228 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
1229
1230 ```
1231
1232 27 lines remaining in ancestor node. Read the file to see all.
1233
1234 ### mod test › L278-282
1235 ```
1236 }
1237
1238 #[test]
1239 fn test_css_to_core_text_font_stretch() {
1240 // Exact matches
1241 ```
1242 "},
1243 )],
1244 ),
1245 ],
1246 input_file_path,
1247 Some(input_file_content.into()),
1248 EvalAssertion::judge_diff(indoc::indoc! {"
1249 - The diff contains a new `from_pixels` constructor
1250 - The diff contains new tests for the `from_pixels` constructor
1251 "}),
1252 ))
1253 });
1254}
1255
1256#[test]
1257#[cfg_attr(not(feature = "unit-eval"), ignore)]
1258fn eval_zode() {
1259 let input_file_path = "root/zode.py";
1260 let input_content = None;
1261
1262 eval_utils::eval(50, 1., eval_utils::NoProcessor, move || {
1263 run_eval(EvalInput::new(
1264 vec![
1265 message(User, [text(include_str!("fixtures/zode/prompt.md"))]),
1266 message(
1267 Assistant,
1268 [
1269 tool_use(
1270 "tool_1",
1271 ReadFileTool::NAME,
1272 ReadFileToolInput {
1273 path: "root/eval/react.py".into(),
1274 start_line: None,
1275 end_line: None,
1276 },
1277 ),
1278 tool_use(
1279 "tool_2",
1280 ReadFileTool::NAME,
1281 ReadFileToolInput {
1282 path: "root/eval/react_test.py".into(),
1283 start_line: None,
1284 end_line: None,
1285 },
1286 ),
1287 ],
1288 ),
1289 message(
1290 User,
1291 [
1292 tool_result(
1293 "tool_1",
1294 ReadFileTool::NAME,
1295 include_str!("fixtures/zode/react.py"),
1296 ),
1297 tool_result(
1298 "tool_2",
1299 ReadFileTool::NAME,
1300 include_str!("fixtures/zode/react_test.py"),
1301 ),
1302 ],
1303 ),
1304 ],
1305 input_file_path,
1306 input_content.clone(),
1307 EvalAssertion::new(async move |sample, _, _cx| {
1308 let invalid_starts = [' ', '`', '\n'];
1309 let mut message = String::new();
1310 for start in invalid_starts {
1311 if sample.text_after.starts_with(start) {
1312 message.push_str(&format!("The sample starts with a {:?}\n", start));
1313 break;
1314 }
1315 }
1316 message.pop();
1317
1318 if message.is_empty() {
1319 Ok(EvalAssertionOutcome {
1320 score: 100,
1321 message: None,
1322 })
1323 } else {
1324 Ok(EvalAssertionOutcome {
1325 score: 0,
1326 message: Some(message),
1327 })
1328 }
1329 }),
1330 ))
1331 });
1332}
1333
1334#[test]
1335#[cfg_attr(not(feature = "unit-eval"), ignore)]
1336fn eval_add_overwrite_test() {
1337 let input_file_path = "root/action_log.rs";
1338 let input_file_content = include_str!("fixtures/add_overwrite_test/before.rs");
1339
1340 eval_utils::eval(200, 0.5, eval_utils::NoProcessor, move || {
1341 run_eval(EvalInput::new(
1342 vec![
1343 message(
1344 User,
1345 [text(indoc::indoc! {"
1346 Introduce a new test in `action_log.rs` to test overwriting a file.
1347 That is, a file already exists, but we call `buffer_created` as if the file were new.
1348 Take inspiration from all the other tests in the file.
1349 "})],
1350 ),
1351 message(
1352 Assistant,
1353 [tool_use(
1354 "tool_1",
1355 ReadFileTool::NAME,
1356 ReadFileToolInput {
1357 path: input_file_path.into(),
1358 start_line: None,
1359 end_line: None,
1360 },
1361 )],
1362 ),
1363 message(
1364 User,
1365 [tool_result(
1366 "tool_1",
1367 ReadFileTool::NAME,
1368 indoc::indoc! {"
1369 pub struct ActionLog [L13-20]
1370 tracked_buffers [L15]
1371 edited_since_project_diagnostics_check [L17]
1372 project [L19]
1373 impl ActionLog [L22-498]
1374 pub fn new [L24-30]
1375 pub fn project [L32-34]
1376 pub fn checked_project_diagnostics [L37-39]
1377 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
1378 fn track_buffer_internal [L46-101]
1379 fn handle_buffer_event [L103-116]
1380 fn handle_buffer_edited [L118-123]
1381 fn handle_buffer_file_changed [L125-158]
1382 async fn maintain_diff [L160-264]
1383 pub fn buffer_read [L267-269]
1384 pub fn buffer_created [L272-276]
1385 pub fn buffer_edited [L279-287]
1386 pub fn will_delete_buffer [L289-304]
1387 pub fn keep_edits_in_range [L306-364]
1388 pub fn reject_edits_in_ranges [L366-459]
1389 pub fn keep_all_edits [L461-473]
1390 pub fn changed_buffers [L476-482]
1391 pub fn stale_buffers [L485-497]
1392 fn apply_non_conflicting_edits [L500-561]
1393 fn diff_snapshots [L563-585]
1394 fn point_to_row_edit [L587-614]
1395 enum ChangeAuthor [L617-620]
1396 User [L618]
1397 Agent [L619]
1398 enum TrackedBufferStatus [L623-627]
1399 Created [L624]
1400 Modified [L625]
1401 Deleted [L626]
1402 struct TrackedBuffer [L629-641]
1403 buffer [L630]
1404 base_text [L631]
1405 unreviewed_changes [L632]
1406 status [L633]
1407 version [L634]
1408 diff [L635]
1409 snapshot [L636]
1410 diff_update [L637]
1411 _open_lsp_handle [L638]
1412 _maintain_diff [L639]
1413 _subscription [L640]
1414 impl TrackedBuffer [L643-657]
1415 fn has_changes [L644-650]
1416 fn schedule_diff_update [L652-656]
1417 pub struct ChangedBuffer [L659-661]
1418 pub diff [L660]
1419 mod tests [L664-1574]
1420 fn init_logger [L678-682]
1421 fn init_test [L684-691]
1422 async fn test_keep_edits [L694-769]
1423 async fn test_deletions [L772-854]
1424 async fn test_overlapping_user_edits [L857-951]
1425 async fn test_creating_files [L954-1010]
1426 async fn test_deleting_files [L1013-1120]
1427 async fn test_reject_edits [L1123-1255]
1428 async fn test_reject_multiple_edits [L1258-1331]
1429 async fn test_reject_deleted_file [L1334-1388]
1430 async fn test_reject_created_file [L1391-1443]
1431 async fn test_random_diffs [L1446-1535]
1432 fn quiesce [L1510-1534]
1433 struct HunkStatus [L1538-1542]
1434 range [L1539]
1435 diff_status [L1540]
1436 old_text [L1541]
1437 fn unreviewed_hunks [L1544-1573]
1438
1439 Showing symbols 1-69 (total symbols: 69)
1440
1441 Using the line numbers in this outline, you can call this tool again while specifying
1442 the start_line and end_line fields to see the implementations of symbols in the outline.
1443 "},
1444 )],
1445 ),
1446 message(
1447 Assistant,
1448 [
1449 text(
1450 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
1451 ),
1452 tool_use(
1453 "tool_2",
1454 ReadFileTool::NAME,
1455 ReadFileToolInput {
1456 path: input_file_path.into(),
1457 start_line: Some(953),
1458 end_line: Some(1010),
1459 },
1460 ),
1461 ],
1462 ),
1463 message(
1464 User,
1465 [tool_result(
1466 "tool_2",
1467 ReadFileTool::NAME,
1468 lines(input_file_content, 953..1010),
1469 )],
1470 ),
1471 message(
1472 Assistant,
1473 [
1474 text(
1475 "Now let's look at another relevant test to understand how they're structured:",
1476 ),
1477 tool_use(
1478 "tool_3",
1479 ReadFileTool::NAME,
1480 ReadFileToolInput {
1481 path: input_file_path.into(),
1482 start_line: Some(1012),
1483 end_line: Some(1120),
1484 },
1485 ),
1486 ],
1487 ),
1488 message(
1489 User,
1490 [tool_result(
1491 "tool_3",
1492 ReadFileTool::NAME,
1493 lines(input_file_content, 1012..1120),
1494 )],
1495 ),
1496 message(
1497 Assistant,
1498 [
1499 text("Now let's look at how `buffer_created` is implemented:"),
1500 tool_use(
1501 "tool_4",
1502 ReadFileTool::NAME,
1503 ReadFileToolInput {
1504 path: input_file_path.into(),
1505 start_line: Some(271),
1506 end_line: Some(276),
1507 },
1508 ),
1509 ],
1510 ),
1511 message(
1512 User,
1513 [tool_result(
1514 "tool_4",
1515 ReadFileTool::NAME,
1516 lines(input_file_content, 271..276),
1517 )],
1518 ),
1519 ],
1520 input_file_path,
1521 Some(input_file_content.into()),
1522 EvalAssertion::judge_diff(
1523 "A new test for overwritten files was created, without changing any previous test",
1524 ),
1525 ))
1526 });
1527}
1528
1529#[test]
1530#[cfg_attr(not(feature = "unit-eval"), ignore)]
1531fn eval_create_empty_file() {
1532 let input_file_path = "root/TODO3";
1533 let input_file_content = None;
1534 let expected_output_content = String::new();
1535
1536 eval_utils::eval(100, 0.99, eval_utils::NoProcessor, move || {
1537 run_eval(EvalInput::new(
1538 vec![
1539 message(User, [text("Create a second empty todo file ")]),
1540 message(
1541 Assistant,
1542 [
1543 text(indoc::formatdoc! {"
1544 I'll help you create a second empty todo file.
1545 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1546 "}),
1547 tool_use(
1548 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1549 ListDirectoryTool::NAME,
1550 ListDirectoryToolInput {
1551 path: "root".to_string(),
1552 },
1553 ),
1554 ],
1555 ),
1556 message(
1557 User,
1558 [tool_result(
1559 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1560 ListDirectoryTool::NAME,
1561 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1562 )],
1563 ),
1564 ],
1565 input_file_path,
1566 input_file_content.clone(),
1567 EvalAssertion::assert_eq(expected_output_content.clone()),
1568 ))
1569 });
1570}