1use crate::tools::streaming_edit_file_tool::*;
2use crate::{
3 AgentTool, ContextServerRegistry, EditFileTool, GrepTool, GrepToolInput, ListDirectoryTool,
4 ListDirectoryToolInput, ReadFileTool, ReadFileToolInput, StreamingEditFileTool, Template,
5 Templates, Thread, ToolCallEventStream, ToolInput,
6};
7use Role::*;
8use anyhow::{Context as _, Result};
9use client::{Client, UserStore};
10use fs::FakeFs;
11use futures::{FutureExt, StreamExt, future::LocalBoxFuture};
12use gpui::{AppContext as _, AsyncApp, Entity, TestAppContext, UpdateGlobal as _};
13use http_client::StatusCode;
14use language::language_settings::FormatOnSave;
15use language_model::{
16 LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
17 LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage,
18 LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent,
19 LanguageModelToolSchemaFormat, LanguageModelToolUse, LanguageModelToolUseId, MessageContent,
20 Role, SelectedModel,
21};
22use project::Project;
23use prompt_store::{ProjectContext, WorktreeContext};
24use rand::prelude::*;
25use reqwest_client::ReqwestClient;
26use serde::Serialize;
27use serde_json::json;
28use settings::SettingsStore;
29use std::{
30 fmt::{self, Display},
31 path::{Path, PathBuf},
32 str::FromStr,
33 sync::Arc,
34 time::Duration,
35};
36use util::path;
37
38#[derive(Serialize)]
39struct DiffJudgeTemplate {
40 diff: String,
41 assertions: &'static str,
42}
43
44impl Template for DiffJudgeTemplate {
45 const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
46}
47
48#[derive(Clone)]
49struct EvalInput {
50 conversation: Vec<LanguageModelRequestMessage>,
51 input_file_path: PathBuf,
52 input_content: Option<String>,
53 assertion: EvalAssertion,
54}
55
56impl EvalInput {
57 fn new(
58 conversation: Vec<LanguageModelRequestMessage>,
59 input_file_path: impl Into<PathBuf>,
60 input_content: Option<String>,
61 assertion: EvalAssertion,
62 ) -> Self {
63 EvalInput {
64 conversation,
65 input_file_path: input_file_path.into(),
66 input_content,
67 assertion,
68 }
69 }
70}
71
72#[derive(Clone)]
73struct EvalSample {
74 text_before: String,
75 text_after: String,
76 tool_input: StreamingEditFileToolInput,
77 diff: String,
78}
79
80trait AssertionFn: 'static + Send + Sync {
81 fn assert<'a>(
82 &'a self,
83 sample: &'a EvalSample,
84 judge_model: Arc<dyn LanguageModel>,
85 cx: &'a mut TestAppContext,
86 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
87}
88
89impl<F> AssertionFn for F
90where
91 F: 'static
92 + Send
93 + Sync
94 + AsyncFn(
95 &EvalSample,
96 Arc<dyn LanguageModel>,
97 &mut TestAppContext,
98 ) -> Result<EvalAssertionOutcome>,
99{
100 fn assert<'a>(
101 &'a self,
102 sample: &'a EvalSample,
103 judge_model: Arc<dyn LanguageModel>,
104 cx: &'a mut TestAppContext,
105 ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
106 (self)(sample, judge_model, cx).boxed_local()
107 }
108}
109
110#[derive(Clone)]
111struct EvalAssertion(Arc<dyn AssertionFn>);
112
113impl EvalAssertion {
114 fn new<F>(f: F) -> Self
115 where
116 F: 'static
117 + Send
118 + Sync
119 + AsyncFn(
120 &EvalSample,
121 Arc<dyn LanguageModel>,
122 &mut TestAppContext,
123 ) -> Result<EvalAssertionOutcome>,
124 {
125 EvalAssertion(Arc::new(f))
126 }
127
128 fn assert_eq(expected: impl Into<String>) -> Self {
129 let expected = expected.into();
130 Self::new(async move |sample, _judge, _cx| {
131 Ok(EvalAssertionOutcome {
132 score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
133 100
134 } else {
135 0
136 },
137 message: None,
138 })
139 })
140 }
141
142 fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
143 let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
144 Self::new(async move |sample, _judge, _cx| {
145 let matches = expected_diffs.iter().any(|possible_diff| {
146 language::apply_diff_patch(&sample.text_before, possible_diff)
147 .map(|expected| {
148 strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
149 })
150 .unwrap_or(false)
151 });
152
153 Ok(EvalAssertionOutcome {
154 score: if matches { 100 } else { 0 },
155 message: None,
156 })
157 })
158 }
159
160 fn judge_diff(assertions: &'static str) -> Self {
161 Self::new(async move |sample, judge, cx| {
162 let prompt = DiffJudgeTemplate {
163 diff: sample.diff.clone(),
164 assertions,
165 }
166 .render(&Templates::new())
167 .context("Failed to render diff judge template")?;
168
169 let request = LanguageModelRequest {
170 messages: vec![LanguageModelRequestMessage {
171 role: Role::User,
172 content: vec![prompt.into()],
173 cache: false,
174 reasoning_details: None,
175 }],
176 thinking_allowed: true,
177 thinking_effort: judge
178 .default_effort_level()
179 .map(|effort_level| effort_level.value.to_string()),
180 ..Default::default()
181 };
182 let mut response = retry_on_rate_limit(async || {
183 Ok(judge
184 .stream_completion_text(request.clone(), &cx.to_async())
185 .await?)
186 })
187 .await?;
188 let mut output = String::new();
189 while let Some(chunk) = response.stream.next().await {
190 let chunk = chunk?;
191 output.push_str(&chunk);
192 }
193
194 let re = regex::Regex::new(r"<score>(\d+)</score>")
195 .context("Failed to compile score regex")?;
196 if let Some(captures) = re.captures(&output)
197 && let Some(score_match) = captures.get(1)
198 {
199 let score = score_match.as_str().parse().unwrap_or(0);
200 return Ok(EvalAssertionOutcome {
201 score,
202 message: Some(output),
203 });
204 }
205
206 anyhow::bail!("No score found in response. Raw output: {output}");
207 })
208 }
209
210 async fn run(
211 &self,
212 input: &EvalSample,
213 judge_model: Arc<dyn LanguageModel>,
214 cx: &mut TestAppContext,
215 ) -> Result<EvalAssertionOutcome> {
216 self.0.assert(input, judge_model, cx).await
217 }
218}
219
220#[derive(Clone)]
221struct StreamingEditEvalOutput {
222 sample: EvalSample,
223 assertion: EvalAssertionOutcome,
224}
225
226impl Display for StreamingEditEvalOutput {
227 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
228 writeln!(f, "Score: {:?}", self.assertion.score)?;
229 if let Some(message) = self.assertion.message.as_ref() {
230 writeln!(f, "Message: {}", message)?;
231 }
232 writeln!(f, "Diff:\n{}", self.sample.diff)?;
233 writeln!(f, "Tool Input:\n{:#?}", self.sample.tool_input)?;
234 Ok(())
235 }
236}
237
238#[derive(Clone, Debug, Eq, PartialEq, Hash)]
239struct EvalAssertionOutcome {
240 score: usize,
241 message: Option<String>,
242}
243
244struct StreamingEditToolTest {
245 fs: Arc<FakeFs>,
246 project: Entity<Project>,
247 model: Arc<dyn LanguageModel>,
248 judge_model: Arc<dyn LanguageModel>,
249 model_thinking_effort: Option<String>,
250}
251
252impl StreamingEditToolTest {
253 async fn new(cx: &mut TestAppContext) -> Self {
254 cx.executor().allow_parking();
255
256 let fs = FakeFs::new(cx.executor());
257 cx.update(|cx| {
258 let settings_store = SettingsStore::test(cx);
259 cx.set_global(settings_store);
260 SettingsStore::update_global(cx, |store: &mut SettingsStore, cx| {
261 store.update_user_settings(cx, |settings| {
262 settings
263 .project
264 .all_languages
265 .defaults
266 .ensure_final_newline_on_save = Some(false);
267 settings.project.all_languages.defaults.format_on_save =
268 Some(FormatOnSave::Off);
269 });
270 });
271
272 gpui_tokio::init(cx);
273 let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
274 cx.set_http_client(http_client);
275 let client = Client::production(cx);
276 let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
277 language_model::init(user_store.clone(), client.clone(), cx);
278 language_models::init(user_store, client, cx);
279 });
280
281 fs.insert_tree("/root", json!({})).await;
282 let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
283 let agent_model = SelectedModel::from_str(
284 &std::env::var("ZED_AGENT_MODEL")
285 .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
286 )
287 .unwrap();
288 let judge_model = SelectedModel::from_str(
289 &std::env::var("ZED_JUDGE_MODEL")
290 .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
291 )
292 .unwrap();
293
294 let authenticate_provider_tasks = cx.update(|cx| {
295 LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
296 registry
297 .providers()
298 .iter()
299 .map(|p| p.authenticate(cx))
300 .collect::<Vec<_>>()
301 })
302 });
303 let (model, judge_model) = cx
304 .update(|cx| {
305 cx.spawn(async move |cx| {
306 futures::future::join_all(authenticate_provider_tasks).await;
307 let model = Self::load_model(&agent_model, cx).await;
308 let judge_model = Self::load_model(&judge_model, cx).await;
309 (model.unwrap(), judge_model.unwrap())
310 })
311 })
312 .await;
313
314 let model_thinking_effort = model
315 .default_effort_level()
316 .map(|effort_level| effort_level.value.to_string());
317
318 Self {
319 fs,
320 project,
321 model,
322 judge_model,
323 model_thinking_effort,
324 }
325 }
326
327 async fn load_model(
328 selected_model: &SelectedModel,
329 cx: &mut AsyncApp,
330 ) -> Result<Arc<dyn LanguageModel>> {
331 cx.update(|cx| {
332 let registry = LanguageModelRegistry::read_global(cx);
333 let provider = registry
334 .provider(&selected_model.provider)
335 .expect("Provider not found");
336 provider.authenticate(cx)
337 })
338 .await?;
339 Ok(cx.update(|cx| {
340 let models = LanguageModelRegistry::read_global(cx);
341 models
342 .available_models(cx)
343 .find(|model| {
344 model.provider_id() == selected_model.provider
345 && model.id() == selected_model.model
346 })
347 .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0))
348 }))
349 }
350
351 /// Build the tool definitions for the model, replacing `edit_file` with the
352 /// streaming edit file tool schema. In production the streaming tool is
353 /// exposed under the name `"edit_file"` (see `Thread::enabled_tools`), so
354 /// the model has never seen the name `"streaming_edit_file"`.
355 fn build_tools() -> Vec<LanguageModelRequestTool> {
356 let mut tools: Vec<LanguageModelRequestTool> = crate::built_in_tools()
357 .filter(|tool| tool.name != EditFileTool::NAME)
358 .collect();
359 tools.push(LanguageModelRequestTool {
360 name: EditFileTool::NAME.to_string(),
361 description: StreamingEditFileTool::description().to_string(),
362 input_schema: StreamingEditFileTool::input_schema(
363 LanguageModelToolSchemaFormat::JsonSchema,
364 )
365 .to_value(),
366 use_input_streaming: StreamingEditFileTool::supports_input_streaming(),
367 });
368 tools
369 }
370
371 async fn eval(
372 &self,
373 mut eval: EvalInput,
374 cx: &mut TestAppContext,
375 ) -> Result<StreamingEditEvalOutput> {
376 eval.conversation
377 .last_mut()
378 .context("Conversation must not be empty")?
379 .cache = true;
380
381 // Populate the FakeFs so `resolve_path` / `entry_for_path` can find
382 // the file in the worktree.
383 if let Some(input_content) = eval.input_content.as_deref() {
384 let abs_path = Path::new("/root").join(
385 eval.input_file_path
386 .strip_prefix("root")
387 .unwrap_or(&eval.input_file_path),
388 );
389 self.fs.insert_file(&abs_path, input_content.into()).await;
390
391 // Wait for the worktree to pick up the new file.
392 cx.run_until_parked();
393 }
394
395 let tools = Self::build_tools();
396
397 let system_prompt = {
398 let worktrees = vec![WorktreeContext {
399 root_name: "root".to_string(),
400 abs_path: Path::new("/path/to/root").into(),
401 rules_file: None,
402 }];
403 let project_context = ProjectContext::new(worktrees, Vec::default());
404 let tool_names = tools
405 .iter()
406 .map(|tool| tool.name.clone().into())
407 .collect::<Vec<_>>();
408 let template = crate::SystemPromptTemplate {
409 project: &project_context,
410 available_tools: tool_names,
411 model_name: None,
412 };
413 let templates = Templates::new();
414 template.render(&templates)?
415 };
416
417 let has_system_prompt = eval
418 .conversation
419 .first()
420 .is_some_and(|msg| msg.role == Role::System);
421 let messages = if has_system_prompt {
422 eval.conversation
423 } else {
424 [LanguageModelRequestMessage {
425 role: Role::System,
426 content: vec![MessageContent::Text(system_prompt)],
427 cache: true,
428 reasoning_details: None,
429 }]
430 .into_iter()
431 .chain(eval.conversation)
432 .collect::<Vec<_>>()
433 };
434
435 let request = LanguageModelRequest {
436 messages,
437 tools,
438 thinking_allowed: true,
439 thinking_effort: self.model_thinking_effort.clone(),
440 ..Default::default()
441 };
442
443 // The model will call the tool as "edit_file" (the production-visible
444 // name), but the schema is from StreamingEditFileTool.
445 let tool_input =
446 retry_on_rate_limit(async || self.extract_tool_use(request.clone(), cx).await).await?;
447
448 let language_registry = self
449 .project
450 .read_with(cx, |project, _cx| project.languages().clone());
451
452 let context_server_registry = cx
453 .new(|cx| ContextServerRegistry::new(self.project.read(cx).context_server_store(), cx));
454 let thread = cx.new(|cx| {
455 Thread::new(
456 self.project.clone(),
457 cx.new(|_cx| ProjectContext::default()),
458 context_server_registry,
459 Templates::new(),
460 Some(self.model.clone()),
461 cx,
462 )
463 });
464 let action_log = thread.read_with(cx, |thread, _| thread.action_log().clone());
465
466 let tool = Arc::new(StreamingEditFileTool::new(
467 self.project.clone(),
468 thread.downgrade(),
469 action_log,
470 language_registry,
471 ));
472
473 let result = cx
474 .update(|cx| {
475 tool.clone().run(
476 ToolInput::resolved(tool_input.clone()),
477 ToolCallEventStream::test().0,
478 cx,
479 )
480 })
481 .await;
482
483 let output = match result {
484 Ok(output) => output,
485 Err(output) => {
486 anyhow::bail!("Tool returned error: {}", output);
487 }
488 };
489
490 let StreamingEditFileToolOutput::Success { new_text, .. } = &output else {
491 anyhow::bail!("Tool returned error output: {}", output);
492 };
493
494 let sample = EvalSample {
495 tool_input,
496 diff: language::unified_diff(
497 eval.input_content.as_deref().unwrap_or_default(),
498 new_text,
499 ),
500 text_before: eval.input_content.unwrap_or_default(),
501 text_after: new_text.clone(),
502 };
503
504 let assertion = eval
505 .assertion
506 .run(&sample, self.judge_model.clone(), cx)
507 .await?;
508
509 Ok(StreamingEditEvalOutput { assertion, sample })
510 }
511
512 /// Stream the model completion and extract the first complete tool use
513 /// whose name matches `EditFileTool::NAME` (the production-visible name
514 /// for the streaming edit tool), parsed as `StreamingEditFileToolInput`.
515 async fn extract_tool_use(
516 &self,
517 request: LanguageModelRequest,
518 cx: &mut TestAppContext,
519 ) -> Result<StreamingEditFileToolInput> {
520 let model = self.model.clone();
521 let events = cx
522 .update(|cx| {
523 let async_cx = cx.to_async();
524 cx.foreground_executor()
525 .spawn(async move { model.stream_completion(request, &async_cx).await })
526 })
527 .await
528 .map_err(|err| anyhow::anyhow!("completion error: {}", err))?;
529
530 let mut streamed_text = String::new();
531 let mut stop_reason = None;
532 let mut parse_errors = Vec::new();
533
534 let mut events = events.fuse();
535 while let Some(event) = events.next().await {
536 match event {
537 Ok(LanguageModelCompletionEvent::ToolUse(tool_use))
538 if tool_use.is_input_complete
539 && tool_use.name.as_ref() == EditFileTool::NAME =>
540 {
541 let input: StreamingEditFileToolInput = serde_json::from_value(tool_use.input)
542 .context("Failed to parse tool input as StreamingEditFileToolInput")?;
543 return Ok(input);
544 }
545 Ok(LanguageModelCompletionEvent::Text(text)) => {
546 if streamed_text.len() < 2_000 {
547 streamed_text.push_str(&text);
548 }
549 }
550 Ok(LanguageModelCompletionEvent::Stop(reason)) => {
551 stop_reason = Some(reason);
552 }
553 Ok(LanguageModelCompletionEvent::ToolUseJsonParseError {
554 tool_name,
555 raw_input,
556 json_parse_error,
557 ..
558 }) if tool_name.as_ref() == EditFileTool::NAME => {
559 parse_errors.push(format!("{json_parse_error}\nRaw input:\n{raw_input:?}"));
560 }
561 Err(err) => {
562 return Err(anyhow::anyhow!("completion error: {}", err));
563 }
564 _ => {}
565 }
566 }
567
568 let streamed_text = streamed_text.trim();
569 let streamed_text_suffix = if streamed_text.is_empty() {
570 String::new()
571 } else {
572 format!("\nStreamed text:\n{streamed_text}")
573 };
574 let stop_reason_suffix = stop_reason
575 .map(|reason| format!("\nStop reason: {reason:?}"))
576 .unwrap_or_default();
577 let parse_errors_suffix = if parse_errors.is_empty() {
578 String::new()
579 } else {
580 format!("\nTool parse errors:\n{}", parse_errors.join("\n"))
581 };
582
583 anyhow::bail!(
584 "Stream ended without an edit_file tool use{stop_reason_suffix}{parse_errors_suffix}{streamed_text_suffix}"
585 )
586 }
587}
588
589fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<()> {
590 let dispatcher = gpui::TestDispatcher::new(rand::random());
591 let mut cx = TestAppContext::build(dispatcher, None);
592 let foreground_executor = cx.foreground_executor().clone();
593 let result = foreground_executor.block_test(async {
594 let test = StreamingEditToolTest::new(&mut cx).await;
595 let result = test.eval(eval, &mut cx).await;
596 drop(test);
597 cx.run_until_parked();
598 result
599 });
600 cx.quit();
601 match result {
602 Ok(output) => eval_utils::EvalOutput {
603 data: output.to_string(),
604 outcome: if output.assertion.score < 80 {
605 eval_utils::OutcomeKind::Failed
606 } else {
607 eval_utils::OutcomeKind::Passed
608 },
609 metadata: (),
610 },
611 Err(err) => eval_utils::EvalOutput {
612 data: format!("{err:?}"),
613 outcome: eval_utils::OutcomeKind::Error,
614 metadata: (),
615 },
616 }
617}
618
619fn message(
620 role: Role,
621 contents: impl IntoIterator<Item = MessageContent>,
622) -> LanguageModelRequestMessage {
623 LanguageModelRequestMessage {
624 role,
625 content: contents.into_iter().collect(),
626 cache: false,
627 reasoning_details: None,
628 }
629}
630
631fn text(text: impl Into<String>) -> MessageContent {
632 MessageContent::Text(text.into())
633}
634
635fn lines(input: &str, range: std::ops::Range<usize>) -> String {
636 input
637 .lines()
638 .skip(range.start)
639 .take(range.len())
640 .collect::<Vec<_>>()
641 .join("\n")
642}
643
644fn tool_use(
645 id: impl Into<Arc<str>>,
646 name: impl Into<Arc<str>>,
647 input: impl Serialize,
648) -> MessageContent {
649 MessageContent::ToolUse(LanguageModelToolUse {
650 id: LanguageModelToolUseId::from(id.into()),
651 name: name.into(),
652 raw_input: serde_json::to_string_pretty(&input).unwrap(),
653 input: serde_json::to_value(input).unwrap(),
654 is_input_complete: true,
655 thought_signature: None,
656 })
657}
658
659fn tool_result(
660 id: impl Into<Arc<str>>,
661 name: impl Into<Arc<str>>,
662 result: impl Into<Arc<str>>,
663) -> MessageContent {
664 MessageContent::ToolResult(LanguageModelToolResult {
665 tool_use_id: LanguageModelToolUseId::from(id.into()),
666 tool_name: name.into(),
667 is_error: false,
668 content: LanguageModelToolResultContent::Text(result.into()),
669 output: None,
670 })
671}
672
673fn strip_empty_lines(text: &str) -> String {
674 text.lines()
675 .filter(|line| !line.trim().is_empty())
676 .collect::<Vec<_>>()
677 .join("\n")
678}
679
680async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
681 const MAX_RETRIES: usize = 20;
682 let mut attempt = 0;
683
684 loop {
685 attempt += 1;
686 let response = request().await;
687
688 if attempt >= MAX_RETRIES {
689 return response;
690 }
691
692 let retry_delay = match &response {
693 Ok(_) => None,
694 Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
695 Some(err) => match &err {
696 LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
697 | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
698 Some(retry_after.unwrap_or(Duration::from_secs(5)))
699 }
700 LanguageModelCompletionError::UpstreamProviderError {
701 status,
702 retry_after,
703 ..
704 } => {
705 let should_retry = matches!(
706 *status,
707 StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
708 ) || status.as_u16() == 529;
709
710 if should_retry {
711 Some(retry_after.unwrap_or(Duration::from_secs(5)))
712 } else {
713 None
714 }
715 }
716 LanguageModelCompletionError::ApiReadResponseError { .. }
717 | LanguageModelCompletionError::ApiInternalServerError { .. }
718 | LanguageModelCompletionError::HttpSend { .. } => {
719 Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
720 }
721 _ => None,
722 },
723 _ => None,
724 },
725 };
726
727 if let Some(retry_after) = retry_delay {
728 let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
729 eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
730 #[allow(clippy::disallowed_methods)]
731 smol::Timer::after(retry_after + jitter).await;
732 } else {
733 return response;
734 }
735 }
736}
737
738#[test]
739#[cfg_attr(not(feature = "unit-eval"), ignore)]
740fn eval_delete_function() {
741 let input_file_path = "root/blame.rs";
742 let input_file_content = include_str!("fixtures/delete_run_git_blame/before.rs");
743 let output_file_content = include_str!("fixtures/delete_run_git_blame/after.rs");
744 let possible_diffs = vec![
745 language::unified_diff(input_file_content, output_file_content),
746 language::unified_diff(
747 input_file_content,
748 &output_file_content
749 .replace(
750 "const GIT_BLAME_NO_COMMIT_ERROR: &str = \"fatal: no such ref: HEAD\";\n",
751 "",
752 )
753 .replace(
754 "const GIT_BLAME_NO_PATH: &str = \"fatal: no such path\";\n",
755 "",
756 ),
757 ),
758 ];
759
760 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
761 run_eval(EvalInput::new(
762 vec![
763 message(
764 User,
765 [text(indoc::formatdoc! {"
766 Read the `{input_file_path}` file and delete `run_git_blame`. Just that
767 one function, not its usages.
768 "})],
769 ),
770 message(
771 Assistant,
772 [tool_use(
773 "tool_1",
774 ReadFileTool::NAME,
775 ReadFileToolInput {
776 path: input_file_path.into(),
777 start_line: None,
778 end_line: None,
779 },
780 )],
781 ),
782 message(
783 User,
784 [tool_result(
785 "tool_1",
786 ReadFileTool::NAME,
787 input_file_content,
788 )],
789 ),
790 ],
791 input_file_path,
792 Some(input_file_content.into()),
793 EvalAssertion::assert_diff_any(possible_diffs.clone()),
794 ))
795 });
796}
797
798#[test]
799#[cfg_attr(not(feature = "unit-eval"), ignore)]
800fn eval_extract_handle_command_output() {
801 let input_file_path = "root/blame.rs";
802 let input_file_content = include_str!("fixtures/extract_handle_command_output/before.rs");
803 let possible_diffs = vec![
804 include_str!("fixtures/extract_handle_command_output/possible-01.diff"),
805 include_str!("fixtures/extract_handle_command_output/possible-02.diff"),
806 include_str!("fixtures/extract_handle_command_output/possible-03.diff"),
807 include_str!("fixtures/extract_handle_command_output/possible-04.diff"),
808 include_str!("fixtures/extract_handle_command_output/possible-05.diff"),
809 include_str!("fixtures/extract_handle_command_output/possible-06.diff"),
810 include_str!("fixtures/extract_handle_command_output/possible-07.diff"),
811 ];
812
813 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
814 run_eval(EvalInput::new(
815 vec![
816 message(
817 User,
818 [text(indoc::formatdoc! {"
819 Read the `{input_file_path}` file and extract a method in
820 the final stanza of `run_git_blame` to deal with command failures,
821 call it `handle_command_output` and take the std::process::Output as the only parameter.
822 Do not document the method and do not add any comments.
823
824 Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
825 "})],
826 ),
827 message(
828 Assistant,
829 [tool_use(
830 "tool_1",
831 ReadFileTool::NAME,
832 ReadFileToolInput {
833 path: input_file_path.into(),
834 start_line: None,
835 end_line: None,
836 },
837 )],
838 ),
839 message(
840 User,
841 [tool_result(
842 "tool_1",
843 ReadFileTool::NAME,
844 input_file_content,
845 )],
846 ),
847 ],
848 input_file_path,
849 Some(input_file_content.into()),
850 EvalAssertion::assert_diff_any(possible_diffs.clone()),
851 ))
852 });
853}
854
855#[test]
856#[cfg_attr(not(feature = "unit-eval"), ignore)]
857fn eval_translate_doc_comments() {
858 let input_file_path = "root/canvas.rs";
859 let input_file_content = include_str!("fixtures/translate_doc_comments/before.rs");
860
861 eval_utils::eval(200, 1., eval_utils::NoProcessor, move || {
862 run_eval(EvalInput::new(
863 vec![
864 message(
865 User,
866 [text(indoc::formatdoc! {"
867 Read the `{input_file_path}` file and edit it (without overwriting it),
868 translating all the doc comments to italian.
869 "})],
870 ),
871 message(
872 Assistant,
873 [tool_use(
874 "tool_1",
875 ReadFileTool::NAME,
876 ReadFileToolInput {
877 path: input_file_path.into(),
878 start_line: None,
879 end_line: None,
880 },
881 )],
882 ),
883 message(
884 User,
885 [tool_result(
886 "tool_1",
887 ReadFileTool::NAME,
888 input_file_content,
889 )],
890 ),
891 ],
892 input_file_path,
893 Some(input_file_content.into()),
894 EvalAssertion::judge_diff("Doc comments were translated to Italian"),
895 ))
896 });
897}
898
899#[test]
900#[cfg_attr(not(feature = "unit-eval"), ignore)]
901fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
902 let input_file_path = "root/lib.rs";
903 let input_file_content =
904 include_str!("fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
905
906 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
907 run_eval(EvalInput::new(
908 vec![
909 message(
910 User,
911 [text(indoc::formatdoc! {"
912 Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
913 Use `ureq` to download the SDK for the current platform and architecture.
914 Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
915 Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
916 that's inside of the archive.
917 Don't re-download the SDK if that executable already exists.
918
919 Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
920
921 Here are the available wasi-sdk assets:
922 - wasi-sdk-25.0-x86_64-macos.tar.gz
923 - wasi-sdk-25.0-arm64-macos.tar.gz
924 - wasi-sdk-25.0-x86_64-linux.tar.gz
925 - wasi-sdk-25.0-arm64-linux.tar.gz
926 - wasi-sdk-25.0-x86_64-linux.tar.gz
927 - wasi-sdk-25.0-arm64-linux.tar.gz
928 - wasi-sdk-25.0-x86_64-windows.tar.gz
929 "})],
930 ),
931 message(
932 Assistant,
933 [tool_use(
934 "tool_1",
935 ReadFileTool::NAME,
936 ReadFileToolInput {
937 path: input_file_path.into(),
938 start_line: Some(971),
939 end_line: Some(1050),
940 },
941 )],
942 ),
943 message(
944 User,
945 [tool_result(
946 "tool_1",
947 ReadFileTool::NAME,
948 lines(input_file_content, 971..1050),
949 )],
950 ),
951 message(
952 Assistant,
953 [tool_use(
954 "tool_2",
955 ReadFileTool::NAME,
956 ReadFileToolInput {
957 path: input_file_path.into(),
958 start_line: Some(1050),
959 end_line: Some(1100),
960 },
961 )],
962 ),
963 message(
964 User,
965 [tool_result(
966 "tool_2",
967 ReadFileTool::NAME,
968 lines(input_file_content, 1050..1100),
969 )],
970 ),
971 message(
972 Assistant,
973 [tool_use(
974 "tool_3",
975 ReadFileTool::NAME,
976 ReadFileToolInput {
977 path: input_file_path.into(),
978 start_line: Some(1100),
979 end_line: Some(1150),
980 },
981 )],
982 ),
983 message(
984 User,
985 [tool_result(
986 "tool_3",
987 ReadFileTool::NAME,
988 lines(input_file_content, 1100..1150),
989 )],
990 ),
991 ],
992 input_file_path,
993 Some(input_file_content.into()),
994 EvalAssertion::judge_diff(indoc::indoc! {"
995 - The compile_parser_to_wasm method has been changed to use wasi-sdk
996 - ureq is used to download the SDK for current platform and architecture
997 "}),
998 ))
999 });
1000}
1001
1002#[test]
1003#[cfg_attr(not(feature = "unit-eval"), ignore)]
1004fn eval_disable_cursor_blinking() {
1005 let input_file_path = "root/editor.rs";
1006 let input_file_content = include_str!("fixtures/disable_cursor_blinking/before.rs");
1007 let possible_diffs = vec![
1008 include_str!("fixtures/disable_cursor_blinking/possible-01.diff"),
1009 include_str!("fixtures/disable_cursor_blinking/possible-02.diff"),
1010 include_str!("fixtures/disable_cursor_blinking/possible-03.diff"),
1011 include_str!("fixtures/disable_cursor_blinking/possible-04.diff"),
1012 ];
1013
1014 eval_utils::eval(100, 0.51, eval_utils::NoProcessor, move || {
1015 run_eval(EvalInput::new(
1016 vec![
1017 message(User, [text("Let's research how to cursor blinking works.")]),
1018 message(
1019 Assistant,
1020 [tool_use(
1021 "tool_1",
1022 GrepTool::NAME,
1023 GrepToolInput {
1024 regex: "blink".into(),
1025 include_pattern: None,
1026 offset: 0,
1027 case_sensitive: false,
1028 },
1029 )],
1030 ),
1031 message(
1032 User,
1033 [tool_result(
1034 "tool_1",
1035 GrepTool::NAME,
1036 [
1037 lines(input_file_content, 100..400),
1038 lines(input_file_content, 800..1300),
1039 lines(input_file_content, 1600..2000),
1040 lines(input_file_content, 5000..5500),
1041 lines(input_file_content, 8000..9000),
1042 lines(input_file_content, 18455..18470),
1043 lines(input_file_content, 20000..20500),
1044 lines(input_file_content, 21000..21300),
1045 ]
1046 .join("Match found:\n\n"),
1047 )],
1048 ),
1049 message(
1050 User,
1051 [text(indoc::indoc! {"
1052 Comment out the lines that interact with the BlinkManager.
1053 Keep the outer `update` blocks, but comments everything that's inside (including if statements).
1054 Don't add additional comments.
1055 "})],
1056 ),
1057 ],
1058 input_file_path,
1059 Some(input_file_content.into()),
1060 EvalAssertion::assert_diff_any(possible_diffs.clone()),
1061 ))
1062 });
1063}
1064
1065#[test]
1066#[cfg_attr(not(feature = "unit-eval"), ignore)]
1067fn eval_from_pixels_constructor() {
1068 let input_file_path = "root/canvas.rs";
1069 let input_file_content = include_str!("fixtures/from_pixels_constructor/before.rs");
1070
1071 eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
1072 run_eval(EvalInput::new(
1073 vec![
1074 message(
1075 User,
1076 [text(indoc::indoc! {"
1077 Introduce a new `from_pixels` constructor in Canvas and
1078 also add tests for it in the same file.
1079 "})],
1080 ),
1081 message(
1082 Assistant,
1083 [tool_use(
1084 "tool_1",
1085 ReadFileTool::NAME,
1086 ReadFileToolInput {
1087 path: input_file_path.into(),
1088 start_line: None,
1089 end_line: None,
1090 },
1091 )],
1092 ),
1093 message(
1094 User,
1095 [tool_result(
1096 "tool_1",
1097 ReadFileTool::NAME,
1098 input_file_content,
1099 )],
1100 ),
1101 message(
1102 Assistant,
1103 [tool_use(
1104 "tool_2",
1105 GrepTool::NAME,
1106 GrepToolInput {
1107 regex: "mod\\s+tests".into(),
1108 include_pattern: Some("font-kit/src/canvas.rs".into()),
1109 offset: 0,
1110 case_sensitive: false,
1111 },
1112 )],
1113 ),
1114 message(
1115 User,
1116 [tool_result("tool_2", GrepTool::NAME, "No matches found")],
1117 ),
1118 message(
1119 Assistant,
1120 [tool_use(
1121 "tool_3",
1122 GrepTool::NAME,
1123 GrepToolInput {
1124 regex: "mod\\s+tests".into(),
1125 include_pattern: Some("font-kit/src/**/*.rs".into()),
1126 offset: 0,
1127 case_sensitive: false,
1128 },
1129 )],
1130 ),
1131 message(
1132 User,
1133 [tool_result("tool_3", GrepTool::NAME, "No matches found")],
1134 ),
1135 message(
1136 Assistant,
1137 [tool_use(
1138 "tool_4",
1139 GrepTool::NAME,
1140 GrepToolInput {
1141 regex: "#\\[test\\]".into(),
1142 include_pattern: Some("font-kit/src/**/*.rs".into()),
1143 offset: 0,
1144 case_sensitive: false,
1145 },
1146 )],
1147 ),
1148 message(
1149 User,
1150 [tool_result(
1151 "tool_4",
1152 GrepTool::NAME,
1153 indoc::indoc! {"
1154 Found 6 matches:
1155
1156 ## Matches in font-kit/src/loaders/core_text.rs
1157
1158 ### mod test › L926-936
1159 ```
1160 mod test {
1161 use super::Font;
1162 use crate::properties::{Stretch, Weight};
1163
1164 #[cfg(feature = \"source\")]
1165 use crate::source::SystemSource;
1166
1167 static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
1168
1169 #[cfg(feature = \"source\")]
1170 #[test]
1171 ```
1172
1173 55 lines remaining in ancestor node. Read the file to see all.
1174
1175 ### mod test › L947-951
1176 ```
1177 }
1178
1179 #[test]
1180 fn test_core_text_to_css_font_weight() {
1181 // Exact matches
1182 ```
1183
1184 ### mod test › L959-963
1185 ```
1186 }
1187
1188 #[test]
1189 fn test_core_text_to_css_font_stretch() {
1190 // Exact matches
1191 ```
1192
1193 ## Matches in font-kit/src/loaders/freetype.rs
1194
1195 ### mod test › L1238-1248
1196 ```
1197 mod test {
1198 use crate::loaders::freetype::Font;
1199
1200 static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
1201 static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
1202
1203 #[test]
1204 fn get_pcf_postscript_name() {
1205 let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
1206 assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
1207 }
1208 ```
1209
1210 1 lines remaining in ancestor node. Read the file to see all.
1211
1212 ## Matches in font-kit/src/sources/core_text.rs
1213
1214 ### mod test › L265-275
1215 ```
1216 mod test {
1217 use crate::properties::{Stretch, Weight};
1218
1219 #[test]
1220 fn test_css_to_core_text_font_weight() {
1221 // Exact matches
1222 assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
1223 assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
1224 assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
1225 assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
1226
1227 ```
1228
1229 27 lines remaining in ancestor node. Read the file to see all.
1230
1231 ### mod test › L278-282
1232 ```
1233 }
1234
1235 #[test]
1236 fn test_css_to_core_text_font_stretch() {
1237 // Exact matches
1238 ```
1239 "},
1240 )],
1241 ),
1242 ],
1243 input_file_path,
1244 Some(input_file_content.into()),
1245 EvalAssertion::judge_diff(indoc::indoc! {"
1246 - The diff contains a new `from_pixels` constructor
1247 - The diff contains new tests for the `from_pixels` constructor
1248 "}),
1249 ))
1250 });
1251}
1252
1253#[test]
1254#[cfg_attr(not(feature = "unit-eval"), ignore)]
1255fn eval_zode() {
1256 let input_file_path = "root/zode.py";
1257 let input_content = None;
1258
1259 eval_utils::eval(50, 1., eval_utils::NoProcessor, move || {
1260 run_eval(EvalInput::new(
1261 vec![
1262 message(User, [text(include_str!("fixtures/zode/prompt.md"))]),
1263 message(
1264 Assistant,
1265 [
1266 tool_use(
1267 "tool_1",
1268 ReadFileTool::NAME,
1269 ReadFileToolInput {
1270 path: "root/eval/react.py".into(),
1271 start_line: None,
1272 end_line: None,
1273 },
1274 ),
1275 tool_use(
1276 "tool_2",
1277 ReadFileTool::NAME,
1278 ReadFileToolInput {
1279 path: "root/eval/react_test.py".into(),
1280 start_line: None,
1281 end_line: None,
1282 },
1283 ),
1284 ],
1285 ),
1286 message(
1287 User,
1288 [
1289 tool_result(
1290 "tool_1",
1291 ReadFileTool::NAME,
1292 include_str!("fixtures/zode/react.py"),
1293 ),
1294 tool_result(
1295 "tool_2",
1296 ReadFileTool::NAME,
1297 include_str!("fixtures/zode/react_test.py"),
1298 ),
1299 ],
1300 ),
1301 ],
1302 input_file_path,
1303 input_content.clone(),
1304 EvalAssertion::new(async move |sample, _, _cx| {
1305 let invalid_starts = [' ', '`', '\n'];
1306 let mut message = String::new();
1307 for start in invalid_starts {
1308 if sample.text_after.starts_with(start) {
1309 message.push_str(&format!("The sample starts with a {:?}\n", start));
1310 break;
1311 }
1312 }
1313 message.pop();
1314
1315 if message.is_empty() {
1316 Ok(EvalAssertionOutcome {
1317 score: 100,
1318 message: None,
1319 })
1320 } else {
1321 Ok(EvalAssertionOutcome {
1322 score: 0,
1323 message: Some(message),
1324 })
1325 }
1326 }),
1327 ))
1328 });
1329}
1330
1331#[test]
1332#[cfg_attr(not(feature = "unit-eval"), ignore)]
1333fn eval_add_overwrite_test() {
1334 let input_file_path = "root/action_log.rs";
1335 let input_file_content = include_str!("fixtures/add_overwrite_test/before.rs");
1336
1337 eval_utils::eval(200, 0.5, eval_utils::NoProcessor, move || {
1338 run_eval(EvalInput::new(
1339 vec![
1340 message(
1341 User,
1342 [text(indoc::indoc! {"
1343 Introduce a new test in `action_log.rs` to test overwriting a file.
1344 That is, a file already exists, but we call `buffer_created` as if the file were new.
1345 Take inspiration from all the other tests in the file.
1346 "})],
1347 ),
1348 message(
1349 Assistant,
1350 [tool_use(
1351 "tool_1",
1352 ReadFileTool::NAME,
1353 ReadFileToolInput {
1354 path: input_file_path.into(),
1355 start_line: None,
1356 end_line: None,
1357 },
1358 )],
1359 ),
1360 message(
1361 User,
1362 [tool_result(
1363 "tool_1",
1364 ReadFileTool::NAME,
1365 indoc::indoc! {"
1366 pub struct ActionLog [L13-20]
1367 tracked_buffers [L15]
1368 edited_since_project_diagnostics_check [L17]
1369 project [L19]
1370 impl ActionLog [L22-498]
1371 pub fn new [L24-30]
1372 pub fn project [L32-34]
1373 pub fn checked_project_diagnostics [L37-39]
1374 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
1375 fn track_buffer_internal [L46-101]
1376 fn handle_buffer_event [L103-116]
1377 fn handle_buffer_edited [L118-123]
1378 fn handle_buffer_file_changed [L125-158]
1379 async fn maintain_diff [L160-264]
1380 pub fn buffer_read [L267-269]
1381 pub fn buffer_created [L272-276]
1382 pub fn buffer_edited [L279-287]
1383 pub fn will_delete_buffer [L289-304]
1384 pub fn keep_edits_in_range [L306-364]
1385 pub fn reject_edits_in_ranges [L366-459]
1386 pub fn keep_all_edits [L461-473]
1387 pub fn changed_buffers [L476-482]
1388 pub fn stale_buffers [L485-497]
1389 fn apply_non_conflicting_edits [L500-561]
1390 fn diff_snapshots [L563-585]
1391 fn point_to_row_edit [L587-614]
1392 enum ChangeAuthor [L617-620]
1393 User [L618]
1394 Agent [L619]
1395 enum TrackedBufferStatus [L623-627]
1396 Created [L624]
1397 Modified [L625]
1398 Deleted [L626]
1399 struct TrackedBuffer [L629-641]
1400 buffer [L630]
1401 base_text [L631]
1402 unreviewed_changes [L632]
1403 status [L633]
1404 version [L634]
1405 diff [L635]
1406 snapshot [L636]
1407 diff_update [L637]
1408 _open_lsp_handle [L638]
1409 _maintain_diff [L639]
1410 _subscription [L640]
1411 impl TrackedBuffer [L643-657]
1412 fn has_changes [L644-650]
1413 fn schedule_diff_update [L652-656]
1414 pub struct ChangedBuffer [L659-661]
1415 pub diff [L660]
1416 mod tests [L664-1574]
1417 fn init_logger [L678-682]
1418 fn init_test [L684-691]
1419 async fn test_keep_edits [L694-769]
1420 async fn test_deletions [L772-854]
1421 async fn test_overlapping_user_edits [L857-951]
1422 async fn test_creating_files [L954-1010]
1423 async fn test_deleting_files [L1013-1120]
1424 async fn test_reject_edits [L1123-1255]
1425 async fn test_reject_multiple_edits [L1258-1331]
1426 async fn test_reject_deleted_file [L1334-1388]
1427 async fn test_reject_created_file [L1391-1443]
1428 async fn test_random_diffs [L1446-1535]
1429 fn quiesce [L1510-1534]
1430 struct HunkStatus [L1538-1542]
1431 range [L1539]
1432 diff_status [L1540]
1433 old_text [L1541]
1434 fn unreviewed_hunks [L1544-1573]
1435
1436 Showing symbols 1-69 (total symbols: 69)
1437
1438 Using the line numbers in this outline, you can call this tool again while specifying
1439 the start_line and end_line fields to see the implementations of symbols in the outline.
1440 "},
1441 )],
1442 ),
1443 message(
1444 Assistant,
1445 [
1446 text(
1447 "Let's look at the existing test cases that are related to file creation to understand the pattern:",
1448 ),
1449 tool_use(
1450 "tool_2",
1451 ReadFileTool::NAME,
1452 ReadFileToolInput {
1453 path: input_file_path.into(),
1454 start_line: Some(953),
1455 end_line: Some(1010),
1456 },
1457 ),
1458 ],
1459 ),
1460 message(
1461 User,
1462 [tool_result(
1463 "tool_2",
1464 ReadFileTool::NAME,
1465 lines(input_file_content, 953..1010),
1466 )],
1467 ),
1468 message(
1469 Assistant,
1470 [
1471 text(
1472 "Now let's look at another relevant test to understand how they're structured:",
1473 ),
1474 tool_use(
1475 "tool_3",
1476 ReadFileTool::NAME,
1477 ReadFileToolInput {
1478 path: input_file_path.into(),
1479 start_line: Some(1012),
1480 end_line: Some(1120),
1481 },
1482 ),
1483 ],
1484 ),
1485 message(
1486 User,
1487 [tool_result(
1488 "tool_3",
1489 ReadFileTool::NAME,
1490 lines(input_file_content, 1012..1120),
1491 )],
1492 ),
1493 message(
1494 Assistant,
1495 [
1496 text("Now let's look at how `buffer_created` is implemented:"),
1497 tool_use(
1498 "tool_4",
1499 ReadFileTool::NAME,
1500 ReadFileToolInput {
1501 path: input_file_path.into(),
1502 start_line: Some(271),
1503 end_line: Some(276),
1504 },
1505 ),
1506 ],
1507 ),
1508 message(
1509 User,
1510 [tool_result(
1511 "tool_4",
1512 ReadFileTool::NAME,
1513 lines(input_file_content, 271..276),
1514 )],
1515 ),
1516 ],
1517 input_file_path,
1518 Some(input_file_content.into()),
1519 EvalAssertion::judge_diff(
1520 "A new test for overwritten files was created, without changing any previous test",
1521 ),
1522 ))
1523 });
1524}
1525
1526#[test]
1527#[cfg_attr(not(feature = "unit-eval"), ignore)]
1528fn eval_create_empty_file() {
1529 let input_file_path = "root/TODO3";
1530 let input_file_content = None;
1531 let expected_output_content = String::new();
1532
1533 eval_utils::eval(100, 0.99, eval_utils::NoProcessor, move || {
1534 run_eval(EvalInput::new(
1535 vec![
1536 message(User, [text("Create a second empty todo file ")]),
1537 message(
1538 Assistant,
1539 [
1540 text(indoc::formatdoc! {"
1541 I'll help you create a second empty todo file.
1542 First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1543 "}),
1544 tool_use(
1545 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1546 ListDirectoryTool::NAME,
1547 ListDirectoryToolInput {
1548 path: "root".to_string(),
1549 },
1550 ),
1551 ],
1552 ),
1553 message(
1554 User,
1555 [tool_result(
1556 "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1557 ListDirectoryTool::NAME,
1558 "root/TODO\nroot/TODO2\nroot/new.txt\n",
1559 )],
1560 ),
1561 ],
1562 input_file_path,
1563 input_file_content.clone(),
1564 EvalAssertion::assert_eq(expected_output_content.clone()),
1565 ))
1566 });
1567}