streaming_edit_file.rs

   1use crate::tools::streaming_edit_file_tool::*;
   2use crate::{
   3    AgentTool, ContextServerRegistry, EditFileTool, GrepTool, GrepToolInput, ListDirectoryTool,
   4    ListDirectoryToolInput, ReadFileTool, ReadFileToolInput, StreamingEditFileTool, Template,
   5    Templates, Thread, ToolCallEventStream, ToolInput,
   6};
   7use Role::*;
   8use anyhow::{Context as _, Result};
   9use client::{Client, RefreshLlmTokenListener, UserStore};
  10use fs::FakeFs;
  11use futures::{FutureExt, StreamExt, future::LocalBoxFuture};
  12use gpui::{AppContext as _, AsyncApp, Entity, TestAppContext, UpdateGlobal as _};
  13use http_client::StatusCode;
  14use language::language_settings::FormatOnSave;
  15use language_model::{
  16    LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
  17    LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage,
  18    LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent,
  19    LanguageModelToolSchemaFormat, LanguageModelToolUse, LanguageModelToolUseId, MessageContent,
  20    Role, SelectedModel,
  21};
  22use project::Project;
  23use prompt_store::{ProjectContext, WorktreeContext};
  24use rand::prelude::*;
  25use reqwest_client::ReqwestClient;
  26use serde::Serialize;
  27use serde_json::json;
  28use settings::SettingsStore;
  29use std::{
  30    fmt::{self, Display},
  31    path::{Path, PathBuf},
  32    str::FromStr,
  33    sync::Arc,
  34    time::Duration,
  35};
  36use util::path;
  37
  38#[derive(Serialize)]
  39struct DiffJudgeTemplate {
  40    diff: String,
  41    assertions: &'static str,
  42}
  43
  44impl Template for DiffJudgeTemplate {
  45    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
  46}
  47
  48#[derive(Clone)]
  49struct EvalInput {
  50    conversation: Vec<LanguageModelRequestMessage>,
  51    input_file_path: PathBuf,
  52    input_content: Option<String>,
  53    assertion: EvalAssertion,
  54}
  55
  56impl EvalInput {
  57    fn new(
  58        conversation: Vec<LanguageModelRequestMessage>,
  59        input_file_path: impl Into<PathBuf>,
  60        input_content: Option<String>,
  61        assertion: EvalAssertion,
  62    ) -> Self {
  63        EvalInput {
  64            conversation,
  65            input_file_path: input_file_path.into(),
  66            input_content,
  67            assertion,
  68        }
  69    }
  70}
  71
  72#[derive(Clone)]
  73struct EvalSample {
  74    text_before: String,
  75    text_after: String,
  76    tool_input: StreamingEditFileToolInput,
  77    diff: String,
  78}
  79
  80trait AssertionFn: 'static + Send + Sync {
  81    fn assert<'a>(
  82        &'a self,
  83        sample: &'a EvalSample,
  84        judge_model: Arc<dyn LanguageModel>,
  85        cx: &'a mut TestAppContext,
  86    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
  87}
  88
  89impl<F> AssertionFn for F
  90where
  91    F: 'static
  92        + Send
  93        + Sync
  94        + AsyncFn(
  95            &EvalSample,
  96            Arc<dyn LanguageModel>,
  97            &mut TestAppContext,
  98        ) -> Result<EvalAssertionOutcome>,
  99{
 100    fn assert<'a>(
 101        &'a self,
 102        sample: &'a EvalSample,
 103        judge_model: Arc<dyn LanguageModel>,
 104        cx: &'a mut TestAppContext,
 105    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
 106        (self)(sample, judge_model, cx).boxed_local()
 107    }
 108}
 109
 110#[derive(Clone)]
 111struct EvalAssertion(Arc<dyn AssertionFn>);
 112
 113impl EvalAssertion {
 114    fn new<F>(f: F) -> Self
 115    where
 116        F: 'static
 117            + Send
 118            + Sync
 119            + AsyncFn(
 120                &EvalSample,
 121                Arc<dyn LanguageModel>,
 122                &mut TestAppContext,
 123            ) -> Result<EvalAssertionOutcome>,
 124    {
 125        EvalAssertion(Arc::new(f))
 126    }
 127
 128    fn assert_eq(expected: impl Into<String>) -> Self {
 129        let expected = expected.into();
 130        Self::new(async move |sample, _judge, _cx| {
 131            Ok(EvalAssertionOutcome {
 132                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
 133                    100
 134                } else {
 135                    0
 136                },
 137                message: None,
 138            })
 139        })
 140    }
 141
 142    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
 143        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
 144        Self::new(async move |sample, _judge, _cx| {
 145            let matches = expected_diffs.iter().any(|possible_diff| {
 146                language::apply_diff_patch(&sample.text_before, possible_diff)
 147                    .map(|expected| {
 148                        strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
 149                    })
 150                    .unwrap_or(false)
 151            });
 152
 153            Ok(EvalAssertionOutcome {
 154                score: if matches { 100 } else { 0 },
 155                message: None,
 156            })
 157        })
 158    }
 159
 160    fn judge_diff(assertions: &'static str) -> Self {
 161        Self::new(async move |sample, judge, cx| {
 162            let prompt = DiffJudgeTemplate {
 163                diff: sample.diff.clone(),
 164                assertions,
 165            }
 166            .render(&Templates::new())
 167            .context("Failed to render diff judge template")?;
 168
 169            let request = LanguageModelRequest {
 170                messages: vec![LanguageModelRequestMessage {
 171                    role: Role::User,
 172                    content: vec![prompt.into()],
 173                    cache: false,
 174                    reasoning_details: None,
 175                }],
 176                thinking_allowed: true,
 177                thinking_effort: judge
 178                    .default_effort_level()
 179                    .map(|effort_level| effort_level.value.to_string()),
 180                ..Default::default()
 181            };
 182            let mut response = retry_on_rate_limit(async || {
 183                Ok(judge
 184                    .stream_completion_text(request.clone(), &cx.to_async())
 185                    .await?)
 186            })
 187            .await?;
 188            let mut output = String::new();
 189            while let Some(chunk) = response.stream.next().await {
 190                let chunk = chunk?;
 191                output.push_str(&chunk);
 192            }
 193
 194            let re = regex::Regex::new(r"<score>(\d+)</score>")
 195                .context("Failed to compile score regex")?;
 196            if let Some(captures) = re.captures(&output)
 197                && let Some(score_match) = captures.get(1)
 198            {
 199                let score = score_match.as_str().parse().unwrap_or(0);
 200                return Ok(EvalAssertionOutcome {
 201                    score,
 202                    message: Some(output),
 203                });
 204            }
 205
 206            anyhow::bail!("No score found in response. Raw output: {output}");
 207        })
 208    }
 209
 210    async fn run(
 211        &self,
 212        input: &EvalSample,
 213        judge_model: Arc<dyn LanguageModel>,
 214        cx: &mut TestAppContext,
 215    ) -> Result<EvalAssertionOutcome> {
 216        self.0.assert(input, judge_model, cx).await
 217    }
 218}
 219
 220#[derive(Clone)]
 221struct StreamingEditEvalOutput {
 222    sample: EvalSample,
 223    assertion: EvalAssertionOutcome,
 224}
 225
 226impl Display for StreamingEditEvalOutput {
 227    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 228        writeln!(f, "Score: {:?}", self.assertion.score)?;
 229        if let Some(message) = self.assertion.message.as_ref() {
 230            writeln!(f, "Message: {}", message)?;
 231        }
 232        writeln!(f, "Diff:\n{}", self.sample.diff)?;
 233        writeln!(f, "Tool Input:\n{:#?}", self.sample.tool_input)?;
 234        Ok(())
 235    }
 236}
 237
 238#[derive(Clone, Debug, Eq, PartialEq, Hash)]
 239struct EvalAssertionOutcome {
 240    score: usize,
 241    message: Option<String>,
 242}
 243
 244struct StreamingEditToolTest {
 245    fs: Arc<FakeFs>,
 246    project: Entity<Project>,
 247    model: Arc<dyn LanguageModel>,
 248    judge_model: Arc<dyn LanguageModel>,
 249    model_thinking_effort: Option<String>,
 250}
 251
 252impl StreamingEditToolTest {
 253    async fn new(cx: &mut TestAppContext) -> Self {
 254        cx.executor().allow_parking();
 255
 256        let fs = FakeFs::new(cx.executor());
 257        cx.update(|cx| {
 258            let settings_store = SettingsStore::test(cx);
 259            cx.set_global(settings_store);
 260            SettingsStore::update_global(cx, |store: &mut SettingsStore, cx| {
 261                store.update_user_settings(cx, |settings| {
 262                    settings
 263                        .project
 264                        .all_languages
 265                        .defaults
 266                        .ensure_final_newline_on_save = Some(false);
 267                    settings.project.all_languages.defaults.format_on_save =
 268                        Some(FormatOnSave::Off);
 269                });
 270            });
 271
 272            gpui_tokio::init(cx);
 273            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
 274            cx.set_http_client(http_client);
 275            let client = Client::production(cx);
 276            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
 277            language_model::init(cx);
 278            RefreshLlmTokenListener::register(client.clone(), user_store.clone(), cx);
 279            language_models::init(user_store, client, cx);
 280        });
 281
 282        fs.insert_tree("/root", json!({})).await;
 283        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
 284        let agent_model = SelectedModel::from_str(
 285            &std::env::var("ZED_AGENT_MODEL")
 286                .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
 287        )
 288        .unwrap();
 289        let judge_model = SelectedModel::from_str(
 290            &std::env::var("ZED_JUDGE_MODEL")
 291                .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
 292        )
 293        .unwrap();
 294
 295        let authenticate_provider_tasks = cx.update(|cx| {
 296            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
 297                registry
 298                    .providers()
 299                    .iter()
 300                    .map(|p| p.authenticate(cx))
 301                    .collect::<Vec<_>>()
 302            })
 303        });
 304        let (model, judge_model) = cx
 305            .update(|cx| {
 306                cx.spawn(async move |cx| {
 307                    futures::future::join_all(authenticate_provider_tasks).await;
 308                    let model = Self::load_model(&agent_model, cx).await;
 309                    let judge_model = Self::load_model(&judge_model, cx).await;
 310                    (model.unwrap(), judge_model.unwrap())
 311                })
 312            })
 313            .await;
 314
 315        let model_thinking_effort = model
 316            .default_effort_level()
 317            .map(|effort_level| effort_level.value.to_string());
 318
 319        Self {
 320            fs,
 321            project,
 322            model,
 323            judge_model,
 324            model_thinking_effort,
 325        }
 326    }
 327
 328    async fn load_model(
 329        selected_model: &SelectedModel,
 330        cx: &mut AsyncApp,
 331    ) -> Result<Arc<dyn LanguageModel>> {
 332        cx.update(|cx| {
 333            let registry = LanguageModelRegistry::read_global(cx);
 334            let provider = registry
 335                .provider(&selected_model.provider)
 336                .expect("Provider not found");
 337            provider.authenticate(cx)
 338        })
 339        .await?;
 340        Ok(cx.update(|cx| {
 341            let models = LanguageModelRegistry::read_global(cx);
 342            models
 343                .available_models(cx)
 344                .find(|model| {
 345                    model.provider_id() == selected_model.provider
 346                        && model.id() == selected_model.model
 347                })
 348                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0))
 349        }))
 350    }
 351
 352    /// Build the tool definitions for the model, replacing `edit_file` with the
 353    /// streaming edit file tool schema. In production the streaming tool is
 354    /// exposed under the name `"edit_file"` (see `Thread::enabled_tools`), so
 355    /// the model has never seen the name `"streaming_edit_file"`.
 356    fn build_tools() -> Vec<LanguageModelRequestTool> {
 357        let mut tools: Vec<LanguageModelRequestTool> = crate::built_in_tools()
 358            .filter(|tool| tool.name != EditFileTool::NAME)
 359            .collect();
 360        tools.push(LanguageModelRequestTool {
 361            name: EditFileTool::NAME.to_string(),
 362            description: StreamingEditFileTool::description().to_string(),
 363            input_schema: StreamingEditFileTool::input_schema(
 364                LanguageModelToolSchemaFormat::JsonSchema,
 365            )
 366            .to_value(),
 367            use_input_streaming: StreamingEditFileTool::supports_input_streaming(),
 368        });
 369        tools
 370    }
 371
 372    async fn eval(
 373        &self,
 374        mut eval: EvalInput,
 375        cx: &mut TestAppContext,
 376    ) -> Result<StreamingEditEvalOutput> {
 377        eval.conversation
 378            .last_mut()
 379            .context("Conversation must not be empty")?
 380            .cache = true;
 381
 382        // Populate the FakeFs so `resolve_path` / `entry_for_path` can find
 383        // the file in the worktree.
 384        if let Some(input_content) = eval.input_content.as_deref() {
 385            let abs_path = Path::new("/root").join(
 386                eval.input_file_path
 387                    .strip_prefix("root")
 388                    .unwrap_or(&eval.input_file_path),
 389            );
 390            self.fs.insert_file(&abs_path, input_content.into()).await;
 391
 392            // Wait for the worktree to pick up the new file.
 393            cx.run_until_parked();
 394        }
 395
 396        let tools = Self::build_tools();
 397
 398        let system_prompt = {
 399            let worktrees = vec![WorktreeContext {
 400                root_name: "root".to_string(),
 401                abs_path: Path::new("/path/to/root").into(),
 402                rules_file: None,
 403            }];
 404            let project_context = ProjectContext::new(worktrees, Vec::default());
 405            let tool_names = tools
 406                .iter()
 407                .map(|tool| tool.name.clone().into())
 408                .collect::<Vec<_>>();
 409            let template = crate::SystemPromptTemplate {
 410                project: &project_context,
 411                available_tools: tool_names,
 412                model_name: None,
 413            };
 414            let templates = Templates::new();
 415            template.render(&templates)?
 416        };
 417
 418        let has_system_prompt = eval
 419            .conversation
 420            .first()
 421            .is_some_and(|msg| msg.role == Role::System);
 422        let messages = if has_system_prompt {
 423            eval.conversation
 424        } else {
 425            [LanguageModelRequestMessage {
 426                role: Role::System,
 427                content: vec![MessageContent::Text(system_prompt)],
 428                cache: true,
 429                reasoning_details: None,
 430            }]
 431            .into_iter()
 432            .chain(eval.conversation)
 433            .collect::<Vec<_>>()
 434        };
 435
 436        let request = LanguageModelRequest {
 437            messages,
 438            tools,
 439            thinking_allowed: true,
 440            thinking_effort: self.model_thinking_effort.clone(),
 441            ..Default::default()
 442        };
 443
 444        // The model will call the tool as "edit_file" (the production-visible
 445        // name), but the schema is from StreamingEditFileTool.
 446        let tool_input =
 447            retry_on_rate_limit(async || self.extract_tool_use(request.clone(), cx).await).await?;
 448
 449        let language_registry = self
 450            .project
 451            .read_with(cx, |project, _cx| project.languages().clone());
 452
 453        let context_server_registry = cx
 454            .new(|cx| ContextServerRegistry::new(self.project.read(cx).context_server_store(), cx));
 455        let thread = cx.new(|cx| {
 456            Thread::new(
 457                self.project.clone(),
 458                cx.new(|_cx| ProjectContext::default()),
 459                context_server_registry,
 460                Templates::new(),
 461                Some(self.model.clone()),
 462                cx,
 463            )
 464        });
 465        let action_log = thread.read_with(cx, |thread, _| thread.action_log().clone());
 466
 467        let tool = Arc::new(StreamingEditFileTool::new(
 468            self.project.clone(),
 469            thread.downgrade(),
 470            action_log,
 471            language_registry,
 472        ));
 473
 474        let result = cx
 475            .update(|cx| {
 476                tool.clone().run(
 477                    ToolInput::resolved(tool_input.clone()),
 478                    ToolCallEventStream::test().0,
 479                    cx,
 480                )
 481            })
 482            .await;
 483
 484        let output = match result {
 485            Ok(output) => output,
 486            Err(output) => {
 487                anyhow::bail!("Tool returned error: {}", output);
 488            }
 489        };
 490
 491        let StreamingEditFileToolOutput::Success { new_text, .. } = &output else {
 492            anyhow::bail!("Tool returned error output: {}", output);
 493        };
 494
 495        let sample = EvalSample {
 496            tool_input,
 497            diff: language::unified_diff(
 498                eval.input_content.as_deref().unwrap_or_default(),
 499                new_text,
 500            ),
 501            text_before: eval.input_content.unwrap_or_default(),
 502            text_after: new_text.clone(),
 503        };
 504
 505        let assertion = eval
 506            .assertion
 507            .run(&sample, self.judge_model.clone(), cx)
 508            .await?;
 509
 510        Ok(StreamingEditEvalOutput { assertion, sample })
 511    }
 512
 513    /// Stream the model completion and extract the first complete tool use
 514    /// whose name matches `EditFileTool::NAME` (the production-visible name
 515    /// for the streaming edit tool), parsed as `StreamingEditFileToolInput`.
 516    async fn extract_tool_use(
 517        &self,
 518        request: LanguageModelRequest,
 519        cx: &mut TestAppContext,
 520    ) -> Result<StreamingEditFileToolInput> {
 521        let model = self.model.clone();
 522        let events = cx
 523            .update(|cx| {
 524                let async_cx = cx.to_async();
 525                cx.foreground_executor()
 526                    .spawn(async move { model.stream_completion(request, &async_cx).await })
 527            })
 528            .await
 529            .map_err(|err| anyhow::anyhow!("completion error: {}", err))?;
 530
 531        let mut streamed_text = String::new();
 532        let mut stop_reason = None;
 533        let mut parse_errors = Vec::new();
 534
 535        let mut events = events.fuse();
 536        while let Some(event) = events.next().await {
 537            match event {
 538                Ok(LanguageModelCompletionEvent::ToolUse(tool_use))
 539                    if tool_use.is_input_complete
 540                        && tool_use.name.as_ref() == EditFileTool::NAME =>
 541                {
 542                    let input: StreamingEditFileToolInput = serde_json::from_value(tool_use.input)
 543                        .context("Failed to parse tool input as StreamingEditFileToolInput")?;
 544                    return Ok(input);
 545                }
 546                Ok(LanguageModelCompletionEvent::Text(text)) => {
 547                    if streamed_text.len() < 2_000 {
 548                        streamed_text.push_str(&text);
 549                    }
 550                }
 551                Ok(LanguageModelCompletionEvent::Stop(reason)) => {
 552                    stop_reason = Some(reason);
 553                }
 554                Ok(LanguageModelCompletionEvent::ToolUseJsonParseError {
 555                    tool_name,
 556                    raw_input,
 557                    json_parse_error,
 558                    ..
 559                }) if tool_name.as_ref() == EditFileTool::NAME => {
 560                    parse_errors.push(format!("{json_parse_error}\nRaw input:\n{raw_input:?}"));
 561                }
 562                Err(err) => {
 563                    return Err(anyhow::anyhow!("completion error: {}", err));
 564                }
 565                _ => {}
 566            }
 567        }
 568
 569        let streamed_text = streamed_text.trim();
 570        let streamed_text_suffix = if streamed_text.is_empty() {
 571            String::new()
 572        } else {
 573            format!("\nStreamed text:\n{streamed_text}")
 574        };
 575        let stop_reason_suffix = stop_reason
 576            .map(|reason| format!("\nStop reason: {reason:?}"))
 577            .unwrap_or_default();
 578        let parse_errors_suffix = if parse_errors.is_empty() {
 579            String::new()
 580        } else {
 581            format!("\nTool parse errors:\n{}", parse_errors.join("\n"))
 582        };
 583
 584        anyhow::bail!(
 585            "Stream ended without an edit_file tool use{stop_reason_suffix}{parse_errors_suffix}{streamed_text_suffix}"
 586        )
 587    }
 588}
 589
 590fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<()> {
 591    let dispatcher = gpui::TestDispatcher::new(rand::random());
 592    let mut cx = TestAppContext::build(dispatcher, None);
 593    let foreground_executor = cx.foreground_executor().clone();
 594    let result = foreground_executor.block_test(async {
 595        let test = StreamingEditToolTest::new(&mut cx).await;
 596        let result = test.eval(eval, &mut cx).await;
 597        drop(test);
 598        cx.run_until_parked();
 599        result
 600    });
 601    cx.quit();
 602    match result {
 603        Ok(output) => eval_utils::EvalOutput {
 604            data: output.to_string(),
 605            outcome: if output.assertion.score < 80 {
 606                eval_utils::OutcomeKind::Failed
 607            } else {
 608                eval_utils::OutcomeKind::Passed
 609            },
 610            metadata: (),
 611        },
 612        Err(err) => eval_utils::EvalOutput {
 613            data: format!("{err:?}"),
 614            outcome: eval_utils::OutcomeKind::Error,
 615            metadata: (),
 616        },
 617    }
 618}
 619
 620fn message(
 621    role: Role,
 622    contents: impl IntoIterator<Item = MessageContent>,
 623) -> LanguageModelRequestMessage {
 624    LanguageModelRequestMessage {
 625        role,
 626        content: contents.into_iter().collect(),
 627        cache: false,
 628        reasoning_details: None,
 629    }
 630}
 631
 632fn text(text: impl Into<String>) -> MessageContent {
 633    MessageContent::Text(text.into())
 634}
 635
 636fn lines(input: &str, range: std::ops::Range<usize>) -> String {
 637    input
 638        .lines()
 639        .skip(range.start)
 640        .take(range.len())
 641        .collect::<Vec<_>>()
 642        .join("\n")
 643}
 644
 645fn tool_use(
 646    id: impl Into<Arc<str>>,
 647    name: impl Into<Arc<str>>,
 648    input: impl Serialize,
 649) -> MessageContent {
 650    MessageContent::ToolUse(LanguageModelToolUse {
 651        id: LanguageModelToolUseId::from(id.into()),
 652        name: name.into(),
 653        raw_input: serde_json::to_string_pretty(&input).unwrap(),
 654        input: serde_json::to_value(input).unwrap(),
 655        is_input_complete: true,
 656        thought_signature: None,
 657    })
 658}
 659
 660fn tool_result(
 661    id: impl Into<Arc<str>>,
 662    name: impl Into<Arc<str>>,
 663    result: impl Into<Arc<str>>,
 664) -> MessageContent {
 665    MessageContent::ToolResult(LanguageModelToolResult {
 666        tool_use_id: LanguageModelToolUseId::from(id.into()),
 667        tool_name: name.into(),
 668        is_error: false,
 669        content: LanguageModelToolResultContent::Text(result.into()),
 670        output: None,
 671    })
 672}
 673
 674fn strip_empty_lines(text: &str) -> String {
 675    text.lines()
 676        .filter(|line| !line.trim().is_empty())
 677        .collect::<Vec<_>>()
 678        .join("\n")
 679}
 680
 681async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
 682    const MAX_RETRIES: usize = 20;
 683    let mut attempt = 0;
 684
 685    loop {
 686        attempt += 1;
 687        let response = request().await;
 688
 689        if attempt >= MAX_RETRIES {
 690            return response;
 691        }
 692
 693        let retry_delay = match &response {
 694            Ok(_) => None,
 695            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
 696                Some(err) => match &err {
 697                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
 698                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
 699                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
 700                    }
 701                    LanguageModelCompletionError::UpstreamProviderError {
 702                        status,
 703                        retry_after,
 704                        ..
 705                    } => {
 706                        let should_retry = matches!(
 707                            *status,
 708                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
 709                        ) || status.as_u16() == 529;
 710
 711                        if should_retry {
 712                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
 713                        } else {
 714                            None
 715                        }
 716                    }
 717                    LanguageModelCompletionError::ApiReadResponseError { .. }
 718                    | LanguageModelCompletionError::ApiInternalServerError { .. }
 719                    | LanguageModelCompletionError::HttpSend { .. } => {
 720                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
 721                    }
 722                    _ => None,
 723                },
 724                _ => None,
 725            },
 726        };
 727
 728        if let Some(retry_after) = retry_delay {
 729            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
 730            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
 731            #[allow(clippy::disallowed_methods)]
 732            smol::Timer::after(retry_after + jitter).await;
 733        } else {
 734            return response;
 735        }
 736    }
 737}
 738
 739#[test]
 740#[cfg_attr(not(feature = "unit-eval"), ignore)]
 741fn eval_delete_function() {
 742    let input_file_path = "root/blame.rs";
 743    let input_file_content = include_str!("fixtures/delete_run_git_blame/before.rs");
 744    let output_file_content = include_str!("fixtures/delete_run_git_blame/after.rs");
 745    let possible_diffs = vec![
 746        language::unified_diff(input_file_content, output_file_content),
 747        language::unified_diff(
 748            input_file_content,
 749            &output_file_content
 750                .replace(
 751                    "const GIT_BLAME_NO_COMMIT_ERROR: &str = \"fatal: no such ref: HEAD\";\n",
 752                    "",
 753                )
 754                .replace(
 755                    "const GIT_BLAME_NO_PATH: &str = \"fatal: no such path\";\n",
 756                    "",
 757                ),
 758        ),
 759    ];
 760
 761    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
 762        run_eval(EvalInput::new(
 763            vec![
 764                message(
 765                    User,
 766                    [text(indoc::formatdoc! {"
 767                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 768                        one function, not its usages.
 769                    "})],
 770                ),
 771                message(
 772                    Assistant,
 773                    [tool_use(
 774                        "tool_1",
 775                        ReadFileTool::NAME,
 776                        ReadFileToolInput {
 777                            path: input_file_path.into(),
 778                            start_line: None,
 779                            end_line: None,
 780                        },
 781                    )],
 782                ),
 783                message(
 784                    User,
 785                    [tool_result(
 786                        "tool_1",
 787                        ReadFileTool::NAME,
 788                        input_file_content,
 789                    )],
 790                ),
 791            ],
 792            input_file_path,
 793            Some(input_file_content.into()),
 794            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 795        ))
 796    });
 797}
 798
 799#[test]
 800#[cfg_attr(not(feature = "unit-eval"), ignore)]
 801fn eval_extract_handle_command_output() {
 802    let input_file_path = "root/blame.rs";
 803    let input_file_content = include_str!("fixtures/extract_handle_command_output/before.rs");
 804    let possible_diffs = vec![
 805        include_str!("fixtures/extract_handle_command_output/possible-01.diff"),
 806        include_str!("fixtures/extract_handle_command_output/possible-02.diff"),
 807        include_str!("fixtures/extract_handle_command_output/possible-03.diff"),
 808        include_str!("fixtures/extract_handle_command_output/possible-04.diff"),
 809        include_str!("fixtures/extract_handle_command_output/possible-05.diff"),
 810        include_str!("fixtures/extract_handle_command_output/possible-06.diff"),
 811        include_str!("fixtures/extract_handle_command_output/possible-07.diff"),
 812        include_str!("fixtures/extract_handle_command_output/possible-08.diff"),
 813        include_str!("fixtures/extract_handle_command_output/possible-09.diff"),
 814    ];
 815
 816    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
 817        run_eval(EvalInput::new(
 818            vec![
 819                message(
 820                    User,
 821                    [text(indoc::formatdoc! {"
 822                        Read the `{input_file_path}` file and extract a method in
 823                        the final stanza of `run_git_blame` to deal with command failures,
 824                        call it `handle_command_output` and take the std::process::Output as the only parameter.
 825                        Do not document the method and do not add any comments.
 826
 827                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
 828                    "})],
 829                ),
 830                message(
 831                    Assistant,
 832                    [tool_use(
 833                        "tool_1",
 834                        ReadFileTool::NAME,
 835                        ReadFileToolInput {
 836                            path: input_file_path.into(),
 837                            start_line: None,
 838                            end_line: None,
 839                        },
 840                    )],
 841                ),
 842                message(
 843                    User,
 844                    [tool_result(
 845                        "tool_1",
 846                        ReadFileTool::NAME,
 847                        input_file_content,
 848                    )],
 849                ),
 850            ],
 851            input_file_path,
 852            Some(input_file_content.into()),
 853            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 854        ))
 855    });
 856}
 857
 858#[test]
 859#[cfg_attr(not(feature = "unit-eval"), ignore)]
 860fn eval_translate_doc_comments() {
 861    let input_file_path = "root/canvas.rs";
 862    let input_file_content = include_str!("fixtures/translate_doc_comments/before.rs");
 863
 864    eval_utils::eval(200, 1., eval_utils::NoProcessor, move || {
 865        run_eval(EvalInput::new(
 866            vec![
 867                message(
 868                    User,
 869                    [text(indoc::formatdoc! {"
 870                        Read the `{input_file_path}` file and edit it (without overwriting it),
 871                        translating all the doc comments to italian.
 872                    "})],
 873                ),
 874                message(
 875                    Assistant,
 876                    [tool_use(
 877                        "tool_1",
 878                        ReadFileTool::NAME,
 879                        ReadFileToolInput {
 880                            path: input_file_path.into(),
 881                            start_line: None,
 882                            end_line: None,
 883                        },
 884                    )],
 885                ),
 886                message(
 887                    User,
 888                    [tool_result(
 889                        "tool_1",
 890                        ReadFileTool::NAME,
 891                        input_file_content,
 892                    )],
 893                ),
 894            ],
 895            input_file_path,
 896            Some(input_file_content.into()),
 897            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 898        ))
 899    });
 900}
 901
 902#[test]
 903#[cfg_attr(not(feature = "unit-eval"), ignore)]
 904fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 905    let input_file_path = "root/lib.rs";
 906    let input_file_content =
 907        include_str!("fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 908
 909    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
 910        run_eval(EvalInput::new(
 911            vec![
 912                message(
 913                    User,
 914                    [text(indoc::formatdoc! {"
 915                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 916                        Use `ureq` to download the SDK for the current platform and architecture.
 917                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 918                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 919                        that's inside of the archive.
 920                        Don't re-download the SDK if that executable already exists.
 921
 922                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 923
 924                        Here are the available wasi-sdk assets:
 925                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 926                        - wasi-sdk-25.0-arm64-macos.tar.gz
 927                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 928                        - wasi-sdk-25.0-arm64-linux.tar.gz
 929                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 930                        - wasi-sdk-25.0-arm64-linux.tar.gz
 931                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 932                    "})],
 933                ),
 934                message(
 935                    Assistant,
 936                    [tool_use(
 937                        "tool_1",
 938                        ReadFileTool::NAME,
 939                        ReadFileToolInput {
 940                            path: input_file_path.into(),
 941                            start_line: Some(971),
 942                            end_line: Some(1050),
 943                        },
 944                    )],
 945                ),
 946                message(
 947                    User,
 948                    [tool_result(
 949                        "tool_1",
 950                        ReadFileTool::NAME,
 951                        lines(input_file_content, 971..1050),
 952                    )],
 953                ),
 954                message(
 955                    Assistant,
 956                    [tool_use(
 957                        "tool_2",
 958                        ReadFileTool::NAME,
 959                        ReadFileToolInput {
 960                            path: input_file_path.into(),
 961                            start_line: Some(1050),
 962                            end_line: Some(1100),
 963                        },
 964                    )],
 965                ),
 966                message(
 967                    User,
 968                    [tool_result(
 969                        "tool_2",
 970                        ReadFileTool::NAME,
 971                        lines(input_file_content, 1050..1100),
 972                    )],
 973                ),
 974                message(
 975                    Assistant,
 976                    [tool_use(
 977                        "tool_3",
 978                        ReadFileTool::NAME,
 979                        ReadFileToolInput {
 980                            path: input_file_path.into(),
 981                            start_line: Some(1100),
 982                            end_line: Some(1150),
 983                        },
 984                    )],
 985                ),
 986                message(
 987                    User,
 988                    [tool_result(
 989                        "tool_3",
 990                        ReadFileTool::NAME,
 991                        lines(input_file_content, 1100..1150),
 992                    )],
 993                ),
 994            ],
 995            input_file_path,
 996            Some(input_file_content.into()),
 997            EvalAssertion::judge_diff(indoc::indoc! {"
 998                    - The compile_parser_to_wasm method has been changed to use wasi-sdk
 999                    - ureq is used to download the SDK for current platform and architecture
1000                "}),
1001        ))
1002    });
1003}
1004
1005#[test]
1006#[cfg_attr(not(feature = "unit-eval"), ignore)]
1007fn eval_disable_cursor_blinking() {
1008    let input_file_path = "root/editor.rs";
1009    let input_file_content = include_str!("fixtures/disable_cursor_blinking/before.rs");
1010    let possible_diffs = vec![
1011        include_str!("fixtures/disable_cursor_blinking/possible-01.diff"),
1012        include_str!("fixtures/disable_cursor_blinking/possible-02.diff"),
1013        include_str!("fixtures/disable_cursor_blinking/possible-03.diff"),
1014        include_str!("fixtures/disable_cursor_blinking/possible-04.diff"),
1015    ];
1016
1017    eval_utils::eval(100, 0.51, eval_utils::NoProcessor, move || {
1018        run_eval(EvalInput::new(
1019            vec![
1020                message(User, [text("Let's research how to cursor blinking works.")]),
1021                message(
1022                    Assistant,
1023                    [tool_use(
1024                        "tool_1",
1025                        GrepTool::NAME,
1026                        GrepToolInput {
1027                            regex: "blink".into(),
1028                            include_pattern: None,
1029                            offset: 0,
1030                            case_sensitive: false,
1031                        },
1032                    )],
1033                ),
1034                message(
1035                    User,
1036                    [tool_result(
1037                        "tool_1",
1038                        GrepTool::NAME,
1039                        [
1040                            lines(input_file_content, 100..400),
1041                            lines(input_file_content, 800..1300),
1042                            lines(input_file_content, 1600..2000),
1043                            lines(input_file_content, 5000..5500),
1044                            lines(input_file_content, 8000..9000),
1045                            lines(input_file_content, 18455..18470),
1046                            lines(input_file_content, 20000..20500),
1047                            lines(input_file_content, 21000..21300),
1048                        ]
1049                        .join("Match found:\n\n"),
1050                    )],
1051                ),
1052                message(
1053                    User,
1054                    [text(indoc::indoc! {"
1055                            Comment out the lines that interact with the BlinkManager.
1056                            Keep the outer `update` blocks, but comments everything that's inside (including if statements).
1057                            Don't add additional comments.
1058                        "})],
1059                ),
1060            ],
1061            input_file_path,
1062            Some(input_file_content.into()),
1063            EvalAssertion::assert_diff_any(possible_diffs.clone()),
1064        ))
1065    });
1066}
1067
1068#[test]
1069#[cfg_attr(not(feature = "unit-eval"), ignore)]
1070fn eval_from_pixels_constructor() {
1071    let input_file_path = "root/canvas.rs";
1072    let input_file_content = include_str!("fixtures/from_pixels_constructor/before.rs");
1073
1074    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
1075        run_eval(EvalInput::new(
1076            vec![
1077                message(
1078                    User,
1079                    [text(indoc::indoc! {"
1080                            Introduce a new `from_pixels` constructor in Canvas and
1081                            also add tests for it in the same file.
1082                        "})],
1083                ),
1084                message(
1085                    Assistant,
1086                    [tool_use(
1087                        "tool_1",
1088                        ReadFileTool::NAME,
1089                        ReadFileToolInput {
1090                            path: input_file_path.into(),
1091                            start_line: None,
1092                            end_line: None,
1093                        },
1094                    )],
1095                ),
1096                message(
1097                    User,
1098                    [tool_result(
1099                        "tool_1",
1100                        ReadFileTool::NAME,
1101                        input_file_content,
1102                    )],
1103                ),
1104                message(
1105                    Assistant,
1106                    [tool_use(
1107                        "tool_2",
1108                        GrepTool::NAME,
1109                        GrepToolInput {
1110                            regex: "mod\\s+tests".into(),
1111                            include_pattern: Some("font-kit/src/canvas.rs".into()),
1112                            offset: 0,
1113                            case_sensitive: false,
1114                        },
1115                    )],
1116                ),
1117                message(
1118                    User,
1119                    [tool_result("tool_2", GrepTool::NAME, "No matches found")],
1120                ),
1121                message(
1122                    Assistant,
1123                    [tool_use(
1124                        "tool_3",
1125                        GrepTool::NAME,
1126                        GrepToolInput {
1127                            regex: "mod\\s+tests".into(),
1128                            include_pattern: Some("font-kit/src/**/*.rs".into()),
1129                            offset: 0,
1130                            case_sensitive: false,
1131                        },
1132                    )],
1133                ),
1134                message(
1135                    User,
1136                    [tool_result("tool_3", GrepTool::NAME, "No matches found")],
1137                ),
1138                message(
1139                    Assistant,
1140                    [tool_use(
1141                        "tool_4",
1142                        GrepTool::NAME,
1143                        GrepToolInput {
1144                            regex: "#\\[test\\]".into(),
1145                            include_pattern: Some("font-kit/src/**/*.rs".into()),
1146                            offset: 0,
1147                            case_sensitive: false,
1148                        },
1149                    )],
1150                ),
1151                message(
1152                    User,
1153                    [tool_result(
1154                        "tool_4",
1155                        GrepTool::NAME,
1156                        indoc::indoc! {"
1157                                Found 6 matches:
1158
1159                                ## Matches in font-kit/src/loaders/core_text.rs
1160
1161                                ### mod test › L926-936
1162                                ```
1163                                mod test {
1164                                    use super::Font;
1165                                    use crate::properties::{Stretch, Weight};
1166
1167                                    #[cfg(feature = \"source\")]
1168                                    use crate::source::SystemSource;
1169
1170                                    static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
1171
1172                                    #[cfg(feature = \"source\")]
1173                                    #[test]
1174                                ```
1175
1176                                55 lines remaining in ancestor node. Read the file to see all.
1177
1178                                ### mod test › L947-951
1179                                ```
1180                                    }
1181
1182                                    #[test]
1183                                    fn test_core_text_to_css_font_weight() {
1184                                        // Exact matches
1185                                ```
1186
1187                                ### mod test › L959-963
1188                                ```
1189                                    }
1190
1191                                    #[test]
1192                                    fn test_core_text_to_css_font_stretch() {
1193                                        // Exact matches
1194                                ```
1195
1196                                ## Matches in font-kit/src/loaders/freetype.rs
1197
1198                                ### mod test › L1238-1248
1199                                ```
1200                                mod test {
1201                                    use crate::loaders::freetype::Font;
1202
1203                                    static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
1204                                    static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
1205
1206                                    #[test]
1207                                    fn get_pcf_postscript_name() {
1208                                        let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
1209                                        assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
1210                                    }
1211                                ```
1212
1213                                1 lines remaining in ancestor node. Read the file to see all.
1214
1215                                ## Matches in font-kit/src/sources/core_text.rs
1216
1217                                ### mod test › L265-275
1218                                ```
1219                                mod test {
1220                                    use crate::properties::{Stretch, Weight};
1221
1222                                    #[test]
1223                                    fn test_css_to_core_text_font_weight() {
1224                                        // Exact matches
1225                                        assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
1226                                        assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
1227                                        assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
1228                                        assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
1229
1230                                ```
1231
1232                                27 lines remaining in ancestor node. Read the file to see all.
1233
1234                                ### mod test › L278-282
1235                                ```
1236                                    }
1237
1238                                    #[test]
1239                                    fn test_css_to_core_text_font_stretch() {
1240                                        // Exact matches
1241                                ```
1242                            "},
1243                    )],
1244                ),
1245            ],
1246            input_file_path,
1247            Some(input_file_content.into()),
1248            EvalAssertion::judge_diff(indoc::indoc! {"
1249                        - The diff contains a new `from_pixels` constructor
1250                        - The diff contains new tests for the `from_pixels` constructor
1251                    "}),
1252        ))
1253    });
1254}
1255
1256#[test]
1257#[cfg_attr(not(feature = "unit-eval"), ignore)]
1258fn eval_zode() {
1259    let input_file_path = "root/zode.py";
1260    let input_content = None;
1261
1262    eval_utils::eval(50, 1., eval_utils::NoProcessor, move || {
1263        run_eval(EvalInput::new(
1264            vec![
1265                message(User, [text(include_str!("fixtures/zode/prompt.md"))]),
1266                message(
1267                    Assistant,
1268                    [
1269                        tool_use(
1270                            "tool_1",
1271                            ReadFileTool::NAME,
1272                            ReadFileToolInput {
1273                                path: "root/eval/react.py".into(),
1274                                start_line: None,
1275                                end_line: None,
1276                            },
1277                        ),
1278                        tool_use(
1279                            "tool_2",
1280                            ReadFileTool::NAME,
1281                            ReadFileToolInput {
1282                                path: "root/eval/react_test.py".into(),
1283                                start_line: None,
1284                                end_line: None,
1285                            },
1286                        ),
1287                    ],
1288                ),
1289                message(
1290                    User,
1291                    [
1292                        tool_result(
1293                            "tool_1",
1294                            ReadFileTool::NAME,
1295                            include_str!("fixtures/zode/react.py"),
1296                        ),
1297                        tool_result(
1298                            "tool_2",
1299                            ReadFileTool::NAME,
1300                            include_str!("fixtures/zode/react_test.py"),
1301                        ),
1302                    ],
1303                ),
1304            ],
1305            input_file_path,
1306            input_content.clone(),
1307            EvalAssertion::new(async move |sample, _, _cx| {
1308                let invalid_starts = [' ', '`', '\n'];
1309                let mut message = String::new();
1310                for start in invalid_starts {
1311                    if sample.text_after.starts_with(start) {
1312                        message.push_str(&format!("The sample starts with a {:?}\n", start));
1313                        break;
1314                    }
1315                }
1316                message.pop();
1317
1318                if message.is_empty() {
1319                    Ok(EvalAssertionOutcome {
1320                        score: 100,
1321                        message: None,
1322                    })
1323                } else {
1324                    Ok(EvalAssertionOutcome {
1325                        score: 0,
1326                        message: Some(message),
1327                    })
1328                }
1329            }),
1330        ))
1331    });
1332}
1333
1334#[test]
1335#[cfg_attr(not(feature = "unit-eval"), ignore)]
1336fn eval_add_overwrite_test() {
1337    let input_file_path = "root/action_log.rs";
1338    let input_file_content = include_str!("fixtures/add_overwrite_test/before.rs");
1339
1340    eval_utils::eval(200, 0.5, eval_utils::NoProcessor, move || {
1341        run_eval(EvalInput::new(
1342            vec![
1343                message(
1344                    User,
1345                    [text(indoc::indoc! {"
1346                            Introduce a new test in `action_log.rs` to test overwriting a file.
1347                            That is, a file already exists, but we call `buffer_created` as if the file were new.
1348                            Take inspiration from all the other tests in the file.
1349                        "})],
1350                ),
1351                message(
1352                    Assistant,
1353                    [tool_use(
1354                        "tool_1",
1355                        ReadFileTool::NAME,
1356                        ReadFileToolInput {
1357                            path: input_file_path.into(),
1358                            start_line: None,
1359                            end_line: None,
1360                        },
1361                    )],
1362                ),
1363                message(
1364                    User,
1365                    [tool_result(
1366                        "tool_1",
1367                        ReadFileTool::NAME,
1368                        indoc::indoc! {"
1369                                pub struct ActionLog [L13-20]
1370                                 tracked_buffers [L15]
1371                                 edited_since_project_diagnostics_check [L17]
1372                                 project [L19]
1373                                impl ActionLog [L22-498]
1374                                 pub fn new [L24-30]
1375                                 pub fn project [L32-34]
1376                                 pub fn checked_project_diagnostics [L37-39]
1377                                 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
1378                                 fn track_buffer_internal [L46-101]
1379                                 fn handle_buffer_event [L103-116]
1380                                 fn handle_buffer_edited [L118-123]
1381                                 fn handle_buffer_file_changed [L125-158]
1382                                 async fn maintain_diff [L160-264]
1383                                 pub fn buffer_read [L267-269]
1384                                 pub fn buffer_created [L272-276]
1385                                 pub fn buffer_edited [L279-287]
1386                                 pub fn will_delete_buffer [L289-304]
1387                                 pub fn keep_edits_in_range [L306-364]
1388                                 pub fn reject_edits_in_ranges [L366-459]
1389                                 pub fn keep_all_edits [L461-473]
1390                                 pub fn changed_buffers [L476-482]
1391                                 pub fn stale_buffers [L485-497]
1392                                fn apply_non_conflicting_edits [L500-561]
1393                                fn diff_snapshots [L563-585]
1394                                fn point_to_row_edit [L587-614]
1395                                enum ChangeAuthor [L617-620]
1396                                 User [L618]
1397                                 Agent [L619]
1398                                enum TrackedBufferStatus [L623-627]
1399                                 Created [L624]
1400                                 Modified [L625]
1401                                 Deleted [L626]
1402                                struct TrackedBuffer [L629-641]
1403                                 buffer [L630]
1404                                 base_text [L631]
1405                                 unreviewed_changes [L632]
1406                                 status [L633]
1407                                 version [L634]
1408                                 diff [L635]
1409                                 snapshot [L636]
1410                                 diff_update [L637]
1411                                 _open_lsp_handle [L638]
1412                                 _maintain_diff [L639]
1413                                 _subscription [L640]
1414                                impl TrackedBuffer [L643-657]
1415                                 fn has_changes [L644-650]
1416                                 fn schedule_diff_update [L652-656]
1417                                pub struct ChangedBuffer [L659-661]
1418                                 pub diff [L660]
1419                                mod tests [L664-1574]
1420                                 fn init_logger [L678-682]
1421                                 fn init_test [L684-691]
1422                                 async fn test_keep_edits [L694-769]
1423                                 async fn test_deletions [L772-854]
1424                                 async fn test_overlapping_user_edits [L857-951]
1425                                 async fn test_creating_files [L954-1010]
1426                                 async fn test_deleting_files [L1013-1120]
1427                                 async fn test_reject_edits [L1123-1255]
1428                                 async fn test_reject_multiple_edits [L1258-1331]
1429                                 async fn test_reject_deleted_file [L1334-1388]
1430                                 async fn test_reject_created_file [L1391-1443]
1431                                 async fn test_random_diffs [L1446-1535]
1432                                  fn quiesce [L1510-1534]
1433                                 struct HunkStatus [L1538-1542]
1434                                  range [L1539]
1435                                  diff_status [L1540]
1436                                  old_text [L1541]
1437                                 fn unreviewed_hunks [L1544-1573]
1438
1439                                Showing symbols 1-69 (total symbols: 69)
1440
1441                                Using the line numbers in this outline, you can call this tool again while specifying
1442                                the start_line and end_line fields to see the implementations of symbols in the outline.
1443                            "},
1444                    )],
1445                ),
1446                message(
1447                    Assistant,
1448                    [
1449                        text(
1450                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
1451                        ),
1452                        tool_use(
1453                            "tool_2",
1454                            ReadFileTool::NAME,
1455                            ReadFileToolInput {
1456                                path: input_file_path.into(),
1457                                start_line: Some(953),
1458                                end_line: Some(1010),
1459                            },
1460                        ),
1461                    ],
1462                ),
1463                message(
1464                    User,
1465                    [tool_result(
1466                        "tool_2",
1467                        ReadFileTool::NAME,
1468                        lines(input_file_content, 953..1010),
1469                    )],
1470                ),
1471                message(
1472                    Assistant,
1473                    [
1474                        text(
1475                            "Now let's look at another relevant test to understand how they're structured:",
1476                        ),
1477                        tool_use(
1478                            "tool_3",
1479                            ReadFileTool::NAME,
1480                            ReadFileToolInput {
1481                                path: input_file_path.into(),
1482                                start_line: Some(1012),
1483                                end_line: Some(1120),
1484                            },
1485                        ),
1486                    ],
1487                ),
1488                message(
1489                    User,
1490                    [tool_result(
1491                        "tool_3",
1492                        ReadFileTool::NAME,
1493                        lines(input_file_content, 1012..1120),
1494                    )],
1495                ),
1496                message(
1497                    Assistant,
1498                    [
1499                        text("Now let's look at how `buffer_created` is implemented:"),
1500                        tool_use(
1501                            "tool_4",
1502                            ReadFileTool::NAME,
1503                            ReadFileToolInput {
1504                                path: input_file_path.into(),
1505                                start_line: Some(271),
1506                                end_line: Some(276),
1507                            },
1508                        ),
1509                    ],
1510                ),
1511                message(
1512                    User,
1513                    [tool_result(
1514                        "tool_4",
1515                        ReadFileTool::NAME,
1516                        lines(input_file_content, 271..276),
1517                    )],
1518                ),
1519            ],
1520            input_file_path,
1521            Some(input_file_content.into()),
1522            EvalAssertion::judge_diff(
1523                "A new test for overwritten files was created, without changing any previous test",
1524            ),
1525        ))
1526    });
1527}
1528
1529#[test]
1530#[cfg_attr(not(feature = "unit-eval"), ignore)]
1531fn eval_create_empty_file() {
1532    let input_file_path = "root/TODO3";
1533    let input_file_content = None;
1534    let expected_output_content = String::new();
1535
1536    eval_utils::eval(100, 0.99, eval_utils::NoProcessor, move || {
1537        run_eval(EvalInput::new(
1538            vec![
1539                message(User, [text("Create a second empty todo file ")]),
1540                message(
1541                    Assistant,
1542                    [
1543                        text(indoc::formatdoc! {"
1544                            I'll help you create a second empty todo file.
1545                            First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1546                            "}),
1547                        tool_use(
1548                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1549                            ListDirectoryTool::NAME,
1550                            ListDirectoryToolInput {
1551                                path: "root".to_string(),
1552                            },
1553                        ),
1554                    ],
1555                ),
1556                message(
1557                    User,
1558                    [tool_result(
1559                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1560                        ListDirectoryTool::NAME,
1561                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1562                    )],
1563                ),
1564            ],
1565            input_file_path,
1566            input_file_content.clone(),
1567            EvalAssertion::assert_eq(expected_output_content.clone()),
1568        ))
1569    });
1570}