streaming_edit_file.rs

   1use crate::tools::streaming_edit_file_tool::*;
   2use crate::{
   3    AgentTool, ContextServerRegistry, EditFileTool, GrepTool, GrepToolInput, ListDirectoryTool,
   4    ListDirectoryToolInput, ReadFileTool, ReadFileToolInput, StreamingEditFileTool, Template,
   5    Templates, Thread, ToolCallEventStream, ToolInput,
   6};
   7use Role::*;
   8use anyhow::{Context as _, Result};
   9use client::{Client, UserStore};
  10use fs::FakeFs;
  11use futures::{FutureExt, StreamExt, future::LocalBoxFuture};
  12use gpui::{AppContext as _, AsyncApp, Entity, TestAppContext, UpdateGlobal as _};
  13use http_client::StatusCode;
  14use language::language_settings::FormatOnSave;
  15use language_model::{
  16    LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
  17    LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage,
  18    LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent,
  19    LanguageModelToolSchemaFormat, LanguageModelToolUse, LanguageModelToolUseId, MessageContent,
  20    Role, SelectedModel,
  21};
  22use project::Project;
  23use prompt_store::{ProjectContext, WorktreeContext};
  24use rand::prelude::*;
  25use reqwest_client::ReqwestClient;
  26use serde::Serialize;
  27use serde_json::json;
  28use settings::SettingsStore;
  29use std::{
  30    fmt::{self, Display},
  31    path::{Path, PathBuf},
  32    str::FromStr,
  33    sync::Arc,
  34    time::Duration,
  35};
  36use util::path;
  37
  38#[derive(Serialize)]
  39struct DiffJudgeTemplate {
  40    diff: String,
  41    assertions: &'static str,
  42}
  43
  44impl Template for DiffJudgeTemplate {
  45    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
  46}
  47
  48#[derive(Clone)]
  49struct EvalInput {
  50    conversation: Vec<LanguageModelRequestMessage>,
  51    input_file_path: PathBuf,
  52    input_content: Option<String>,
  53    assertion: EvalAssertion,
  54}
  55
  56impl EvalInput {
  57    fn new(
  58        conversation: Vec<LanguageModelRequestMessage>,
  59        input_file_path: impl Into<PathBuf>,
  60        input_content: Option<String>,
  61        assertion: EvalAssertion,
  62    ) -> Self {
  63        EvalInput {
  64            conversation,
  65            input_file_path: input_file_path.into(),
  66            input_content,
  67            assertion,
  68        }
  69    }
  70}
  71
  72#[derive(Clone)]
  73struct EvalSample {
  74    text_before: String,
  75    text_after: String,
  76    tool_input: StreamingEditFileToolInput,
  77    diff: String,
  78}
  79
  80trait AssertionFn: 'static + Send + Sync {
  81    fn assert<'a>(
  82        &'a self,
  83        sample: &'a EvalSample,
  84        judge_model: Arc<dyn LanguageModel>,
  85        cx: &'a mut TestAppContext,
  86    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
  87}
  88
  89impl<F> AssertionFn for F
  90where
  91    F: 'static
  92        + Send
  93        + Sync
  94        + AsyncFn(
  95            &EvalSample,
  96            Arc<dyn LanguageModel>,
  97            &mut TestAppContext,
  98        ) -> Result<EvalAssertionOutcome>,
  99{
 100    fn assert<'a>(
 101        &'a self,
 102        sample: &'a EvalSample,
 103        judge_model: Arc<dyn LanguageModel>,
 104        cx: &'a mut TestAppContext,
 105    ) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
 106        (self)(sample, judge_model, cx).boxed_local()
 107    }
 108}
 109
 110#[derive(Clone)]
 111struct EvalAssertion(Arc<dyn AssertionFn>);
 112
 113impl EvalAssertion {
 114    fn new<F>(f: F) -> Self
 115    where
 116        F: 'static
 117            + Send
 118            + Sync
 119            + AsyncFn(
 120                &EvalSample,
 121                Arc<dyn LanguageModel>,
 122                &mut TestAppContext,
 123            ) -> Result<EvalAssertionOutcome>,
 124    {
 125        EvalAssertion(Arc::new(f))
 126    }
 127
 128    fn assert_eq(expected: impl Into<String>) -> Self {
 129        let expected = expected.into();
 130        Self::new(async move |sample, _judge, _cx| {
 131            Ok(EvalAssertionOutcome {
 132                score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
 133                    100
 134                } else {
 135                    0
 136                },
 137                message: None,
 138            })
 139        })
 140    }
 141
 142    fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
 143        let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
 144        Self::new(async move |sample, _judge, _cx| {
 145            let matches = expected_diffs.iter().any(|possible_diff| {
 146                language::apply_diff_patch(&sample.text_before, possible_diff)
 147                    .map(|expected| {
 148                        strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
 149                    })
 150                    .unwrap_or(false)
 151            });
 152
 153            Ok(EvalAssertionOutcome {
 154                score: if matches { 100 } else { 0 },
 155                message: None,
 156            })
 157        })
 158    }
 159
 160    fn judge_diff(assertions: &'static str) -> Self {
 161        Self::new(async move |sample, judge, cx| {
 162            let prompt = DiffJudgeTemplate {
 163                diff: sample.diff.clone(),
 164                assertions,
 165            }
 166            .render(&Templates::new())
 167            .context("Failed to render diff judge template")?;
 168
 169            let request = LanguageModelRequest {
 170                messages: vec![LanguageModelRequestMessage {
 171                    role: Role::User,
 172                    content: vec![prompt.into()],
 173                    cache: false,
 174                    reasoning_details: None,
 175                }],
 176                thinking_allowed: true,
 177                thinking_effort: judge
 178                    .default_effort_level()
 179                    .map(|effort_level| effort_level.value.to_string()),
 180                ..Default::default()
 181            };
 182            let mut response = retry_on_rate_limit(async || {
 183                Ok(judge
 184                    .stream_completion_text(request.clone(), &cx.to_async())
 185                    .await?)
 186            })
 187            .await?;
 188            let mut output = String::new();
 189            while let Some(chunk) = response.stream.next().await {
 190                let chunk = chunk?;
 191                output.push_str(&chunk);
 192            }
 193
 194            let re = regex::Regex::new(r"<score>(\d+)</score>")
 195                .context("Failed to compile score regex")?;
 196            if let Some(captures) = re.captures(&output)
 197                && let Some(score_match) = captures.get(1)
 198            {
 199                let score = score_match.as_str().parse().unwrap_or(0);
 200                return Ok(EvalAssertionOutcome {
 201                    score,
 202                    message: Some(output),
 203                });
 204            }
 205
 206            anyhow::bail!("No score found in response. Raw output: {output}");
 207        })
 208    }
 209
 210    async fn run(
 211        &self,
 212        input: &EvalSample,
 213        judge_model: Arc<dyn LanguageModel>,
 214        cx: &mut TestAppContext,
 215    ) -> Result<EvalAssertionOutcome> {
 216        self.0.assert(input, judge_model, cx).await
 217    }
 218}
 219
 220#[derive(Clone)]
 221struct StreamingEditEvalOutput {
 222    sample: EvalSample,
 223    assertion: EvalAssertionOutcome,
 224}
 225
 226impl Display for StreamingEditEvalOutput {
 227    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 228        writeln!(f, "Score: {:?}", self.assertion.score)?;
 229        if let Some(message) = self.assertion.message.as_ref() {
 230            writeln!(f, "Message: {}", message)?;
 231        }
 232        writeln!(f, "Diff:\n{}", self.sample.diff)?;
 233        writeln!(f, "Tool Input:\n{:#?}", self.sample.tool_input)?;
 234        Ok(())
 235    }
 236}
 237
 238#[derive(Clone, Debug, Eq, PartialEq, Hash)]
 239struct EvalAssertionOutcome {
 240    score: usize,
 241    message: Option<String>,
 242}
 243
 244struct StreamingEditToolTest {
 245    fs: Arc<FakeFs>,
 246    project: Entity<Project>,
 247    model: Arc<dyn LanguageModel>,
 248    judge_model: Arc<dyn LanguageModel>,
 249    model_thinking_effort: Option<String>,
 250}
 251
 252impl StreamingEditToolTest {
 253    async fn new(cx: &mut TestAppContext) -> Self {
 254        cx.executor().allow_parking();
 255
 256        let fs = FakeFs::new(cx.executor());
 257        cx.update(|cx| {
 258            let settings_store = SettingsStore::test(cx);
 259            cx.set_global(settings_store);
 260            SettingsStore::update_global(cx, |store: &mut SettingsStore, cx| {
 261                store.update_user_settings(cx, |settings| {
 262                    settings
 263                        .project
 264                        .all_languages
 265                        .defaults
 266                        .ensure_final_newline_on_save = Some(false);
 267                    settings.project.all_languages.defaults.format_on_save =
 268                        Some(FormatOnSave::Off);
 269                });
 270            });
 271
 272            gpui_tokio::init(cx);
 273            let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
 274            cx.set_http_client(http_client);
 275            let client = Client::production(cx);
 276            let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
 277            language_model::init(user_store.clone(), client.clone(), cx);
 278            language_models::init(user_store, client, cx);
 279        });
 280
 281        fs.insert_tree("/root", json!({})).await;
 282        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
 283        let agent_model = SelectedModel::from_str(
 284            &std::env::var("ZED_AGENT_MODEL")
 285                .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
 286        )
 287        .unwrap();
 288        let judge_model = SelectedModel::from_str(
 289            &std::env::var("ZED_JUDGE_MODEL")
 290                .unwrap_or("anthropic/claude-sonnet-4-6-latest".into()),
 291        )
 292        .unwrap();
 293
 294        let authenticate_provider_tasks = cx.update(|cx| {
 295            LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
 296                registry
 297                    .providers()
 298                    .iter()
 299                    .map(|p| p.authenticate(cx))
 300                    .collect::<Vec<_>>()
 301            })
 302        });
 303        let (model, judge_model) = cx
 304            .update(|cx| {
 305                cx.spawn(async move |cx| {
 306                    futures::future::join_all(authenticate_provider_tasks).await;
 307                    let model = Self::load_model(&agent_model, cx).await;
 308                    let judge_model = Self::load_model(&judge_model, cx).await;
 309                    (model.unwrap(), judge_model.unwrap())
 310                })
 311            })
 312            .await;
 313
 314        let model_thinking_effort = model
 315            .default_effort_level()
 316            .map(|effort_level| effort_level.value.to_string());
 317
 318        Self {
 319            fs,
 320            project,
 321            model,
 322            judge_model,
 323            model_thinking_effort,
 324        }
 325    }
 326
 327    async fn load_model(
 328        selected_model: &SelectedModel,
 329        cx: &mut AsyncApp,
 330    ) -> Result<Arc<dyn LanguageModel>> {
 331        cx.update(|cx| {
 332            let registry = LanguageModelRegistry::read_global(cx);
 333            let provider = registry
 334                .provider(&selected_model.provider)
 335                .expect("Provider not found");
 336            provider.authenticate(cx)
 337        })
 338        .await?;
 339        Ok(cx.update(|cx| {
 340            let models = LanguageModelRegistry::read_global(cx);
 341            models
 342                .available_models(cx)
 343                .find(|model| {
 344                    model.provider_id() == selected_model.provider
 345                        && model.id() == selected_model.model
 346                })
 347                .unwrap_or_else(|| panic!("Model {} not found", selected_model.model.0))
 348        }))
 349    }
 350
 351    /// Build the tool definitions for the model, replacing `edit_file` with the
 352    /// streaming edit file tool schema. In production the streaming tool is
 353    /// exposed under the name `"edit_file"` (see `Thread::enabled_tools`), so
 354    /// the model has never seen the name `"streaming_edit_file"`.
 355    fn build_tools() -> Vec<LanguageModelRequestTool> {
 356        let mut tools: Vec<LanguageModelRequestTool> = crate::built_in_tools()
 357            .filter(|tool| tool.name != EditFileTool::NAME)
 358            .collect();
 359        tools.push(LanguageModelRequestTool {
 360            name: EditFileTool::NAME.to_string(),
 361            description: StreamingEditFileTool::description().to_string(),
 362            input_schema: StreamingEditFileTool::input_schema(
 363                LanguageModelToolSchemaFormat::JsonSchema,
 364            )
 365            .to_value(),
 366            use_input_streaming: StreamingEditFileTool::supports_input_streaming(),
 367        });
 368        tools
 369    }
 370
 371    async fn eval(
 372        &self,
 373        mut eval: EvalInput,
 374        cx: &mut TestAppContext,
 375    ) -> Result<StreamingEditEvalOutput> {
 376        eval.conversation
 377            .last_mut()
 378            .context("Conversation must not be empty")?
 379            .cache = true;
 380
 381        // Populate the FakeFs so `resolve_path` / `entry_for_path` can find
 382        // the file in the worktree.
 383        if let Some(input_content) = eval.input_content.as_deref() {
 384            let abs_path = Path::new("/root").join(
 385                eval.input_file_path
 386                    .strip_prefix("root")
 387                    .unwrap_or(&eval.input_file_path),
 388            );
 389            self.fs.insert_file(&abs_path, input_content.into()).await;
 390
 391            // Wait for the worktree to pick up the new file.
 392            cx.run_until_parked();
 393        }
 394
 395        let tools = Self::build_tools();
 396
 397        let system_prompt = {
 398            let worktrees = vec![WorktreeContext {
 399                root_name: "root".to_string(),
 400                abs_path: Path::new("/path/to/root").into(),
 401                rules_file: None,
 402            }];
 403            let project_context = ProjectContext::new(worktrees, Vec::default());
 404            let tool_names = tools
 405                .iter()
 406                .map(|tool| tool.name.clone().into())
 407                .collect::<Vec<_>>();
 408            let template = crate::SystemPromptTemplate {
 409                project: &project_context,
 410                available_tools: tool_names,
 411                model_name: None,
 412            };
 413            let templates = Templates::new();
 414            template.render(&templates)?
 415        };
 416
 417        let has_system_prompt = eval
 418            .conversation
 419            .first()
 420            .is_some_and(|msg| msg.role == Role::System);
 421        let messages = if has_system_prompt {
 422            eval.conversation
 423        } else {
 424            [LanguageModelRequestMessage {
 425                role: Role::System,
 426                content: vec![MessageContent::Text(system_prompt)],
 427                cache: true,
 428                reasoning_details: None,
 429            }]
 430            .into_iter()
 431            .chain(eval.conversation)
 432            .collect::<Vec<_>>()
 433        };
 434
 435        let request = LanguageModelRequest {
 436            messages,
 437            tools,
 438            thinking_allowed: true,
 439            thinking_effort: self.model_thinking_effort.clone(),
 440            ..Default::default()
 441        };
 442
 443        // The model will call the tool as "edit_file" (the production-visible
 444        // name), but the schema is from StreamingEditFileTool.
 445        let tool_input =
 446            retry_on_rate_limit(async || self.extract_tool_use(request.clone(), cx).await).await?;
 447
 448        let language_registry = self
 449            .project
 450            .read_with(cx, |project, _cx| project.languages().clone());
 451
 452        let context_server_registry = cx
 453            .new(|cx| ContextServerRegistry::new(self.project.read(cx).context_server_store(), cx));
 454        let thread = cx.new(|cx| {
 455            Thread::new(
 456                self.project.clone(),
 457                cx.new(|_cx| ProjectContext::default()),
 458                context_server_registry,
 459                Templates::new(),
 460                Some(self.model.clone()),
 461                cx,
 462            )
 463        });
 464        let action_log = thread.read_with(cx, |thread, _| thread.action_log().clone());
 465
 466        let tool = Arc::new(StreamingEditFileTool::new(
 467            self.project.clone(),
 468            thread.downgrade(),
 469            action_log,
 470            language_registry,
 471        ));
 472
 473        let result = cx
 474            .update(|cx| {
 475                tool.clone().run(
 476                    ToolInput::resolved(tool_input.clone()),
 477                    ToolCallEventStream::test().0,
 478                    cx,
 479                )
 480            })
 481            .await;
 482
 483        let output = match result {
 484            Ok(output) => output,
 485            Err(output) => {
 486                anyhow::bail!("Tool returned error: {}", output);
 487            }
 488        };
 489
 490        let StreamingEditFileToolOutput::Success { new_text, .. } = &output else {
 491            anyhow::bail!("Tool returned error output: {}", output);
 492        };
 493
 494        let sample = EvalSample {
 495            tool_input,
 496            diff: language::unified_diff(
 497                eval.input_content.as_deref().unwrap_or_default(),
 498                new_text,
 499            ),
 500            text_before: eval.input_content.unwrap_or_default(),
 501            text_after: new_text.clone(),
 502        };
 503
 504        let assertion = eval
 505            .assertion
 506            .run(&sample, self.judge_model.clone(), cx)
 507            .await?;
 508
 509        Ok(StreamingEditEvalOutput { assertion, sample })
 510    }
 511
 512    /// Stream the model completion and extract the first complete tool use
 513    /// whose name matches `EditFileTool::NAME` (the production-visible name
 514    /// for the streaming edit tool), parsed as `StreamingEditFileToolInput`.
 515    async fn extract_tool_use(
 516        &self,
 517        request: LanguageModelRequest,
 518        cx: &mut TestAppContext,
 519    ) -> Result<StreamingEditFileToolInput> {
 520        let model = self.model.clone();
 521        let events = cx
 522            .update(|cx| {
 523                let async_cx = cx.to_async();
 524                cx.foreground_executor()
 525                    .spawn(async move { model.stream_completion(request, &async_cx).await })
 526            })
 527            .await
 528            .map_err(|err| anyhow::anyhow!("completion error: {}", err))?;
 529
 530        let mut streamed_text = String::new();
 531        let mut stop_reason = None;
 532        let mut parse_errors = Vec::new();
 533
 534        let mut events = events.fuse();
 535        while let Some(event) = events.next().await {
 536            match event {
 537                Ok(LanguageModelCompletionEvent::ToolUse(tool_use))
 538                    if tool_use.is_input_complete
 539                        && tool_use.name.as_ref() == EditFileTool::NAME =>
 540                {
 541                    let input: StreamingEditFileToolInput = serde_json::from_value(tool_use.input)
 542                        .context("Failed to parse tool input as StreamingEditFileToolInput")?;
 543                    return Ok(input);
 544                }
 545                Ok(LanguageModelCompletionEvent::Text(text)) => {
 546                    if streamed_text.len() < 2_000 {
 547                        streamed_text.push_str(&text);
 548                    }
 549                }
 550                Ok(LanguageModelCompletionEvent::Stop(reason)) => {
 551                    stop_reason = Some(reason);
 552                }
 553                Ok(LanguageModelCompletionEvent::ToolUseJsonParseError {
 554                    tool_name,
 555                    raw_input,
 556                    json_parse_error,
 557                    ..
 558                }) if tool_name.as_ref() == EditFileTool::NAME => {
 559                    parse_errors.push(format!("{json_parse_error}\nRaw input:\n{raw_input:?}"));
 560                }
 561                Err(err) => {
 562                    return Err(anyhow::anyhow!("completion error: {}", err));
 563                }
 564                _ => {}
 565            }
 566        }
 567
 568        let streamed_text = streamed_text.trim();
 569        let streamed_text_suffix = if streamed_text.is_empty() {
 570            String::new()
 571        } else {
 572            format!("\nStreamed text:\n{streamed_text}")
 573        };
 574        let stop_reason_suffix = stop_reason
 575            .map(|reason| format!("\nStop reason: {reason:?}"))
 576            .unwrap_or_default();
 577        let parse_errors_suffix = if parse_errors.is_empty() {
 578            String::new()
 579        } else {
 580            format!("\nTool parse errors:\n{}", parse_errors.join("\n"))
 581        };
 582
 583        anyhow::bail!(
 584            "Stream ended without an edit_file tool use{stop_reason_suffix}{parse_errors_suffix}{streamed_text_suffix}"
 585        )
 586    }
 587}
 588
 589fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<()> {
 590    let dispatcher = gpui::TestDispatcher::new(rand::random());
 591    let mut cx = TestAppContext::build(dispatcher, None);
 592    let foreground_executor = cx.foreground_executor().clone();
 593    let result = foreground_executor.block_test(async {
 594        let test = StreamingEditToolTest::new(&mut cx).await;
 595        let result = test.eval(eval, &mut cx).await;
 596        drop(test);
 597        cx.run_until_parked();
 598        result
 599    });
 600    cx.quit();
 601    match result {
 602        Ok(output) => eval_utils::EvalOutput {
 603            data: output.to_string(),
 604            outcome: if output.assertion.score < 80 {
 605                eval_utils::OutcomeKind::Failed
 606            } else {
 607                eval_utils::OutcomeKind::Passed
 608            },
 609            metadata: (),
 610        },
 611        Err(err) => eval_utils::EvalOutput {
 612            data: format!("{err:?}"),
 613            outcome: eval_utils::OutcomeKind::Error,
 614            metadata: (),
 615        },
 616    }
 617}
 618
 619fn message(
 620    role: Role,
 621    contents: impl IntoIterator<Item = MessageContent>,
 622) -> LanguageModelRequestMessage {
 623    LanguageModelRequestMessage {
 624        role,
 625        content: contents.into_iter().collect(),
 626        cache: false,
 627        reasoning_details: None,
 628    }
 629}
 630
 631fn text(text: impl Into<String>) -> MessageContent {
 632    MessageContent::Text(text.into())
 633}
 634
 635fn lines(input: &str, range: std::ops::Range<usize>) -> String {
 636    input
 637        .lines()
 638        .skip(range.start)
 639        .take(range.len())
 640        .collect::<Vec<_>>()
 641        .join("\n")
 642}
 643
 644fn tool_use(
 645    id: impl Into<Arc<str>>,
 646    name: impl Into<Arc<str>>,
 647    input: impl Serialize,
 648) -> MessageContent {
 649    MessageContent::ToolUse(LanguageModelToolUse {
 650        id: LanguageModelToolUseId::from(id.into()),
 651        name: name.into(),
 652        raw_input: serde_json::to_string_pretty(&input).unwrap(),
 653        input: serde_json::to_value(input).unwrap(),
 654        is_input_complete: true,
 655        thought_signature: None,
 656    })
 657}
 658
 659fn tool_result(
 660    id: impl Into<Arc<str>>,
 661    name: impl Into<Arc<str>>,
 662    result: impl Into<Arc<str>>,
 663) -> MessageContent {
 664    MessageContent::ToolResult(LanguageModelToolResult {
 665        tool_use_id: LanguageModelToolUseId::from(id.into()),
 666        tool_name: name.into(),
 667        is_error: false,
 668        content: LanguageModelToolResultContent::Text(result.into()),
 669        output: None,
 670    })
 671}
 672
 673fn strip_empty_lines(text: &str) -> String {
 674    text.lines()
 675        .filter(|line| !line.trim().is_empty())
 676        .collect::<Vec<_>>()
 677        .join("\n")
 678}
 679
 680async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
 681    const MAX_RETRIES: usize = 20;
 682    let mut attempt = 0;
 683
 684    loop {
 685        attempt += 1;
 686        let response = request().await;
 687
 688        if attempt >= MAX_RETRIES {
 689            return response;
 690        }
 691
 692        let retry_delay = match &response {
 693            Ok(_) => None,
 694            Err(err) => match err.downcast_ref::<LanguageModelCompletionError>() {
 695                Some(err) => match &err {
 696                    LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
 697                    | LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
 698                        Some(retry_after.unwrap_or(Duration::from_secs(5)))
 699                    }
 700                    LanguageModelCompletionError::UpstreamProviderError {
 701                        status,
 702                        retry_after,
 703                        ..
 704                    } => {
 705                        let should_retry = matches!(
 706                            *status,
 707                            StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
 708                        ) || status.as_u16() == 529;
 709
 710                        if should_retry {
 711                            Some(retry_after.unwrap_or(Duration::from_secs(5)))
 712                        } else {
 713                            None
 714                        }
 715                    }
 716                    LanguageModelCompletionError::ApiReadResponseError { .. }
 717                    | LanguageModelCompletionError::ApiInternalServerError { .. }
 718                    | LanguageModelCompletionError::HttpSend { .. } => {
 719                        Some(Duration::from_secs(2_u64.pow((attempt - 1) as u32).min(30)))
 720                    }
 721                    _ => None,
 722                },
 723                _ => None,
 724            },
 725        };
 726
 727        if let Some(retry_after) = retry_delay {
 728            let jitter = retry_after.mul_f64(rand::rng().random_range(0.0..1.0));
 729            eprintln!("Attempt #{attempt}: Retry after {retry_after:?} + jitter of {jitter:?}");
 730            #[allow(clippy::disallowed_methods)]
 731            smol::Timer::after(retry_after + jitter).await;
 732        } else {
 733            return response;
 734        }
 735    }
 736}
 737
 738#[test]
 739#[cfg_attr(not(feature = "unit-eval"), ignore)]
 740fn eval_delete_function() {
 741    let input_file_path = "root/blame.rs";
 742    let input_file_content = include_str!("fixtures/delete_run_git_blame/before.rs");
 743    let output_file_content = include_str!("fixtures/delete_run_git_blame/after.rs");
 744    let possible_diffs = vec![
 745        language::unified_diff(input_file_content, output_file_content),
 746        language::unified_diff(
 747            input_file_content,
 748            &output_file_content
 749                .replace(
 750                    "const GIT_BLAME_NO_COMMIT_ERROR: &str = \"fatal: no such ref: HEAD\";\n",
 751                    "",
 752                )
 753                .replace(
 754                    "const GIT_BLAME_NO_PATH: &str = \"fatal: no such path\";\n",
 755                    "",
 756                ),
 757        ),
 758    ];
 759
 760    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
 761        run_eval(EvalInput::new(
 762            vec![
 763                message(
 764                    User,
 765                    [text(indoc::formatdoc! {"
 766                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
 767                        one function, not its usages.
 768                    "})],
 769                ),
 770                message(
 771                    Assistant,
 772                    [tool_use(
 773                        "tool_1",
 774                        ReadFileTool::NAME,
 775                        ReadFileToolInput {
 776                            path: input_file_path.into(),
 777                            start_line: None,
 778                            end_line: None,
 779                        },
 780                    )],
 781                ),
 782                message(
 783                    User,
 784                    [tool_result(
 785                        "tool_1",
 786                        ReadFileTool::NAME,
 787                        input_file_content,
 788                    )],
 789                ),
 790            ],
 791            input_file_path,
 792            Some(input_file_content.into()),
 793            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 794        ))
 795    });
 796}
 797
 798#[test]
 799#[cfg_attr(not(feature = "unit-eval"), ignore)]
 800fn eval_extract_handle_command_output() {
 801    let input_file_path = "root/blame.rs";
 802    let input_file_content = include_str!("fixtures/extract_handle_command_output/before.rs");
 803    let possible_diffs = vec![
 804        include_str!("fixtures/extract_handle_command_output/possible-01.diff"),
 805        include_str!("fixtures/extract_handle_command_output/possible-02.diff"),
 806        include_str!("fixtures/extract_handle_command_output/possible-03.diff"),
 807        include_str!("fixtures/extract_handle_command_output/possible-04.diff"),
 808        include_str!("fixtures/extract_handle_command_output/possible-05.diff"),
 809        include_str!("fixtures/extract_handle_command_output/possible-06.diff"),
 810        include_str!("fixtures/extract_handle_command_output/possible-07.diff"),
 811        include_str!("fixtures/extract_handle_command_output/possible-08.diff"),
 812        include_str!("fixtures/extract_handle_command_output/possible-09.diff"),
 813    ];
 814
 815    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
 816        run_eval(EvalInput::new(
 817            vec![
 818                message(
 819                    User,
 820                    [text(indoc::formatdoc! {"
 821                        Read the `{input_file_path}` file and extract a method in
 822                        the final stanza of `run_git_blame` to deal with command failures,
 823                        call it `handle_command_output` and take the std::process::Output as the only parameter.
 824                        Do not document the method and do not add any comments.
 825
 826                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
 827                    "})],
 828                ),
 829                message(
 830                    Assistant,
 831                    [tool_use(
 832                        "tool_1",
 833                        ReadFileTool::NAME,
 834                        ReadFileToolInput {
 835                            path: input_file_path.into(),
 836                            start_line: None,
 837                            end_line: None,
 838                        },
 839                    )],
 840                ),
 841                message(
 842                    User,
 843                    [tool_result(
 844                        "tool_1",
 845                        ReadFileTool::NAME,
 846                        input_file_content,
 847                    )],
 848                ),
 849            ],
 850            input_file_path,
 851            Some(input_file_content.into()),
 852            EvalAssertion::assert_diff_any(possible_diffs.clone()),
 853        ))
 854    });
 855}
 856
 857#[test]
 858#[cfg_attr(not(feature = "unit-eval"), ignore)]
 859fn eval_translate_doc_comments() {
 860    let input_file_path = "root/canvas.rs";
 861    let input_file_content = include_str!("fixtures/translate_doc_comments/before.rs");
 862
 863    eval_utils::eval(200, 1., eval_utils::NoProcessor, move || {
 864        run_eval(EvalInput::new(
 865            vec![
 866                message(
 867                    User,
 868                    [text(indoc::formatdoc! {"
 869                        Read the `{input_file_path}` file and edit it (without overwriting it),
 870                        translating all the doc comments to italian.
 871                    "})],
 872                ),
 873                message(
 874                    Assistant,
 875                    [tool_use(
 876                        "tool_1",
 877                        ReadFileTool::NAME,
 878                        ReadFileToolInput {
 879                            path: input_file_path.into(),
 880                            start_line: None,
 881                            end_line: None,
 882                        },
 883                    )],
 884                ),
 885                message(
 886                    User,
 887                    [tool_result(
 888                        "tool_1",
 889                        ReadFileTool::NAME,
 890                        input_file_content,
 891                    )],
 892                ),
 893            ],
 894            input_file_path,
 895            Some(input_file_content.into()),
 896            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
 897        ))
 898    });
 899}
 900
 901#[test]
 902#[cfg_attr(not(feature = "unit-eval"), ignore)]
 903fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
 904    let input_file_path = "root/lib.rs";
 905    let input_file_content =
 906        include_str!("fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
 907
 908    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
 909        run_eval(EvalInput::new(
 910            vec![
 911                message(
 912                    User,
 913                    [text(indoc::formatdoc! {"
 914                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
 915                        Use `ureq` to download the SDK for the current platform and architecture.
 916                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
 917                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
 918                        that's inside of the archive.
 919                        Don't re-download the SDK if that executable already exists.
 920
 921                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
 922
 923                        Here are the available wasi-sdk assets:
 924                        - wasi-sdk-25.0-x86_64-macos.tar.gz
 925                        - wasi-sdk-25.0-arm64-macos.tar.gz
 926                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 927                        - wasi-sdk-25.0-arm64-linux.tar.gz
 928                        - wasi-sdk-25.0-x86_64-linux.tar.gz
 929                        - wasi-sdk-25.0-arm64-linux.tar.gz
 930                        - wasi-sdk-25.0-x86_64-windows.tar.gz
 931                    "})],
 932                ),
 933                message(
 934                    Assistant,
 935                    [tool_use(
 936                        "tool_1",
 937                        ReadFileTool::NAME,
 938                        ReadFileToolInput {
 939                            path: input_file_path.into(),
 940                            start_line: Some(971),
 941                            end_line: Some(1050),
 942                        },
 943                    )],
 944                ),
 945                message(
 946                    User,
 947                    [tool_result(
 948                        "tool_1",
 949                        ReadFileTool::NAME,
 950                        lines(input_file_content, 971..1050),
 951                    )],
 952                ),
 953                message(
 954                    Assistant,
 955                    [tool_use(
 956                        "tool_2",
 957                        ReadFileTool::NAME,
 958                        ReadFileToolInput {
 959                            path: input_file_path.into(),
 960                            start_line: Some(1050),
 961                            end_line: Some(1100),
 962                        },
 963                    )],
 964                ),
 965                message(
 966                    User,
 967                    [tool_result(
 968                        "tool_2",
 969                        ReadFileTool::NAME,
 970                        lines(input_file_content, 1050..1100),
 971                    )],
 972                ),
 973                message(
 974                    Assistant,
 975                    [tool_use(
 976                        "tool_3",
 977                        ReadFileTool::NAME,
 978                        ReadFileToolInput {
 979                            path: input_file_path.into(),
 980                            start_line: Some(1100),
 981                            end_line: Some(1150),
 982                        },
 983                    )],
 984                ),
 985                message(
 986                    User,
 987                    [tool_result(
 988                        "tool_3",
 989                        ReadFileTool::NAME,
 990                        lines(input_file_content, 1100..1150),
 991                    )],
 992                ),
 993            ],
 994            input_file_path,
 995            Some(input_file_content.into()),
 996            EvalAssertion::judge_diff(indoc::indoc! {"
 997                    - The compile_parser_to_wasm method has been changed to use wasi-sdk
 998                    - ureq is used to download the SDK for current platform and architecture
 999                "}),
1000        ))
1001    });
1002}
1003
1004#[test]
1005#[cfg_attr(not(feature = "unit-eval"), ignore)]
1006fn eval_disable_cursor_blinking() {
1007    let input_file_path = "root/editor.rs";
1008    let input_file_content = include_str!("fixtures/disable_cursor_blinking/before.rs");
1009    let possible_diffs = vec![
1010        include_str!("fixtures/disable_cursor_blinking/possible-01.diff"),
1011        include_str!("fixtures/disable_cursor_blinking/possible-02.diff"),
1012        include_str!("fixtures/disable_cursor_blinking/possible-03.diff"),
1013        include_str!("fixtures/disable_cursor_blinking/possible-04.diff"),
1014    ];
1015
1016    eval_utils::eval(100, 0.51, eval_utils::NoProcessor, move || {
1017        run_eval(EvalInput::new(
1018            vec![
1019                message(User, [text("Let's research how to cursor blinking works.")]),
1020                message(
1021                    Assistant,
1022                    [tool_use(
1023                        "tool_1",
1024                        GrepTool::NAME,
1025                        GrepToolInput {
1026                            regex: "blink".into(),
1027                            include_pattern: None,
1028                            offset: 0,
1029                            case_sensitive: false,
1030                        },
1031                    )],
1032                ),
1033                message(
1034                    User,
1035                    [tool_result(
1036                        "tool_1",
1037                        GrepTool::NAME,
1038                        [
1039                            lines(input_file_content, 100..400),
1040                            lines(input_file_content, 800..1300),
1041                            lines(input_file_content, 1600..2000),
1042                            lines(input_file_content, 5000..5500),
1043                            lines(input_file_content, 8000..9000),
1044                            lines(input_file_content, 18455..18470),
1045                            lines(input_file_content, 20000..20500),
1046                            lines(input_file_content, 21000..21300),
1047                        ]
1048                        .join("Match found:\n\n"),
1049                    )],
1050                ),
1051                message(
1052                    User,
1053                    [text(indoc::indoc! {"
1054                            Comment out the lines that interact with the BlinkManager.
1055                            Keep the outer `update` blocks, but comments everything that's inside (including if statements).
1056                            Don't add additional comments.
1057                        "})],
1058                ),
1059            ],
1060            input_file_path,
1061            Some(input_file_content.into()),
1062            EvalAssertion::assert_diff_any(possible_diffs.clone()),
1063        ))
1064    });
1065}
1066
1067#[test]
1068#[cfg_attr(not(feature = "unit-eval"), ignore)]
1069fn eval_from_pixels_constructor() {
1070    let input_file_path = "root/canvas.rs";
1071    let input_file_content = include_str!("fixtures/from_pixels_constructor/before.rs");
1072
1073    eval_utils::eval(100, 0.95, eval_utils::NoProcessor, move || {
1074        run_eval(EvalInput::new(
1075            vec![
1076                message(
1077                    User,
1078                    [text(indoc::indoc! {"
1079                            Introduce a new `from_pixels` constructor in Canvas and
1080                            also add tests for it in the same file.
1081                        "})],
1082                ),
1083                message(
1084                    Assistant,
1085                    [tool_use(
1086                        "tool_1",
1087                        ReadFileTool::NAME,
1088                        ReadFileToolInput {
1089                            path: input_file_path.into(),
1090                            start_line: None,
1091                            end_line: None,
1092                        },
1093                    )],
1094                ),
1095                message(
1096                    User,
1097                    [tool_result(
1098                        "tool_1",
1099                        ReadFileTool::NAME,
1100                        input_file_content,
1101                    )],
1102                ),
1103                message(
1104                    Assistant,
1105                    [tool_use(
1106                        "tool_2",
1107                        GrepTool::NAME,
1108                        GrepToolInput {
1109                            regex: "mod\\s+tests".into(),
1110                            include_pattern: Some("font-kit/src/canvas.rs".into()),
1111                            offset: 0,
1112                            case_sensitive: false,
1113                        },
1114                    )],
1115                ),
1116                message(
1117                    User,
1118                    [tool_result("tool_2", GrepTool::NAME, "No matches found")],
1119                ),
1120                message(
1121                    Assistant,
1122                    [tool_use(
1123                        "tool_3",
1124                        GrepTool::NAME,
1125                        GrepToolInput {
1126                            regex: "mod\\s+tests".into(),
1127                            include_pattern: Some("font-kit/src/**/*.rs".into()),
1128                            offset: 0,
1129                            case_sensitive: false,
1130                        },
1131                    )],
1132                ),
1133                message(
1134                    User,
1135                    [tool_result("tool_3", GrepTool::NAME, "No matches found")],
1136                ),
1137                message(
1138                    Assistant,
1139                    [tool_use(
1140                        "tool_4",
1141                        GrepTool::NAME,
1142                        GrepToolInput {
1143                            regex: "#\\[test\\]".into(),
1144                            include_pattern: Some("font-kit/src/**/*.rs".into()),
1145                            offset: 0,
1146                            case_sensitive: false,
1147                        },
1148                    )],
1149                ),
1150                message(
1151                    User,
1152                    [tool_result(
1153                        "tool_4",
1154                        GrepTool::NAME,
1155                        indoc::indoc! {"
1156                                Found 6 matches:
1157
1158                                ## Matches in font-kit/src/loaders/core_text.rs
1159
1160                                ### mod test › L926-936
1161                                ```
1162                                mod test {
1163                                    use super::Font;
1164                                    use crate::properties::{Stretch, Weight};
1165
1166                                    #[cfg(feature = \"source\")]
1167                                    use crate::source::SystemSource;
1168
1169                                    static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
1170
1171                                    #[cfg(feature = \"source\")]
1172                                    #[test]
1173                                ```
1174
1175                                55 lines remaining in ancestor node. Read the file to see all.
1176
1177                                ### mod test › L947-951
1178                                ```
1179                                    }
1180
1181                                    #[test]
1182                                    fn test_core_text_to_css_font_weight() {
1183                                        // Exact matches
1184                                ```
1185
1186                                ### mod test › L959-963
1187                                ```
1188                                    }
1189
1190                                    #[test]
1191                                    fn test_core_text_to_css_font_stretch() {
1192                                        // Exact matches
1193                                ```
1194
1195                                ## Matches in font-kit/src/loaders/freetype.rs
1196
1197                                ### mod test › L1238-1248
1198                                ```
1199                                mod test {
1200                                    use crate::loaders::freetype::Font;
1201
1202                                    static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
1203                                    static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
1204
1205                                    #[test]
1206                                    fn get_pcf_postscript_name() {
1207                                        let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
1208                                        assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
1209                                    }
1210                                ```
1211
1212                                1 lines remaining in ancestor node. Read the file to see all.
1213
1214                                ## Matches in font-kit/src/sources/core_text.rs
1215
1216                                ### mod test › L265-275
1217                                ```
1218                                mod test {
1219                                    use crate::properties::{Stretch, Weight};
1220
1221                                    #[test]
1222                                    fn test_css_to_core_text_font_weight() {
1223                                        // Exact matches
1224                                        assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
1225                                        assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
1226                                        assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
1227                                        assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
1228
1229                                ```
1230
1231                                27 lines remaining in ancestor node. Read the file to see all.
1232
1233                                ### mod test › L278-282
1234                                ```
1235                                    }
1236
1237                                    #[test]
1238                                    fn test_css_to_core_text_font_stretch() {
1239                                        // Exact matches
1240                                ```
1241                            "},
1242                    )],
1243                ),
1244            ],
1245            input_file_path,
1246            Some(input_file_content.into()),
1247            EvalAssertion::judge_diff(indoc::indoc! {"
1248                        - The diff contains a new `from_pixels` constructor
1249                        - The diff contains new tests for the `from_pixels` constructor
1250                    "}),
1251        ))
1252    });
1253}
1254
1255#[test]
1256#[cfg_attr(not(feature = "unit-eval"), ignore)]
1257fn eval_zode() {
1258    let input_file_path = "root/zode.py";
1259    let input_content = None;
1260
1261    eval_utils::eval(50, 1., eval_utils::NoProcessor, move || {
1262        run_eval(EvalInput::new(
1263            vec![
1264                message(User, [text(include_str!("fixtures/zode/prompt.md"))]),
1265                message(
1266                    Assistant,
1267                    [
1268                        tool_use(
1269                            "tool_1",
1270                            ReadFileTool::NAME,
1271                            ReadFileToolInput {
1272                                path: "root/eval/react.py".into(),
1273                                start_line: None,
1274                                end_line: None,
1275                            },
1276                        ),
1277                        tool_use(
1278                            "tool_2",
1279                            ReadFileTool::NAME,
1280                            ReadFileToolInput {
1281                                path: "root/eval/react_test.py".into(),
1282                                start_line: None,
1283                                end_line: None,
1284                            },
1285                        ),
1286                    ],
1287                ),
1288                message(
1289                    User,
1290                    [
1291                        tool_result(
1292                            "tool_1",
1293                            ReadFileTool::NAME,
1294                            include_str!("fixtures/zode/react.py"),
1295                        ),
1296                        tool_result(
1297                            "tool_2",
1298                            ReadFileTool::NAME,
1299                            include_str!("fixtures/zode/react_test.py"),
1300                        ),
1301                    ],
1302                ),
1303            ],
1304            input_file_path,
1305            input_content.clone(),
1306            EvalAssertion::new(async move |sample, _, _cx| {
1307                let invalid_starts = [' ', '`', '\n'];
1308                let mut message = String::new();
1309                for start in invalid_starts {
1310                    if sample.text_after.starts_with(start) {
1311                        message.push_str(&format!("The sample starts with a {:?}\n", start));
1312                        break;
1313                    }
1314                }
1315                message.pop();
1316
1317                if message.is_empty() {
1318                    Ok(EvalAssertionOutcome {
1319                        score: 100,
1320                        message: None,
1321                    })
1322                } else {
1323                    Ok(EvalAssertionOutcome {
1324                        score: 0,
1325                        message: Some(message),
1326                    })
1327                }
1328            }),
1329        ))
1330    });
1331}
1332
1333#[test]
1334#[cfg_attr(not(feature = "unit-eval"), ignore)]
1335fn eval_add_overwrite_test() {
1336    let input_file_path = "root/action_log.rs";
1337    let input_file_content = include_str!("fixtures/add_overwrite_test/before.rs");
1338
1339    eval_utils::eval(200, 0.5, eval_utils::NoProcessor, move || {
1340        run_eval(EvalInput::new(
1341            vec![
1342                message(
1343                    User,
1344                    [text(indoc::indoc! {"
1345                            Introduce a new test in `action_log.rs` to test overwriting a file.
1346                            That is, a file already exists, but we call `buffer_created` as if the file were new.
1347                            Take inspiration from all the other tests in the file.
1348                        "})],
1349                ),
1350                message(
1351                    Assistant,
1352                    [tool_use(
1353                        "tool_1",
1354                        ReadFileTool::NAME,
1355                        ReadFileToolInput {
1356                            path: input_file_path.into(),
1357                            start_line: None,
1358                            end_line: None,
1359                        },
1360                    )],
1361                ),
1362                message(
1363                    User,
1364                    [tool_result(
1365                        "tool_1",
1366                        ReadFileTool::NAME,
1367                        indoc::indoc! {"
1368                                pub struct ActionLog [L13-20]
1369                                 tracked_buffers [L15]
1370                                 edited_since_project_diagnostics_check [L17]
1371                                 project [L19]
1372                                impl ActionLog [L22-498]
1373                                 pub fn new [L24-30]
1374                                 pub fn project [L32-34]
1375                                 pub fn checked_project_diagnostics [L37-39]
1376                                 pub fn has_edited_files_since_project_diagnostics_check [L42-44]
1377                                 fn track_buffer_internal [L46-101]
1378                                 fn handle_buffer_event [L103-116]
1379                                 fn handle_buffer_edited [L118-123]
1380                                 fn handle_buffer_file_changed [L125-158]
1381                                 async fn maintain_diff [L160-264]
1382                                 pub fn buffer_read [L267-269]
1383                                 pub fn buffer_created [L272-276]
1384                                 pub fn buffer_edited [L279-287]
1385                                 pub fn will_delete_buffer [L289-304]
1386                                 pub fn keep_edits_in_range [L306-364]
1387                                 pub fn reject_edits_in_ranges [L366-459]
1388                                 pub fn keep_all_edits [L461-473]
1389                                 pub fn changed_buffers [L476-482]
1390                                 pub fn stale_buffers [L485-497]
1391                                fn apply_non_conflicting_edits [L500-561]
1392                                fn diff_snapshots [L563-585]
1393                                fn point_to_row_edit [L587-614]
1394                                enum ChangeAuthor [L617-620]
1395                                 User [L618]
1396                                 Agent [L619]
1397                                enum TrackedBufferStatus [L623-627]
1398                                 Created [L624]
1399                                 Modified [L625]
1400                                 Deleted [L626]
1401                                struct TrackedBuffer [L629-641]
1402                                 buffer [L630]
1403                                 base_text [L631]
1404                                 unreviewed_changes [L632]
1405                                 status [L633]
1406                                 version [L634]
1407                                 diff [L635]
1408                                 snapshot [L636]
1409                                 diff_update [L637]
1410                                 _open_lsp_handle [L638]
1411                                 _maintain_diff [L639]
1412                                 _subscription [L640]
1413                                impl TrackedBuffer [L643-657]
1414                                 fn has_changes [L644-650]
1415                                 fn schedule_diff_update [L652-656]
1416                                pub struct ChangedBuffer [L659-661]
1417                                 pub diff [L660]
1418                                mod tests [L664-1574]
1419                                 fn init_logger [L678-682]
1420                                 fn init_test [L684-691]
1421                                 async fn test_keep_edits [L694-769]
1422                                 async fn test_deletions [L772-854]
1423                                 async fn test_overlapping_user_edits [L857-951]
1424                                 async fn test_creating_files [L954-1010]
1425                                 async fn test_deleting_files [L1013-1120]
1426                                 async fn test_reject_edits [L1123-1255]
1427                                 async fn test_reject_multiple_edits [L1258-1331]
1428                                 async fn test_reject_deleted_file [L1334-1388]
1429                                 async fn test_reject_created_file [L1391-1443]
1430                                 async fn test_random_diffs [L1446-1535]
1431                                  fn quiesce [L1510-1534]
1432                                 struct HunkStatus [L1538-1542]
1433                                  range [L1539]
1434                                  diff_status [L1540]
1435                                  old_text [L1541]
1436                                 fn unreviewed_hunks [L1544-1573]
1437
1438                                Showing symbols 1-69 (total symbols: 69)
1439
1440                                Using the line numbers in this outline, you can call this tool again while specifying
1441                                the start_line and end_line fields to see the implementations of symbols in the outline.
1442                            "},
1443                    )],
1444                ),
1445                message(
1446                    Assistant,
1447                    [
1448                        text(
1449                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
1450                        ),
1451                        tool_use(
1452                            "tool_2",
1453                            ReadFileTool::NAME,
1454                            ReadFileToolInput {
1455                                path: input_file_path.into(),
1456                                start_line: Some(953),
1457                                end_line: Some(1010),
1458                            },
1459                        ),
1460                    ],
1461                ),
1462                message(
1463                    User,
1464                    [tool_result(
1465                        "tool_2",
1466                        ReadFileTool::NAME,
1467                        lines(input_file_content, 953..1010),
1468                    )],
1469                ),
1470                message(
1471                    Assistant,
1472                    [
1473                        text(
1474                            "Now let's look at another relevant test to understand how they're structured:",
1475                        ),
1476                        tool_use(
1477                            "tool_3",
1478                            ReadFileTool::NAME,
1479                            ReadFileToolInput {
1480                                path: input_file_path.into(),
1481                                start_line: Some(1012),
1482                                end_line: Some(1120),
1483                            },
1484                        ),
1485                    ],
1486                ),
1487                message(
1488                    User,
1489                    [tool_result(
1490                        "tool_3",
1491                        ReadFileTool::NAME,
1492                        lines(input_file_content, 1012..1120),
1493                    )],
1494                ),
1495                message(
1496                    Assistant,
1497                    [
1498                        text("Now let's look at how `buffer_created` is implemented:"),
1499                        tool_use(
1500                            "tool_4",
1501                            ReadFileTool::NAME,
1502                            ReadFileToolInput {
1503                                path: input_file_path.into(),
1504                                start_line: Some(271),
1505                                end_line: Some(276),
1506                            },
1507                        ),
1508                    ],
1509                ),
1510                message(
1511                    User,
1512                    [tool_result(
1513                        "tool_4",
1514                        ReadFileTool::NAME,
1515                        lines(input_file_content, 271..276),
1516                    )],
1517                ),
1518            ],
1519            input_file_path,
1520            Some(input_file_content.into()),
1521            EvalAssertion::judge_diff(
1522                "A new test for overwritten files was created, without changing any previous test",
1523            ),
1524        ))
1525    });
1526}
1527
1528#[test]
1529#[cfg_attr(not(feature = "unit-eval"), ignore)]
1530fn eval_create_empty_file() {
1531    let input_file_path = "root/TODO3";
1532    let input_file_content = None;
1533    let expected_output_content = String::new();
1534
1535    eval_utils::eval(100, 0.99, eval_utils::NoProcessor, move || {
1536        run_eval(EvalInput::new(
1537            vec![
1538                message(User, [text("Create a second empty todo file ")]),
1539                message(
1540                    Assistant,
1541                    [
1542                        text(indoc::formatdoc! {"
1543                            I'll help you create a second empty todo file.
1544                            First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
1545                            "}),
1546                        tool_use(
1547                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1548                            ListDirectoryTool::NAME,
1549                            ListDirectoryToolInput {
1550                                path: "root".to_string(),
1551                            },
1552                        ),
1553                    ],
1554                ),
1555                message(
1556                    User,
1557                    [tool_result(
1558                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
1559                        ListDirectoryTool::NAME,
1560                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
1561                    )],
1562                ),
1563            ],
1564            input_file_path,
1565            input_file_content.clone(),
1566            EvalAssertion::assert_eq(expected_output_content.clone()),
1567        ))
1568    });
1569}