zeta2: Improve error reporting and eval purity (#42470)

Ben Kunkle and Agus created 2 weeks ago

Closes #ISSUE

Improves error reporting for various failure modes of zeta2, including
failing to parse the `<old_text>`/`<new_text>` pattern, and the contents
of `<old_text>` failing to match.

Additionally, makes it so that evals are checked out into a worktree
with the _repo_ name instead of the _example_ name, in order to make
sure that the eval name has no influence on the models prediction. The
repo name worktrees are still namespaced by the example name like
`{example_name}/{repo_name}` to ensure evals pointing to the same repo
do not conflict.

Release Notes:

- N/A *or* Added/Fixed/Improved ...

---------

Co-authored-by: Agus <agus@zed.dev>

Change summary

Cargo.lock                           |  1 
crates/zeta2/Cargo.toml              |  3 +
crates/zeta2/src/retrieval_search.rs | 42 ++++++++++++----------
crates/zeta2/src/xml_edits.rs        | 56 ++++++++++++++++++++++++++++-
crates/zeta_cli/src/example.rs       | 16 +++++---
5 files changed, 89 insertions(+), 29 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -21719,6 +21719,7 @@ dependencies = [
  "serde_json",
  "settings",
  "smol",
+ "strsim",
  "thiserror 2.0.17",
  "util",
  "uuid",

crates/zeta2/Cargo.toml 🔗

@@ -37,11 +37,13 @@ release_channel.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 smol.workspace = true
+strsim.workspace = true
 thiserror.workspace = true
 util.workspace = true
 uuid.workspace = true
 workspace.workspace = true
 worktree.workspace = true
+pretty_assertions.workspace = true
 
 [dev-dependencies]
 clock = { workspace = true, features = ["test-support"] }
@@ -51,7 +53,6 @@ lsp.workspace = true
 indoc.workspace = true
 language = { workspace = true, features = ["test-support"] }
 language_model = { workspace = true, features = ["test-support"] }
-pretty_assertions.workspace = true
 project = { workspace = true, features = ["test-support"] }
 settings = { workspace = true, features = ["test-support"] }
 zlog.workspace = true

crates/zeta2/src/retrieval_search.rs 🔗

@@ -81,25 +81,7 @@ pub async fn run_retrieval_searches(
 
         for (buffer, ranges) in results.iter_mut() {
             if let Some(snapshot) = snapshots.get(&buffer.entity_id()) {
-                ranges.sort_unstable_by(|a, b| {
-                    a.start
-                        .cmp(&b.start, snapshot)
-                        .then(b.end.cmp(&b.end, snapshot))
-                });
-
-                let mut index = 1;
-                while index < ranges.len() {
-                    if ranges[index - 1]
-                        .end
-                        .cmp(&ranges[index].start, snapshot)
-                        .is_gt()
-                    {
-                        let removed = ranges.remove(index);
-                        ranges[index - 1].end = removed.end;
-                    } else {
-                        index += 1;
-                    }
-                }
+                merge_anchor_ranges(ranges, snapshot);
             }
         }
 
@@ -108,6 +90,28 @@ pub async fn run_retrieval_searches(
     .await
 }
 
+fn merge_anchor_ranges(ranges: &mut Vec<Range<Anchor>>, snapshot: &BufferSnapshot) {
+    ranges.sort_unstable_by(|a, b| {
+        a.start
+            .cmp(&b.start, snapshot)
+            .then(b.end.cmp(&b.end, snapshot))
+    });
+
+    let mut index = 1;
+    while index < ranges.len() {
+        if ranges[index - 1]
+            .end
+            .cmp(&ranges[index].start, snapshot)
+            .is_ge()
+        {
+            let removed = ranges.remove(index);
+            ranges[index - 1].end = removed.end;
+        } else {
+            index += 1;
+        }
+    }
+}
+
 const MAX_EXCERPT_LEN: usize = 768;
 const MAX_RESULTS_LEN: usize = MAX_EXCERPT_LEN * 5;

crates/zeta2/src/xml_edits.rs 🔗

@@ -5,6 +5,15 @@ use std::path::Path;
 use std::sync::Arc;
 
 pub async fn parse_xml_edits<'a>(
+    input: &'a str,
+    get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range<Anchor>])> + Send,
+) -> Result<(&'a BufferSnapshot, Vec<(Range<Anchor>, Arc<str>)>)> {
+    parse_xml_edits_inner(input, get_buffer)
+        .await
+        .with_context(|| format!("Failed to parse XML edits:\n{input}"))
+}
+
+async fn parse_xml_edits_inner<'a>(
     mut input: &'a str,
     get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range<Anchor>])> + Send,
 ) -> Result<(&'a BufferSnapshot, Vec<(Range<Anchor>, Arc<str>)>)> {
@@ -56,13 +65,29 @@ fn resolve_new_text_old_text_in_buffer(
             let range = range.to_offset(buffer);
             let text = buffer.text_for_range(range.clone()).collect::<String>();
             for (match_offset, _) in text.match_indices(old_text) {
-                if offset.is_some() {
-                    anyhow::bail!("old_text is not unique enough:\n{}", old_text);
+                if let Some(offset) = offset {
+                    let offset_match_point = buffer.offset_to_point(offset);
+                    let second_match_point = buffer.offset_to_point(range.start + match_offset);
+                    anyhow::bail!(
+                        "old_text is not unique enough:\n{}\nFound at {:?} and {:?}",
+                        old_text,
+                        offset_match_point,
+                        second_match_point
+                    );
                 }
                 offset = Some(range.start + match_offset);
             }
         }
-        offset.ok_or_else(|| anyhow!("Failed to match old_text:\n{}", old_text))
+        offset.ok_or_else(|| {
+            #[cfg(debug_assertions)]
+            if let Some(closest_match) = closest_old_text_match(buffer, old_text) {
+                log::info!(
+                    "Closest `old_text` match: {}",
+                    pretty_assertions::StrComparison::new(old_text, &closest_match)
+                )
+            }
+            anyhow!("Failed to match old_text:\n{}", old_text)
+        })
     }?;
 
     let edits_within_hunk = language::text_diff(&old_text, &new_text);
@@ -77,6 +102,31 @@ fn resolve_new_text_old_text_in_buffer(
         }))
 }
 
+#[cfg(debug_assertions)]
+fn closest_old_text_match(buffer: &TextBufferSnapshot, old_text: &str) -> Option<String> {
+    let buffer_text = buffer.text();
+    let mut cursor = 0;
+    let len = old_text.len();
+
+    let mut min_score = usize::MAX;
+    let mut min_start = 0;
+
+    while cursor + len <= buffer_text.len() {
+        let candidate = &buffer_text[cursor..cursor + len];
+        let score = strsim::levenshtein(candidate, old_text);
+        if score < min_score {
+            min_score = score;
+            min_start = cursor;
+        }
+        cursor += 1;
+    }
+    if min_score != usize::MAX {
+        Some(buffer_text[min_start..min_start + len].to_string())
+    } else {
+        None
+    }
+}
+
 struct ParsedTag<'a> {
     attributes: &'a str,
     body: &'a str,

crates/zeta_cli/src/example.rs 🔗

@@ -315,9 +315,6 @@ impl NamedExample {
         let (repo_owner, repo_name) = self.repo_name()?;
         let file_name = self.file_name();
 
-        fs::create_dir_all(&*REPOS_DIR)?;
-        fs::create_dir_all(&*WORKTREES_DIR)?;
-
         let repo_dir = REPOS_DIR.join(repo_owner.as_ref()).join(repo_name.as_ref());
         let repo_lock = lock_repo(&repo_dir).await;
 
@@ -332,7 +329,14 @@ impl NamedExample {
         }
 
         // Resolve the example to a revision, fetching it if needed.
-        let revision = run_git(&repo_dir, &["rev-parse", &self.example.revision]).await;
+        let revision = run_git(
+            &repo_dir,
+            &[
+                "rev-parse",
+                &format!("{}^{{commit}}", self.example.revision),
+            ],
+        )
+        .await;
         let revision = if let Ok(revision) = revision {
             revision
         } else {
@@ -349,7 +353,7 @@ impl NamedExample {
         };
 
         // Create the worktree for this example if needed.
-        let worktree_path = WORKTREES_DIR.join(&file_name);
+        let worktree_path = WORKTREES_DIR.join(&file_name).join(repo_name.as_ref());
         if worktree_path.is_dir() {
             run_git(&worktree_path, &["clean", "--force", "-d"]).await?;
             run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
@@ -477,7 +481,7 @@ impl NamedExample {
             let mut matches = text.match_indices(&cursor_excerpt);
             let Some((excerpt_offset, _)) = matches.next() else {
                 anyhow::bail!(
-                    "Cursor excerpt did not exist in buffer.\nExcerpt:\n\n{cursor_excerpt}\nBuffer text:\n{text}\n"
+                    "\nExcerpt:\n\n{cursor_excerpt}\nBuffer text:\n{text}\n.Cursor excerpt did not exist in buffer."
                 );
             };
             assert!(matches.next().is_none());