From 6c0069ca983128bb879c186297fdc721920ef522 Mon Sep 17 00:00:00 2001 From: Ben Kunkle Date: Wed, 12 Nov 2025 09:52:11 -0800 Subject: [PATCH] zeta2: Improve error reporting and eval purity (#42470) Closes #ISSUE Improves error reporting for various failure modes of zeta2, including failing to parse the ``/`` pattern, and the contents of `` failing to match. Additionally, makes it so that evals are checked out into a worktree with the _repo_ name instead of the _example_ name, in order to make sure that the eval name has no influence on the models prediction. The repo name worktrees are still namespaced by the example name like `{example_name}/{repo_name}` to ensure evals pointing to the same repo do not conflict. Release Notes: - N/A *or* Added/Fixed/Improved ... --------- Co-authored-by: Agus --- Cargo.lock | 1 + crates/zeta2/Cargo.toml | 3 +- crates/zeta2/src/retrieval_search.rs | 42 +++++++++++---------- crates/zeta2/src/xml_edits.rs | 56 ++++++++++++++++++++++++++-- crates/zeta_cli/src/example.rs | 16 +++++--- 5 files changed, 89 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 25bb2b4063ce437f5de1411a04d129a8de06aff0..d2c799dc41d03e8ed961f5d854ac74797efd01ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21719,6 +21719,7 @@ dependencies = [ "serde_json", "settings", "smol", + "strsim", "thiserror 2.0.17", "util", "uuid", diff --git a/crates/zeta2/Cargo.toml b/crates/zeta2/Cargo.toml index 1cb3a866065748f8e39dee7a980b99ea0b6c63fa..0360a74e65a109a0c95ea4787a0df1c61b375615 100644 --- a/crates/zeta2/Cargo.toml +++ b/crates/zeta2/Cargo.toml @@ -37,11 +37,13 @@ release_channel.workspace = true serde.workspace = true serde_json.workspace = true smol.workspace = true +strsim.workspace = true thiserror.workspace = true util.workspace = true uuid.workspace = true workspace.workspace = true worktree.workspace = true +pretty_assertions.workspace = true [dev-dependencies] clock = { workspace = true, features = ["test-support"] } @@ -51,7 +53,6 @@ lsp.workspace = true indoc.workspace = true language = { workspace = true, features = ["test-support"] } language_model = { workspace = true, features = ["test-support"] } -pretty_assertions.workspace = true project = { workspace = true, features = ["test-support"] } settings = { workspace = true, features = ["test-support"] } zlog.workspace = true diff --git a/crates/zeta2/src/retrieval_search.rs b/crates/zeta2/src/retrieval_search.rs index f735f44cad9623711e5ed9a1293a74e34e084888..fe28976bb27e27cd6355d3efa13e0a1bf26d5962 100644 --- a/crates/zeta2/src/retrieval_search.rs +++ b/crates/zeta2/src/retrieval_search.rs @@ -81,25 +81,7 @@ pub async fn run_retrieval_searches( for (buffer, ranges) in results.iter_mut() { if let Some(snapshot) = snapshots.get(&buffer.entity_id()) { - ranges.sort_unstable_by(|a, b| { - a.start - .cmp(&b.start, snapshot) - .then(b.end.cmp(&b.end, snapshot)) - }); - - let mut index = 1; - while index < ranges.len() { - if ranges[index - 1] - .end - .cmp(&ranges[index].start, snapshot) - .is_gt() - { - let removed = ranges.remove(index); - ranges[index - 1].end = removed.end; - } else { - index += 1; - } - } + merge_anchor_ranges(ranges, snapshot); } } @@ -108,6 +90,28 @@ pub async fn run_retrieval_searches( .await } +fn merge_anchor_ranges(ranges: &mut Vec>, snapshot: &BufferSnapshot) { + ranges.sort_unstable_by(|a, b| { + a.start + .cmp(&b.start, snapshot) + .then(b.end.cmp(&b.end, snapshot)) + }); + + let mut index = 1; + while index < ranges.len() { + if ranges[index - 1] + .end + .cmp(&ranges[index].start, snapshot) + .is_ge() + { + let removed = ranges.remove(index); + ranges[index - 1].end = removed.end; + } else { + index += 1; + } + } +} + const MAX_EXCERPT_LEN: usize = 768; const MAX_RESULTS_LEN: usize = MAX_EXCERPT_LEN * 5; diff --git a/crates/zeta2/src/xml_edits.rs b/crates/zeta2/src/xml_edits.rs index e8bcc4b1ba7eb2d00cd73b0b2e8d1638a5b00e32..6c9b5a97f6398cc00eaca08f9af6c4c9de991785 100644 --- a/crates/zeta2/src/xml_edits.rs +++ b/crates/zeta2/src/xml_edits.rs @@ -5,6 +5,15 @@ use std::path::Path; use std::sync::Arc; pub async fn parse_xml_edits<'a>( + input: &'a str, + get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range])> + Send, +) -> Result<(&'a BufferSnapshot, Vec<(Range, Arc)>)> { + parse_xml_edits_inner(input, get_buffer) + .await + .with_context(|| format!("Failed to parse XML edits:\n{input}")) +} + +async fn parse_xml_edits_inner<'a>( mut input: &'a str, get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range])> + Send, ) -> Result<(&'a BufferSnapshot, Vec<(Range, Arc)>)> { @@ -56,13 +65,29 @@ fn resolve_new_text_old_text_in_buffer( let range = range.to_offset(buffer); let text = buffer.text_for_range(range.clone()).collect::(); for (match_offset, _) in text.match_indices(old_text) { - if offset.is_some() { - anyhow::bail!("old_text is not unique enough:\n{}", old_text); + if let Some(offset) = offset { + let offset_match_point = buffer.offset_to_point(offset); + let second_match_point = buffer.offset_to_point(range.start + match_offset); + anyhow::bail!( + "old_text is not unique enough:\n{}\nFound at {:?} and {:?}", + old_text, + offset_match_point, + second_match_point + ); } offset = Some(range.start + match_offset); } } - offset.ok_or_else(|| anyhow!("Failed to match old_text:\n{}", old_text)) + offset.ok_or_else(|| { + #[cfg(debug_assertions)] + if let Some(closest_match) = closest_old_text_match(buffer, old_text) { + log::info!( + "Closest `old_text` match: {}", + pretty_assertions::StrComparison::new(old_text, &closest_match) + ) + } + anyhow!("Failed to match old_text:\n{}", old_text) + }) }?; let edits_within_hunk = language::text_diff(&old_text, &new_text); @@ -77,6 +102,31 @@ fn resolve_new_text_old_text_in_buffer( })) } +#[cfg(debug_assertions)] +fn closest_old_text_match(buffer: &TextBufferSnapshot, old_text: &str) -> Option { + let buffer_text = buffer.text(); + let mut cursor = 0; + let len = old_text.len(); + + let mut min_score = usize::MAX; + let mut min_start = 0; + + while cursor + len <= buffer_text.len() { + let candidate = &buffer_text[cursor..cursor + len]; + let score = strsim::levenshtein(candidate, old_text); + if score < min_score { + min_score = score; + min_start = cursor; + } + cursor += 1; + } + if min_score != usize::MAX { + Some(buffer_text[min_start..min_start + len].to_string()) + } else { + None + } +} + struct ParsedTag<'a> { attributes: &'a str, body: &'a str, diff --git a/crates/zeta_cli/src/example.rs b/crates/zeta_cli/src/example.rs index a470effa575f5e8ece3c59781dc09d9d1c5e822e..20176fbb5d73de83b90b8edb2831104ecddc8ef0 100644 --- a/crates/zeta_cli/src/example.rs +++ b/crates/zeta_cli/src/example.rs @@ -315,9 +315,6 @@ impl NamedExample { let (repo_owner, repo_name) = self.repo_name()?; let file_name = self.file_name(); - fs::create_dir_all(&*REPOS_DIR)?; - fs::create_dir_all(&*WORKTREES_DIR)?; - let repo_dir = REPOS_DIR.join(repo_owner.as_ref()).join(repo_name.as_ref()); let repo_lock = lock_repo(&repo_dir).await; @@ -332,7 +329,14 @@ impl NamedExample { } // Resolve the example to a revision, fetching it if needed. - let revision = run_git(&repo_dir, &["rev-parse", &self.example.revision]).await; + let revision = run_git( + &repo_dir, + &[ + "rev-parse", + &format!("{}^{{commit}}", self.example.revision), + ], + ) + .await; let revision = if let Ok(revision) = revision { revision } else { @@ -349,7 +353,7 @@ impl NamedExample { }; // Create the worktree for this example if needed. - let worktree_path = WORKTREES_DIR.join(&file_name); + let worktree_path = WORKTREES_DIR.join(&file_name).join(repo_name.as_ref()); if worktree_path.is_dir() { run_git(&worktree_path, &["clean", "--force", "-d"]).await?; run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?; @@ -477,7 +481,7 @@ impl NamedExample { let mut matches = text.match_indices(&cursor_excerpt); let Some((excerpt_offset, _)) = matches.next() else { anyhow::bail!( - "Cursor excerpt did not exist in buffer.\nExcerpt:\n\n{cursor_excerpt}\nBuffer text:\n{text}\n" + "\nExcerpt:\n\n{cursor_excerpt}\nBuffer text:\n{text}\n.Cursor excerpt did not exist in buffer." ); }; assert!(matches.next().is_none());