From bf0dd4057ce262f6213a685fb73ae5239494673b Mon Sep 17 00:00:00 2001 From: Ben Kunkle Date: Tue, 18 Nov 2025 09:36:37 -0800 Subject: [PATCH] zeta2: Make `new_text`/`old_text` parsing more robust (#42997) Closes #ISSUE The model often uses the wrong closing tag, or has spaces around the closing tag name. This PR makes it so that opening tags are treated as authoritative and any closing tag with the name `new_text` `old_text` or `edits` is accepted based on depth. This has the additional benefit that the parsing is more robust with contents that contain `new_text` `old_text` or `edits. I.e. the following test passes ```rust #[test] fn test_extract_xml_edits_with_conflicting_content() { let input = indoc! {r#" "#}; let result = extract_xml_replacements(input).unwrap(); assert_eq!(result.file_path, "component.tsx"); assert_eq!(result.replacements.len(), 1); assert_eq!(result.replacements[0].0, ""); assert_eq!(result.replacements[0].1, ""); } ``` Release Notes: - N/A *or* Added/Fixed/Improved ... --- crates/zeta2/src/xml_edits.rs | 318 +++++++++++++++++++++++++++------- 1 file changed, 257 insertions(+), 61 deletions(-) diff --git a/crates/zeta2/src/xml_edits.rs b/crates/zeta2/src/xml_edits.rs index 468efa8b202141c4cca04459233ea91c5bff9d44..ee8dd47cb25ad3dcd2c3d7d172b62e724b41c22d 100644 --- a/crates/zeta2/src/xml_edits.rs +++ b/crates/zeta2/src/xml_edits.rs @@ -2,6 +2,11 @@ use anyhow::{Context as _, Result}; use language::{Anchor, BufferSnapshot, OffsetRangeExt as _, Point}; use std::{cmp, ops::Range, path::Path, sync::Arc}; +const EDITS_TAG_NAME: &'static str = "edits"; +const OLD_TEXT_TAG_NAME: &'static str = "old_text"; +const NEW_TEXT_TAG_NAME: &'static str = "new_text"; +const XML_TAGS: &[&str] = &[EDITS_TAG_NAME, OLD_TEXT_TAG_NAME, NEW_TEXT_TAG_NAME]; + pub async fn parse_xml_edits<'a>( input: &'a str, get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range])> + Send, @@ -12,38 +17,22 @@ pub async fn parse_xml_edits<'a>( } async fn parse_xml_edits_inner<'a>( - mut input: &'a str, + input: &'a str, get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range])> + Send, ) -> Result<(&'a BufferSnapshot, Vec<(Range, Arc)>)> { - let edits_tag = parse_tag(&mut input, "edits")?.context("No edits tag")?; - - input = edits_tag.body; - - let file_path = edits_tag - .attributes - .trim_start() - .strip_prefix("path") - .context("no file attribute on edits tag")? - .trim_end() - .strip_prefix('=') - .context("no value for path attribute")? - .trim() - .trim_start_matches('"') - .trim_end_matches('"'); + let xml_edits = extract_xml_replacements(input)?; - let (buffer, context_ranges) = get_buffer(file_path.as_ref()) - .with_context(|| format!("no buffer for file {file_path}"))?; + let (buffer, context_ranges) = get_buffer(xml_edits.file_path.as_ref()) + .with_context(|| format!("no buffer for file {}", xml_edits.file_path))?; - let mut edits = vec![]; - while let Some(old_text_tag) = parse_tag(&mut input, "old_text")? { - let new_text_tag = - parse_tag(&mut input, "new_text")?.context("no new_text tag following old_text")?; - let match_range = fuzzy_match_in_ranges(old_text_tag.body, buffer, context_ranges)?; - let old_text = buffer + let mut all_edits = vec![]; + for (old_text, new_text) in xml_edits.replacements { + let match_range = fuzzy_match_in_ranges(old_text, buffer, context_ranges)?; + let matched_old_text = buffer .text_for_range(match_range.clone()) .collect::(); - let edits_within_hunk = language::text_diff(&old_text, &new_text_tag.body); - edits.extend( + let edits_within_hunk = language::text_diff(&matched_old_text, new_text); + all_edits.extend( edits_within_hunk .into_iter() .map(move |(inner_range, inner_text)| { @@ -56,7 +45,7 @@ async fn parse_xml_edits_inner<'a>( ); } - Ok((buffer, edits)) + Ok((buffer, all_edits)) } fn fuzzy_match_in_ranges( @@ -110,32 +99,128 @@ fn fuzzy_match_in_ranges( ); } -struct ParsedTag<'a> { - attributes: &'a str, - body: &'a str, +#[derive(Debug)] +struct XmlEdits<'a> { + file_path: &'a str, + /// Vec of (old_text, new_text) pairs + replacements: Vec<(&'a str, &'a str)>, } -fn parse_tag<'a>(input: &mut &'a str, tag: &str) -> Result>> { - let open_tag = format!("<{}", tag); - let close_tag = format!("", tag); - let Some(start_ix) = input.find(&open_tag) else { - return Ok(None); - }; - let start_ix = start_ix + open_tag.len(); - let closing_bracket_ix = start_ix - + input[start_ix..] +fn extract_xml_replacements(input: &str) -> Result> { + let mut cursor = 0; + + let (edits_body_start, edits_attrs) = + find_tag_open(input, &mut cursor, EDITS_TAG_NAME)?.context("No edits tag found")?; + + let file_path = edits_attrs + .trim_start() + .strip_prefix("path") + .context("no path attribute on edits tag")? + .trim_end() + .strip_prefix('=') + .context("no value for path attribute")? + .trim() + .trim_start_matches('"') + .trim_end_matches('"'); + + cursor = edits_body_start; + let mut edits_list = Vec::new(); + + while let Some((old_body_start, _)) = find_tag_open(input, &mut cursor, OLD_TEXT_TAG_NAME)? { + let old_body_end = find_tag_close(input, &mut cursor)?; + let old_text = trim_surrounding_newlines(&input[old_body_start..old_body_end]); + + let (new_body_start, _) = find_tag_open(input, &mut cursor, NEW_TEXT_TAG_NAME)? + .context("no new_text tag following old_text")?; + let new_body_end = find_tag_close(input, &mut cursor)?; + let new_text = trim_surrounding_newlines(&input[new_body_start..new_body_end]); + + edits_list.push((old_text, new_text)); + } + + Ok(XmlEdits { + file_path, + replacements: edits_list, + }) +} + +/// Trims a single leading and trailing newline +fn trim_surrounding_newlines(input: &str) -> &str { + let start = input.strip_prefix('\n').unwrap_or(input); + let end = start.strip_suffix('\n').unwrap_or(start); + end +} + +fn find_tag_open<'a>( + input: &'a str, + cursor: &mut usize, + expected_tag: &str, +) -> Result> { + let mut search_pos = *cursor; + + while search_pos < input.len() { + let Some(tag_start) = input[search_pos..].find("<") else { + break; + }; + let tag_start = search_pos + tag_start; + if !input[tag_start + 1..].starts_with(expected_tag) { + search_pos = search_pos + tag_start + 1; + continue; + }; + + let after_tag_name = tag_start + expected_tag.len() + 1; + let close_bracket = input[after_tag_name..] .find('>') - .with_context(|| format!("missing > after {tag}"))?; - let attributes = &input[start_ix..closing_bracket_ix].trim(); - let end_ix = closing_bracket_ix - + input[closing_bracket_ix..] - .find(&close_tag) - .with_context(|| format!("no `{close_tag}` tag"))?; - let body = &input[closing_bracket_ix + '>'.len_utf8()..end_ix]; - let body = body.strip_prefix('\n').unwrap_or(body); - let body = body.strip_suffix('\n').unwrap_or(body); - *input = &input[end_ix + close_tag.len()..]; - Ok(Some(ParsedTag { attributes, body })) + .with_context(|| format!("missing > after <{}", expected_tag))?; + let attrs_end = after_tag_name + close_bracket; + let body_start = attrs_end + 1; + + let attributes = input[after_tag_name..attrs_end].trim(); + *cursor = body_start; + + return Ok(Some((body_start, attributes))); + } + + Ok(None) +} + +fn find_tag_close(input: &str, cursor: &mut usize) -> Result { + let mut depth = 1; + let mut search_pos = *cursor; + + while search_pos < input.len() && depth > 0 { + let Some(bracket_offset) = input[search_pos..].find('<') else { + break; + }; + let bracket_pos = search_pos + bracket_offset; + + if input[bracket_pos..].starts_with("') + { + let close_start = bracket_pos + 2; + let tag_name = input[close_start..close_start + close_end].trim(); + + if XML_TAGS.contains(&tag_name) { + depth -= 1; + if depth == 0 { + *cursor = close_start + close_end + 1; + return Ok(bracket_pos); + } + } + search_pos = close_start + close_end + 1; + continue; + } else if let Some(close_bracket_offset) = input[bracket_pos..].find('>') { + let close_bracket_pos = bracket_pos + close_bracket_offset; + let tag_name = &input[bracket_pos + 1..close_bracket_pos].trim(); + if XML_TAGS.contains(&tag_name) { + depth += 1; + } + } + + search_pos = bracket_pos + 1; + } + + anyhow::bail!("no closing tag found") } const REPLACEMENT_COST: u32 = 1; @@ -357,17 +442,128 @@ mod tests { use util::path; #[test] - fn test_parse_tags() { - let mut input = indoc! {r#" - Prelude - - tag value - - "# }; - let parsed = parse_tag(&mut input, "tag").unwrap().unwrap(); - assert_eq!(parsed.attributes, "attr=\"foo\""); - assert_eq!(parsed.body, "tag value"); - assert_eq!(input, "\n"); + fn test_extract_xml_edits() { + let input = indoc! {r#" + + + old content + + + new content + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "test.rs"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, "old content"); + assert_eq!(result.replacements[0].1, "new content"); + } + + #[test] + fn test_extract_xml_edits_with_wrong_closing_tags() { + let input = indoc! {r#" + + + old content + + + new content + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "test.rs"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, "old content"); + assert_eq!(result.replacements[0].1, "new content"); + } + + #[test] + fn test_extract_xml_edits_with_xml_like_content() { + let input = indoc! {r#" + + + + + + + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "component.tsx"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, ""); + assert_eq!( + result.replacements[0].1, + "" + ); + } + + #[test] + fn test_extract_xml_edits_with_conflicting_content() { + let input = indoc! {r#" + + + + + + + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "component.tsx"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, ""); + assert_eq!(result.replacements[0].1, ""); + } + + #[test] + fn test_extract_xml_edits_multiple_pairs() { + let input = indoc! {r#" + Some reasoning before edits. Lots of thinking going on here + + + + first old + + + first new + + + second old + + + second new + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "test.rs"); + assert_eq!(result.replacements.len(), 2); + assert_eq!(result.replacements[0].0, "first old"); + assert_eq!(result.replacements[0].1, "first new"); + assert_eq!(result.replacements[1].0, "second old"); + assert_eq!(result.replacements[1].1, "second new"); + } + + #[test] + fn test_extract_xml_edits_unexpected_eof() { + let input = indoc! {r#" + + + first old +