diff --git a/crates/zeta2/src/xml_edits.rs b/crates/zeta2/src/xml_edits.rs index 468efa8b202141c4cca04459233ea91c5bff9d44..ee8dd47cb25ad3dcd2c3d7d172b62e724b41c22d 100644 --- a/crates/zeta2/src/xml_edits.rs +++ b/crates/zeta2/src/xml_edits.rs @@ -2,6 +2,11 @@ use anyhow::{Context as _, Result}; use language::{Anchor, BufferSnapshot, OffsetRangeExt as _, Point}; use std::{cmp, ops::Range, path::Path, sync::Arc}; +const EDITS_TAG_NAME: &'static str = "edits"; +const OLD_TEXT_TAG_NAME: &'static str = "old_text"; +const NEW_TEXT_TAG_NAME: &'static str = "new_text"; +const XML_TAGS: &[&str] = &[EDITS_TAG_NAME, OLD_TEXT_TAG_NAME, NEW_TEXT_TAG_NAME]; + pub async fn parse_xml_edits<'a>( input: &'a str, get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range])> + Send, @@ -12,38 +17,22 @@ pub async fn parse_xml_edits<'a>( } async fn parse_xml_edits_inner<'a>( - mut input: &'a str, + input: &'a str, get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range])> + Send, ) -> Result<(&'a BufferSnapshot, Vec<(Range, Arc)>)> { - let edits_tag = parse_tag(&mut input, "edits")?.context("No edits tag")?; - - input = edits_tag.body; - - let file_path = edits_tag - .attributes - .trim_start() - .strip_prefix("path") - .context("no file attribute on edits tag")? - .trim_end() - .strip_prefix('=') - .context("no value for path attribute")? - .trim() - .trim_start_matches('"') - .trim_end_matches('"'); + let xml_edits = extract_xml_replacements(input)?; - let (buffer, context_ranges) = get_buffer(file_path.as_ref()) - .with_context(|| format!("no buffer for file {file_path}"))?; + let (buffer, context_ranges) = get_buffer(xml_edits.file_path.as_ref()) + .with_context(|| format!("no buffer for file {}", xml_edits.file_path))?; - let mut edits = vec![]; - while let Some(old_text_tag) = parse_tag(&mut input, "old_text")? { - let new_text_tag = - parse_tag(&mut input, "new_text")?.context("no new_text tag following old_text")?; - let match_range = fuzzy_match_in_ranges(old_text_tag.body, buffer, context_ranges)?; - let old_text = buffer + let mut all_edits = vec![]; + for (old_text, new_text) in xml_edits.replacements { + let match_range = fuzzy_match_in_ranges(old_text, buffer, context_ranges)?; + let matched_old_text = buffer .text_for_range(match_range.clone()) .collect::(); - let edits_within_hunk = language::text_diff(&old_text, &new_text_tag.body); - edits.extend( + let edits_within_hunk = language::text_diff(&matched_old_text, new_text); + all_edits.extend( edits_within_hunk .into_iter() .map(move |(inner_range, inner_text)| { @@ -56,7 +45,7 @@ async fn parse_xml_edits_inner<'a>( ); } - Ok((buffer, edits)) + Ok((buffer, all_edits)) } fn fuzzy_match_in_ranges( @@ -110,32 +99,128 @@ fn fuzzy_match_in_ranges( ); } -struct ParsedTag<'a> { - attributes: &'a str, - body: &'a str, +#[derive(Debug)] +struct XmlEdits<'a> { + file_path: &'a str, + /// Vec of (old_text, new_text) pairs + replacements: Vec<(&'a str, &'a str)>, } -fn parse_tag<'a>(input: &mut &'a str, tag: &str) -> Result>> { - let open_tag = format!("<{}", tag); - let close_tag = format!("", tag); - let Some(start_ix) = input.find(&open_tag) else { - return Ok(None); - }; - let start_ix = start_ix + open_tag.len(); - let closing_bracket_ix = start_ix - + input[start_ix..] +fn extract_xml_replacements(input: &str) -> Result> { + let mut cursor = 0; + + let (edits_body_start, edits_attrs) = + find_tag_open(input, &mut cursor, EDITS_TAG_NAME)?.context("No edits tag found")?; + + let file_path = edits_attrs + .trim_start() + .strip_prefix("path") + .context("no path attribute on edits tag")? + .trim_end() + .strip_prefix('=') + .context("no value for path attribute")? + .trim() + .trim_start_matches('"') + .trim_end_matches('"'); + + cursor = edits_body_start; + let mut edits_list = Vec::new(); + + while let Some((old_body_start, _)) = find_tag_open(input, &mut cursor, OLD_TEXT_TAG_NAME)? { + let old_body_end = find_tag_close(input, &mut cursor)?; + let old_text = trim_surrounding_newlines(&input[old_body_start..old_body_end]); + + let (new_body_start, _) = find_tag_open(input, &mut cursor, NEW_TEXT_TAG_NAME)? + .context("no new_text tag following old_text")?; + let new_body_end = find_tag_close(input, &mut cursor)?; + let new_text = trim_surrounding_newlines(&input[new_body_start..new_body_end]); + + edits_list.push((old_text, new_text)); + } + + Ok(XmlEdits { + file_path, + replacements: edits_list, + }) +} + +/// Trims a single leading and trailing newline +fn trim_surrounding_newlines(input: &str) -> &str { + let start = input.strip_prefix('\n').unwrap_or(input); + let end = start.strip_suffix('\n').unwrap_or(start); + end +} + +fn find_tag_open<'a>( + input: &'a str, + cursor: &mut usize, + expected_tag: &str, +) -> Result> { + let mut search_pos = *cursor; + + while search_pos < input.len() { + let Some(tag_start) = input[search_pos..].find("<") else { + break; + }; + let tag_start = search_pos + tag_start; + if !input[tag_start + 1..].starts_with(expected_tag) { + search_pos = search_pos + tag_start + 1; + continue; + }; + + let after_tag_name = tag_start + expected_tag.len() + 1; + let close_bracket = input[after_tag_name..] .find('>') - .with_context(|| format!("missing > after {tag}"))?; - let attributes = &input[start_ix..closing_bracket_ix].trim(); - let end_ix = closing_bracket_ix - + input[closing_bracket_ix..] - .find(&close_tag) - .with_context(|| format!("no `{close_tag}` tag"))?; - let body = &input[closing_bracket_ix + '>'.len_utf8()..end_ix]; - let body = body.strip_prefix('\n').unwrap_or(body); - let body = body.strip_suffix('\n').unwrap_or(body); - *input = &input[end_ix + close_tag.len()..]; - Ok(Some(ParsedTag { attributes, body })) + .with_context(|| format!("missing > after <{}", expected_tag))?; + let attrs_end = after_tag_name + close_bracket; + let body_start = attrs_end + 1; + + let attributes = input[after_tag_name..attrs_end].trim(); + *cursor = body_start; + + return Ok(Some((body_start, attributes))); + } + + Ok(None) +} + +fn find_tag_close(input: &str, cursor: &mut usize) -> Result { + let mut depth = 1; + let mut search_pos = *cursor; + + while search_pos < input.len() && depth > 0 { + let Some(bracket_offset) = input[search_pos..].find('<') else { + break; + }; + let bracket_pos = search_pos + bracket_offset; + + if input[bracket_pos..].starts_with("') + { + let close_start = bracket_pos + 2; + let tag_name = input[close_start..close_start + close_end].trim(); + + if XML_TAGS.contains(&tag_name) { + depth -= 1; + if depth == 0 { + *cursor = close_start + close_end + 1; + return Ok(bracket_pos); + } + } + search_pos = close_start + close_end + 1; + continue; + } else if let Some(close_bracket_offset) = input[bracket_pos..].find('>') { + let close_bracket_pos = bracket_pos + close_bracket_offset; + let tag_name = &input[bracket_pos + 1..close_bracket_pos].trim(); + if XML_TAGS.contains(&tag_name) { + depth += 1; + } + } + + search_pos = bracket_pos + 1; + } + + anyhow::bail!("no closing tag found") } const REPLACEMENT_COST: u32 = 1; @@ -357,17 +442,128 @@ mod tests { use util::path; #[test] - fn test_parse_tags() { - let mut input = indoc! {r#" - Prelude - - tag value - - "# }; - let parsed = parse_tag(&mut input, "tag").unwrap().unwrap(); - assert_eq!(parsed.attributes, "attr=\"foo\""); - assert_eq!(parsed.body, "tag value"); - assert_eq!(input, "\n"); + fn test_extract_xml_edits() { + let input = indoc! {r#" + + + old content + + + new content + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "test.rs"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, "old content"); + assert_eq!(result.replacements[0].1, "new content"); + } + + #[test] + fn test_extract_xml_edits_with_wrong_closing_tags() { + let input = indoc! {r#" + + + old content + + + new content + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "test.rs"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, "old content"); + assert_eq!(result.replacements[0].1, "new content"); + } + + #[test] + fn test_extract_xml_edits_with_xml_like_content() { + let input = indoc! {r#" + + + + + + + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "component.tsx"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, ""); + assert_eq!( + result.replacements[0].1, + "" + ); + } + + #[test] + fn test_extract_xml_edits_with_conflicting_content() { + let input = indoc! {r#" + + + + + + + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "component.tsx"); + assert_eq!(result.replacements.len(), 1); + assert_eq!(result.replacements[0].0, ""); + assert_eq!(result.replacements[0].1, ""); + } + + #[test] + fn test_extract_xml_edits_multiple_pairs() { + let input = indoc! {r#" + Some reasoning before edits. Lots of thinking going on here + + + + first old + + + first new + + + second old + + + second new + + + "#}; + + let result = extract_xml_replacements(input).unwrap(); + assert_eq!(result.file_path, "test.rs"); + assert_eq!(result.replacements.len(), 2); + assert_eq!(result.replacements[0].0, "first old"); + assert_eq!(result.replacements[0].1, "first new"); + assert_eq!(result.replacements[1].0, "second old"); + assert_eq!(result.replacements[1].1, "second new"); + } + + #[test] + fn test_extract_xml_edits_unexpected_eof() { + let input = indoc! {r#" + + + first old +