zeta2: Make `new_text`/`old_text` parsing more robust (#42997)

crates/zeta2/src/xml_edits.rs 🔗

@@ -2,6 +2,11 @@ use anyhow::{Context as _, Result};
 use language::{Anchor, BufferSnapshot, OffsetRangeExt as _, Point};
 use std::{cmp, ops::Range, path::Path, sync::Arc};
 
+const EDITS_TAG_NAME: &'static str = "edits";
+const OLD_TEXT_TAG_NAME: &'static str = "old_text";
+const NEW_TEXT_TAG_NAME: &'static str = "new_text";
+const XML_TAGS: &[&str] = &[EDITS_TAG_NAME, OLD_TEXT_TAG_NAME, NEW_TEXT_TAG_NAME];
+
 pub async fn parse_xml_edits<'a>(
     input: &'a str,
     get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range<Anchor>])> + Send,
@@ -12,38 +17,22 @@ pub async fn parse_xml_edits<'a>(
 }
 
 async fn parse_xml_edits_inner<'a>(
-    mut input: &'a str,
+    input: &'a str,
     get_buffer: impl Fn(&Path) -> Option<(&'a BufferSnapshot, &'a [Range<Anchor>])> + Send,
 ) -> Result<(&'a BufferSnapshot, Vec<(Range<Anchor>, Arc<str>)>)> {
-    let edits_tag = parse_tag(&mut input, "edits")?.context("No edits tag")?;
-
-    input = edits_tag.body;
-
-    let file_path = edits_tag
-        .attributes
-        .trim_start()
-        .strip_prefix("path")
-        .context("no file attribute on edits tag")?
-        .trim_end()
-        .strip_prefix('=')
-        .context("no value for path attribute")?
-        .trim()
-        .trim_start_matches('"')
-        .trim_end_matches('"');
+    let xml_edits = extract_xml_replacements(input)?;
 
-    let (buffer, context_ranges) = get_buffer(file_path.as_ref())
-        .with_context(|| format!("no buffer for file {file_path}"))?;
+    let (buffer, context_ranges) = get_buffer(xml_edits.file_path.as_ref())
+        .with_context(|| format!("no buffer for file {}", xml_edits.file_path))?;
 
-    let mut edits = vec![];
-    while let Some(old_text_tag) = parse_tag(&mut input, "old_text")? {
-        let new_text_tag =
-            parse_tag(&mut input, "new_text")?.context("no new_text tag following old_text")?;
-        let match_range = fuzzy_match_in_ranges(old_text_tag.body, buffer, context_ranges)?;
-        let old_text = buffer
+    let mut all_edits = vec![];
+    for (old_text, new_text) in xml_edits.replacements {
+        let match_range = fuzzy_match_in_ranges(old_text, buffer, context_ranges)?;
+        let matched_old_text = buffer
             .text_for_range(match_range.clone())
             .collect::<String>();
-        let edits_within_hunk = language::text_diff(&old_text, &new_text_tag.body);
-        edits.extend(
+        let edits_within_hunk = language::text_diff(&matched_old_text, new_text);
+        all_edits.extend(
             edits_within_hunk
                 .into_iter()
                 .map(move |(inner_range, inner_text)| {
@@ -56,7 +45,7 @@ async fn parse_xml_edits_inner<'a>(
         );
     }
 
-    Ok((buffer, edits))
+    Ok((buffer, all_edits))
 }
 
 fn fuzzy_match_in_ranges(
@@ -110,32 +99,128 @@ fn fuzzy_match_in_ranges(
     );
 }
 
-struct ParsedTag<'a> {
-    attributes: &'a str,
-    body: &'a str,
+#[derive(Debug)]
+struct XmlEdits<'a> {
+    file_path: &'a str,
+    /// Vec of (old_text, new_text) pairs
+    replacements: Vec<(&'a str, &'a str)>,
 }
 
-fn parse_tag<'a>(input: &mut &'a str, tag: &str) -> Result<Option<ParsedTag<'a>>> {
-    let open_tag = format!("<{}", tag);
-    let close_tag = format!("</{}>", tag);
-    let Some(start_ix) = input.find(&open_tag) else {
-        return Ok(None);
-    };
-    let start_ix = start_ix + open_tag.len();
-    let closing_bracket_ix = start_ix
-        + input[start_ix..]
+fn extract_xml_replacements(input: &str) -> Result<XmlEdits<'_>> {
+    let mut cursor = 0;
+
+    let (edits_body_start, edits_attrs) =
+        find_tag_open(input, &mut cursor, EDITS_TAG_NAME)?.context("No edits tag found")?;
+
+    let file_path = edits_attrs
+        .trim_start()
+        .strip_prefix("path")
+        .context("no path attribute on edits tag")?
+        .trim_end()
+        .strip_prefix('=')
+        .context("no value for path attribute")?
+        .trim()
+        .trim_start_matches('"')
+        .trim_end_matches('"');
+
+    cursor = edits_body_start;
+    let mut edits_list = Vec::new();
+
+    while let Some((old_body_start, _)) = find_tag_open(input, &mut cursor, OLD_TEXT_TAG_NAME)? {
+        let old_body_end = find_tag_close(input, &mut cursor)?;
+        let old_text = trim_surrounding_newlines(&input[old_body_start..old_body_end]);
+
+        let (new_body_start, _) = find_tag_open(input, &mut cursor, NEW_TEXT_TAG_NAME)?
+            .context("no new_text tag following old_text")?;
+        let new_body_end = find_tag_close(input, &mut cursor)?;
+        let new_text = trim_surrounding_newlines(&input[new_body_start..new_body_end]);
+
+        edits_list.push((old_text, new_text));
+    }
+
+    Ok(XmlEdits {
+        file_path,
+        replacements: edits_list,
+    })
+}
+
+/// Trims a single leading and trailing newline
+fn trim_surrounding_newlines(input: &str) -> &str {
+    let start = input.strip_prefix('\n').unwrap_or(input);
+    let end = start.strip_suffix('\n').unwrap_or(start);
+    end
+}
+
+fn find_tag_open<'a>(
+    input: &'a str,
+    cursor: &mut usize,
+    expected_tag: &str,
+) -> Result<Option<(usize, &'a str)>> {
+    let mut search_pos = *cursor;
+
+    while search_pos < input.len() {
+        let Some(tag_start) = input[search_pos..].find("<") else {
+            break;
+        };
+        let tag_start = search_pos + tag_start;
+        if !input[tag_start + 1..].starts_with(expected_tag) {
+            search_pos = search_pos + tag_start + 1;
+            continue;
+        };
+
+        let after_tag_name = tag_start + expected_tag.len() + 1;
+        let close_bracket = input[after_tag_name..]
             .find('>')
-            .with_context(|| format!("missing > after {tag}"))?;
-    let attributes = &input[start_ix..closing_bracket_ix].trim();
-    let end_ix = closing_bracket_ix
-        + input[closing_bracket_ix..]
-            .find(&close_tag)
-            .with_context(|| format!("no `{close_tag}` tag"))?;
-    let body = &input[closing_bracket_ix + '>'.len_utf8()..end_ix];
-    let body = body.strip_prefix('\n').unwrap_or(body);
-    let body = body.strip_suffix('\n').unwrap_or(body);
-    *input = &input[end_ix + close_tag.len()..];
-    Ok(Some(ParsedTag { attributes, body }))
+            .with_context(|| format!("missing > after <{}", expected_tag))?;
+        let attrs_end = after_tag_name + close_bracket;
+        let body_start = attrs_end + 1;
+
+        let attributes = input[after_tag_name..attrs_end].trim();
+        *cursor = body_start;
+
+        return Ok(Some((body_start, attributes)));
+    }
+
+    Ok(None)
+}
+
+fn find_tag_close(input: &str, cursor: &mut usize) -> Result<usize> {
+    let mut depth = 1;
+    let mut search_pos = *cursor;
+
+    while search_pos < input.len() && depth > 0 {
+        let Some(bracket_offset) = input[search_pos..].find('<') else {
+            break;
+        };
+        let bracket_pos = search_pos + bracket_offset;
+
+        if input[bracket_pos..].starts_with("</")
+            && let Some(close_end) = input[bracket_pos + 2..].find('>')
+        {
+            let close_start = bracket_pos + 2;
+            let tag_name = input[close_start..close_start + close_end].trim();
+
+            if XML_TAGS.contains(&tag_name) {
+                depth -= 1;
+                if depth == 0 {
+                    *cursor = close_start + close_end + 1;
+                    return Ok(bracket_pos);
+                }
+            }
+            search_pos = close_start + close_end + 1;
+            continue;
+        } else if let Some(close_bracket_offset) = input[bracket_pos..].find('>') {
+            let close_bracket_pos = bracket_pos + close_bracket_offset;
+            let tag_name = &input[bracket_pos + 1..close_bracket_pos].trim();
+            if XML_TAGS.contains(&tag_name) {
+                depth += 1;
+            }
+        }
+
+        search_pos = bracket_pos + 1;
+    }
+
+    anyhow::bail!("no closing tag found")
 }
 
 const REPLACEMENT_COST: u32 = 1;
@@ -357,17 +442,128 @@ mod tests {
     use util::path;
 
     #[test]
-    fn test_parse_tags() {
-        let mut input = indoc! {r#"
-            Prelude
-            <tag attr="foo">
-            tag value
-            </tag>
-            "# };
-        let parsed = parse_tag(&mut input, "tag").unwrap().unwrap();
-        assert_eq!(parsed.attributes, "attr=\"foo\"");
-        assert_eq!(parsed.body, "tag value");
-        assert_eq!(input, "\n");
+    fn test_extract_xml_edits() {
+        let input = indoc! {r#"
+            <edits path="test.rs">
+            <old_text>
+            old content
+            </old_text>
+            <new_text>
+            new content
+            </new_text>
+            </edits>
+        "#};
+
+        let result = extract_xml_replacements(input).unwrap();
+        assert_eq!(result.file_path, "test.rs");
+        assert_eq!(result.replacements.len(), 1);
+        assert_eq!(result.replacements[0].0, "old content");
+        assert_eq!(result.replacements[0].1, "new content");
+    }
+
+    #[test]
+    fn test_extract_xml_edits_with_wrong_closing_tags() {
+        let input = indoc! {r#"
+            <edits path="test.rs">
+            <old_text>
+            old content
+            </new_text>
+            <new_text>
+            new content
+            </old_text>
+            </ edits >
+        "#};
+
+        let result = extract_xml_replacements(input).unwrap();
+        assert_eq!(result.file_path, "test.rs");
+        assert_eq!(result.replacements.len(), 1);
+        assert_eq!(result.replacements[0].0, "old content");
+        assert_eq!(result.replacements[0].1, "new content");
+    }
+
+    #[test]
+    fn test_extract_xml_edits_with_xml_like_content() {
+        let input = indoc! {r#"
+            <edits path="component.tsx">
+            <old_text>
+            <foo><bar></bar></foo>
+            </old_text>
+            <new_text>
+            <foo><bar><baz></baz></bar></foo>
+            </new_text>
+            </edits>
+        "#};
+
+        let result = extract_xml_replacements(input).unwrap();
+        assert_eq!(result.file_path, "component.tsx");
+        assert_eq!(result.replacements.len(), 1);
+        assert_eq!(result.replacements[0].0, "<foo><bar></bar></foo>");
+        assert_eq!(
+            result.replacements[0].1,
+            "<foo><bar><baz></baz></bar></foo>"
+        );
+    }
+
+    #[test]
+    fn test_extract_xml_edits_with_conflicting_content() {
+        let input = indoc! {r#"
+            <edits path="component.tsx">
+            <old_text>
+            <new_text></new_text>
+            </old_text>
+            <new_text>
+            <old_text></old_text>
+            </new_text>
+            </edits>
+        "#};
+
+        let result = extract_xml_replacements(input).unwrap();
+        assert_eq!(result.file_path, "component.tsx");
+        assert_eq!(result.replacements.len(), 1);
+        assert_eq!(result.replacements[0].0, "<new_text></new_text>");
+        assert_eq!(result.replacements[0].1, "<old_text></old_text>");
+    }
+
+    #[test]
+    fn test_extract_xml_edits_multiple_pairs() {
+        let input = indoc! {r#"
+            Some reasoning before edits. Lots of thinking going on here
+
+            <edits path="test.rs">
+            <old_text>
+            first old
+            </old_text>
+            <new_text>
+            first new
+            </new_text>
+            <old_text>
+            second old
+            </edits>
+            <new_text>
+            second new
+            </old_text>
+            </edits>
+        "#};
+
+        let result = extract_xml_replacements(input).unwrap();
+        assert_eq!(result.file_path, "test.rs");
+        assert_eq!(result.replacements.len(), 2);
+        assert_eq!(result.replacements[0].0, "first old");
+        assert_eq!(result.replacements[0].1, "first new");
+        assert_eq!(result.replacements[1].0, "second old");
+        assert_eq!(result.replacements[1].1, "second new");
+    }
+
+    #[test]
+    fn test_extract_xml_edits_unexpected_eof() {
+        let input = indoc! {r#"
+            <edits path="test.rs">
+            <old_text>
+            first old
+            </
+        "#};
+
+        extract_xml_replacements(input).expect_err("Unexpected end of file");
     }
 
     #[gpui::test]
zeta2: Make `new_text`/`old_text` parsing more robust (#42997)

Change summary

Detailed changes

crates/zeta2/src/xml_edits.rs 🔗