ep cli: Make the udiff parsing more resilient (#46264)

Agus Zubiaga created

We will now properly handle `\ No newline at the end of file` after
context lines and repeated occurrences. Also, when line numbers are
present in hunk headers, we will use them to disambiguate the location
when the context is not unique enough.

Release Notes:

- N/A

Change summary

crates/edit_prediction/src/udiff.rs | 296 +++++++++++++++++++++++++++---
1 file changed, 258 insertions(+), 38 deletions(-)

Detailed changes

crates/edit_prediction/src/udiff.rs 🔗

@@ -230,6 +230,26 @@ pub fn strip_diff_metadata(diff: &str) -> String {
     result
 }
 
+/// Given multiple candidate offsets where context matches, use line numbers to disambiguate.
+/// Returns the offset that matches the expected line, or None if no match or no line number available.
+fn disambiguate_by_line_number(
+    candidates: &[usize],
+    expected_line: Option<u32>,
+    offset_to_line: impl Fn(usize) -> u32,
+) -> Option<usize> {
+    match candidates.len() {
+        0 => None,
+        1 => Some(candidates[0]),
+        _ => {
+            let expected = expected_line?;
+            candidates
+                .iter()
+                .copied()
+                .find(|&offset| offset_to_line(offset) == expected)
+        }
+    }
+}
+
 pub fn apply_diff_to_string(diff_str: &str, text: &str) -> Result<String> {
     let mut diff = DiffParser::new(diff_str);
 
@@ -242,9 +262,18 @@ pub fn apply_diff_to_string(diff_str: &str, text: &str) -> Result<String> {
                 path: _,
                 is_new_file: _,
             } => {
-                let hunk_offset = text
-                    .find(&hunk.context)
+                // Find all matches of the context in the text
+                let candidates: Vec<usize> = text
+                    .match_indices(&hunk.context)
+                    .map(|(offset, _)| offset)
+                    .collect();
+
+                let hunk_offset =
+                    disambiguate_by_line_number(&candidates, hunk.start_line, |offset| {
+                        text[..offset].matches('\n').count() as u32
+                    })
                     .ok_or_else(|| anyhow!("couldn't resolve hunk {:?}", hunk.context))?;
+
                 for edit in hunk.edits.iter().rev() {
                     let range = (hunk_offset + edit.range.start)..(hunk_offset + edit.range.end);
                     text.replace_range(range, &edit.text);
@@ -276,17 +305,20 @@ pub fn edits_for_diff(content: &str, diff_str: &str) -> Result<Vec<(Range<usize>
                     return Ok(Vec::new());
                 }
 
-                // Find the context in the content
-                let first_match = content.find(&hunk.context);
-                let Some(context_offset) = first_match else {
+                // Find all matches of the context in the content
+                let candidates: Vec<usize> = content
+                    .match_indices(&hunk.context)
+                    .map(|(offset, _)| offset)
+                    .collect();
+
+                let Some(context_offset) =
+                    disambiguate_by_line_number(&candidates, hunk.start_line, |offset| {
+                        content[..offset].matches('\n').count() as u32
+                    })
+                else {
                     return Ok(Vec::new());
                 };
 
-                // Check for ambiguity - if context appears more than once, reject
-                if content[context_offset + 1..].contains(&hunk.context) {
-                    return Ok(Vec::new());
-                }
-
                 // Use sub-line diffing to find precise edit positions
                 for edit in &hunk.edits {
                     let old_text = &content
@@ -316,6 +348,18 @@ struct DiffParser<'a> {
     current_line: Option<(&'a str, DiffLine<'a>)>,
     hunk: Hunk,
     diff: std::str::Lines<'a>,
+    pending_start_line: Option<u32>,
+    processed_no_newline: bool,
+    last_diff_op: LastDiffOp,
+}
+
+#[derive(Clone, Copy, Default)]
+enum LastDiffOp {
+    #[default]
+    None,
+    Context,
+    Deletion,
+    Addition,
 }
 
 #[derive(Debug, PartialEq)]
@@ -334,6 +378,7 @@ enum DiffEvent<'a> {
 struct Hunk {
     context: String,
     edits: Vec<Edit>,
+    start_line: Option<u32>,
 }
 
 impl Hunk {
@@ -357,6 +402,9 @@ impl<'a> DiffParser<'a> {
             hunk: Hunk::default(),
             current_line,
             diff,
+            pending_start_line: None,
+            processed_no_newline: false,
+            last_diff_op: LastDiffOp::None,
         }
     }
 
@@ -378,9 +426,13 @@ impl<'a> DiffParser<'a> {
                     } else {
                         file.old_path.clone()
                     };
+                    let mut hunk = mem::take(&mut self.hunk);
+                    hunk.start_line = self.pending_start_line.take();
+                    self.processed_no_newline = false;
+                    self.last_diff_op = LastDiffOp::None;
                     return Ok(Some(DiffEvent::Hunk {
                         path,
-                        hunk: mem::take(&mut self.hunk),
+                        hunk,
                         is_new_file,
                     }));
                 }
@@ -415,10 +467,15 @@ impl<'a> DiffParser<'a> {
                             current_file.new_path = path
                         }
                     }
-                    DiffLine::HunkHeader(_) => {}
+                    DiffLine::HunkHeader(location) => {
+                        if let Some(loc) = location {
+                            self.pending_start_line = Some(loc.start_line_old);
+                        }
+                    }
                     DiffLine::Context(ctx) => {
                         if self.current_file.is_some() {
                             writeln!(&mut self.hunk.context, "{ctx}")?;
+                            self.last_diff_op = LastDiffOp::Context;
                         }
                     }
                     DiffLine::Deletion(del) => {
@@ -436,6 +493,7 @@ impl<'a> DiffParser<'a> {
                                 });
                             }
                             writeln!(&mut self.hunk.context, "{del}")?;
+                            self.last_diff_op = LastDiffOp::Deletion;
                         }
                     }
                     DiffLine::Addition(add) => {
@@ -451,23 +509,31 @@ impl<'a> DiffParser<'a> {
                                     text: format!("{add}\n"),
                                 });
                             }
+                            self.last_diff_op = LastDiffOp::Addition;
                         }
                     }
                     DiffLine::NoNewlineAtEOF => {
-                        if let Some(last_edit) = self.hunk.edits.last_mut() {
-                            if last_edit.text.ends_with('\n') {
-                                // Previous line was an addition (has trailing newline in text)
-                                last_edit.text.pop();
-                            } else if !last_edit.range.is_empty()
-                                && last_edit.range.end == self.hunk.context.len()
-                            {
-                                // Previous line was a deletion (non-empty range at end of context)
-                                self.hunk.context.pop();
-                                last_edit.range.end -= 1;
+                        if !self.processed_no_newline {
+                            self.processed_no_newline = true;
+                            match self.last_diff_op {
+                                LastDiffOp::Addition => {
+                                    // Remove trailing newline from the last addition
+                                    if let Some(last_edit) = self.hunk.edits.last_mut() {
+                                        last_edit.text.pop();
+                                    }
+                                }
+                                LastDiffOp::Deletion => {
+                                    // Remove trailing newline from context (which includes the deletion)
+                                    self.hunk.context.pop();
+                                    if let Some(last_edit) = self.hunk.edits.last_mut() {
+                                        last_edit.range.end -= 1;
+                                    }
+                                }
+                                LastDiffOp::Context | LastDiffOp::None => {
+                                    // Remove trailing newline from context
+                                    self.hunk.context.pop();
+                                }
                             }
-                        } else {
-                            // Previous line was context (no edits)
-                            self.hunk.context.pop();
                         }
                     }
                     DiffLine::Garbage(_) => {}
@@ -491,27 +557,32 @@ fn resolve_hunk_edits_in_buffer(
     is_new_file: bool,
 ) -> Result<impl Iterator<Item = (Range<Anchor>, Arc<str>)>, anyhow::Error> {
     let context_offset = if is_new_file || hunk.context.is_empty() {
-        Ok(0)
+        0
     } else {
-        let mut offset = None;
+        let mut candidates: Vec<usize> = Vec::new();
         for range in ranges {
             let range = range.to_offset(buffer);
             let text = buffer.text_for_range(range.clone()).collect::<String>();
             for (ix, _) in text.match_indices(&hunk.context) {
-                if offset.is_some() {
-                    anyhow::bail!("Context is not unique enough:\n{}", hunk.context);
-                }
-                offset = Some(range.start + ix);
+                candidates.push(range.start + ix);
             }
         }
-        offset.ok_or_else(|| {
-            anyhow!(
-                "Failed to match context:\n\n```\n{}```\n\nBuffer contents:\n\n```\n{}```",
-                hunk.context,
-                buffer.text()
-            )
+
+        disambiguate_by_line_number(&candidates, hunk.start_line, |offset| {
+            buffer.offset_to_point(offset).row
         })
-    }?;
+        .ok_or_else(|| {
+            if candidates.is_empty() {
+                anyhow!(
+                    "Failed to match context:\n\n```\n{}```\n\nBuffer contents:\n\n```\n{}```",
+                    hunk.context,
+                    buffer.text()
+                )
+            } else {
+                anyhow!("Context is not unique enough:\n{}", hunk.context)
+            }
+        })?
+    };
     let iter = hunk.edits.into_iter().flat_map(move |edit| {
         let old_text = buffer
             .text_for_range(context_offset + edit.range.start..context_offset + edit.range.end)
@@ -878,6 +949,7 @@ mod tests {
                             range: 4..4,
                             text: "AND\n".into()
                         }],
+                        start_line: None,
                     },
                     is_new_file: false,
                 },
@@ -928,6 +1000,7 @@ mod tests {
                             range: 80..203,
                             text: "".into()
                         }],
+                        start_line: Some(54), // @@ -55,7 -> line 54 (0-indexed)
                     },
                     is_new_file: false,
                 },
@@ -965,6 +1038,91 @@ mod tests {
                             range: 8..16,
                             text: "added line".into()
                         }],
+                        start_line: Some(0), // @@ -1,2 -> line 0 (0-indexed)
+                    },
+                    is_new_file: false,
+                },
+                DiffEvent::FileEnd { renamed_to: None }
+            ],
+        );
+    }
+
+    #[test]
+    fn test_double_no_newline_at_eof() {
+        // Two consecutive "no newline" markers - the second should be ignored
+        let diff = indoc! {"
+            --- a/file.txt
+            +++ b/file.txt
+            @@ -1,3 +1,3 @@
+             line1
+            -old
+            +new
+             line3
+            \\ No newline at end of file
+            \\ No newline at end of file
+        "};
+
+        let mut events = Vec::new();
+        let mut parser = DiffParser::new(diff);
+        while let Some(event) = parser.next().unwrap() {
+            events.push(event);
+        }
+
+        assert_eq!(
+            events,
+            &[
+                DiffEvent::Hunk {
+                    path: "file.txt".into(),
+                    hunk: Hunk {
+                        context: "line1\nold\nline3".into(), // Only one newline removed
+                        edits: vec![Edit {
+                            range: 6..10, // "old\n" is 4 bytes
+                            text: "new\n".into()
+                        }],
+                        start_line: Some(0),
+                    },
+                    is_new_file: false,
+                },
+                DiffEvent::FileEnd { renamed_to: None }
+            ],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_after_context_not_addition() {
+        // "No newline" after context lines should remove newline from context,
+        // not from an earlier addition
+        let diff = indoc! {"
+            --- a/file.txt
+            +++ b/file.txt
+            @@ -1,4 +1,4 @@
+             line1
+            -old
+            +new
+             line3
+             line4
+            \\ No newline at end of file
+        "};
+
+        let mut events = Vec::new();
+        let mut parser = DiffParser::new(diff);
+        while let Some(event) = parser.next().unwrap() {
+            events.push(event);
+        }
+
+        assert_eq!(
+            events,
+            &[
+                DiffEvent::Hunk {
+                    path: "file.txt".into(),
+                    hunk: Hunk {
+                        // newline removed from line4 (context), not from "new" (addition)
+                        context: "line1\nold\nline3\nline4".into(),
+                        edits: vec![Edit {
+                            range: 6..10,         // "old\n" is 4 bytes
+                            text: "new\n".into()  // Still has newline
+                        }],
+                        start_line: Some(0),
                     },
                     is_new_file: false,
                 },
@@ -973,6 +1131,68 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_line_number_disambiguation() {
+        // Test that line numbers from hunk headers are used to disambiguate
+        // when context before the operation appears multiple times
+        let content = indoc! {"
+            repeated line
+            first unique
+            repeated line
+            second unique
+        "};
+
+        // Context "repeated line" appears twice - line number selects first occurrence
+        let diff = indoc! {"
+            --- a/file.txt
+            +++ b/file.txt
+            @@ -1,2 +1,2 @@
+             repeated line
+            -first unique
+            +REPLACED
+        "};
+
+        let result = edits_for_diff(content, diff).unwrap();
+        assert_eq!(result.len(), 1);
+
+        // The edit should replace "first unique" (after first "repeated line\n" at offset 14)
+        let (range, text) = &result[0];
+        assert_eq!(range.start, 14);
+        assert_eq!(range.end, 26); // "first unique" is 12 bytes
+        assert_eq!(text, "REPLACED");
+    }
+
+    #[test]
+    fn test_line_number_disambiguation_second_match() {
+        // Test disambiguation when the edit should apply to a later occurrence
+        let content = indoc! {"
+            repeated line
+            first unique
+            repeated line
+            second unique
+        "};
+
+        // Context "repeated line" appears twice - line number selects second occurrence
+        let diff = indoc! {"
+            --- a/file.txt
+            +++ b/file.txt
+            @@ -3,2 +3,2 @@
+             repeated line
+            -second unique
+            +REPLACED
+        "};
+
+        let result = edits_for_diff(content, diff).unwrap();
+        assert_eq!(result.len(), 1);
+
+        // The edit should replace "second unique" (after second "repeated line\n")
+        // Offset: "repeated line\n" (14) + "first unique\n" (13) + "repeated line\n" (14) = 41
+        let (range, text) = &result[0];
+        assert_eq!(range.start, 41);
+        assert_eq!(range.end, 54); // "second unique" is 13 bytes
+        assert_eq!(text, "REPLACED");
+    }
+
     #[gpui::test]
     async fn test_apply_diff_successful(cx: &mut TestAppContext) {
         let fs = init_test(cx);