word_diff.rs

  1//! Word-diff utilities for converting unified diffs to word-diff format.
  2
  3/// Convert unified diff to word-diff format.
  4///
  5/// This transforms line-based diffs into word-level diffs where:
  6/// - Deletions are marked with `[-...-]`
  7/// - Insertions are marked with `{+...+}`
  8pub fn unified_to_word_diff(unified_diff: &str) -> String {
  9    let lines: Vec<&str> = unified_diff.lines().collect();
 10    let mut result = String::new();
 11    let mut old_lines: Vec<&str> = Vec::new();
 12    let mut new_lines: Vec<&str> = Vec::new();
 13
 14    let flush_changes =
 15        |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, result: &mut String| {
 16            if old_lines.is_empty() && new_lines.is_empty() {
 17                return;
 18            }
 19
 20            // Strip the leading '-' or '+' from each line
 21            let old_text: String = old_lines
 22                .iter()
 23                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
 24                .collect::<Vec<_>>()
 25                .join("\n");
 26
 27            let new_text: String = new_lines
 28                .iter()
 29                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
 30                .collect::<Vec<_>>()
 31                .join("\n");
 32
 33            if !old_text.is_empty() || !new_text.is_empty() {
 34                let word_diff = compute_word_diff(&old_text, &new_text);
 35                result.push_str(&word_diff);
 36            }
 37
 38            old_lines.clear();
 39            new_lines.clear();
 40        };
 41
 42    for line in lines {
 43        if line.starts_with("---") || line.starts_with("+++") {
 44            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 45            result.push_str(line);
 46            result.push('\n');
 47        } else if line.starts_with("@@") {
 48            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 49            result.push_str(line);
 50            result.push('\n');
 51        } else if line.starts_with('-') {
 52            old_lines.push(line);
 53        } else if line.starts_with('+') {
 54            new_lines.push(line);
 55        } else if line.starts_with(' ') || line.is_empty() {
 56            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 57            result.push_str(line);
 58            result.push('\n');
 59        } else {
 60            // Header lines (diff --git, index, etc.)
 61            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 62            result.push_str(line);
 63            result.push('\n');
 64        }
 65    }
 66
 67    flush_changes(&mut old_lines, &mut new_lines, &mut result);
 68    result
 69}
 70
 71/// Compute word-level diff between two text blocks.
 72///
 73/// Words and whitespace are treated as separate tokens. The output uses:
 74/// - `[-...-]` for deleted content
 75/// - `{+...+}` for inserted content
 76fn compute_word_diff(old_text: &str, new_text: &str) -> String {
 77    // Split into words while preserving whitespace
 78    let old_words = tokenize(old_text);
 79    let new_words = tokenize(new_text);
 80
 81    let ops = diff_tokens(&old_words, &new_words);
 82    let mut result = String::new();
 83
 84    for op in ops {
 85        match op {
 86            DiffOp::Equal(start, end) => {
 87                for token in &old_words[start..end] {
 88                    result.push_str(token);
 89                }
 90            }
 91            DiffOp::Delete(start, end) => {
 92                result.push_str("[-");
 93                for token in &old_words[start..end] {
 94                    result.push_str(token);
 95                }
 96                result.push_str("-]");
 97            }
 98            DiffOp::Insert(start, end) => {
 99                result.push_str("{+");
100                for token in &new_words[start..end] {
101                    result.push_str(token);
102                }
103                result.push_str("+}");
104            }
105            DiffOp::Replace {
106                old_start,
107                old_end,
108                new_start,
109                new_end,
110            } => {
111                result.push_str("[-");
112                for token in &old_words[old_start..old_end] {
113                    result.push_str(token);
114                }
115                result.push_str("-]");
116                result.push_str("{+");
117                for token in &new_words[new_start..new_end] {
118                    result.push_str(token);
119                }
120                result.push_str("+}");
121            }
122        }
123    }
124
125    if !result.is_empty() && !result.ends_with('\n') {
126        result.push('\n');
127    }
128
129    result
130}
131
132/// Tokenize text into words and whitespace sequences.
133fn tokenize(text: &str) -> Vec<&str> {
134    let mut tokens = Vec::new();
135    let mut chars = text.char_indices().peekable();
136
137    while let Some((start, ch)) = chars.next() {
138        if ch.is_whitespace() {
139            // Collect contiguous whitespace
140            let mut end = start + ch.len_utf8();
141            while let Some(&(_, next_ch)) = chars.peek() {
142                if next_ch.is_whitespace() {
143                    end += next_ch.len_utf8();
144                    chars.next();
145                } else {
146                    break;
147                }
148            }
149            tokens.push(&text[start..end]);
150        } else {
151            // Collect contiguous non-whitespace
152            let mut end = start + ch.len_utf8();
153            while let Some(&(_, next_ch)) = chars.peek() {
154                if !next_ch.is_whitespace() {
155                    end += next_ch.len_utf8();
156                    chars.next();
157                } else {
158                    break;
159                }
160            }
161            tokens.push(&text[start..end]);
162        }
163    }
164
165    tokens
166}
167
168#[derive(Debug)]
169enum DiffOp {
170    Equal(usize, usize),
171    Delete(usize, usize),
172    Insert(usize, usize),
173    Replace {
174        old_start: usize,
175        old_end: usize,
176        new_start: usize,
177        new_end: usize,
178    },
179}
180
181/// Compute diff operations between two token sequences using a simple LCS-based algorithm.
182fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
183    // Build LCS table
184    let m = old.len();
185    let n = new.len();
186
187    if m == 0 && n == 0 {
188        return vec![];
189    }
190    if m == 0 {
191        return vec![DiffOp::Insert(0, n)];
192    }
193    if n == 0 {
194        return vec![DiffOp::Delete(0, m)];
195    }
196
197    // LCS dynamic programming
198    let mut dp = vec![vec![0usize; n + 1]; m + 1];
199    for i in 1..=m {
200        for j in 1..=n {
201            if old[i - 1] == new[j - 1] {
202                dp[i][j] = dp[i - 1][j - 1] + 1;
203            } else {
204                dp[i][j] = dp[i - 1][j].max(dp[i][j - 1]);
205            }
206        }
207    }
208
209    // Backtrack to find operations
210    let mut ops = Vec::new();
211    let mut i = m;
212    let mut j = n;
213
214    // We'll collect in reverse order, then reverse at the end
215    let mut stack: Vec<(usize, usize, bool)> = Vec::new(); // (index, end, is_old)
216
217    while i > 0 || j > 0 {
218        if i > 0 && j > 0 && old[i - 1] == new[j - 1] {
219            stack.push((i - 1, i, true)); // Equal marker (using old index)
220            stack.push((j - 1, j, false)); // Paired with new index
221            i -= 1;
222            j -= 1;
223        } else if j > 0 && (i == 0 || dp[i][j - 1] >= dp[i - 1][j]) {
224            // Insert from new
225            stack.push((j - 1, j, false));
226            j -= 1;
227        } else {
228            // Delete from old
229            stack.push((i - 1, i, true));
230            i -= 1;
231        }
232    }
233
234    // Process the stack to build proper DiffOps
235    // This is a simplified approach - just iterate through and build ops
236    let mut old_idx = 0;
237    let mut new_idx = 0;
238
239    while old_idx < m || new_idx < n {
240        // Find next matching pair
241        let mut old_match = None;
242        let mut new_match = None;
243
244        for oi in old_idx..m {
245            for ni in new_idx..n {
246                if old[oi] == new[ni] {
247                    old_match = Some(oi);
248                    new_match = Some(ni);
249                    break;
250                }
251            }
252            if old_match.is_some() {
253                break;
254            }
255        }
256
257        match (old_match, new_match) {
258            (Some(om), Some(nm)) => {
259                // Handle any deletions/insertions before the match
260                if old_idx < om && new_idx < nm {
261                    ops.push(DiffOp::Replace {
262                        old_start: old_idx,
263                        old_end: om,
264                        new_start: new_idx,
265                        new_end: nm,
266                    });
267                } else if old_idx < om {
268                    ops.push(DiffOp::Delete(old_idx, om));
269                } else if new_idx < nm {
270                    ops.push(DiffOp::Insert(new_idx, nm));
271                }
272
273                // Find the extent of the equal sequence
274                let mut eq_end_old = om;
275                let mut eq_end_new = nm;
276                while eq_end_old < m && eq_end_new < n && old[eq_end_old] == new[eq_end_new] {
277                    eq_end_old += 1;
278                    eq_end_new += 1;
279                }
280
281                ops.push(DiffOp::Equal(om, eq_end_old));
282                old_idx = eq_end_old;
283                new_idx = eq_end_new;
284            }
285            _ => {
286                // No more matches, handle remaining
287                if old_idx < m && new_idx < n {
288                    ops.push(DiffOp::Replace {
289                        old_start: old_idx,
290                        old_end: m,
291                        new_start: new_idx,
292                        new_end: n,
293                    });
294                } else if old_idx < m {
295                    ops.push(DiffOp::Delete(old_idx, m));
296                } else if new_idx < n {
297                    ops.push(DiffOp::Insert(new_idx, n));
298                }
299                break;
300            }
301        }
302    }
303
304    ops
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310
311    #[test]
312    fn test_tokenize() {
313        let tokens = tokenize("hello world");
314        assert_eq!(tokens, vec!["hello", " ", "world"]);
315
316        let tokens = tokenize("  multiple   spaces  ");
317        assert_eq!(tokens, vec!["  ", "multiple", "   ", "spaces", "  "]);
318    }
319
320    #[test]
321    fn test_compute_word_diff_simple() {
322        let result = compute_word_diff("hello world", "hello there");
323        assert!(result.contains("[-world-]"));
324        assert!(result.contains("{+there+}"));
325    }
326
327    #[test]
328    fn test_unified_to_word_diff() {
329        let unified = "\
330--- a/file.txt
331+++ b/file.txt
332@@ -1,3 +1,3 @@
333 context line
334-old text here
335+new text here
336 more context";
337
338        let result = unified_to_word_diff(unified);
339        assert!(result.contains("--- a/file.txt"));
340        assert!(result.contains("+++ b/file.txt"));
341        assert!(result.contains("@@"));
342    }
343}