word_diff.rs

  1//! Word-diff utilities for converting unified diffs to word-diff format.
  2
  3use similar::{DiffTag, TextDiff};
  4
  5/// Convert unified diff to word-diff format.
  6///
  7/// This transforms line-based diffs into word-level diffs where:
  8/// - Deletions are marked with `[-...-]`
  9/// - Insertions are marked with `{+...+}`
 10pub fn unified_to_word_diff(unified_diff: &str) -> String {
 11    let lines: Vec<&str> = unified_diff.lines().collect();
 12    let mut result = String::new();
 13    let mut old_lines: Vec<&str> = Vec::new();
 14    let mut new_lines: Vec<&str> = Vec::new();
 15
 16    let flush_changes =
 17        |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, result: &mut String| {
 18            if old_lines.is_empty() && new_lines.is_empty() {
 19                return;
 20            }
 21
 22            // Strip the leading '-' or '+' from each line
 23            let old_text: String = old_lines
 24                .iter()
 25                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
 26                .collect::<Vec<_>>()
 27                .join("\n");
 28
 29            let new_text: String = new_lines
 30                .iter()
 31                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
 32                .collect::<Vec<_>>()
 33                .join("\n");
 34
 35            if !old_text.is_empty() || !new_text.is_empty() {
 36                let word_diff = compute_word_diff(&old_text, &new_text);
 37                result.push_str(&word_diff);
 38            }
 39
 40            old_lines.clear();
 41            new_lines.clear();
 42        };
 43
 44    for line in lines {
 45        if line.starts_with("---") || line.starts_with("+++") {
 46            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 47            result.push_str(line);
 48            result.push('\n');
 49        } else if line.starts_with("@@") {
 50            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 51            result.push_str(line);
 52            result.push('\n');
 53        } else if line.starts_with('-') {
 54            old_lines.push(line);
 55        } else if line.starts_with('+') {
 56            new_lines.push(line);
 57        } else if line.starts_with(' ') || line.is_empty() {
 58            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 59            result.push_str(line);
 60            result.push('\n');
 61        } else {
 62            // Header lines (diff --git, index, etc.)
 63            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 64            result.push_str(line);
 65            result.push('\n');
 66        }
 67    }
 68
 69    flush_changes(&mut old_lines, &mut new_lines, &mut result);
 70    result
 71}
 72
 73/// Compute word-level diff between two text blocks.
 74///
 75/// Words and whitespace are treated as separate tokens. The output uses:
 76/// - `[-...-]` for deleted content
 77/// - `{+...+}` for inserted content
 78fn compute_word_diff(old_text: &str, new_text: &str) -> String {
 79    // Split into words while preserving whitespace
 80    let old_words = tokenize(old_text);
 81    let new_words = tokenize(new_text);
 82
 83    let ops = diff_tokens(&old_words, &new_words);
 84    let mut result = String::new();
 85
 86    for op in ops {
 87        match op {
 88            DiffOp::Equal {
 89                old_start, old_end, ..
 90            } => {
 91                for token in &old_words[old_start..old_end] {
 92                    result.push_str(token);
 93                }
 94            }
 95            DiffOp::Delete(start, end) => {
 96                result.push_str("[-");
 97                for token in &old_words[start..end] {
 98                    result.push_str(token);
 99                }
100                result.push_str("-]");
101            }
102            DiffOp::Insert(start, end) => {
103                result.push_str("{+");
104                for token in &new_words[start..end] {
105                    result.push_str(token);
106                }
107                result.push_str("+}");
108            }
109            DiffOp::Replace {
110                old_start,
111                old_end,
112                new_start,
113                new_end,
114            } => {
115                result.push_str("[-");
116                for token in &old_words[old_start..old_end] {
117                    result.push_str(token);
118                }
119                result.push_str("-]");
120                result.push_str("{+");
121                for token in &new_words[new_start..new_end] {
122                    result.push_str(token);
123                }
124                result.push_str("+}");
125            }
126        }
127    }
128
129    if !result.is_empty() && !result.ends_with('\n') {
130        result.push('\n');
131    }
132
133    result
134}
135
136/// Classify a character into one of three token classes:
137/// - 0: identifier (alphanumeric or `_`)
138/// - 1: whitespace
139/// - 2: punctuation (everything else, each character becomes its own token)
140fn char_class(ch: char) -> u8 {
141    if ch.is_alphanumeric() || ch == '_' {
142        0
143    } else if ch.is_whitespace() {
144        1
145    } else {
146        2
147    }
148}
149
150/// Tokenize text into identifier words, whitespace runs, and individual punctuation characters.
151///
152/// This splitting aligns with the syntactic atoms of source code so that the
153/// LCS-based diff can produce fine-grained, meaningful change regions.
154pub(crate) fn tokenize(text: &str) -> Vec<&str> {
155    let mut tokens = Vec::new();
156    let mut chars = text.char_indices().peekable();
157
158    while let Some((start, ch)) = chars.next() {
159        let class = char_class(ch);
160        if class == 2 {
161            // Punctuation: each character is a separate token
162            tokens.push(&text[start..start + ch.len_utf8()]);
163        } else {
164            // Identifier or whitespace: collect contiguous run of same class
165            let mut end = start + ch.len_utf8();
166            while let Some(&(_, next_ch)) = chars.peek() {
167                if char_class(next_ch) == class {
168                    end += next_ch.len_utf8();
169                    chars.next();
170                } else {
171                    break;
172                }
173            }
174            tokens.push(&text[start..end]);
175        }
176    }
177
178    tokens
179}
180
181#[derive(Debug)]
182pub(crate) enum DiffOp {
183    Equal {
184        old_start: usize,
185        old_end: usize,
186        new_start: usize,
187        new_end: usize,
188    },
189    Delete(usize, usize),
190    Insert(usize, usize),
191    Replace {
192        old_start: usize,
193        old_end: usize,
194        new_start: usize,
195        new_end: usize,
196    },
197}
198
199/// Compute diff operations between two token sequences using `similar`'s Myers diff.
200pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
201    let diff = TextDiff::from_slices(old, new);
202    diff.ops()
203        .iter()
204        .map(|op| {
205            let tag = op.tag();
206            let old_range = op.old_range();
207            let new_range = op.new_range();
208            match tag {
209                DiffTag::Equal => DiffOp::Equal {
210                    old_start: old_range.start,
211                    old_end: old_range.end,
212                    new_start: new_range.start,
213                    new_end: new_range.end,
214                },
215                DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end),
216                DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end),
217                DiffTag::Replace => DiffOp::Replace {
218                    old_start: old_range.start,
219                    old_end: old_range.end,
220                    new_start: new_range.start,
221                    new_end: new_range.end,
222                },
223            }
224        })
225        .collect()
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn test_tokenize() {
234        let tokens = tokenize("hello world");
235        assert_eq!(tokens, vec!["hello", " ", "world"]);
236
237        let tokens = tokenize("  multiple   spaces  ");
238        assert_eq!(tokens, vec!["  ", "multiple", "   ", "spaces", "  "]);
239
240        let tokens = tokenize("self.name");
241        assert_eq!(tokens, vec!["self", ".", "name"]);
242
243        let tokens = tokenize("foo(bar, baz)");
244        assert_eq!(tokens, vec!["foo", "(", "bar", ",", " ", "baz", ")"]);
245
246        let tokens = tokenize("hello_world");
247        assert_eq!(tokens, vec!["hello_world"]);
248
249        let tokens = tokenize("fn();");
250        assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
251
252        let tokens = tokenize("foo_bar123 + baz");
253        assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]);
254
255        let tokens = tokenize("print(\"hello\")");
256        assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
257    }
258
259    #[test]
260    fn test_compute_word_diff_simple() {
261        let result = compute_word_diff("hello world", "hello there");
262        assert!(result.contains("[-world-]"));
263        assert!(result.contains("{+there+}"));
264    }
265
266    #[test]
267    fn test_unified_to_word_diff() {
268        let unified = "\
269--- a/file.txt
270+++ b/file.txt
271@@ -1,3 +1,3 @@
272 context line
273-old text here
274+new text here
275 more context";
276
277        let result = unified_to_word_diff(unified);
278        assert!(result.contains("--- a/file.txt"));
279        assert!(result.contains("+++ b/file.txt"));
280        assert!(result.contains("@@"));
281    }
282}