word_diff.rs

  1//! Word-diff utilities for converting unified diffs to word-diff format.
  2
  3use similar::{DiffTag, TextDiff};
  4
  5/// Convert unified diff to word-diff format.
  6///
  7/// This transforms line-based diffs into word-level diffs where:
  8/// - Deletions are marked with `[-...-]`
  9/// - Insertions are marked with `{+...+}`
 10pub fn unified_to_word_diff(unified_diff: &str) -> String {
 11    let lines: Vec<&str> = unified_diff.lines().collect();
 12    let mut result = String::new();
 13    let mut old_lines: Vec<&str> = Vec::new();
 14    let mut new_lines: Vec<&str> = Vec::new();
 15
 16    let flush_changes =
 17        |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, result: &mut String| {
 18            if old_lines.is_empty() && new_lines.is_empty() {
 19                return;
 20            }
 21
 22            // Strip the leading '-' or '+' from each line
 23            let old_text: String = old_lines
 24                .iter()
 25                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
 26                .collect::<Vec<_>>()
 27                .join("\n");
 28
 29            let new_text: String = new_lines
 30                .iter()
 31                .map(|line| if line.len() > 1 { &line[1..] } else { "" })
 32                .collect::<Vec<_>>()
 33                .join("\n");
 34
 35            if !old_text.is_empty() || !new_text.is_empty() {
 36                let word_diff = compute_word_diff(&old_text, &new_text);
 37                result.push_str(&word_diff);
 38            }
 39
 40            old_lines.clear();
 41            new_lines.clear();
 42        };
 43
 44    for line in lines {
 45        if line.starts_with("---") || line.starts_with("+++") {
 46            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 47            result.push_str(line);
 48            result.push('\n');
 49        } else if line.starts_with("@@") {
 50            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 51            result.push_str(line);
 52            result.push('\n');
 53        } else if line.starts_with('-') {
 54            old_lines.push(line);
 55        } else if line.starts_with('+') {
 56            new_lines.push(line);
 57        } else if line.starts_with(' ') || line.is_empty() {
 58            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 59            result.push_str(line);
 60            result.push('\n');
 61        } else {
 62            // Header lines (diff --git, index, etc.)
 63            flush_changes(&mut old_lines, &mut new_lines, &mut result);
 64            result.push_str(line);
 65            result.push('\n');
 66        }
 67    }
 68
 69    flush_changes(&mut old_lines, &mut new_lines, &mut result);
 70    result
 71}
 72
 73/// Compute word-level diff between two text blocks.
 74///
 75/// Words and whitespace are treated as separate tokens. The output uses:
 76/// - `[-...-]` for deleted content
 77/// - `{+...+}` for inserted content
 78fn compute_word_diff(old_text: &str, new_text: &str) -> String {
 79    // Split into words while preserving whitespace
 80    let old_words = tokenize(old_text);
 81    let new_words = tokenize(new_text);
 82
 83    let ops = diff_tokens(&old_words, &new_words);
 84    let mut result = String::new();
 85
 86    for op in ops {
 87        match op {
 88            DiffOp::Equal(start, end) => {
 89                for token in &old_words[start..end] {
 90                    result.push_str(token);
 91                }
 92            }
 93            DiffOp::Delete(start, end) => {
 94                result.push_str("[-");
 95                for token in &old_words[start..end] {
 96                    result.push_str(token);
 97                }
 98                result.push_str("-]");
 99            }
100            DiffOp::Insert(start, end) => {
101                result.push_str("{+");
102                for token in &new_words[start..end] {
103                    result.push_str(token);
104                }
105                result.push_str("+}");
106            }
107            DiffOp::Replace {
108                old_start,
109                old_end,
110                new_start,
111                new_end,
112            } => {
113                result.push_str("[-");
114                for token in &old_words[old_start..old_end] {
115                    result.push_str(token);
116                }
117                result.push_str("-]");
118                result.push_str("{+");
119                for token in &new_words[new_start..new_end] {
120                    result.push_str(token);
121                }
122                result.push_str("+}");
123            }
124        }
125    }
126
127    if !result.is_empty() && !result.ends_with('\n') {
128        result.push('\n');
129    }
130
131    result
132}
133
134/// Classify a character into one of three token classes:
135/// - 0: identifier (alphanumeric or `_`)
136/// - 1: whitespace
137/// - 2: punctuation (everything else, each character becomes its own token)
138fn char_class(ch: char) -> u8 {
139    if ch.is_alphanumeric() || ch == '_' {
140        0
141    } else if ch.is_whitespace() {
142        1
143    } else {
144        2
145    }
146}
147
148/// Tokenize text into identifier words, whitespace runs, and individual punctuation characters.
149///
150/// This splitting aligns with the syntactic atoms of source code so that the
151/// LCS-based diff can produce fine-grained, meaningful change regions.
152pub(crate) fn tokenize(text: &str) -> Vec<&str> {
153    let mut tokens = Vec::new();
154    let mut chars = text.char_indices().peekable();
155
156    while let Some((start, ch)) = chars.next() {
157        let class = char_class(ch);
158        if class == 2 {
159            // Punctuation: each character is a separate token
160            tokens.push(&text[start..start + ch.len_utf8()]);
161        } else {
162            // Identifier or whitespace: collect contiguous run of same class
163            let mut end = start + ch.len_utf8();
164            while let Some(&(_, next_ch)) = chars.peek() {
165                if char_class(next_ch) == class {
166                    end += next_ch.len_utf8();
167                    chars.next();
168                } else {
169                    break;
170                }
171            }
172            tokens.push(&text[start..end]);
173        }
174    }
175
176    tokens
177}
178
179#[derive(Debug)]
180pub(crate) enum DiffOp {
181    Equal(usize, usize),
182    Delete(usize, usize),
183    Insert(usize, usize),
184    Replace {
185        old_start: usize,
186        old_end: usize,
187        new_start: usize,
188        new_end: usize,
189    },
190}
191
192/// Compute diff operations between two token sequences using `similar`'s Myers diff.
193pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
194    let diff = TextDiff::from_slices(old, new);
195    diff.ops()
196        .iter()
197        .map(|op| {
198            let tag = op.tag();
199            let old_range = op.old_range();
200            let new_range = op.new_range();
201            match tag {
202                DiffTag::Equal => DiffOp::Equal(old_range.start, old_range.end),
203                DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end),
204                DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end),
205                DiffTag::Replace => DiffOp::Replace {
206                    old_start: old_range.start,
207                    old_end: old_range.end,
208                    new_start: new_range.start,
209                    new_end: new_range.end,
210                },
211            }
212        })
213        .collect()
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    #[test]
221    fn test_tokenize() {
222        let tokens = tokenize("hello world");
223        assert_eq!(tokens, vec!["hello", " ", "world"]);
224
225        let tokens = tokenize("  multiple   spaces  ");
226        assert_eq!(tokens, vec!["  ", "multiple", "   ", "spaces", "  "]);
227
228        let tokens = tokenize("self.name");
229        assert_eq!(tokens, vec!["self", ".", "name"]);
230
231        let tokens = tokenize("foo(bar, baz)");
232        assert_eq!(tokens, vec!["foo", "(", "bar", ",", " ", "baz", ")"]);
233
234        let tokens = tokenize("hello_world");
235        assert_eq!(tokens, vec!["hello_world"]);
236
237        let tokens = tokenize("fn();");
238        assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
239
240        let tokens = tokenize("foo_bar123 + baz");
241        assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]);
242
243        let tokens = tokenize("print(\"hello\")");
244        assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
245    }
246
247    #[test]
248    fn test_compute_word_diff_simple() {
249        let result = compute_word_diff("hello world", "hello there");
250        assert!(result.contains("[-world-]"));
251        assert!(result.contains("{+there+}"));
252    }
253
254    #[test]
255    fn test_unified_to_word_diff() {
256        let unified = "\
257--- a/file.txt
258+++ b/file.txt
259@@ -1,3 +1,3 @@
260 context line
261-old text here
262+new text here
263 more context";
264
265        let result = unified_to_word_diff(unified);
266        assert!(result.contains("--- a/file.txt"));
267        assert!(result.contains("+++ b/file.txt"));
268        assert!(result.contains("@@"));
269    }
270}