1//! Word-diff utilities for converting unified diffs to word-diff format.
2
3use similar::{DiffTag, TextDiff};
4
5/// Convert unified diff to word-diff format.
6///
7/// This transforms line-based diffs into word-level diffs where:
8/// - Deletions are marked with `[-...-]`
9/// - Insertions are marked with `{+...+}`
10pub fn unified_to_word_diff(unified_diff: &str) -> String {
11 let lines: Vec<&str> = unified_diff.lines().collect();
12 let mut result = String::new();
13 let mut old_lines: Vec<&str> = Vec::new();
14 let mut new_lines: Vec<&str> = Vec::new();
15
16 let flush_changes =
17 |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, result: &mut String| {
18 if old_lines.is_empty() && new_lines.is_empty() {
19 return;
20 }
21
22 // Strip the leading '-' or '+' from each line
23 let old_text: String = old_lines
24 .iter()
25 .map(|line| if line.len() > 1 { &line[1..] } else { "" })
26 .collect::<Vec<_>>()
27 .join("\n");
28
29 let new_text: String = new_lines
30 .iter()
31 .map(|line| if line.len() > 1 { &line[1..] } else { "" })
32 .collect::<Vec<_>>()
33 .join("\n");
34
35 if !old_text.is_empty() || !new_text.is_empty() {
36 let word_diff = compute_word_diff(&old_text, &new_text);
37 result.push_str(&word_diff);
38 }
39
40 old_lines.clear();
41 new_lines.clear();
42 };
43
44 for line in lines {
45 if line.starts_with("---") || line.starts_with("+++") {
46 flush_changes(&mut old_lines, &mut new_lines, &mut result);
47 result.push_str(line);
48 result.push('\n');
49 } else if line.starts_with("@@") {
50 flush_changes(&mut old_lines, &mut new_lines, &mut result);
51 result.push_str(line);
52 result.push('\n');
53 } else if line.starts_with('-') {
54 old_lines.push(line);
55 } else if line.starts_with('+') {
56 new_lines.push(line);
57 } else if line.starts_with(' ') || line.is_empty() {
58 flush_changes(&mut old_lines, &mut new_lines, &mut result);
59 result.push_str(line);
60 result.push('\n');
61 } else {
62 // Header lines (diff --git, index, etc.)
63 flush_changes(&mut old_lines, &mut new_lines, &mut result);
64 result.push_str(line);
65 result.push('\n');
66 }
67 }
68
69 flush_changes(&mut old_lines, &mut new_lines, &mut result);
70 result
71}
72
73/// Compute word-level diff between two text blocks.
74///
75/// Words and whitespace are treated as separate tokens. The output uses:
76/// - `[-...-]` for deleted content
77/// - `{+...+}` for inserted content
78fn compute_word_diff(old_text: &str, new_text: &str) -> String {
79 // Split into words while preserving whitespace
80 let old_words = tokenize(old_text);
81 let new_words = tokenize(new_text);
82
83 let ops = diff_tokens(&old_words, &new_words);
84 let mut result = String::new();
85
86 for op in ops {
87 match op {
88 DiffOp::Equal(start, end) => {
89 for token in &old_words[start..end] {
90 result.push_str(token);
91 }
92 }
93 DiffOp::Delete(start, end) => {
94 result.push_str("[-");
95 for token in &old_words[start..end] {
96 result.push_str(token);
97 }
98 result.push_str("-]");
99 }
100 DiffOp::Insert(start, end) => {
101 result.push_str("{+");
102 for token in &new_words[start..end] {
103 result.push_str(token);
104 }
105 result.push_str("+}");
106 }
107 DiffOp::Replace {
108 old_start,
109 old_end,
110 new_start,
111 new_end,
112 } => {
113 result.push_str("[-");
114 for token in &old_words[old_start..old_end] {
115 result.push_str(token);
116 }
117 result.push_str("-]");
118 result.push_str("{+");
119 for token in &new_words[new_start..new_end] {
120 result.push_str(token);
121 }
122 result.push_str("+}");
123 }
124 }
125 }
126
127 if !result.is_empty() && !result.ends_with('\n') {
128 result.push('\n');
129 }
130
131 result
132}
133
134/// Classify a character into one of three token classes:
135/// - 0: identifier (alphanumeric or `_`)
136/// - 1: whitespace
137/// - 2: punctuation (everything else, each character becomes its own token)
138fn char_class(ch: char) -> u8 {
139 if ch.is_alphanumeric() || ch == '_' {
140 0
141 } else if ch.is_whitespace() {
142 1
143 } else {
144 2
145 }
146}
147
148/// Tokenize text into identifier words, whitespace runs, and individual punctuation characters.
149///
150/// This splitting aligns with the syntactic atoms of source code so that the
151/// LCS-based diff can produce fine-grained, meaningful change regions.
152pub(crate) fn tokenize(text: &str) -> Vec<&str> {
153 let mut tokens = Vec::new();
154 let mut chars = text.char_indices().peekable();
155
156 while let Some((start, ch)) = chars.next() {
157 let class = char_class(ch);
158 if class == 2 {
159 // Punctuation: each character is a separate token
160 tokens.push(&text[start..start + ch.len_utf8()]);
161 } else {
162 // Identifier or whitespace: collect contiguous run of same class
163 let mut end = start + ch.len_utf8();
164 while let Some(&(_, next_ch)) = chars.peek() {
165 if char_class(next_ch) == class {
166 end += next_ch.len_utf8();
167 chars.next();
168 } else {
169 break;
170 }
171 }
172 tokens.push(&text[start..end]);
173 }
174 }
175
176 tokens
177}
178
179#[derive(Debug)]
180pub(crate) enum DiffOp {
181 Equal(usize, usize),
182 Delete(usize, usize),
183 Insert(usize, usize),
184 Replace {
185 old_start: usize,
186 old_end: usize,
187 new_start: usize,
188 new_end: usize,
189 },
190}
191
192/// Compute diff operations between two token sequences using `similar`'s Myers diff.
193pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
194 let diff = TextDiff::from_slices(old, new);
195 diff.ops()
196 .iter()
197 .map(|op| {
198 let tag = op.tag();
199 let old_range = op.old_range();
200 let new_range = op.new_range();
201 match tag {
202 DiffTag::Equal => DiffOp::Equal(old_range.start, old_range.end),
203 DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end),
204 DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end),
205 DiffTag::Replace => DiffOp::Replace {
206 old_start: old_range.start,
207 old_end: old_range.end,
208 new_start: new_range.start,
209 new_end: new_range.end,
210 },
211 }
212 })
213 .collect()
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219
220 #[test]
221 fn test_tokenize() {
222 let tokens = tokenize("hello world");
223 assert_eq!(tokens, vec!["hello", " ", "world"]);
224
225 let tokens = tokenize(" multiple spaces ");
226 assert_eq!(tokens, vec![" ", "multiple", " ", "spaces", " "]);
227
228 let tokens = tokenize("self.name");
229 assert_eq!(tokens, vec!["self", ".", "name"]);
230
231 let tokens = tokenize("foo(bar, baz)");
232 assert_eq!(tokens, vec!["foo", "(", "bar", ",", " ", "baz", ")"]);
233
234 let tokens = tokenize("hello_world");
235 assert_eq!(tokens, vec!["hello_world"]);
236
237 let tokens = tokenize("fn();");
238 assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
239
240 let tokens = tokenize("foo_bar123 + baz");
241 assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]);
242
243 let tokens = tokenize("print(\"hello\")");
244 assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
245 }
246
247 #[test]
248 fn test_compute_word_diff_simple() {
249 let result = compute_word_diff("hello world", "hello there");
250 assert!(result.contains("[-world-]"));
251 assert!(result.contains("{+there+}"));
252 }
253
254 #[test]
255 fn test_unified_to_word_diff() {
256 let unified = "\
257--- a/file.txt
258+++ b/file.txt
259@@ -1,3 +1,3 @@
260 context line
261-old text here
262+new text here
263 more context";
264
265 let result = unified_to_word_diff(unified);
266 assert!(result.contains("--- a/file.txt"));
267 assert!(result.contains("+++ b/file.txt"));
268 assert!(result.contains("@@"));
269 }
270}