1//! Word-diff utilities for converting unified diffs to word-diff format.
2
3/// Convert unified diff to word-diff format.
4///
5/// This transforms line-based diffs into word-level diffs where:
6/// - Deletions are marked with `[-...-]`
7/// - Insertions are marked with `{+...+}`
8pub fn unified_to_word_diff(unified_diff: &str) -> String {
9 let lines: Vec<&str> = unified_diff.lines().collect();
10 let mut result = String::new();
11 let mut old_lines: Vec<&str> = Vec::new();
12 let mut new_lines: Vec<&str> = Vec::new();
13
14 let flush_changes =
15 |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, result: &mut String| {
16 if old_lines.is_empty() && new_lines.is_empty() {
17 return;
18 }
19
20 // Strip the leading '-' or '+' from each line
21 let old_text: String = old_lines
22 .iter()
23 .map(|line| if line.len() > 1 { &line[1..] } else { "" })
24 .collect::<Vec<_>>()
25 .join("\n");
26
27 let new_text: String = new_lines
28 .iter()
29 .map(|line| if line.len() > 1 { &line[1..] } else { "" })
30 .collect::<Vec<_>>()
31 .join("\n");
32
33 if !old_text.is_empty() || !new_text.is_empty() {
34 let word_diff = compute_word_diff(&old_text, &new_text);
35 result.push_str(&word_diff);
36 }
37
38 old_lines.clear();
39 new_lines.clear();
40 };
41
42 for line in lines {
43 if line.starts_with("---") || line.starts_with("+++") {
44 flush_changes(&mut old_lines, &mut new_lines, &mut result);
45 result.push_str(line);
46 result.push('\n');
47 } else if line.starts_with("@@") {
48 flush_changes(&mut old_lines, &mut new_lines, &mut result);
49 result.push_str(line);
50 result.push('\n');
51 } else if line.starts_with('-') {
52 old_lines.push(line);
53 } else if line.starts_with('+') {
54 new_lines.push(line);
55 } else if line.starts_with(' ') || line.is_empty() {
56 flush_changes(&mut old_lines, &mut new_lines, &mut result);
57 result.push_str(line);
58 result.push('\n');
59 } else {
60 // Header lines (diff --git, index, etc.)
61 flush_changes(&mut old_lines, &mut new_lines, &mut result);
62 result.push_str(line);
63 result.push('\n');
64 }
65 }
66
67 flush_changes(&mut old_lines, &mut new_lines, &mut result);
68 result
69}
70
71/// Compute word-level diff between two text blocks.
72///
73/// Words and whitespace are treated as separate tokens. The output uses:
74/// - `[-...-]` for deleted content
75/// - `{+...+}` for inserted content
76fn compute_word_diff(old_text: &str, new_text: &str) -> String {
77 // Split into words while preserving whitespace
78 let old_words = tokenize(old_text);
79 let new_words = tokenize(new_text);
80
81 let ops = diff_tokens(&old_words, &new_words);
82 let mut result = String::new();
83
84 for op in ops {
85 match op {
86 DiffOp::Equal(start, end) => {
87 for token in &old_words[start..end] {
88 result.push_str(token);
89 }
90 }
91 DiffOp::Delete(start, end) => {
92 result.push_str("[-");
93 for token in &old_words[start..end] {
94 result.push_str(token);
95 }
96 result.push_str("-]");
97 }
98 DiffOp::Insert(start, end) => {
99 result.push_str("{+");
100 for token in &new_words[start..end] {
101 result.push_str(token);
102 }
103 result.push_str("+}");
104 }
105 DiffOp::Replace {
106 old_start,
107 old_end,
108 new_start,
109 new_end,
110 } => {
111 result.push_str("[-");
112 for token in &old_words[old_start..old_end] {
113 result.push_str(token);
114 }
115 result.push_str("-]");
116 result.push_str("{+");
117 for token in &new_words[new_start..new_end] {
118 result.push_str(token);
119 }
120 result.push_str("+}");
121 }
122 }
123 }
124
125 if !result.is_empty() && !result.ends_with('\n') {
126 result.push('\n');
127 }
128
129 result
130}
131
132/// Tokenize text into words and whitespace sequences.
133fn tokenize(text: &str) -> Vec<&str> {
134 let mut tokens = Vec::new();
135 let mut chars = text.char_indices().peekable();
136
137 while let Some((start, ch)) = chars.next() {
138 if ch.is_whitespace() {
139 // Collect contiguous whitespace
140 let mut end = start + ch.len_utf8();
141 while let Some(&(_, next_ch)) = chars.peek() {
142 if next_ch.is_whitespace() {
143 end += next_ch.len_utf8();
144 chars.next();
145 } else {
146 break;
147 }
148 }
149 tokens.push(&text[start..end]);
150 } else {
151 // Collect contiguous non-whitespace
152 let mut end = start + ch.len_utf8();
153 while let Some(&(_, next_ch)) = chars.peek() {
154 if !next_ch.is_whitespace() {
155 end += next_ch.len_utf8();
156 chars.next();
157 } else {
158 break;
159 }
160 }
161 tokens.push(&text[start..end]);
162 }
163 }
164
165 tokens
166}
167
168#[derive(Debug)]
169enum DiffOp {
170 Equal(usize, usize),
171 Delete(usize, usize),
172 Insert(usize, usize),
173 Replace {
174 old_start: usize,
175 old_end: usize,
176 new_start: usize,
177 new_end: usize,
178 },
179}
180
181/// Compute diff operations between two token sequences using a simple LCS-based algorithm.
182fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
183 // Build LCS table
184 let m = old.len();
185 let n = new.len();
186
187 if m == 0 && n == 0 {
188 return vec![];
189 }
190 if m == 0 {
191 return vec![DiffOp::Insert(0, n)];
192 }
193 if n == 0 {
194 return vec![DiffOp::Delete(0, m)];
195 }
196
197 // LCS dynamic programming
198 let mut dp = vec![vec![0usize; n + 1]; m + 1];
199 for i in 1..=m {
200 for j in 1..=n {
201 if old[i - 1] == new[j - 1] {
202 dp[i][j] = dp[i - 1][j - 1] + 1;
203 } else {
204 dp[i][j] = dp[i - 1][j].max(dp[i][j - 1]);
205 }
206 }
207 }
208
209 // Backtrack to find operations
210 let mut ops = Vec::new();
211 let mut i = m;
212 let mut j = n;
213
214 // We'll collect in reverse order, then reverse at the end
215 let mut stack: Vec<(usize, usize, bool)> = Vec::new(); // (index, end, is_old)
216
217 while i > 0 || j > 0 {
218 if i > 0 && j > 0 && old[i - 1] == new[j - 1] {
219 stack.push((i - 1, i, true)); // Equal marker (using old index)
220 stack.push((j - 1, j, false)); // Paired with new index
221 i -= 1;
222 j -= 1;
223 } else if j > 0 && (i == 0 || dp[i][j - 1] >= dp[i - 1][j]) {
224 // Insert from new
225 stack.push((j - 1, j, false));
226 j -= 1;
227 } else {
228 // Delete from old
229 stack.push((i - 1, i, true));
230 i -= 1;
231 }
232 }
233
234 // Process the stack to build proper DiffOps
235 // This is a simplified approach - just iterate through and build ops
236 let mut old_idx = 0;
237 let mut new_idx = 0;
238
239 while old_idx < m || new_idx < n {
240 // Find next matching pair
241 let mut old_match = None;
242 let mut new_match = None;
243
244 for oi in old_idx..m {
245 for ni in new_idx..n {
246 if old[oi] == new[ni] {
247 old_match = Some(oi);
248 new_match = Some(ni);
249 break;
250 }
251 }
252 if old_match.is_some() {
253 break;
254 }
255 }
256
257 match (old_match, new_match) {
258 (Some(om), Some(nm)) => {
259 // Handle any deletions/insertions before the match
260 if old_idx < om && new_idx < nm {
261 ops.push(DiffOp::Replace {
262 old_start: old_idx,
263 old_end: om,
264 new_start: new_idx,
265 new_end: nm,
266 });
267 } else if old_idx < om {
268 ops.push(DiffOp::Delete(old_idx, om));
269 } else if new_idx < nm {
270 ops.push(DiffOp::Insert(new_idx, nm));
271 }
272
273 // Find the extent of the equal sequence
274 let mut eq_end_old = om;
275 let mut eq_end_new = nm;
276 while eq_end_old < m && eq_end_new < n && old[eq_end_old] == new[eq_end_new] {
277 eq_end_old += 1;
278 eq_end_new += 1;
279 }
280
281 ops.push(DiffOp::Equal(om, eq_end_old));
282 old_idx = eq_end_old;
283 new_idx = eq_end_new;
284 }
285 _ => {
286 // No more matches, handle remaining
287 if old_idx < m && new_idx < n {
288 ops.push(DiffOp::Replace {
289 old_start: old_idx,
290 old_end: m,
291 new_start: new_idx,
292 new_end: n,
293 });
294 } else if old_idx < m {
295 ops.push(DiffOp::Delete(old_idx, m));
296 } else if new_idx < n {
297 ops.push(DiffOp::Insert(new_idx, n));
298 }
299 break;
300 }
301 }
302 }
303
304 ops
305}
306
307#[cfg(test)]
308mod tests {
309 use super::*;
310
311 #[test]
312 fn test_tokenize() {
313 let tokens = tokenize("hello world");
314 assert_eq!(tokens, vec!["hello", " ", "world"]);
315
316 let tokens = tokenize(" multiple spaces ");
317 assert_eq!(tokens, vec![" ", "multiple", " ", "spaces", " "]);
318 }
319
320 #[test]
321 fn test_compute_word_diff_simple() {
322 let result = compute_word_diff("hello world", "hello there");
323 assert!(result.contains("[-world-]"));
324 assert!(result.contains("{+there+}"));
325 }
326
327 #[test]
328 fn test_unified_to_word_diff() {
329 let unified = "\
330--- a/file.txt
331+++ b/file.txt
332@@ -1,3 +1,3 @@
333 context line
334-old text here
335+new text here
336 more context";
337
338 let result = unified_to_word_diff(unified);
339 assert!(result.contains("--- a/file.txt"));
340 assert!(result.contains("+++ b/file.txt"));
341 assert!(result.contains("@@"));
342 }
343}