1//! Word-diff utilities for converting unified diffs to word-diff format.
2
3use similar::{DiffTag, TextDiff};
4
5/// Convert unified diff to word-diff format.
6///
7/// This transforms line-based diffs into word-level diffs where:
8/// - Deletions are marked with `[-...-]`
9/// - Insertions are marked with `{+...+}`
10pub fn unified_to_word_diff(unified_diff: &str) -> String {
11 let lines: Vec<&str> = unified_diff.lines().collect();
12 let mut result = String::new();
13 let mut old_lines: Vec<&str> = Vec::new();
14 let mut new_lines: Vec<&str> = Vec::new();
15
16 let flush_changes =
17 |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, result: &mut String| {
18 if old_lines.is_empty() && new_lines.is_empty() {
19 return;
20 }
21
22 // Strip the leading '-' or '+' from each line
23 let old_text: String = old_lines
24 .iter()
25 .map(|line| if line.len() > 1 { &line[1..] } else { "" })
26 .collect::<Vec<_>>()
27 .join("\n");
28
29 let new_text: String = new_lines
30 .iter()
31 .map(|line| if line.len() > 1 { &line[1..] } else { "" })
32 .collect::<Vec<_>>()
33 .join("\n");
34
35 if !old_text.is_empty() || !new_text.is_empty() {
36 let word_diff = compute_word_diff(&old_text, &new_text);
37 result.push_str(&word_diff);
38 }
39
40 old_lines.clear();
41 new_lines.clear();
42 };
43
44 for line in lines {
45 if line.starts_with("---") || line.starts_with("+++") {
46 flush_changes(&mut old_lines, &mut new_lines, &mut result);
47 result.push_str(line);
48 result.push('\n');
49 } else if line.starts_with("@@") {
50 flush_changes(&mut old_lines, &mut new_lines, &mut result);
51 result.push_str(line);
52 result.push('\n');
53 } else if line.starts_with('-') {
54 old_lines.push(line);
55 } else if line.starts_with('+') {
56 new_lines.push(line);
57 } else if line.starts_with(' ') || line.is_empty() {
58 flush_changes(&mut old_lines, &mut new_lines, &mut result);
59 result.push_str(line);
60 result.push('\n');
61 } else {
62 // Header lines (diff --git, index, etc.)
63 flush_changes(&mut old_lines, &mut new_lines, &mut result);
64 result.push_str(line);
65 result.push('\n');
66 }
67 }
68
69 flush_changes(&mut old_lines, &mut new_lines, &mut result);
70 result
71}
72
73/// Compute word-level diff between two text blocks.
74///
75/// Words and whitespace are treated as separate tokens. The output uses:
76/// - `[-...-]` for deleted content
77/// - `{+...+}` for inserted content
78fn compute_word_diff(old_text: &str, new_text: &str) -> String {
79 // Split into words while preserving whitespace
80 let old_words = tokenize(old_text);
81 let new_words = tokenize(new_text);
82
83 let ops = diff_tokens(&old_words, &new_words);
84 let mut result = String::new();
85
86 for op in ops {
87 match op {
88 DiffOp::Equal {
89 old_start, old_end, ..
90 } => {
91 for token in &old_words[old_start..old_end] {
92 result.push_str(token);
93 }
94 }
95 DiffOp::Delete(start, end) => {
96 result.push_str("[-");
97 for token in &old_words[start..end] {
98 result.push_str(token);
99 }
100 result.push_str("-]");
101 }
102 DiffOp::Insert(start, end) => {
103 result.push_str("{+");
104 for token in &new_words[start..end] {
105 result.push_str(token);
106 }
107 result.push_str("+}");
108 }
109 DiffOp::Replace {
110 old_start,
111 old_end,
112 new_start,
113 new_end,
114 } => {
115 result.push_str("[-");
116 for token in &old_words[old_start..old_end] {
117 result.push_str(token);
118 }
119 result.push_str("-]");
120 result.push_str("{+");
121 for token in &new_words[new_start..new_end] {
122 result.push_str(token);
123 }
124 result.push_str("+}");
125 }
126 }
127 }
128
129 if !result.is_empty() && !result.ends_with('\n') {
130 result.push('\n');
131 }
132
133 result
134}
135
136/// Classify a character into one of three token classes:
137/// - 0: identifier (alphanumeric or `_`)
138/// - 1: whitespace
139/// - 2: punctuation (everything else, each character becomes its own token)
140fn char_class(ch: char) -> u8 {
141 if ch.is_alphanumeric() || ch == '_' {
142 0
143 } else if ch.is_whitespace() {
144 1
145 } else {
146 2
147 }
148}
149
150/// Tokenize text into identifier words, whitespace runs, and individual punctuation characters.
151///
152/// This splitting aligns with the syntactic atoms of source code so that the
153/// LCS-based diff can produce fine-grained, meaningful change regions.
154pub(crate) fn tokenize(text: &str) -> Vec<&str> {
155 let mut tokens = Vec::new();
156 let mut chars = text.char_indices().peekable();
157
158 while let Some((start, ch)) = chars.next() {
159 let class = char_class(ch);
160 if class == 2 {
161 // Punctuation: each character is a separate token
162 tokens.push(&text[start..start + ch.len_utf8()]);
163 } else {
164 // Identifier or whitespace: collect contiguous run of same class
165 let mut end = start + ch.len_utf8();
166 while let Some(&(_, next_ch)) = chars.peek() {
167 if char_class(next_ch) == class {
168 end += next_ch.len_utf8();
169 chars.next();
170 } else {
171 break;
172 }
173 }
174 tokens.push(&text[start..end]);
175 }
176 }
177
178 tokens
179}
180
181#[derive(Debug)]
182pub(crate) enum DiffOp {
183 Equal {
184 old_start: usize,
185 old_end: usize,
186 new_start: usize,
187 new_end: usize,
188 },
189 Delete(usize, usize),
190 Insert(usize, usize),
191 Replace {
192 old_start: usize,
193 old_end: usize,
194 new_start: usize,
195 new_end: usize,
196 },
197}
198
199/// Compute diff operations between two token sequences using `similar`'s Myers diff.
200pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
201 let diff = TextDiff::from_slices(old, new);
202 diff.ops()
203 .iter()
204 .map(|op| {
205 let tag = op.tag();
206 let old_range = op.old_range();
207 let new_range = op.new_range();
208 match tag {
209 DiffTag::Equal => DiffOp::Equal {
210 old_start: old_range.start,
211 old_end: old_range.end,
212 new_start: new_range.start,
213 new_end: new_range.end,
214 },
215 DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end),
216 DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end),
217 DiffTag::Replace => DiffOp::Replace {
218 old_start: old_range.start,
219 old_end: old_range.end,
220 new_start: new_range.start,
221 new_end: new_range.end,
222 },
223 }
224 })
225 .collect()
226}
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231
232 #[test]
233 fn test_tokenize() {
234 let tokens = tokenize("hello world");
235 assert_eq!(tokens, vec!["hello", " ", "world"]);
236
237 let tokens = tokenize(" multiple spaces ");
238 assert_eq!(tokens, vec![" ", "multiple", " ", "spaces", " "]);
239
240 let tokens = tokenize("self.name");
241 assert_eq!(tokens, vec!["self", ".", "name"]);
242
243 let tokens = tokenize("foo(bar, baz)");
244 assert_eq!(tokens, vec!["foo", "(", "bar", ",", " ", "baz", ")"]);
245
246 let tokens = tokenize("hello_world");
247 assert_eq!(tokens, vec!["hello_world"]);
248
249 let tokens = tokenize("fn();");
250 assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
251
252 let tokens = tokenize("foo_bar123 + baz");
253 assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]);
254
255 let tokens = tokenize("print(\"hello\")");
256 assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
257 }
258
259 #[test]
260 fn test_compute_word_diff_simple() {
261 let result = compute_word_diff("hello world", "hello there");
262 assert!(result.contains("[-world-]"));
263 assert!(result.contains("{+there+}"));
264 }
265
266 #[test]
267 fn test_unified_to_word_diff() {
268 let unified = "\
269--- a/file.txt
270+++ b/file.txt
271@@ -1,3 +1,3 @@
272 context line
273-old text here
274+new text here
275 more context";
276
277 let result = unified_to_word_diff(unified);
278 assert!(result.contains("--- a/file.txt"));
279 assert!(result.contains("+++ b/file.txt"));
280 assert!(result.contains("@@"));
281 }
282}