1use crate::{CharClassifier, CharKind, CharScopeContext, LanguageScope};
2use anyhow::{Context, anyhow};
3use imara_diff::{
4 Algorithm, UnifiedDiffBuilder, diff,
5 intern::{InternedInput, Token},
6 sources::lines_with_terminator,
7};
8use std::{iter, ops::Range, sync::Arc};
9
10const MAX_WORD_DIFF_LEN: usize = 512;
11const MAX_WORD_DIFF_LINE_COUNT: usize = 8;
12
13/// Computes a diff between two strings, returning a unified diff string.
14pub fn unified_diff(old_text: &str, new_text: &str) -> String {
15 let input = InternedInput::new(old_text, new_text);
16 diff(
17 Algorithm::Histogram,
18 &input,
19 UnifiedDiffBuilder::new(&input),
20 )
21}
22
23/// Computes a diff between two strings, returning a vector of old and new row
24/// ranges.
25pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range<u32>, Range<u32>)> {
26 let mut edits = Vec::new();
27 let input = InternedInput::new(
28 lines_with_terminator(old_text),
29 lines_with_terminator(new_text),
30 );
31 diff_internal(&input, |_, _, old_rows, new_rows| {
32 edits.push((old_rows, new_rows));
33 });
34 edits
35}
36
37/// Computes a diff between two strings, returning a vector of edits.
38///
39/// The edits are represented as tuples of byte ranges and replacement strings.
40///
41/// Internally, this function first performs a line-based diff, and then performs a second
42/// word-based diff within hunks that replace small numbers of lines.
43pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range<usize>, Arc<str>)> {
44 text_diff_with_options(old_text, new_text, DiffOptions::default())
45}
46
47/// Computes word-level diff ranges between two strings.
48///
49/// Returns a tuple of (old_ranges, new_ranges) where each vector contains
50/// the byte ranges of changed words in the respective text.
51/// Whitespace-only changes are excluded from the results.
52pub fn word_diff_ranges(
53 old_text: &str,
54 new_text: &str,
55 options: DiffOptions,
56) -> (Vec<Range<usize>>, Vec<Range<usize>>) {
57 let mut input: InternedInput<&str> = InternedInput::default();
58 input.update_before(tokenize(old_text, options.language_scope.clone()));
59 input.update_after(tokenize(new_text, options.language_scope));
60
61 let mut old_ranges: Vec<Range<usize>> = Vec::new();
62 let mut new_ranges: Vec<Range<usize>> = Vec::new();
63
64 diff_internal(&input, |old_byte_range, new_byte_range, _, _| {
65 for range in split_on_whitespace(old_text, &old_byte_range) {
66 if let Some(last) = old_ranges.last_mut()
67 && last.end >= range.start
68 {
69 last.end = range.end;
70 } else {
71 old_ranges.push(range);
72 }
73 }
74
75 for range in split_on_whitespace(new_text, &new_byte_range) {
76 if let Some(last) = new_ranges.last_mut()
77 && last.end >= range.start
78 {
79 last.end = range.end;
80 } else {
81 new_ranges.push(range);
82 }
83 }
84 });
85
86 (old_ranges, new_ranges)
87}
88
89fn split_on_whitespace(text: &str, range: &Range<usize>) -> Vec<Range<usize>> {
90 if range.is_empty() {
91 return Vec::new();
92 }
93
94 let slice = &text[range.clone()];
95 let mut ranges = Vec::new();
96 let mut offset = 0;
97
98 for line in slice.lines() {
99 let line_start = offset;
100 let line_end = line_start + line.len();
101 offset = line_end + 1;
102 let trimmed = line.trim();
103
104 if !trimmed.is_empty() {
105 let leading = line.len() - line.trim_start().len();
106 let trailing = line.len() - line.trim_end().len();
107 let trimmed_start = range.start + line_start + leading;
108 let trimmed_end = range.start + line_end - trailing;
109
110 let original_line_start = text[..range.start + line_start]
111 .rfind('\n')
112 .map(|i| i + 1)
113 .unwrap_or(0);
114 let original_line_end = text[range.start + line_start..]
115 .find('\n')
116 .map(|i| range.start + line_start + i)
117 .unwrap_or(text.len());
118 let original_line = &text[original_line_start..original_line_end];
119 let original_trimmed_start =
120 original_line_start + (original_line.len() - original_line.trim_start().len());
121 let original_trimmed_end =
122 original_line_end - (original_line.len() - original_line.trim_end().len());
123
124 if trimmed_start > original_trimmed_start || trimmed_end < original_trimmed_end {
125 ranges.push(trimmed_start..trimmed_end);
126 }
127 }
128 }
129
130 ranges
131}
132
133pub struct DiffOptions {
134 pub language_scope: Option<LanguageScope>,
135 pub max_word_diff_len: usize,
136 pub max_word_diff_line_count: usize,
137}
138
139impl Default for DiffOptions {
140 fn default() -> Self {
141 Self {
142 language_scope: Default::default(),
143 max_word_diff_len: MAX_WORD_DIFF_LEN,
144 max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT,
145 }
146 }
147}
148
149/// Computes a diff between two strings, using a specific language scope's
150/// word characters for word-level diffing.
151pub fn text_diff_with_options(
152 old_text: &str,
153 new_text: &str,
154 options: DiffOptions,
155) -> Vec<(Range<usize>, Arc<str>)> {
156 let empty: Arc<str> = Arc::default();
157 let mut edits = Vec::new();
158 let mut hunk_input = InternedInput::default();
159 let input = InternedInput::new(
160 lines_with_terminator(old_text),
161 lines_with_terminator(new_text),
162 );
163 diff_internal(
164 &input,
165 |old_byte_range, new_byte_range, old_rows, new_rows| {
166 if should_perform_word_diff_within_hunk(
167 &old_rows,
168 &old_byte_range,
169 &new_rows,
170 &new_byte_range,
171 &options,
172 ) {
173 let old_offset = old_byte_range.start;
174 let new_offset = new_byte_range.start;
175 hunk_input.clear();
176 hunk_input.update_before(tokenize(
177 &old_text[old_byte_range],
178 options.language_scope.clone(),
179 ));
180 hunk_input.update_after(tokenize(
181 &new_text[new_byte_range],
182 options.language_scope.clone(),
183 ));
184 diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| {
185 let old_byte_range =
186 old_offset + old_byte_range.start..old_offset + old_byte_range.end;
187 let new_byte_range =
188 new_offset + new_byte_range.start..new_offset + new_byte_range.end;
189 let replacement_text = if new_byte_range.is_empty() {
190 empty.clone()
191 } else {
192 new_text[new_byte_range].into()
193 };
194 edits.push((old_byte_range, replacement_text));
195 });
196 } else {
197 let replacement_text = if new_byte_range.is_empty() {
198 empty.clone()
199 } else {
200 new_text[new_byte_range].into()
201 };
202 edits.push((old_byte_range, replacement_text));
203 }
204 },
205 );
206 edits
207}
208
209pub fn apply_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
210 let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
211 let result = diffy::apply(base_text, &patch);
212 result.map_err(|err| anyhow!(err))
213}
214
215fn should_perform_word_diff_within_hunk(
216 old_row_range: &Range<u32>,
217 old_byte_range: &Range<usize>,
218 new_row_range: &Range<u32>,
219 new_byte_range: &Range<usize>,
220 options: &DiffOptions,
221) -> bool {
222 !old_byte_range.is_empty()
223 && !new_byte_range.is_empty()
224 && old_byte_range.len() <= options.max_word_diff_len
225 && new_byte_range.len() <= options.max_word_diff_len
226 && old_row_range.len() <= options.max_word_diff_line_count
227 && new_row_range.len() <= options.max_word_diff_line_count
228}
229
230fn diff_internal(
231 input: &InternedInput<&str>,
232 mut on_change: impl FnMut(Range<usize>, Range<usize>, Range<u32>, Range<u32>),
233) {
234 let mut old_offset = 0;
235 let mut new_offset = 0;
236 let mut old_token_ix = 0;
237 let mut new_token_ix = 0;
238 diff(
239 Algorithm::Histogram,
240 input,
241 |old_tokens: Range<u32>, new_tokens: Range<u32>| {
242 old_offset += token_len(
243 input,
244 &input.before[old_token_ix as usize..old_tokens.start as usize],
245 );
246 new_offset += token_len(
247 input,
248 &input.after[new_token_ix as usize..new_tokens.start as usize],
249 );
250 let old_len = token_len(
251 input,
252 &input.before[old_tokens.start as usize..old_tokens.end as usize],
253 );
254 let new_len = token_len(
255 input,
256 &input.after[new_tokens.start as usize..new_tokens.end as usize],
257 );
258 let old_byte_range = old_offset..old_offset + old_len;
259 let new_byte_range = new_offset..new_offset + new_len;
260 old_token_ix = old_tokens.end;
261 new_token_ix = new_tokens.end;
262 old_offset = old_byte_range.end;
263 new_offset = new_byte_range.end;
264 on_change(old_byte_range, new_byte_range, old_tokens, new_tokens);
265 },
266 );
267}
268
269fn tokenize(text: &str, language_scope: Option<LanguageScope>) -> impl Iterator<Item = &str> {
270 let classifier =
271 CharClassifier::new(language_scope).scope_context(Some(CharScopeContext::Completion));
272 let mut chars = text.char_indices();
273 let mut prev = None;
274 let mut start_ix = 0;
275 iter::from_fn(move || {
276 for (ix, c) in chars.by_ref() {
277 let mut token = None;
278 let kind = classifier.kind(c);
279 if let Some((prev_char, prev_kind)) = prev
280 && (kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char))
281 {
282 token = Some(&text[start_ix..ix]);
283 start_ix = ix;
284 }
285 prev = Some((c, kind));
286 if token.is_some() {
287 return token;
288 }
289 }
290 if start_ix < text.len() {
291 let token = &text[start_ix..];
292 start_ix = text.len();
293 return Some(token);
294 }
295 None
296 })
297}
298
299fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize {
300 tokens
301 .iter()
302 .map(|token| input.interner[*token].len())
303 .sum()
304}
305
306#[cfg(test)]
307mod tests {
308 use super::*;
309
310 #[test]
311 fn test_tokenize() {
312 let text = "";
313 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), Vec::<&str>::new());
314
315 let text = " ";
316 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec![" "]);
317
318 let text = "one";
319 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one"]);
320
321 let text = "one\n";
322 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one", "\n"]);
323
324 let text = "one.two(three)";
325 assert_eq!(
326 tokenize(text, None).collect::<Vec<_>>(),
327 vec!["one", ".", "two", "(", "three", ")"]
328 );
329
330 let text = "one two three()";
331 assert_eq!(
332 tokenize(text, None).collect::<Vec<_>>(),
333 vec!["one", " ", "two", " ", "three", "(", ")"]
334 );
335
336 let text = " one\n two three";
337 assert_eq!(
338 tokenize(text, None).collect::<Vec<_>>(),
339 vec![" ", "one", "\n ", "two", " ", "three"]
340 );
341 }
342
343 #[test]
344 fn test_text_diff() {
345 let old_text = "one two three";
346 let new_text = "one TWO three";
347 assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]);
348
349 let old_text = "one\ntwo\nthree\n";
350 let new_text = "one\ntwo\nAND\nTHEN\nthree\n";
351 assert_eq!(
352 text_diff(old_text, new_text),
353 [(8..8, "AND\nTHEN\n".into()),]
354 );
355
356 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
357 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
358 assert_eq!(
359 text_diff(old_text, new_text),
360 [
361 (14..18, "FOUR".into()),
362 (28..33, "SEVEN".into()),
363 (49..49, "ELEVEN\n".into())
364 ]
365 );
366 }
367
368 #[test]
369 fn test_apply_diff_patch() {
370 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
371 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
372 let patch = unified_diff(old_text, new_text);
373 assert_eq!(apply_diff_patch(old_text, &patch).unwrap(), new_text);
374 }
375}