1use crate::{CharClassifier, CharKind, CharScopeContext, LanguageScope};
2use anyhow::{Context, anyhow};
3use imara_diff::{
4 Algorithm, Sink, diff,
5 intern::{InternedInput, Interner, Token},
6 sources::lines_with_terminator,
7};
8use std::{fmt::Write, iter, ops::Range, sync::Arc};
9
10const MAX_WORD_DIFF_LEN: usize = 512;
11const MAX_WORD_DIFF_LINE_COUNT: usize = 8;
12
13/// Computes a diff between two strings, returning a unified diff string.
14pub fn unified_diff(old_text: &str, new_text: &str) -> String {
15 unified_diff_with_offsets(old_text, new_text, 0, 0)
16}
17
18/// Computes a diff between two strings, returning a unified diff string with
19/// hunk headers adjusted to reflect the given starting line numbers (zero-indexed).
20pub fn unified_diff_with_offsets(
21 old_text: &str,
22 new_text: &str,
23 old_start_line: u32,
24 new_start_line: u32,
25) -> String {
26 let input = InternedInput::new(old_text, new_text);
27 diff(
28 Algorithm::Histogram,
29 &input,
30 OffsetUnifiedDiffBuilder::new(&input, old_start_line, new_start_line),
31 )
32}
33
34/// A unified diff builder that applies line number offsets to hunk headers.
35struct OffsetUnifiedDiffBuilder<'a> {
36 before: &'a [Token],
37 after: &'a [Token],
38 interner: &'a Interner<&'a str>,
39
40 pos: u32,
41 before_hunk_start: u32,
42 after_hunk_start: u32,
43 before_hunk_len: u32,
44 after_hunk_len: u32,
45
46 old_line_offset: u32,
47 new_line_offset: u32,
48
49 buffer: String,
50 dst: String,
51}
52
53impl<'a> OffsetUnifiedDiffBuilder<'a> {
54 fn new(input: &'a InternedInput<&'a str>, old_line_offset: u32, new_line_offset: u32) -> Self {
55 Self {
56 before_hunk_start: 0,
57 after_hunk_start: 0,
58 before_hunk_len: 0,
59 after_hunk_len: 0,
60 old_line_offset,
61 new_line_offset,
62 buffer: String::with_capacity(8),
63 dst: String::new(),
64 interner: &input.interner,
65 before: &input.before,
66 after: &input.after,
67 pos: 0,
68 }
69 }
70
71 fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
72 for &token in tokens {
73 writeln!(&mut self.buffer, "{prefix}{}", self.interner[token]).unwrap();
74 }
75 }
76
77 fn flush(&mut self) {
78 if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
79 return;
80 }
81
82 let end = (self.pos + 3).min(self.before.len() as u32);
83 self.update_pos(end, end);
84
85 writeln!(
86 &mut self.dst,
87 "@@ -{},{} +{},{} @@",
88 self.before_hunk_start + 1 + self.old_line_offset,
89 self.before_hunk_len,
90 self.after_hunk_start + 1 + self.new_line_offset,
91 self.after_hunk_len,
92 )
93 .unwrap();
94 write!(&mut self.dst, "{}", &self.buffer).unwrap();
95 self.buffer.clear();
96 self.before_hunk_len = 0;
97 self.after_hunk_len = 0;
98 }
99
100 fn update_pos(&mut self, print_to: u32, move_to: u32) {
101 self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' ');
102 let len = print_to - self.pos;
103 self.pos = move_to;
104 self.before_hunk_len += len;
105 self.after_hunk_len += len;
106 }
107}
108
109impl Sink for OffsetUnifiedDiffBuilder<'_> {
110 type Out = String;
111
112 fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
113 if before.start - self.pos > 6 {
114 self.flush();
115 }
116 if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
117 self.pos = before.start.saturating_sub(3);
118 self.before_hunk_start = self.pos;
119 self.after_hunk_start = after.start.saturating_sub(3);
120 }
121 self.update_pos(before.start, before.end);
122 self.before_hunk_len += before.end - before.start;
123 self.after_hunk_len += after.end - after.start;
124 self.print_tokens(
125 &self.before[before.start as usize..before.end as usize],
126 '-',
127 );
128 self.print_tokens(&self.after[after.start as usize..after.end as usize], '+');
129 }
130
131 fn finish(mut self) -> Self::Out {
132 self.flush();
133 self.dst
134 }
135}
136
137/// Computes a diff between two strings, returning a vector of old and new row
138/// ranges.
139pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range<u32>, Range<u32>)> {
140 let mut edits = Vec::new();
141 let input = InternedInput::new(
142 lines_with_terminator(old_text),
143 lines_with_terminator(new_text),
144 );
145 diff_internal(&input, |_, _, old_rows, new_rows| {
146 edits.push((old_rows, new_rows));
147 });
148 edits
149}
150
151/// Computes a diff between two strings, returning a vector of edits.
152///
153/// The edits are represented as tuples of byte ranges and replacement strings.
154///
155/// Internally, this function first performs a line-based diff, and then performs a second
156/// word-based diff within hunks that replace small numbers of lines.
157pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range<usize>, Arc<str>)> {
158 text_diff_with_options(old_text, new_text, DiffOptions::default())
159}
160
161/// Computes word-level diff ranges between two strings.
162///
163/// Returns a tuple of (old_ranges, new_ranges) where each vector contains
164/// the byte ranges of changed words in the respective text.
165pub fn word_diff_ranges(
166 old_text: &str,
167 new_text: &str,
168 options: DiffOptions,
169) -> (Vec<Range<usize>>, Vec<Range<usize>>) {
170 let mut input: InternedInput<&str> = InternedInput::default();
171 input.update_before(tokenize(old_text, options.language_scope.clone()));
172 input.update_after(tokenize(new_text, options.language_scope));
173
174 let mut old_ranges: Vec<Range<usize>> = Vec::new();
175 let mut new_ranges: Vec<Range<usize>> = Vec::new();
176
177 diff_internal(&input, |old_byte_range, new_byte_range, _, _| {
178 if !old_byte_range.is_empty() {
179 if let Some(last) = old_ranges.last_mut()
180 && last.end >= old_byte_range.start
181 {
182 last.end = old_byte_range.end;
183 } else {
184 old_ranges.push(old_byte_range);
185 }
186 }
187
188 if !new_byte_range.is_empty() {
189 if let Some(last) = new_ranges.last_mut()
190 && last.end >= new_byte_range.start
191 {
192 last.end = new_byte_range.end;
193 } else {
194 new_ranges.push(new_byte_range);
195 }
196 }
197 });
198
199 (old_ranges, new_ranges)
200}
201
202pub struct DiffOptions {
203 pub language_scope: Option<LanguageScope>,
204 pub max_word_diff_len: usize,
205 pub max_word_diff_line_count: usize,
206}
207
208impl Default for DiffOptions {
209 fn default() -> Self {
210 Self {
211 language_scope: Default::default(),
212 max_word_diff_len: MAX_WORD_DIFF_LEN,
213 max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT,
214 }
215 }
216}
217
218/// Computes a diff between two strings, using a specific language scope's
219/// word characters for word-level diffing.
220pub fn text_diff_with_options(
221 old_text: &str,
222 new_text: &str,
223 options: DiffOptions,
224) -> Vec<(Range<usize>, Arc<str>)> {
225 let empty: Arc<str> = Arc::default();
226 let mut edits = Vec::new();
227 let mut hunk_input = InternedInput::default();
228 let input = InternedInput::new(
229 lines_with_terminator(old_text),
230 lines_with_terminator(new_text),
231 );
232 diff_internal(
233 &input,
234 |old_byte_range, new_byte_range, old_rows, new_rows| {
235 if should_perform_word_diff_within_hunk(
236 &old_rows,
237 &old_byte_range,
238 &new_rows,
239 &new_byte_range,
240 &options,
241 ) {
242 let old_offset = old_byte_range.start;
243 let new_offset = new_byte_range.start;
244 hunk_input.clear();
245 hunk_input.update_before(tokenize(
246 &old_text[old_byte_range],
247 options.language_scope.clone(),
248 ));
249 hunk_input.update_after(tokenize(
250 &new_text[new_byte_range],
251 options.language_scope.clone(),
252 ));
253 diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| {
254 let old_byte_range =
255 old_offset + old_byte_range.start..old_offset + old_byte_range.end;
256 let new_byte_range =
257 new_offset + new_byte_range.start..new_offset + new_byte_range.end;
258 let replacement_text = if new_byte_range.is_empty() {
259 empty.clone()
260 } else {
261 new_text[new_byte_range].into()
262 };
263 edits.push((old_byte_range, replacement_text));
264 });
265 } else {
266 let replacement_text = if new_byte_range.is_empty() {
267 empty.clone()
268 } else {
269 new_text[new_byte_range].into()
270 };
271 edits.push((old_byte_range, replacement_text));
272 }
273 },
274 );
275 edits
276}
277
278pub fn apply_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
279 let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
280 let result = diffy::apply(base_text, &patch);
281 result.map_err(|err| anyhow!(err))
282}
283
284pub fn apply_reversed_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
285 let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
286 let reversed = patch.reverse();
287 diffy::apply(base_text, &reversed).map_err(|err| anyhow!(err))
288}
289
290fn should_perform_word_diff_within_hunk(
291 old_row_range: &Range<u32>,
292 old_byte_range: &Range<usize>,
293 new_row_range: &Range<u32>,
294 new_byte_range: &Range<usize>,
295 options: &DiffOptions,
296) -> bool {
297 !old_byte_range.is_empty()
298 && !new_byte_range.is_empty()
299 && old_byte_range.len() <= options.max_word_diff_len
300 && new_byte_range.len() <= options.max_word_diff_len
301 && old_row_range.len() <= options.max_word_diff_line_count
302 && new_row_range.len() <= options.max_word_diff_line_count
303}
304
305fn diff_internal(
306 input: &InternedInput<&str>,
307 mut on_change: impl FnMut(Range<usize>, Range<usize>, Range<u32>, Range<u32>),
308) {
309 let mut old_offset = 0;
310 let mut new_offset = 0;
311 let mut old_token_ix = 0;
312 let mut new_token_ix = 0;
313 diff(
314 Algorithm::Histogram,
315 input,
316 |old_tokens: Range<u32>, new_tokens: Range<u32>| {
317 old_offset += token_len(
318 input,
319 &input.before[old_token_ix as usize..old_tokens.start as usize],
320 );
321 new_offset += token_len(
322 input,
323 &input.after[new_token_ix as usize..new_tokens.start as usize],
324 );
325 let old_len = token_len(
326 input,
327 &input.before[old_tokens.start as usize..old_tokens.end as usize],
328 );
329 let new_len = token_len(
330 input,
331 &input.after[new_tokens.start as usize..new_tokens.end as usize],
332 );
333 let old_byte_range = old_offset..old_offset + old_len;
334 let new_byte_range = new_offset..new_offset + new_len;
335 old_token_ix = old_tokens.end;
336 new_token_ix = new_tokens.end;
337 old_offset = old_byte_range.end;
338 new_offset = new_byte_range.end;
339 on_change(old_byte_range, new_byte_range, old_tokens, new_tokens);
340 },
341 );
342}
343
344fn tokenize(text: &str, language_scope: Option<LanguageScope>) -> impl Iterator<Item = &str> {
345 let classifier =
346 CharClassifier::new(language_scope).scope_context(Some(CharScopeContext::Completion));
347 let mut chars = text.char_indices();
348 let mut prev = None;
349 let mut start_ix = 0;
350 iter::from_fn(move || {
351 for (ix, c) in chars.by_ref() {
352 let mut token = None;
353 let kind = classifier.kind(c);
354 if let Some((prev_char, prev_kind)) = prev
355 && (kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char))
356 {
357 token = Some(&text[start_ix..ix]);
358 start_ix = ix;
359 }
360 prev = Some((c, kind));
361 if token.is_some() {
362 return token;
363 }
364 }
365 if start_ix < text.len() {
366 let token = &text[start_ix..];
367 start_ix = text.len();
368 return Some(token);
369 }
370 None
371 })
372}
373
374fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize {
375 tokens
376 .iter()
377 .map(|token| input.interner[*token].len())
378 .sum()
379}
380
381#[cfg(test)]
382mod tests {
383 use super::*;
384
385 #[test]
386 fn test_tokenize() {
387 let text = "";
388 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), Vec::<&str>::new());
389
390 let text = " ";
391 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec![" "]);
392
393 let text = "one";
394 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one"]);
395
396 let text = "one\n";
397 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one", "\n"]);
398
399 let text = "one.two(three)";
400 assert_eq!(
401 tokenize(text, None).collect::<Vec<_>>(),
402 vec!["one", ".", "two", "(", "three", ")"]
403 );
404
405 let text = "one two three()";
406 assert_eq!(
407 tokenize(text, None).collect::<Vec<_>>(),
408 vec!["one", " ", "two", " ", "three", "(", ")"]
409 );
410
411 let text = " one\n two three";
412 assert_eq!(
413 tokenize(text, None).collect::<Vec<_>>(),
414 vec![" ", "one", "\n ", "two", " ", "three"]
415 );
416 }
417
418 #[test]
419 fn test_text_diff() {
420 let old_text = "one two three";
421 let new_text = "one TWO three";
422 assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]);
423
424 let old_text = "one\ntwo\nthree\n";
425 let new_text = "one\ntwo\nAND\nTHEN\nthree\n";
426 assert_eq!(
427 text_diff(old_text, new_text),
428 [(8..8, "AND\nTHEN\n".into()),]
429 );
430
431 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
432 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
433 assert_eq!(
434 text_diff(old_text, new_text),
435 [
436 (14..18, "FOUR".into()),
437 (28..33, "SEVEN".into()),
438 (49..49, "ELEVEN\n".into())
439 ]
440 );
441 }
442
443 #[test]
444 fn test_apply_diff_patch() {
445 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
446 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
447 let patch = unified_diff(old_text, new_text);
448 assert_eq!(apply_diff_patch(old_text, &patch).unwrap(), new_text);
449 }
450
451 #[test]
452 fn test_apply_reversed_diff_patch() {
453 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
454 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
455 let patch = unified_diff(old_text, new_text);
456 assert_eq!(
457 apply_reversed_diff_patch(new_text, &patch).unwrap(),
458 old_text
459 );
460 }
461
462 #[test]
463 fn test_unified_diff_with_offsets() {
464 let old_text = "foo\nbar\nbaz\n";
465 let new_text = "foo\nBAR\nbaz\n";
466
467 let expected_diff_body = " foo\n-bar\n+BAR\n baz\n";
468
469 let diff_no_offset = unified_diff(old_text, new_text);
470 assert_eq!(
471 diff_no_offset,
472 format!("@@ -1,3 +1,3 @@\n{}", expected_diff_body)
473 );
474
475 let diff_with_offset = unified_diff_with_offsets(old_text, new_text, 9, 11);
476 assert_eq!(
477 diff_with_offset,
478 format!("@@ -10,3 +12,3 @@\n{}", expected_diff_body)
479 );
480
481 let diff_with_offset = unified_diff_with_offsets(old_text, new_text, 99, 104);
482 assert_eq!(
483 diff_with_offset,
484 format!("@@ -100,3 +105,3 @@\n{}", expected_diff_body)
485 );
486 }
487}