1use crate::{CharClassifier, CharKind, CharScopeContext, LanguageScope};
2use anyhow::{Context, anyhow};
3use imara_diff::{
4 Algorithm, Sink, diff,
5 intern::{InternedInput, Interner, Token},
6 sources::lines_with_terminator,
7};
8use std::{fmt::Write, iter, ops::Range, sync::Arc};
9
10const MAX_WORD_DIFF_LEN: usize = 512;
11const MAX_WORD_DIFF_LINE_COUNT: usize = 8;
12
13/// Computes a diff between two strings, returning a unified diff string.
14pub fn unified_diff(old_text: &str, new_text: &str) -> String {
15 unified_diff_with_offsets(old_text, new_text, 0, 0)
16}
17
18/// Computes a diff between two strings, returning a unified diff string with
19/// hunk headers adjusted to reflect the given starting line numbers (zero-indexed).
20pub fn unified_diff_with_offsets(
21 old_text: &str,
22 new_text: &str,
23 old_start_line: u32,
24 new_start_line: u32,
25) -> String {
26 unified_diff_with_context(old_text, new_text, old_start_line, new_start_line, 3)
27}
28
29/// Computes a diff between two strings, returning a unified diff string with
30/// hunk headers adjusted to reflect the given starting line numbers (zero-indexed),
31/// and a configurable number of context lines around changes.
32pub fn unified_diff_with_context(
33 old_text: &str,
34 new_text: &str,
35 old_start_line: u32,
36 new_start_line: u32,
37 context_lines: u32,
38) -> String {
39 let input = InternedInput::new(old_text, new_text);
40 diff(
41 Algorithm::Histogram,
42 &input,
43 OffsetUnifiedDiffBuilder::new(&input, old_start_line, new_start_line, context_lines),
44 )
45}
46
47/// A unified diff builder that applies line number offsets to hunk headers.
48struct OffsetUnifiedDiffBuilder<'a> {
49 before: &'a [Token],
50 after: &'a [Token],
51 interner: &'a Interner<&'a str>,
52
53 pos: u32,
54 before_hunk_start: u32,
55 after_hunk_start: u32,
56 before_hunk_len: u32,
57 after_hunk_len: u32,
58
59 old_line_offset: u32,
60 new_line_offset: u32,
61 context_lines: u32,
62
63 buffer: String,
64 dst: String,
65}
66
67impl<'a> OffsetUnifiedDiffBuilder<'a> {
68 fn new(
69 input: &'a InternedInput<&'a str>,
70 old_line_offset: u32,
71 new_line_offset: u32,
72 context_lines: u32,
73 ) -> Self {
74 Self {
75 before_hunk_start: 0,
76 after_hunk_start: 0,
77 before_hunk_len: 0,
78 after_hunk_len: 0,
79 old_line_offset,
80 new_line_offset,
81 context_lines,
82 buffer: String::with_capacity(8),
83 dst: String::new(),
84 interner: &input.interner,
85 before: &input.before,
86 after: &input.after,
87 pos: 0,
88 }
89 }
90
91 fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
92 for &token in tokens {
93 writeln!(&mut self.buffer, "{prefix}{}", self.interner[token]).unwrap();
94 }
95 }
96
97 fn flush(&mut self) {
98 if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
99 return;
100 }
101
102 let end = (self.pos + self.context_lines).min(self.before.len() as u32);
103 self.update_pos(end, end);
104
105 writeln!(
106 &mut self.dst,
107 "@@ -{},{} +{},{} @@",
108 self.before_hunk_start + 1 + self.old_line_offset,
109 self.before_hunk_len,
110 self.after_hunk_start + 1 + self.new_line_offset,
111 self.after_hunk_len,
112 )
113 .unwrap();
114 write!(&mut self.dst, "{}", &self.buffer).unwrap();
115 self.buffer.clear();
116 self.before_hunk_len = 0;
117 self.after_hunk_len = 0;
118 }
119
120 fn update_pos(&mut self, print_to: u32, move_to: u32) {
121 self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' ');
122 let len = print_to - self.pos;
123 self.pos = move_to;
124 self.before_hunk_len += len;
125 self.after_hunk_len += len;
126 }
127}
128
129impl Sink for OffsetUnifiedDiffBuilder<'_> {
130 type Out = String;
131
132 fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
133 if before.start - self.pos > self.context_lines * 2 {
134 self.flush();
135 }
136 if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
137 self.pos = before.start.saturating_sub(self.context_lines);
138 self.before_hunk_start = self.pos;
139 self.after_hunk_start = after.start.saturating_sub(self.context_lines);
140 }
141 self.update_pos(before.start, before.end);
142 self.before_hunk_len += before.end - before.start;
143 self.after_hunk_len += after.end - after.start;
144 self.print_tokens(
145 &self.before[before.start as usize..before.end as usize],
146 '-',
147 );
148 self.print_tokens(&self.after[after.start as usize..after.end as usize], '+');
149 }
150
151 fn finish(mut self) -> Self::Out {
152 self.flush();
153 self.dst
154 }
155}
156
157/// Computes a diff between two strings, returning a vector of old and new row
158/// ranges.
159pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range<u32>, Range<u32>)> {
160 let mut edits = Vec::new();
161 let input = InternedInput::new(
162 lines_with_terminator(old_text),
163 lines_with_terminator(new_text),
164 );
165 diff_internal(&input, &mut |_, _, old_rows, new_rows| {
166 edits.push((old_rows, new_rows));
167 });
168 edits
169}
170
171/// Computes a diff between two strings, returning a vector of edits.
172///
173/// The edits are represented as tuples of byte ranges and replacement strings.
174///
175/// Internally, this function first performs a line-based diff, and then performs a second
176/// word-based diff within hunks that replace small numbers of lines.
177pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range<usize>, Arc<str>)> {
178 text_diff_with_options(old_text, new_text, DiffOptions::default())
179}
180
181/// Computes word-level diff ranges between two strings.
182///
183/// Returns a tuple of (old_ranges, new_ranges) where each vector contains
184/// the byte ranges of changed words in the respective text.
185pub fn word_diff_ranges(
186 old_text: &str,
187 new_text: &str,
188 options: DiffOptions,
189) -> (Vec<Range<usize>>, Vec<Range<usize>>) {
190 let mut input: InternedInput<&str> = InternedInput::default();
191 input.update_before(tokenize(old_text, options.language_scope.clone()));
192 input.update_after(tokenize(new_text, options.language_scope));
193
194 let mut old_ranges: Vec<Range<usize>> = Vec::new();
195 let mut new_ranges: Vec<Range<usize>> = Vec::new();
196
197 diff_internal(&input, &mut |old_byte_range, new_byte_range, _, _| {
198 if !old_byte_range.is_empty() {
199 if let Some(last) = old_ranges.last_mut()
200 && last.end >= old_byte_range.start
201 {
202 last.end = old_byte_range.end;
203 } else {
204 old_ranges.push(old_byte_range);
205 }
206 }
207
208 if !new_byte_range.is_empty() {
209 if let Some(last) = new_ranges.last_mut()
210 && last.end >= new_byte_range.start
211 {
212 last.end = new_byte_range.end;
213 } else {
214 new_ranges.push(new_byte_range);
215 }
216 }
217 });
218
219 (old_ranges, new_ranges)
220}
221
222/// Computes character-level diff between two strings.
223///
224/// Usually, you should use `text_diff`, which performs a word-wise diff.
225pub fn char_diff<'a>(old_text: &'a str, new_text: &'a str) -> Vec<(Range<usize>, &'a str)> {
226 let mut input: InternedInput<&str> = InternedInput::default();
227 input.update_before(tokenize_chars(old_text));
228 input.update_after(tokenize_chars(new_text));
229 let mut edits: Vec<(Range<usize>, &str)> = Vec::new();
230 diff_internal(&input, &mut |old_byte_range, new_byte_range, _, _| {
231 let replacement = if new_byte_range.is_empty() {
232 ""
233 } else {
234 &new_text[new_byte_range]
235 };
236 edits.push((old_byte_range, replacement));
237 });
238 edits
239}
240
241#[derive(Clone)]
242pub struct DiffOptions {
243 pub language_scope: Option<LanguageScope>,
244 pub max_word_diff_len: usize,
245 pub max_word_diff_line_count: usize,
246}
247
248impl Default for DiffOptions {
249 fn default() -> Self {
250 Self {
251 language_scope: Default::default(),
252 max_word_diff_len: MAX_WORD_DIFF_LEN,
253 max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT,
254 }
255 }
256}
257
258/// Computes a diff between two strings, using a specific language scope's
259/// word characters for word-level diffing.
260pub fn text_diff_with_options(
261 old_text: &str,
262 new_text: &str,
263 options: DiffOptions,
264) -> Vec<(Range<usize>, Arc<str>)> {
265 let empty: Arc<str> = Arc::default();
266 let mut edits = Vec::new();
267 let mut hunk_input = InternedInput::default();
268 let input = InternedInput::new(
269 lines_with_terminator(old_text),
270 lines_with_terminator(new_text),
271 );
272 diff_internal(&input, &mut |old_byte_range,
273 new_byte_range,
274 old_rows,
275 new_rows| {
276 if should_perform_word_diff_within_hunk(
277 &old_rows,
278 &old_byte_range,
279 &new_rows,
280 &new_byte_range,
281 &options,
282 ) {
283 let old_offset = old_byte_range.start;
284 let new_offset = new_byte_range.start;
285 hunk_input.clear();
286 hunk_input.update_before(tokenize(
287 &old_text[old_byte_range],
288 options.language_scope.clone(),
289 ));
290 hunk_input.update_after(tokenize(
291 &new_text[new_byte_range],
292 options.language_scope.clone(),
293 ));
294 diff_internal(&hunk_input, &mut |old_byte_range, new_byte_range, _, _| {
295 let old_byte_range =
296 old_offset + old_byte_range.start..old_offset + old_byte_range.end;
297 let new_byte_range =
298 new_offset + new_byte_range.start..new_offset + new_byte_range.end;
299 let replacement_text = if new_byte_range.is_empty() {
300 empty.clone()
301 } else {
302 new_text[new_byte_range].into()
303 };
304 edits.push((old_byte_range, replacement_text));
305 });
306 } else {
307 let replacement_text = if new_byte_range.is_empty() {
308 empty.clone()
309 } else {
310 new_text[new_byte_range].into()
311 };
312 edits.push((old_byte_range, replacement_text));
313 }
314 });
315 edits
316}
317
318pub fn apply_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
319 let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
320 let result = diffy::apply(base_text, &patch);
321 result.map_err(|err| anyhow!(err))
322}
323
324pub fn apply_reversed_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
325 let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
326 let reversed = patch.reverse();
327 diffy::apply(base_text, &reversed).map_err(|err| anyhow!(err))
328}
329
330fn should_perform_word_diff_within_hunk(
331 old_row_range: &Range<u32>,
332 old_byte_range: &Range<usize>,
333 new_row_range: &Range<u32>,
334 new_byte_range: &Range<usize>,
335 options: &DiffOptions,
336) -> bool {
337 !old_byte_range.is_empty()
338 && !new_byte_range.is_empty()
339 && old_byte_range.len() <= options.max_word_diff_len
340 && new_byte_range.len() <= options.max_word_diff_len
341 && old_row_range.len() <= options.max_word_diff_line_count
342 && new_row_range.len() <= options.max_word_diff_line_count
343}
344
345fn diff_internal(
346 input: &InternedInput<&str>,
347 on_change: &mut dyn FnMut(Range<usize>, Range<usize>, Range<u32>, Range<u32>),
348) {
349 let mut old_offset = 0;
350 let mut new_offset = 0;
351 let mut old_token_ix = 0;
352 let mut new_token_ix = 0;
353 diff(
354 Algorithm::Histogram,
355 input,
356 |old_tokens: Range<u32>, new_tokens: Range<u32>| {
357 old_offset += token_len(
358 input,
359 &input.before[old_token_ix as usize..old_tokens.start as usize],
360 );
361 new_offset += token_len(
362 input,
363 &input.after[new_token_ix as usize..new_tokens.start as usize],
364 );
365 let old_len = token_len(
366 input,
367 &input.before[old_tokens.start as usize..old_tokens.end as usize],
368 );
369 let new_len = token_len(
370 input,
371 &input.after[new_tokens.start as usize..new_tokens.end as usize],
372 );
373 let old_byte_range = old_offset..old_offset + old_len;
374 let new_byte_range = new_offset..new_offset + new_len;
375 old_token_ix = old_tokens.end;
376 new_token_ix = new_tokens.end;
377 old_offset = old_byte_range.end;
378 new_offset = new_byte_range.end;
379 on_change(old_byte_range, new_byte_range, old_tokens, new_tokens);
380 },
381 );
382}
383
384fn tokenize_chars(text: &str) -> impl Iterator<Item = &str> {
385 let mut chars = text.char_indices().peekable();
386 iter::from_fn(move || {
387 let (start, c) = chars.next()?;
388 Some(&text[start..start + c.len_utf8()])
389 })
390}
391
392fn tokenize(text: &str, language_scope: Option<LanguageScope>) -> impl Iterator<Item = &str> {
393 let classifier =
394 CharClassifier::new(language_scope).scope_context(Some(CharScopeContext::Completion));
395 let mut chars = text.char_indices();
396 let mut prev = None;
397 let mut start_ix = 0;
398 iter::from_fn(move || {
399 for (ix, c) in chars.by_ref() {
400 let mut token = None;
401 let kind = classifier.kind(c);
402 if let Some((prev_char, prev_kind)) = prev
403 && (kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char))
404 {
405 token = Some(&text[start_ix..ix]);
406 start_ix = ix;
407 }
408 prev = Some((c, kind));
409 if token.is_some() {
410 return token;
411 }
412 }
413 if start_ix < text.len() {
414 let token = &text[start_ix..];
415 start_ix = text.len();
416 return Some(token);
417 }
418 None
419 })
420}
421
422fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize {
423 tokens
424 .iter()
425 .map(|token| input.interner[*token].len())
426 .sum()
427}
428
429#[cfg(test)]
430mod tests {
431 use super::*;
432
433 #[test]
434 fn test_tokenize() {
435 let text = "";
436 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), Vec::<&str>::new());
437
438 let text = " ";
439 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec![" "]);
440
441 let text = "one";
442 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one"]);
443
444 let text = "one\n";
445 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one", "\n"]);
446
447 let text = "one.two(three)";
448 assert_eq!(
449 tokenize(text, None).collect::<Vec<_>>(),
450 vec!["one", ".", "two", "(", "three", ")"]
451 );
452
453 let text = "one two three()";
454 assert_eq!(
455 tokenize(text, None).collect::<Vec<_>>(),
456 vec!["one", " ", "two", " ", "three", "(", ")"]
457 );
458
459 let text = " one\n two three";
460 assert_eq!(
461 tokenize(text, None).collect::<Vec<_>>(),
462 vec![" ", "one", "\n ", "two", " ", "three"]
463 );
464 }
465
466 #[test]
467 fn test_text_diff() {
468 let old_text = "one two three";
469 let new_text = "one TWO three";
470 assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]);
471
472 let old_text = "one\ntwo\nthree\n";
473 let new_text = "one\ntwo\nAND\nTHEN\nthree\n";
474 assert_eq!(
475 text_diff(old_text, new_text),
476 [(8..8, "AND\nTHEN\n".into()),]
477 );
478
479 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
480 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
481 assert_eq!(
482 text_diff(old_text, new_text),
483 [
484 (14..18, "FOUR".into()),
485 (28..33, "SEVEN".into()),
486 (49..49, "ELEVEN\n".into())
487 ]
488 );
489 }
490
491 #[test]
492 fn test_apply_diff_patch() {
493 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
494 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
495 let patch = unified_diff(old_text, new_text);
496 assert_eq!(apply_diff_patch(old_text, &patch).unwrap(), new_text);
497 }
498
499 #[test]
500 fn test_apply_reversed_diff_patch() {
501 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
502 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
503 let patch = unified_diff(old_text, new_text);
504 assert_eq!(
505 apply_reversed_diff_patch(new_text, &patch).unwrap(),
506 old_text
507 );
508 }
509
510 #[test]
511 fn test_char_diff() {
512 assert_eq!(char_diff("", ""), vec![]);
513 assert_eq!(char_diff("", "abc"), vec![(0..0, "abc")]);
514 assert_eq!(char_diff("abc", ""), vec![(0..3, "")]);
515 assert_eq!(char_diff("ac", "abc"), vec![(1..1, "b")]); // "b" inserted
516 assert_eq!(char_diff("abc", "ac"), vec![(1..2, "")]); // "b" deleted
517 assert_eq!(char_diff("abc", "adc"), vec![(1..2, "d")]); // "b" replaced with "d"
518 assert_eq!(char_diff("ζ₯", "ζ₯ζ¬θͺ"), vec![(3..3, "ζ¬θͺ")]); // "ζ¬θͺ" inserted
519 assert_eq!(char_diff("ζ₯ζ¬θͺ", "ζ₯"), vec![(3..9, "")]); // "ζ¬θͺ" deleted
520 assert_eq!(char_diff("π", "πππ"), vec![(4..4, "ππ")]); // "ππ" inserted
521 assert_eq!(
522 char_diff("testζ₯ζ¬", "testζ₯ζ¬θͺγ§γ"),
523 vec![(10..10, "θͺγ§γ")]
524 );
525 }
526
527 #[test]
528 fn test_unified_diff_with_offsets() {
529 let old_text = "foo\nbar\nbaz\n";
530 let new_text = "foo\nBAR\nbaz\n";
531
532 let expected_diff_body = " foo\n-bar\n+BAR\n baz\n";
533
534 let diff_no_offset = unified_diff(old_text, new_text);
535 assert_eq!(
536 diff_no_offset,
537 format!("@@ -1,3 +1,3 @@\n{}", expected_diff_body)
538 );
539
540 let diff_with_offset = unified_diff_with_offsets(old_text, new_text, 9, 11);
541 assert_eq!(
542 diff_with_offset,
543 format!("@@ -10,3 +12,3 @@\n{}", expected_diff_body)
544 );
545
546 let diff_with_offset = unified_diff_with_offsets(old_text, new_text, 99, 104);
547 assert_eq!(
548 diff_with_offset,
549 format!("@@ -100,3 +105,3 @@\n{}", expected_diff_body)
550 );
551 }
552
553 #[test]
554 fn test_unified_diff_with_context() {
555 // Test that full context includes all lines from the start
556 let old_text = "line1\nline2\nline3\nline4\nline5\nCHANGE_ME\nline7\nline8\n";
557 let new_text = "line1\nline2\nline3\nline4\nline5\nCHANGED\nline7\nline8\n";
558
559 // With default 3 lines of context, the diff starts at line 3
560 let diff_default = unified_diff_with_offsets(old_text, new_text, 0, 0);
561 assert_eq!(
562 diff_default,
563 "@@ -3,6 +3,6 @@\n line3\n line4\n line5\n-CHANGE_ME\n+CHANGED\n line7\n line8\n"
564 );
565
566 // With full context (8 lines), the diff starts at line 1
567 let diff_full_context = unified_diff_with_context(old_text, new_text, 0, 0, 8);
568 assert_eq!(
569 diff_full_context,
570 "@@ -1,8 +1,8 @@\n line1\n line2\n line3\n line4\n line5\n-CHANGE_ME\n+CHANGED\n line7\n line8\n"
571 );
572
573 // With 0 context, only the changed line is shown
574 let diff_no_context = unified_diff_with_context(old_text, new_text, 0, 0, 0);
575 assert_eq!(diff_no_context, "@@ -6,1 +6,1 @@\n-CHANGE_ME\n+CHANGED\n");
576 }
577}