1use crate::{CharClassifier, CharKind, CharScopeContext, LanguageScope};
2use anyhow::{Context, anyhow};
3use imara_diff::{
4 Algorithm, UnifiedDiffBuilder, diff,
5 intern::{InternedInput, Token},
6 sources::lines_with_terminator,
7};
8use std::{iter, ops::Range, sync::Arc};
9
10const MAX_WORD_DIFF_LEN: usize = 512;
11const MAX_WORD_DIFF_LINE_COUNT: usize = 8;
12
13/// Computes a diff between two strings, returning a unified diff string.
14pub fn unified_diff(old_text: &str, new_text: &str) -> String {
15 let input = InternedInput::new(old_text, new_text);
16 diff(
17 Algorithm::Histogram,
18 &input,
19 UnifiedDiffBuilder::new(&input),
20 )
21}
22
23/// Computes a diff between two strings, returning a vector of old and new row
24/// ranges.
25pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range<u32>, Range<u32>)> {
26 let mut edits = Vec::new();
27 let input = InternedInput::new(
28 lines_with_terminator(old_text),
29 lines_with_terminator(new_text),
30 );
31 diff_internal(&input, |_, _, old_rows, new_rows| {
32 edits.push((old_rows, new_rows));
33 });
34 edits
35}
36
37/// Computes a diff between two strings, returning a vector of edits.
38///
39/// The edits are represented as tuples of byte ranges and replacement strings.
40///
41/// Internally, this function first performs a line-based diff, and then performs a second
42/// word-based diff within hunks that replace small numbers of lines.
43pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range<usize>, Arc<str>)> {
44 text_diff_with_options(old_text, new_text, DiffOptions::default())
45}
46
47/// Computes word-level diff ranges between two strings.
48///
49/// Returns a tuple of (old_ranges, new_ranges) where each vector contains
50/// the byte ranges of changed words in the respective text.
51pub fn word_diff_ranges(
52 old_text: &str,
53 new_text: &str,
54 options: DiffOptions,
55) -> (Vec<Range<usize>>, Vec<Range<usize>>) {
56 let mut input: InternedInput<&str> = InternedInput::default();
57 input.update_before(tokenize(old_text, options.language_scope.clone()));
58 input.update_after(tokenize(new_text, options.language_scope));
59
60 let mut old_ranges: Vec<Range<usize>> = Vec::new();
61 let mut new_ranges: Vec<Range<usize>> = Vec::new();
62
63 diff_internal(&input, |old_byte_range, new_byte_range, _, _| {
64 if !old_byte_range.is_empty() {
65 if let Some(last) = old_ranges.last_mut()
66 && last.end >= old_byte_range.start
67 {
68 last.end = old_byte_range.end;
69 } else {
70 old_ranges.push(old_byte_range);
71 }
72 }
73
74 if !new_byte_range.is_empty() {
75 if let Some(last) = new_ranges.last_mut()
76 && last.end >= new_byte_range.start
77 {
78 last.end = new_byte_range.end;
79 } else {
80 new_ranges.push(new_byte_range);
81 }
82 }
83 });
84
85 (old_ranges, new_ranges)
86}
87
88pub struct DiffOptions {
89 pub language_scope: Option<LanguageScope>,
90 pub max_word_diff_len: usize,
91 pub max_word_diff_line_count: usize,
92}
93
94impl Default for DiffOptions {
95 fn default() -> Self {
96 Self {
97 language_scope: Default::default(),
98 max_word_diff_len: MAX_WORD_DIFF_LEN,
99 max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT,
100 }
101 }
102}
103
104/// Computes a diff between two strings, using a specific language scope's
105/// word characters for word-level diffing.
106pub fn text_diff_with_options(
107 old_text: &str,
108 new_text: &str,
109 options: DiffOptions,
110) -> Vec<(Range<usize>, Arc<str>)> {
111 let empty: Arc<str> = Arc::default();
112 let mut edits = Vec::new();
113 let mut hunk_input = InternedInput::default();
114 let input = InternedInput::new(
115 lines_with_terminator(old_text),
116 lines_with_terminator(new_text),
117 );
118 diff_internal(
119 &input,
120 |old_byte_range, new_byte_range, old_rows, new_rows| {
121 if should_perform_word_diff_within_hunk(
122 &old_rows,
123 &old_byte_range,
124 &new_rows,
125 &new_byte_range,
126 &options,
127 ) {
128 let old_offset = old_byte_range.start;
129 let new_offset = new_byte_range.start;
130 hunk_input.clear();
131 hunk_input.update_before(tokenize(
132 &old_text[old_byte_range],
133 options.language_scope.clone(),
134 ));
135 hunk_input.update_after(tokenize(
136 &new_text[new_byte_range],
137 options.language_scope.clone(),
138 ));
139 diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| {
140 let old_byte_range =
141 old_offset + old_byte_range.start..old_offset + old_byte_range.end;
142 let new_byte_range =
143 new_offset + new_byte_range.start..new_offset + new_byte_range.end;
144 let replacement_text = if new_byte_range.is_empty() {
145 empty.clone()
146 } else {
147 new_text[new_byte_range].into()
148 };
149 edits.push((old_byte_range, replacement_text));
150 });
151 } else {
152 let replacement_text = if new_byte_range.is_empty() {
153 empty.clone()
154 } else {
155 new_text[new_byte_range].into()
156 };
157 edits.push((old_byte_range, replacement_text));
158 }
159 },
160 );
161 edits
162}
163
164pub fn apply_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
165 let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
166 let result = diffy::apply(base_text, &patch);
167 result.map_err(|err| anyhow!(err))
168}
169
170fn should_perform_word_diff_within_hunk(
171 old_row_range: &Range<u32>,
172 old_byte_range: &Range<usize>,
173 new_row_range: &Range<u32>,
174 new_byte_range: &Range<usize>,
175 options: &DiffOptions,
176) -> bool {
177 !old_byte_range.is_empty()
178 && !new_byte_range.is_empty()
179 && old_byte_range.len() <= options.max_word_diff_len
180 && new_byte_range.len() <= options.max_word_diff_len
181 && old_row_range.len() <= options.max_word_diff_line_count
182 && new_row_range.len() <= options.max_word_diff_line_count
183}
184
185fn diff_internal(
186 input: &InternedInput<&str>,
187 mut on_change: impl FnMut(Range<usize>, Range<usize>, Range<u32>, Range<u32>),
188) {
189 let mut old_offset = 0;
190 let mut new_offset = 0;
191 let mut old_token_ix = 0;
192 let mut new_token_ix = 0;
193 diff(
194 Algorithm::Histogram,
195 input,
196 |old_tokens: Range<u32>, new_tokens: Range<u32>| {
197 old_offset += token_len(
198 input,
199 &input.before[old_token_ix as usize..old_tokens.start as usize],
200 );
201 new_offset += token_len(
202 input,
203 &input.after[new_token_ix as usize..new_tokens.start as usize],
204 );
205 let old_len = token_len(
206 input,
207 &input.before[old_tokens.start as usize..old_tokens.end as usize],
208 );
209 let new_len = token_len(
210 input,
211 &input.after[new_tokens.start as usize..new_tokens.end as usize],
212 );
213 let old_byte_range = old_offset..old_offset + old_len;
214 let new_byte_range = new_offset..new_offset + new_len;
215 old_token_ix = old_tokens.end;
216 new_token_ix = new_tokens.end;
217 old_offset = old_byte_range.end;
218 new_offset = new_byte_range.end;
219 on_change(old_byte_range, new_byte_range, old_tokens, new_tokens);
220 },
221 );
222}
223
224fn tokenize(text: &str, language_scope: Option<LanguageScope>) -> impl Iterator<Item = &str> {
225 let classifier =
226 CharClassifier::new(language_scope).scope_context(Some(CharScopeContext::Completion));
227 let mut chars = text.char_indices();
228 let mut prev = None;
229 let mut start_ix = 0;
230 iter::from_fn(move || {
231 for (ix, c) in chars.by_ref() {
232 let mut token = None;
233 let kind = classifier.kind(c);
234 if let Some((prev_char, prev_kind)) = prev
235 && (kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char))
236 {
237 token = Some(&text[start_ix..ix]);
238 start_ix = ix;
239 }
240 prev = Some((c, kind));
241 if token.is_some() {
242 return token;
243 }
244 }
245 if start_ix < text.len() {
246 let token = &text[start_ix..];
247 start_ix = text.len();
248 return Some(token);
249 }
250 None
251 })
252}
253
254fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize {
255 tokens
256 .iter()
257 .map(|token| input.interner[*token].len())
258 .sum()
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264
265 #[test]
266 fn test_tokenize() {
267 let text = "";
268 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), Vec::<&str>::new());
269
270 let text = " ";
271 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec![" "]);
272
273 let text = "one";
274 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one"]);
275
276 let text = "one\n";
277 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one", "\n"]);
278
279 let text = "one.two(three)";
280 assert_eq!(
281 tokenize(text, None).collect::<Vec<_>>(),
282 vec!["one", ".", "two", "(", "three", ")"]
283 );
284
285 let text = "one two three()";
286 assert_eq!(
287 tokenize(text, None).collect::<Vec<_>>(),
288 vec!["one", " ", "two", " ", "three", "(", ")"]
289 );
290
291 let text = " one\n two three";
292 assert_eq!(
293 tokenize(text, None).collect::<Vec<_>>(),
294 vec![" ", "one", "\n ", "two", " ", "three"]
295 );
296 }
297
298 #[test]
299 fn test_text_diff() {
300 let old_text = "one two three";
301 let new_text = "one TWO three";
302 assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]);
303
304 let old_text = "one\ntwo\nthree\n";
305 let new_text = "one\ntwo\nAND\nTHEN\nthree\n";
306 assert_eq!(
307 text_diff(old_text, new_text),
308 [(8..8, "AND\nTHEN\n".into()),]
309 );
310
311 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
312 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
313 assert_eq!(
314 text_diff(old_text, new_text),
315 [
316 (14..18, "FOUR".into()),
317 (28..33, "SEVEN".into()),
318 (49..49, "ELEVEN\n".into())
319 ]
320 );
321 }
322
323 #[test]
324 fn test_apply_diff_patch() {
325 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
326 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
327 let patch = unified_diff(old_text, new_text);
328 assert_eq!(apply_diff_patch(old_text, &patch).unwrap(), new_text);
329 }
330}