1use crate::{CharClassifier, CharKind, LanguageScope};
2use imara_diff::{
3 Algorithm, UnifiedDiffBuilder, diff,
4 intern::{InternedInput, Token},
5 sources::lines_with_terminator,
6};
7use std::{iter, ops::Range, sync::Arc};
8
9const MAX_WORD_DIFF_LEN: usize = 512;
10const MAX_WORD_DIFF_LINE_COUNT: usize = 8;
11
12/// Computes a diff between two strings, returning a unified diff string.
13pub fn unified_diff(old_text: &str, new_text: &str) -> String {
14 let input = InternedInput::new(old_text, new_text);
15 diff(
16 Algorithm::Histogram,
17 &input,
18 UnifiedDiffBuilder::new(&input),
19 )
20}
21
22/// Computes a diff between two strings, returning a vector of old and new row
23/// ranges.
24pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range<u32>, Range<u32>)> {
25 let mut edits = Vec::new();
26 let input = InternedInput::new(
27 lines_with_terminator(old_text),
28 lines_with_terminator(new_text),
29 );
30 diff_internal(&input, |_, _, old_rows, new_rows| {
31 edits.push((old_rows, new_rows));
32 });
33 edits
34}
35
36/// Computes a diff between two strings, returning a vector of edits.
37///
38/// The edits are represented as tuples of byte ranges and replacement strings.
39///
40/// Internally, this function first performs a line-based diff, and then performs a second
41/// word-based diff within hunks that replace small numbers of lines.
42pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range<usize>, Arc<str>)> {
43 text_diff_with_options(old_text, new_text, DiffOptions::default())
44}
45
46pub struct DiffOptions {
47 pub language_scope: Option<LanguageScope>,
48 pub max_word_diff_len: usize,
49 pub max_word_diff_line_count: usize,
50}
51
52impl Default for DiffOptions {
53 fn default() -> Self {
54 Self {
55 language_scope: Default::default(),
56 max_word_diff_len: MAX_WORD_DIFF_LEN,
57 max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT,
58 }
59 }
60}
61
62/// Computes a diff between two strings, using a specific language scope's
63/// word characters for word-level diffing.
64pub fn text_diff_with_options(
65 old_text: &str,
66 new_text: &str,
67 options: DiffOptions,
68) -> Vec<(Range<usize>, Arc<str>)> {
69 let empty: Arc<str> = Arc::default();
70 let mut edits = Vec::new();
71 let mut hunk_input = InternedInput::default();
72 let input = InternedInput::new(
73 lines_with_terminator(old_text),
74 lines_with_terminator(new_text),
75 );
76 diff_internal(
77 &input,
78 |old_byte_range, new_byte_range, old_rows, new_rows| {
79 if should_perform_word_diff_within_hunk(
80 &old_rows,
81 &old_byte_range,
82 &new_rows,
83 &new_byte_range,
84 &options,
85 ) {
86 let old_offset = old_byte_range.start;
87 let new_offset = new_byte_range.start;
88 hunk_input.clear();
89 hunk_input.update_before(tokenize(
90 &old_text[old_byte_range.clone()],
91 options.language_scope.clone(),
92 ));
93 hunk_input.update_after(tokenize(
94 &new_text[new_byte_range.clone()],
95 options.language_scope.clone(),
96 ));
97 diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| {
98 let old_byte_range =
99 old_offset + old_byte_range.start..old_offset + old_byte_range.end;
100 let new_byte_range =
101 new_offset + new_byte_range.start..new_offset + new_byte_range.end;
102 let replacement_text = if new_byte_range.is_empty() {
103 empty.clone()
104 } else {
105 new_text[new_byte_range.clone()].into()
106 };
107 edits.push((old_byte_range, replacement_text));
108 });
109 } else {
110 let replacement_text = if new_byte_range.is_empty() {
111 empty.clone()
112 } else {
113 new_text[new_byte_range.clone()].into()
114 };
115 edits.push((old_byte_range.clone(), replacement_text));
116 }
117 },
118 );
119 edits
120}
121
122fn should_perform_word_diff_within_hunk(
123 old_row_range: &Range<u32>,
124 old_byte_range: &Range<usize>,
125 new_row_range: &Range<u32>,
126 new_byte_range: &Range<usize>,
127 options: &DiffOptions,
128) -> bool {
129 !old_byte_range.is_empty()
130 && !new_byte_range.is_empty()
131 && old_byte_range.len() <= options.max_word_diff_len
132 && new_byte_range.len() <= options.max_word_diff_len
133 && old_row_range.len() <= options.max_word_diff_line_count
134 && new_row_range.len() <= options.max_word_diff_line_count
135}
136
137fn diff_internal(
138 input: &InternedInput<&str>,
139 mut on_change: impl FnMut(Range<usize>, Range<usize>, Range<u32>, Range<u32>),
140) {
141 let mut old_offset = 0;
142 let mut new_offset = 0;
143 let mut old_token_ix = 0;
144 let mut new_token_ix = 0;
145 diff(
146 Algorithm::Histogram,
147 input,
148 |old_tokens: Range<u32>, new_tokens: Range<u32>| {
149 old_offset += token_len(
150 &input,
151 &input.before[old_token_ix as usize..old_tokens.start as usize],
152 );
153 new_offset += token_len(
154 &input,
155 &input.after[new_token_ix as usize..new_tokens.start as usize],
156 );
157 let old_len = token_len(
158 &input,
159 &input.before[old_tokens.start as usize..old_tokens.end as usize],
160 );
161 let new_len = token_len(
162 &input,
163 &input.after[new_tokens.start as usize..new_tokens.end as usize],
164 );
165 let old_byte_range = old_offset..old_offset + old_len;
166 let new_byte_range = new_offset..new_offset + new_len;
167 old_token_ix = old_tokens.end;
168 new_token_ix = new_tokens.end;
169 old_offset = old_byte_range.end;
170 new_offset = new_byte_range.end;
171 on_change(old_byte_range, new_byte_range, old_tokens, new_tokens);
172 },
173 );
174}
175
176fn tokenize(text: &str, language_scope: Option<LanguageScope>) -> impl Iterator<Item = &str> {
177 let classifier = CharClassifier::new(language_scope).for_completion(true);
178 let mut chars = text.char_indices();
179 let mut prev = None;
180 let mut start_ix = 0;
181 iter::from_fn(move || {
182 while let Some((ix, c)) = chars.next() {
183 let mut token = None;
184 let kind = classifier.kind(c);
185 if let Some((prev_char, prev_kind)) = prev {
186 if kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char) {
187 token = Some(&text[start_ix..ix]);
188 start_ix = ix;
189 }
190 }
191 prev = Some((c, kind));
192 if token.is_some() {
193 return token;
194 }
195 }
196 if start_ix < text.len() {
197 let token = &text[start_ix..];
198 start_ix = text.len();
199 return Some(token);
200 }
201 None
202 })
203}
204
205fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize {
206 tokens
207 .iter()
208 .map(|token| input.interner[*token].len())
209 .sum()
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 #[test]
217 fn test_tokenize() {
218 let text = "";
219 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), Vec::<&str>::new());
220
221 let text = " ";
222 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec![" "]);
223
224 let text = "one";
225 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one"]);
226
227 let text = "one\n";
228 assert_eq!(tokenize(text, None).collect::<Vec<_>>(), vec!["one", "\n"]);
229
230 let text = "one.two(three)";
231 assert_eq!(
232 tokenize(text, None).collect::<Vec<_>>(),
233 vec!["one", ".", "two", "(", "three", ")"]
234 );
235
236 let text = "one two three()";
237 assert_eq!(
238 tokenize(text, None).collect::<Vec<_>>(),
239 vec!["one", " ", "two", " ", "three", "(", ")"]
240 );
241
242 let text = " one\n two three";
243 assert_eq!(
244 tokenize(text, None).collect::<Vec<_>>(),
245 vec![" ", "one", "\n ", "two", " ", "three"]
246 );
247 }
248
249 #[test]
250 fn test_text_diff() {
251 let old_text = "one two three";
252 let new_text = "one TWO three";
253 assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]);
254
255 let old_text = "one\ntwo\nthree\n";
256 let new_text = "one\ntwo\nAND\nTHEN\nthree\n";
257 assert_eq!(
258 text_diff(old_text, new_text),
259 [(8..8, "AND\nTHEN\n".into()),]
260 );
261
262 let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
263 let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
264 assert_eq!(
265 text_diff(old_text, new_text),
266 [
267 (14..18, "FOUR".into()),
268 (28..33, "SEVEN".into()),
269 (49..49, "ELEVEN\n".into())
270 ]
271 );
272 }
273}