1use derive_more::{Add, AddAssign};
2use smallvec::SmallVec;
3use std::{cmp, mem, ops::Range};
4
5const OLD_TEXT_END_TAG: &str = "</old_text>";
6const NEW_TEXT_END_TAG: &str = "</new_text>";
7const END_TAG_LEN: usize = OLD_TEXT_END_TAG.len();
8const _: () = debug_assert!(OLD_TEXT_END_TAG.len() == NEW_TEXT_END_TAG.len());
9
10#[derive(Debug)]
11pub enum EditParserEvent {
12 OldText(String),
13 NewTextChunk { chunk: String, done: bool },
14}
15
16#[derive(Clone, Debug, Default, PartialEq, Eq, Add, AddAssign)]
17pub struct EditParserMetrics {
18 pub tags: usize,
19 pub mismatched_tags: usize,
20}
21
22#[derive(Debug)]
23pub struct EditParser {
24 state: EditParserState,
25 buffer: String,
26 metrics: EditParserMetrics,
27}
28
29#[derive(Debug, PartialEq)]
30enum EditParserState {
31 Pending,
32 WithinOldText,
33 AfterOldText,
34 WithinNewText { start: bool },
35}
36
37impl EditParser {
38 pub fn new() -> Self {
39 EditParser {
40 state: EditParserState::Pending,
41 buffer: String::new(),
42 metrics: EditParserMetrics::default(),
43 }
44 }
45
46 pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
47 self.buffer.push_str(chunk);
48
49 let mut edit_events = SmallVec::new();
50 loop {
51 match &mut self.state {
52 EditParserState::Pending => {
53 if let Some(start) = self.buffer.find("<old_text>") {
54 self.buffer.drain(..start + "<old_text>".len());
55 self.state = EditParserState::WithinOldText;
56 } else {
57 break;
58 }
59 }
60 EditParserState::WithinOldText => {
61 if let Some(tag_range) = self.find_end_tag() {
62 let mut start = 0;
63 if self.buffer.starts_with('\n') {
64 start = 1;
65 }
66 let mut old_text = self.buffer[start..tag_range.start].to_string();
67 if old_text.ends_with('\n') {
68 old_text.pop();
69 }
70
71 self.metrics.tags += 1;
72 if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
73 self.metrics.mismatched_tags += 1;
74 }
75
76 self.buffer.drain(..tag_range.end);
77 self.state = EditParserState::AfterOldText;
78 edit_events.push(EditParserEvent::OldText(old_text));
79 } else {
80 break;
81 }
82 }
83 EditParserState::AfterOldText => {
84 if let Some(start) = self.buffer.find("<new_text>") {
85 self.buffer.drain(..start + "<new_text>".len());
86 self.state = EditParserState::WithinNewText { start: true };
87 } else {
88 break;
89 }
90 }
91 EditParserState::WithinNewText { start } => {
92 if !self.buffer.is_empty() {
93 if *start && self.buffer.starts_with('\n') {
94 self.buffer.remove(0);
95 }
96 *start = false;
97 }
98
99 if let Some(tag_range) = self.find_end_tag() {
100 let mut chunk = self.buffer[..tag_range.start].to_string();
101 if chunk.ends_with('\n') {
102 chunk.pop();
103 }
104
105 self.metrics.tags += 1;
106 if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
107 self.metrics.mismatched_tags += 1;
108 }
109
110 self.buffer.drain(..tag_range.end);
111 self.state = EditParserState::Pending;
112 edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
113 } else {
114 let mut end_prefixes = (1..END_TAG_LEN)
115 .flat_map(|i| [&NEW_TEXT_END_TAG[..i], &OLD_TEXT_END_TAG[..i]])
116 .chain(["\n"]);
117 if end_prefixes.all(|prefix| !self.buffer.ends_with(&prefix)) {
118 edit_events.push(EditParserEvent::NewTextChunk {
119 chunk: mem::take(&mut self.buffer),
120 done: false,
121 });
122 }
123 break;
124 }
125 }
126 }
127 }
128 edit_events
129 }
130
131 fn find_end_tag(&self) -> Option<Range<usize>> {
132 let old_text_end_tag_ix = self.buffer.find(OLD_TEXT_END_TAG);
133 let new_text_end_tag_ix = self.buffer.find(NEW_TEXT_END_TAG);
134 let start_ix = if let Some((old_text_ix, new_text_ix)) =
135 old_text_end_tag_ix.zip(new_text_end_tag_ix)
136 {
137 cmp::min(old_text_ix, new_text_ix)
138 } else {
139 old_text_end_tag_ix.or(new_text_end_tag_ix)?
140 };
141 Some(start_ix..start_ix + END_TAG_LEN)
142 }
143
144 pub fn finish(self) -> EditParserMetrics {
145 self.metrics
146 }
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152 use indoc::indoc;
153 use rand::prelude::*;
154 use std::cmp;
155
156 #[gpui::test(iterations = 1000)]
157 fn test_single_edit(mut rng: StdRng) {
158 let mut parser = EditParser::new();
159 assert_eq!(
160 parse_random_chunks(
161 "<old_text>original</old_text><new_text>updated</new_text>",
162 &mut parser,
163 &mut rng
164 ),
165 vec![Edit {
166 old_text: "original".to_string(),
167 new_text: "updated".to_string(),
168 }]
169 );
170 assert_eq!(
171 parser.finish(),
172 EditParserMetrics {
173 tags: 2,
174 mismatched_tags: 0
175 }
176 );
177 }
178
179 #[gpui::test(iterations = 1000)]
180 fn test_multiple_edits(mut rng: StdRng) {
181 let mut parser = EditParser::new();
182 assert_eq!(
183 parse_random_chunks(
184 indoc! {"
185 <old_text>
186 first old
187 </old_text><new_text>first new</new_text>
188 <old_text>second old</old_text><new_text>
189 second new
190 </new_text>
191 "},
192 &mut parser,
193 &mut rng
194 ),
195 vec![
196 Edit {
197 old_text: "first old".to_string(),
198 new_text: "first new".to_string(),
199 },
200 Edit {
201 old_text: "second old".to_string(),
202 new_text: "second new".to_string(),
203 },
204 ]
205 );
206 assert_eq!(
207 parser.finish(),
208 EditParserMetrics {
209 tags: 4,
210 mismatched_tags: 0
211 }
212 );
213 }
214
215 #[gpui::test(iterations = 1000)]
216 fn test_edits_with_extra_text(mut rng: StdRng) {
217 let mut parser = EditParser::new();
218 assert_eq!(
219 parse_random_chunks(
220 indoc! {"
221 ignore this <old_text>
222 content</old_text>extra stuff<new_text>updated content</new_text>trailing data
223 more text <old_text>second item
224 </old_text>middle text<new_text>modified second item</new_text>end
225 <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
226 "},
227 &mut parser,
228 &mut rng
229 ),
230 vec![
231 Edit {
232 old_text: "content".to_string(),
233 new_text: "updated content".to_string(),
234 },
235 Edit {
236 old_text: "second item".to_string(),
237 new_text: "modified second item".to_string(),
238 },
239 Edit {
240 old_text: "third case".to_string(),
241 new_text: "improved third case".to_string(),
242 },
243 ]
244 );
245 assert_eq!(
246 parser.finish(),
247 EditParserMetrics {
248 tags: 6,
249 mismatched_tags: 0
250 }
251 );
252 }
253
254 #[gpui::test(iterations = 1000)]
255 fn test_nested_tags(mut rng: StdRng) {
256 let mut parser = EditParser::new();
257 assert_eq!(
258 parse_random_chunks(
259 "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
260 &mut parser,
261 &mut rng
262 ),
263 vec![Edit {
264 old_text: "code with <tag>nested</tag> elements".to_string(),
265 new_text: "new <code>content</code>".to_string(),
266 }]
267 );
268 assert_eq!(
269 parser.finish(),
270 EditParserMetrics {
271 tags: 2,
272 mismatched_tags: 0
273 }
274 );
275 }
276
277 #[gpui::test(iterations = 1000)]
278 fn test_empty_old_and_new_text(mut rng: StdRng) {
279 let mut parser = EditParser::new();
280 assert_eq!(
281 parse_random_chunks(
282 "<old_text></old_text><new_text></new_text>",
283 &mut parser,
284 &mut rng
285 ),
286 vec![Edit {
287 old_text: "".to_string(),
288 new_text: "".to_string(),
289 }]
290 );
291 assert_eq!(
292 parser.finish(),
293 EditParserMetrics {
294 tags: 2,
295 mismatched_tags: 0
296 }
297 );
298 }
299
300 #[gpui::test(iterations = 100)]
301 fn test_multiline_content(mut rng: StdRng) {
302 let mut parser = EditParser::new();
303 assert_eq!(
304 parse_random_chunks(
305 "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
306 &mut parser,
307 &mut rng
308 ),
309 vec![Edit {
310 old_text: "line1\nline2\nline3".to_string(),
311 new_text: "line1\nmodified line2\nline3".to_string(),
312 }]
313 );
314 assert_eq!(
315 parser.finish(),
316 EditParserMetrics {
317 tags: 2,
318 mismatched_tags: 0
319 }
320 );
321 }
322
323 #[gpui::test(iterations = 1000)]
324 fn test_mismatched_tags(mut rng: StdRng) {
325 let mut parser = EditParser::new();
326 assert_eq!(
327 parse_random_chunks(
328 // Reduced from an actual Sonnet 3.7 output
329 indoc! {"
330 <old_text>
331 a
332 b
333 c
334 </new_text>
335 <new_text>
336 a
337 B
338 c
339 </old_text>
340 <old_text>
341 d
342 e
343 f
344 </new_text>
345 <new_text>
346 D
347 e
348 F
349 </old_text>
350 "},
351 &mut parser,
352 &mut rng
353 ),
354 vec![
355 Edit {
356 old_text: "a\nb\nc".to_string(),
357 new_text: "a\nB\nc".to_string(),
358 },
359 Edit {
360 old_text: "d\ne\nf".to_string(),
361 new_text: "D\ne\nF".to_string(),
362 }
363 ]
364 );
365 assert_eq!(
366 parser.finish(),
367 EditParserMetrics {
368 tags: 4,
369 mismatched_tags: 4
370 }
371 );
372 }
373
374 #[derive(Default, Debug, PartialEq, Eq)]
375 struct Edit {
376 old_text: String,
377 new_text: String,
378 }
379
380 fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
381 let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
382 let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
383 chunk_indices.sort();
384 chunk_indices.push(input.len());
385
386 let mut pending_edit = Edit::default();
387 let mut edits = Vec::new();
388 let mut last_ix = 0;
389 for chunk_ix in chunk_indices {
390 for event in parser.push(&input[last_ix..chunk_ix]) {
391 match event {
392 EditParserEvent::OldText(old_text) => {
393 pending_edit.old_text = old_text;
394 }
395 EditParserEvent::NewTextChunk { chunk, done } => {
396 pending_edit.new_text.push_str(&chunk);
397 if done {
398 edits.push(pending_edit);
399 pending_edit = Edit::default();
400 }
401 }
402 }
403 }
404 last_ix = chunk_ix;
405 }
406 edits
407 }
408}