1use derive_more::{Add, AddAssign};
2use schemars::JsonSchema;
3use serde::{Deserialize, Serialize};
4use smallvec::SmallVec;
5use std::{mem, ops::Range};
6
7const OLD_TEXT_END_TAG: &str = "</old_text>";
8const NEW_TEXT_END_TAG: &str = "</new_text>";
9const EDITS_END_TAG: &str = "</edits>";
10const END_TAGS: [&str; 3] = [OLD_TEXT_END_TAG, NEW_TEXT_END_TAG, EDITS_END_TAG];
11
12#[derive(Debug)]
13pub enum EditParserEvent {
14 OldTextChunk { chunk: String, done: bool },
15 NewTextChunk { chunk: String, done: bool },
16}
17
18#[derive(
19 Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
20)]
21pub struct EditParserMetrics {
22 pub tags: usize,
23 pub mismatched_tags: usize,
24}
25
26#[derive(Debug)]
27pub struct EditParser {
28 state: EditParserState,
29 buffer: String,
30 metrics: EditParserMetrics,
31}
32
33#[derive(Debug, PartialEq)]
34enum EditParserState {
35 Pending,
36 WithinOldText { start: bool },
37 AfterOldText,
38 WithinNewText { start: bool },
39}
40
41impl EditParser {
42 pub fn new() -> Self {
43 EditParser {
44 state: EditParserState::Pending,
45 buffer: String::new(),
46 metrics: EditParserMetrics::default(),
47 }
48 }
49
50 pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
51 self.buffer.push_str(chunk);
52
53 let mut edit_events = SmallVec::new();
54 loop {
55 match &mut self.state {
56 EditParserState::Pending => {
57 if let Some(start) = self.buffer.find("<old_text>") {
58 self.buffer.drain(..start + "<old_text>".len());
59 self.state = EditParserState::WithinOldText { start: true };
60 } else {
61 break;
62 }
63 }
64 EditParserState::WithinOldText { start } => {
65 if !self.buffer.is_empty() {
66 if *start && self.buffer.starts_with('\n') {
67 self.buffer.remove(0);
68 }
69 *start = false;
70 }
71
72 if let Some(tag_range) = self.find_end_tag() {
73 let mut chunk = self.buffer[..tag_range.start].to_string();
74 if chunk.ends_with('\n') {
75 chunk.pop();
76 }
77
78 self.metrics.tags += 1;
79 if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
80 self.metrics.mismatched_tags += 1;
81 }
82
83 self.buffer.drain(..tag_range.end);
84 self.state = EditParserState::AfterOldText;
85 edit_events.push(EditParserEvent::OldTextChunk { chunk, done: true });
86 } else {
87 if !self.ends_with_tag_prefix() {
88 edit_events.push(EditParserEvent::OldTextChunk {
89 chunk: mem::take(&mut self.buffer),
90 done: false,
91 });
92 }
93 break;
94 }
95 }
96 EditParserState::AfterOldText => {
97 if let Some(start) = self.buffer.find("<new_text>") {
98 self.buffer.drain(..start + "<new_text>".len());
99 self.state = EditParserState::WithinNewText { start: true };
100 } else {
101 break;
102 }
103 }
104 EditParserState::WithinNewText { start } => {
105 if !self.buffer.is_empty() {
106 if *start && self.buffer.starts_with('\n') {
107 self.buffer.remove(0);
108 }
109 *start = false;
110 }
111
112 if let Some(tag_range) = self.find_end_tag() {
113 let mut chunk = self.buffer[..tag_range.start].to_string();
114 if chunk.ends_with('\n') {
115 chunk.pop();
116 }
117
118 self.metrics.tags += 1;
119 if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
120 self.metrics.mismatched_tags += 1;
121 }
122
123 self.buffer.drain(..tag_range.end);
124 self.state = EditParserState::Pending;
125 edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
126 } else {
127 if !self.ends_with_tag_prefix() {
128 edit_events.push(EditParserEvent::NewTextChunk {
129 chunk: mem::take(&mut self.buffer),
130 done: false,
131 });
132 }
133 break;
134 }
135 }
136 }
137 }
138 edit_events
139 }
140
141 fn find_end_tag(&self) -> Option<Range<usize>> {
142 let (tag, start_ix) = END_TAGS
143 .iter()
144 .flat_map(|tag| Some((tag, self.buffer.find(tag)?)))
145 .min_by_key(|(_, ix)| *ix)?;
146 Some(start_ix..start_ix + tag.len())
147 }
148
149 fn ends_with_tag_prefix(&self) -> bool {
150 let mut end_prefixes = END_TAGS
151 .iter()
152 .flat_map(|tag| (1..tag.len()).map(move |i| &tag[..i]))
153 .chain(["\n"]);
154 end_prefixes.any(|prefix| self.buffer.ends_with(&prefix))
155 }
156
157 pub fn finish(self) -> EditParserMetrics {
158 self.metrics
159 }
160}
161
162#[cfg(test)]
163mod tests {
164 use super::*;
165 use indoc::indoc;
166 use rand::prelude::*;
167 use std::cmp;
168
169 #[gpui::test(iterations = 1000)]
170 fn test_single_edit(mut rng: StdRng) {
171 let mut parser = EditParser::new();
172 assert_eq!(
173 parse_random_chunks(
174 "<old_text>original</old_text><new_text>updated</new_text>",
175 &mut parser,
176 &mut rng
177 ),
178 vec![Edit {
179 old_text: "original".to_string(),
180 new_text: "updated".to_string(),
181 }]
182 );
183 assert_eq!(
184 parser.finish(),
185 EditParserMetrics {
186 tags: 2,
187 mismatched_tags: 0
188 }
189 );
190 }
191
192 #[gpui::test(iterations = 1000)]
193 fn test_multiple_edits(mut rng: StdRng) {
194 let mut parser = EditParser::new();
195 assert_eq!(
196 parse_random_chunks(
197 indoc! {"
198 <old_text>
199 first old
200 </old_text><new_text>first new</new_text>
201 <old_text>second old</old_text><new_text>
202 second new
203 </new_text>
204 "},
205 &mut parser,
206 &mut rng
207 ),
208 vec![
209 Edit {
210 old_text: "first old".to_string(),
211 new_text: "first new".to_string(),
212 },
213 Edit {
214 old_text: "second old".to_string(),
215 new_text: "second new".to_string(),
216 },
217 ]
218 );
219 assert_eq!(
220 parser.finish(),
221 EditParserMetrics {
222 tags: 4,
223 mismatched_tags: 0
224 }
225 );
226 }
227
228 #[gpui::test(iterations = 1000)]
229 fn test_edits_with_extra_text(mut rng: StdRng) {
230 let mut parser = EditParser::new();
231 assert_eq!(
232 parse_random_chunks(
233 indoc! {"
234 ignore this <old_text>
235 content</old_text>extra stuff<new_text>updated content</new_text>trailing data
236 more text <old_text>second item
237 </old_text>middle text<new_text>modified second item</new_text>end
238 <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
239 "},
240 &mut parser,
241 &mut rng
242 ),
243 vec![
244 Edit {
245 old_text: "content".to_string(),
246 new_text: "updated content".to_string(),
247 },
248 Edit {
249 old_text: "second item".to_string(),
250 new_text: "modified second item".to_string(),
251 },
252 Edit {
253 old_text: "third case".to_string(),
254 new_text: "improved third case".to_string(),
255 },
256 ]
257 );
258 assert_eq!(
259 parser.finish(),
260 EditParserMetrics {
261 tags: 6,
262 mismatched_tags: 0
263 }
264 );
265 }
266
267 #[gpui::test(iterations = 1000)]
268 fn test_nested_tags(mut rng: StdRng) {
269 let mut parser = EditParser::new();
270 assert_eq!(
271 parse_random_chunks(
272 "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
273 &mut parser,
274 &mut rng
275 ),
276 vec![Edit {
277 old_text: "code with <tag>nested</tag> elements".to_string(),
278 new_text: "new <code>content</code>".to_string(),
279 }]
280 );
281 assert_eq!(
282 parser.finish(),
283 EditParserMetrics {
284 tags: 2,
285 mismatched_tags: 0
286 }
287 );
288 }
289
290 #[gpui::test(iterations = 1000)]
291 fn test_empty_old_and_new_text(mut rng: StdRng) {
292 let mut parser = EditParser::new();
293 assert_eq!(
294 parse_random_chunks(
295 "<old_text></old_text><new_text></new_text>",
296 &mut parser,
297 &mut rng
298 ),
299 vec![Edit {
300 old_text: "".to_string(),
301 new_text: "".to_string(),
302 }]
303 );
304 assert_eq!(
305 parser.finish(),
306 EditParserMetrics {
307 tags: 2,
308 mismatched_tags: 0
309 }
310 );
311 }
312
313 #[gpui::test(iterations = 100)]
314 fn test_multiline_content(mut rng: StdRng) {
315 let mut parser = EditParser::new();
316 assert_eq!(
317 parse_random_chunks(
318 "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
319 &mut parser,
320 &mut rng
321 ),
322 vec![Edit {
323 old_text: "line1\nline2\nline3".to_string(),
324 new_text: "line1\nmodified line2\nline3".to_string(),
325 }]
326 );
327 assert_eq!(
328 parser.finish(),
329 EditParserMetrics {
330 tags: 2,
331 mismatched_tags: 0
332 }
333 );
334 }
335
336 #[gpui::test(iterations = 1000)]
337 fn test_mismatched_tags(mut rng: StdRng) {
338 let mut parser = EditParser::new();
339 assert_eq!(
340 parse_random_chunks(
341 // Reduced from an actual Sonnet 3.7 output
342 indoc! {"
343 <old_text>
344 a
345 b
346 c
347 </new_text>
348 <new_text>
349 a
350 B
351 c
352 </old_text>
353 <old_text>
354 d
355 e
356 f
357 </new_text>
358 <new_text>
359 D
360 e
361 F
362 </old_text>
363 "},
364 &mut parser,
365 &mut rng
366 ),
367 vec![
368 Edit {
369 old_text: "a\nb\nc".to_string(),
370 new_text: "a\nB\nc".to_string(),
371 },
372 Edit {
373 old_text: "d\ne\nf".to_string(),
374 new_text: "D\ne\nF".to_string(),
375 }
376 ]
377 );
378 assert_eq!(
379 parser.finish(),
380 EditParserMetrics {
381 tags: 4,
382 mismatched_tags: 4
383 }
384 );
385
386 let mut parser = EditParser::new();
387 assert_eq!(
388 parse_random_chunks(
389 // Reduced from an actual Opus 4 output
390 indoc! {"
391 <edits>
392 <old_text>
393 Lorem
394 </old_text>
395 <new_text>
396 LOREM
397 </edits>
398 "},
399 &mut parser,
400 &mut rng
401 ),
402 vec![Edit {
403 old_text: "Lorem".to_string(),
404 new_text: "LOREM".to_string(),
405 },]
406 );
407 assert_eq!(
408 parser.finish(),
409 EditParserMetrics {
410 tags: 2,
411 mismatched_tags: 1
412 }
413 );
414 }
415
416 #[derive(Default, Debug, PartialEq, Eq)]
417 struct Edit {
418 old_text: String,
419 new_text: String,
420 }
421
422 fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
423 let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
424 let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
425 chunk_indices.sort();
426 chunk_indices.push(input.len());
427
428 let mut old_text = Some(String::new());
429 let mut new_text = None;
430 let mut pending_edit = Edit::default();
431 let mut edits = Vec::new();
432 let mut last_ix = 0;
433 for chunk_ix in chunk_indices {
434 for event in parser.push(&input[last_ix..chunk_ix]) {
435 match event {
436 EditParserEvent::OldTextChunk { chunk, done } => {
437 old_text.as_mut().unwrap().push_str(&chunk);
438 if done {
439 pending_edit.old_text = old_text.take().unwrap();
440 new_text = Some(String::new());
441 }
442 }
443 EditParserEvent::NewTextChunk { chunk, done } => {
444 new_text.as_mut().unwrap().push_str(&chunk);
445 if done {
446 pending_edit.new_text = new_text.take().unwrap();
447 edits.push(pending_edit);
448 pending_edit = Edit::default();
449 old_text = Some(String::new());
450 }
451 }
452 }
453 }
454 last_ix = chunk_ix;
455 }
456
457 edits
458 }
459}