1use derive_more::{Add, AddAssign};
2use regex::Regex;
3use schemars::JsonSchema;
4use serde::{Deserialize, Serialize};
5use smallvec::SmallVec;
6use std::{mem, ops::Range};
7
8const OLD_TEXT_END_TAG: &str = "</old_text>";
9const NEW_TEXT_END_TAG: &str = "</new_text>";
10const EDITS_END_TAG: &str = "</edits>";
11const END_TAGS: [&str; 3] = [OLD_TEXT_END_TAG, NEW_TEXT_END_TAG, EDITS_END_TAG];
12
13#[derive(Debug)]
14pub enum EditParserEvent {
15 OldTextChunk {
16 chunk: String,
17 done: bool,
18 line_hint: Option<u32>,
19 },
20 NewTextChunk {
21 chunk: String,
22 done: bool,
23 },
24}
25
26#[derive(
27 Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
28)]
29pub struct EditParserMetrics {
30 pub tags: usize,
31 pub mismatched_tags: usize,
32}
33
34#[derive(Debug)]
35pub struct EditParser {
36 state: EditParserState,
37 buffer: String,
38 metrics: EditParserMetrics,
39}
40
41#[derive(Debug, PartialEq)]
42enum EditParserState {
43 Pending,
44 WithinOldText { start: bool, line_hint: Option<u32> },
45 AfterOldText,
46 WithinNewText { start: bool },
47}
48
49impl EditParser {
50 pub fn new() -> Self {
51 EditParser {
52 state: EditParserState::Pending,
53 buffer: String::new(),
54 metrics: EditParserMetrics::default(),
55 }
56 }
57
58 pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
59 self.buffer.push_str(chunk);
60
61 let mut edit_events = SmallVec::new();
62 loop {
63 match &mut self.state {
64 EditParserState::Pending => {
65 if let Some(start) = self.buffer.find("<old_text") {
66 if let Some(tag_end) = self.buffer[start..].find('>') {
67 let tag_end = start + tag_end + 1;
68 let tag = &self.buffer[start..tag_end];
69 let line_hint = self.parse_line_hint(tag);
70 self.buffer.drain(..tag_end);
71 self.state = EditParserState::WithinOldText {
72 start: true,
73 line_hint,
74 };
75 } else {
76 break;
77 }
78 } else {
79 break;
80 }
81 }
82 EditParserState::WithinOldText { start, line_hint } => {
83 if !self.buffer.is_empty() {
84 if *start && self.buffer.starts_with('\n') {
85 self.buffer.remove(0);
86 }
87 *start = false;
88 }
89
90 let line_hint = *line_hint;
91 if let Some(tag_range) = self.find_end_tag() {
92 let mut chunk = self.buffer[..tag_range.start].to_string();
93 if chunk.ends_with('\n') {
94 chunk.pop();
95 }
96
97 self.metrics.tags += 1;
98 if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
99 self.metrics.mismatched_tags += 1;
100 }
101
102 self.buffer.drain(..tag_range.end);
103 self.state = EditParserState::AfterOldText;
104 edit_events.push(EditParserEvent::OldTextChunk {
105 chunk,
106 done: true,
107 line_hint,
108 });
109 } else {
110 if !self.ends_with_tag_prefix() {
111 edit_events.push(EditParserEvent::OldTextChunk {
112 chunk: mem::take(&mut self.buffer),
113 done: false,
114 line_hint,
115 });
116 }
117 break;
118 }
119 }
120 EditParserState::AfterOldText => {
121 if let Some(start) = self.buffer.find("<new_text>") {
122 self.buffer.drain(..start + "<new_text>".len());
123 self.state = EditParserState::WithinNewText { start: true };
124 } else {
125 break;
126 }
127 }
128 EditParserState::WithinNewText { start } => {
129 if !self.buffer.is_empty() {
130 if *start && self.buffer.starts_with('\n') {
131 self.buffer.remove(0);
132 }
133 *start = false;
134 }
135
136 if let Some(tag_range) = self.find_end_tag() {
137 let mut chunk = self.buffer[..tag_range.start].to_string();
138 if chunk.ends_with('\n') {
139 chunk.pop();
140 }
141
142 self.metrics.tags += 1;
143 if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
144 self.metrics.mismatched_tags += 1;
145 }
146
147 self.buffer.drain(..tag_range.end);
148 self.state = EditParserState::Pending;
149 edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
150 } else {
151 if !self.ends_with_tag_prefix() {
152 edit_events.push(EditParserEvent::NewTextChunk {
153 chunk: mem::take(&mut self.buffer),
154 done: false,
155 });
156 }
157 break;
158 }
159 }
160 }
161 }
162 edit_events
163 }
164
165 fn find_end_tag(&self) -> Option<Range<usize>> {
166 let (tag, start_ix) = END_TAGS
167 .iter()
168 .flat_map(|tag| Some((tag, self.buffer.find(tag)?)))
169 .min_by_key(|(_, ix)| *ix)?;
170 Some(start_ix..start_ix + tag.len())
171 }
172
173 fn ends_with_tag_prefix(&self) -> bool {
174 let mut end_prefixes = END_TAGS
175 .iter()
176 .flat_map(|tag| (1..tag.len()).map(move |i| &tag[..i]))
177 .chain(["\n"]);
178 end_prefixes.any(|prefix| self.buffer.ends_with(&prefix))
179 }
180
181 fn parse_line_hint(&self, tag: &str) -> Option<u32> {
182 static LINE_HINT_REGEX: std::sync::LazyLock<Regex> =
183 std::sync::LazyLock::new(|| Regex::new(r#"line=(?:"?)(\d+)"#).unwrap());
184
185 LINE_HINT_REGEX
186 .captures(tag)
187 .and_then(|caps| caps.get(1))
188 .and_then(|m| m.as_str().parse::<u32>().ok())
189 }
190
191 pub fn finish(self) -> EditParserMetrics {
192 self.metrics
193 }
194}
195
196#[cfg(test)]
197mod tests {
198 use super::*;
199 use indoc::indoc;
200 use rand::prelude::*;
201 use std::cmp;
202
203 #[gpui::test(iterations = 1000)]
204 fn test_single_edit(mut rng: StdRng) {
205 let mut parser = EditParser::new();
206 assert_eq!(
207 parse_random_chunks(
208 "<old_text>original</old_text><new_text>updated</new_text>",
209 &mut parser,
210 &mut rng
211 ),
212 vec![Edit {
213 old_text: "original".to_string(),
214 new_text: "updated".to_string(),
215 line_hint: None,
216 }]
217 );
218 assert_eq!(
219 parser.finish(),
220 EditParserMetrics {
221 tags: 2,
222 mismatched_tags: 0
223 }
224 );
225 }
226
227 #[gpui::test(iterations = 1000)]
228 fn test_multiple_edits(mut rng: StdRng) {
229 let mut parser = EditParser::new();
230 assert_eq!(
231 parse_random_chunks(
232 indoc! {"
233 <old_text>
234 first old
235 </old_text><new_text>first new</new_text>
236 <old_text>second old</old_text><new_text>
237 second new
238 </new_text>
239 "},
240 &mut parser,
241 &mut rng
242 ),
243 vec![
244 Edit {
245 old_text: "first old".to_string(),
246 new_text: "first new".to_string(),
247 line_hint: None,
248 },
249 Edit {
250 old_text: "second old".to_string(),
251 new_text: "second new".to_string(),
252 line_hint: None,
253 },
254 ]
255 );
256 assert_eq!(
257 parser.finish(),
258 EditParserMetrics {
259 tags: 4,
260 mismatched_tags: 0
261 }
262 );
263 }
264
265 #[gpui::test(iterations = 1000)]
266 fn test_edits_with_extra_text(mut rng: StdRng) {
267 let mut parser = EditParser::new();
268 assert_eq!(
269 parse_random_chunks(
270 indoc! {"
271 ignore this <old_text>
272 content</old_text>extra stuff<new_text>updated content</new_text>trailing data
273 more text <old_text>second item
274 </old_text>middle text<new_text>modified second item</new_text>end
275 <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
276 "},
277 &mut parser,
278 &mut rng
279 ),
280 vec![
281 Edit {
282 old_text: "content".to_string(),
283 new_text: "updated content".to_string(),
284 line_hint: None,
285 },
286 Edit {
287 old_text: "second item".to_string(),
288 new_text: "modified second item".to_string(),
289 line_hint: None,
290 },
291 Edit {
292 old_text: "third case".to_string(),
293 new_text: "improved third case".to_string(),
294 line_hint: None,
295 },
296 ]
297 );
298 assert_eq!(
299 parser.finish(),
300 EditParserMetrics {
301 tags: 6,
302 mismatched_tags: 0
303 }
304 );
305 }
306
307 #[gpui::test(iterations = 1000)]
308 fn test_nested_tags(mut rng: StdRng) {
309 let mut parser = EditParser::new();
310 assert_eq!(
311 parse_random_chunks(
312 "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
313 &mut parser,
314 &mut rng
315 ),
316 vec![Edit {
317 old_text: "code with <tag>nested</tag> elements".to_string(),
318 new_text: "new <code>content</code>".to_string(),
319 line_hint: None,
320 }]
321 );
322 assert_eq!(
323 parser.finish(),
324 EditParserMetrics {
325 tags: 2,
326 mismatched_tags: 0
327 }
328 );
329 }
330
331 #[gpui::test(iterations = 1000)]
332 fn test_empty_old_and_new_text(mut rng: StdRng) {
333 let mut parser = EditParser::new();
334 assert_eq!(
335 parse_random_chunks(
336 "<old_text></old_text><new_text></new_text>",
337 &mut parser,
338 &mut rng
339 ),
340 vec![Edit {
341 old_text: "".to_string(),
342 new_text: "".to_string(),
343 line_hint: None,
344 }]
345 );
346 assert_eq!(
347 parser.finish(),
348 EditParserMetrics {
349 tags: 2,
350 mismatched_tags: 0
351 }
352 );
353 }
354
355 #[gpui::test(iterations = 100)]
356 fn test_multiline_content(mut rng: StdRng) {
357 let mut parser = EditParser::new();
358 assert_eq!(
359 parse_random_chunks(
360 "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
361 &mut parser,
362 &mut rng
363 ),
364 vec![Edit {
365 old_text: "line1\nline2\nline3".to_string(),
366 new_text: "line1\nmodified line2\nline3".to_string(),
367 line_hint: None,
368 }]
369 );
370 assert_eq!(
371 parser.finish(),
372 EditParserMetrics {
373 tags: 2,
374 mismatched_tags: 0
375 }
376 );
377 }
378
379 #[gpui::test(iterations = 1000)]
380 fn test_mismatched_tags(mut rng: StdRng) {
381 let mut parser = EditParser::new();
382 assert_eq!(
383 parse_random_chunks(
384 // Reduced from an actual Sonnet 3.7 output
385 indoc! {"
386 <old_text>
387 a
388 b
389 c
390 </new_text>
391 <new_text>
392 a
393 B
394 c
395 </old_text>
396 <old_text>
397 d
398 e
399 f
400 </new_text>
401 <new_text>
402 D
403 e
404 F
405 </old_text>
406 "},
407 &mut parser,
408 &mut rng
409 ),
410 vec![
411 Edit {
412 old_text: "a\nb\nc".to_string(),
413 new_text: "a\nB\nc".to_string(),
414 line_hint: None,
415 },
416 Edit {
417 old_text: "d\ne\nf".to_string(),
418 new_text: "D\ne\nF".to_string(),
419 line_hint: None,
420 }
421 ]
422 );
423 assert_eq!(
424 parser.finish(),
425 EditParserMetrics {
426 tags: 4,
427 mismatched_tags: 4
428 }
429 );
430
431 let mut parser = EditParser::new();
432 assert_eq!(
433 parse_random_chunks(
434 // Reduced from an actual Opus 4 output
435 indoc! {"
436 <edits>
437 <old_text>
438 Lorem
439 </old_text>
440 <new_text>
441 LOREM
442 </edits>
443 "},
444 &mut parser,
445 &mut rng
446 ),
447 vec![Edit {
448 old_text: "Lorem".to_string(),
449 new_text: "LOREM".to_string(),
450 line_hint: None,
451 },]
452 );
453 assert_eq!(
454 parser.finish(),
455 EditParserMetrics {
456 tags: 2,
457 mismatched_tags: 1
458 }
459 );
460 }
461
462 #[gpui::test(iterations = 100)]
463 fn test_line_hints(mut rng: StdRng) {
464 // Line hint is a single quoted line number
465 let mut parser = EditParser::new();
466
467 let edits = parse_random_chunks(
468 r#"
469 <old_text line="23">original code</old_text>
470 <new_text>updated code</new_text>"#,
471 &mut parser,
472 &mut rng,
473 );
474
475 assert_eq!(edits.len(), 1);
476 assert_eq!(edits[0].old_text, "original code");
477 assert_eq!(edits[0].line_hint, Some(23));
478 assert_eq!(edits[0].new_text, "updated code");
479
480 // Line hint is a single unquoted line number
481 let mut parser = EditParser::new();
482
483 let edits = parse_random_chunks(
484 r#"
485 <old_text line=45>original code</old_text>
486 <new_text>updated code</new_text>"#,
487 &mut parser,
488 &mut rng,
489 );
490
491 assert_eq!(edits.len(), 1);
492 assert_eq!(edits[0].old_text, "original code");
493 assert_eq!(edits[0].line_hint, Some(45));
494 assert_eq!(edits[0].new_text, "updated code");
495
496 // Line hint is a range
497 let mut parser = EditParser::new();
498
499 let edits = parse_random_chunks(
500 r#"
501 <old_text line="23:50">original code</old_text>
502 <new_text>updated code</new_text>"#,
503 &mut parser,
504 &mut rng,
505 );
506
507 assert_eq!(edits.len(), 1);
508 assert_eq!(edits[0].old_text, "original code");
509 assert_eq!(edits[0].line_hint, Some(23));
510 assert_eq!(edits[0].new_text, "updated code");
511
512 // No line hint
513 let mut parser = EditParser::new();
514 let edits = parse_random_chunks(
515 r#"
516 <old_text>old</old_text>
517 <new_text>new</new_text>"#,
518 &mut parser,
519 &mut rng,
520 );
521
522 assert_eq!(edits.len(), 1);
523 assert_eq!(edits[0].old_text, "old");
524 assert_eq!(edits[0].line_hint, None);
525 assert_eq!(edits[0].new_text, "new");
526 }
527
528 #[derive(Default, Debug, PartialEq, Eq)]
529 struct Edit {
530 old_text: String,
531 new_text: String,
532 line_hint: Option<u32>,
533 }
534
535 fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
536 let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
537 let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
538 chunk_indices.sort();
539 chunk_indices.push(input.len());
540
541 let mut old_text = Some(String::new());
542 let mut new_text = None;
543 let mut pending_edit = Edit::default();
544 let mut edits = Vec::new();
545 let mut last_ix = 0;
546 for chunk_ix in chunk_indices {
547 for event in parser.push(&input[last_ix..chunk_ix]) {
548 match event {
549 EditParserEvent::OldTextChunk {
550 chunk,
551 done,
552 line_hint,
553 } => {
554 old_text.as_mut().unwrap().push_str(&chunk);
555 if done {
556 pending_edit.old_text = old_text.take().unwrap();
557 pending_edit.line_hint = line_hint;
558 new_text = Some(String::new());
559 }
560 }
561 EditParserEvent::NewTextChunk { chunk, done } => {
562 new_text.as_mut().unwrap().push_str(&chunk);
563 if done {
564 pending_edit.new_text = new_text.take().unwrap();
565 edits.push(pending_edit);
566 pending_edit = Edit::default();
567 old_text = Some(String::new());
568 }
569 }
570 }
571 }
572 last_ix = chunk_ix;
573 }
574
575 edits
576 }
577}