edit_parser.rs

  1use derive_more::{Add, AddAssign};
  2use regex::Regex;
  3use schemars::JsonSchema;
  4use serde::{Deserialize, Serialize};
  5use smallvec::SmallVec;
  6use std::{mem, ops::Range};
  7
  8const OLD_TEXT_END_TAG: &str = "</old_text>";
  9const NEW_TEXT_END_TAG: &str = "</new_text>";
 10const EDITS_END_TAG: &str = "</edits>";
 11const END_TAGS: [&str; 3] = [OLD_TEXT_END_TAG, NEW_TEXT_END_TAG, EDITS_END_TAG];
 12
 13#[derive(Debug)]
 14pub enum EditParserEvent {
 15    OldTextChunk {
 16        chunk: String,
 17        done: bool,
 18        line_hint: Option<u32>,
 19    },
 20    NewTextChunk {
 21        chunk: String,
 22        done: bool,
 23    },
 24}
 25
 26#[derive(
 27    Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
 28)]
 29pub struct EditParserMetrics {
 30    pub tags: usize,
 31    pub mismatched_tags: usize,
 32}
 33
 34#[derive(Debug)]
 35pub struct EditParser {
 36    state: EditParserState,
 37    buffer: String,
 38    metrics: EditParserMetrics,
 39}
 40
 41#[derive(Debug, PartialEq)]
 42enum EditParserState {
 43    Pending,
 44    WithinOldText { start: bool, line_hint: Option<u32> },
 45    AfterOldText,
 46    WithinNewText { start: bool },
 47}
 48
 49impl EditParser {
 50    pub fn new() -> Self {
 51        EditParser {
 52            state: EditParserState::Pending,
 53            buffer: String::new(),
 54            metrics: EditParserMetrics::default(),
 55        }
 56    }
 57
 58    pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
 59        self.buffer.push_str(chunk);
 60
 61        let mut edit_events = SmallVec::new();
 62        loop {
 63            match &mut self.state {
 64                EditParserState::Pending => {
 65                    if let Some(start) = self.buffer.find("<old_text") {
 66                        if let Some(tag_end) = self.buffer[start..].find('>') {
 67                            let tag_end = start + tag_end + 1;
 68                            let tag = &self.buffer[start..tag_end];
 69                            let line_hint = self.parse_line_hint(tag);
 70                            self.buffer.drain(..tag_end);
 71                            self.state = EditParserState::WithinOldText {
 72                                start: true,
 73                                line_hint,
 74                            };
 75                        } else {
 76                            break;
 77                        }
 78                    } else {
 79                        break;
 80                    }
 81                }
 82                EditParserState::WithinOldText { start, line_hint } => {
 83                    if !self.buffer.is_empty() {
 84                        if *start && self.buffer.starts_with('\n') {
 85                            self.buffer.remove(0);
 86                        }
 87                        *start = false;
 88                    }
 89
 90                    let line_hint = *line_hint;
 91                    if let Some(tag_range) = self.find_end_tag() {
 92                        let mut chunk = self.buffer[..tag_range.start].to_string();
 93                        if chunk.ends_with('\n') {
 94                            chunk.pop();
 95                        }
 96
 97                        self.metrics.tags += 1;
 98                        if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
 99                            self.metrics.mismatched_tags += 1;
100                        }
101
102                        self.buffer.drain(..tag_range.end);
103                        self.state = EditParserState::AfterOldText;
104                        edit_events.push(EditParserEvent::OldTextChunk {
105                            chunk,
106                            done: true,
107                            line_hint,
108                        });
109                    } else {
110                        if !self.ends_with_tag_prefix() {
111                            edit_events.push(EditParserEvent::OldTextChunk {
112                                chunk: mem::take(&mut self.buffer),
113                                done: false,
114                                line_hint,
115                            });
116                        }
117                        break;
118                    }
119                }
120                EditParserState::AfterOldText => {
121                    if let Some(start) = self.buffer.find("<new_text>") {
122                        self.buffer.drain(..start + "<new_text>".len());
123                        self.state = EditParserState::WithinNewText { start: true };
124                    } else {
125                        break;
126                    }
127                }
128                EditParserState::WithinNewText { start } => {
129                    if !self.buffer.is_empty() {
130                        if *start && self.buffer.starts_with('\n') {
131                            self.buffer.remove(0);
132                        }
133                        *start = false;
134                    }
135
136                    if let Some(tag_range) = self.find_end_tag() {
137                        let mut chunk = self.buffer[..tag_range.start].to_string();
138                        if chunk.ends_with('\n') {
139                            chunk.pop();
140                        }
141
142                        self.metrics.tags += 1;
143                        if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
144                            self.metrics.mismatched_tags += 1;
145                        }
146
147                        self.buffer.drain(..tag_range.end);
148                        self.state = EditParserState::Pending;
149                        edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
150                    } else {
151                        if !self.ends_with_tag_prefix() {
152                            edit_events.push(EditParserEvent::NewTextChunk {
153                                chunk: mem::take(&mut self.buffer),
154                                done: false,
155                            });
156                        }
157                        break;
158                    }
159                }
160            }
161        }
162        edit_events
163    }
164
165    fn find_end_tag(&self) -> Option<Range<usize>> {
166        let (tag, start_ix) = END_TAGS
167            .iter()
168            .flat_map(|tag| Some((tag, self.buffer.find(tag)?)))
169            .min_by_key(|(_, ix)| *ix)?;
170        Some(start_ix..start_ix + tag.len())
171    }
172
173    fn ends_with_tag_prefix(&self) -> bool {
174        let mut end_prefixes = END_TAGS
175            .iter()
176            .flat_map(|tag| (1..tag.len()).map(move |i| &tag[..i]))
177            .chain(["\n"]);
178        end_prefixes.any(|prefix| self.buffer.ends_with(&prefix))
179    }
180
181    fn parse_line_hint(&self, tag: &str) -> Option<u32> {
182        static LINE_HINT_REGEX: std::sync::LazyLock<Regex> =
183            std::sync::LazyLock::new(|| Regex::new(r#"line=(?:"?)(\d+)"#).unwrap());
184
185        LINE_HINT_REGEX
186            .captures(tag)
187            .and_then(|caps| caps.get(1))
188            .and_then(|m| m.as_str().parse::<u32>().ok())
189    }
190
191    pub fn finish(self) -> EditParserMetrics {
192        self.metrics
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use indoc::indoc;
200    use rand::prelude::*;
201    use std::cmp;
202
203    #[gpui::test(iterations = 1000)]
204    fn test_single_edit(mut rng: StdRng) {
205        let mut parser = EditParser::new();
206        assert_eq!(
207            parse_random_chunks(
208                "<old_text>original</old_text><new_text>updated</new_text>",
209                &mut parser,
210                &mut rng
211            ),
212            vec![Edit {
213                old_text: "original".to_string(),
214                new_text: "updated".to_string(),
215                line_hint: None,
216            }]
217        );
218        assert_eq!(
219            parser.finish(),
220            EditParserMetrics {
221                tags: 2,
222                mismatched_tags: 0
223            }
224        );
225    }
226
227    #[gpui::test(iterations = 1000)]
228    fn test_multiple_edits(mut rng: StdRng) {
229        let mut parser = EditParser::new();
230        assert_eq!(
231            parse_random_chunks(
232                indoc! {"
233                    <old_text>
234                    first old
235                    </old_text><new_text>first new</new_text>
236                    <old_text>second old</old_text><new_text>
237                    second new
238                    </new_text>
239                "},
240                &mut parser,
241                &mut rng
242            ),
243            vec![
244                Edit {
245                    old_text: "first old".to_string(),
246                    new_text: "first new".to_string(),
247                    line_hint: None,
248                },
249                Edit {
250                    old_text: "second old".to_string(),
251                    new_text: "second new".to_string(),
252                    line_hint: None,
253                },
254            ]
255        );
256        assert_eq!(
257            parser.finish(),
258            EditParserMetrics {
259                tags: 4,
260                mismatched_tags: 0
261            }
262        );
263    }
264
265    #[gpui::test(iterations = 1000)]
266    fn test_edits_with_extra_text(mut rng: StdRng) {
267        let mut parser = EditParser::new();
268        assert_eq!(
269            parse_random_chunks(
270                indoc! {"
271                    ignore this <old_text>
272                    content</old_text>extra stuff<new_text>updated content</new_text>trailing data
273                    more text <old_text>second item
274                    </old_text>middle text<new_text>modified second item</new_text>end
275                    <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
276                "},
277                &mut parser,
278                &mut rng
279            ),
280            vec![
281                Edit {
282                    old_text: "content".to_string(),
283                    new_text: "updated content".to_string(),
284                    line_hint: None,
285                },
286                Edit {
287                    old_text: "second item".to_string(),
288                    new_text: "modified second item".to_string(),
289                    line_hint: None,
290                },
291                Edit {
292                    old_text: "third case".to_string(),
293                    new_text: "improved third case".to_string(),
294                    line_hint: None,
295                },
296            ]
297        );
298        assert_eq!(
299            parser.finish(),
300            EditParserMetrics {
301                tags: 6,
302                mismatched_tags: 0
303            }
304        );
305    }
306
307    #[gpui::test(iterations = 1000)]
308    fn test_nested_tags(mut rng: StdRng) {
309        let mut parser = EditParser::new();
310        assert_eq!(
311            parse_random_chunks(
312                "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
313                &mut parser,
314                &mut rng
315            ),
316            vec![Edit {
317                old_text: "code with <tag>nested</tag> elements".to_string(),
318                new_text: "new <code>content</code>".to_string(),
319                line_hint: None,
320            }]
321        );
322        assert_eq!(
323            parser.finish(),
324            EditParserMetrics {
325                tags: 2,
326                mismatched_tags: 0
327            }
328        );
329    }
330
331    #[gpui::test(iterations = 1000)]
332    fn test_empty_old_and_new_text(mut rng: StdRng) {
333        let mut parser = EditParser::new();
334        assert_eq!(
335            parse_random_chunks(
336                "<old_text></old_text><new_text></new_text>",
337                &mut parser,
338                &mut rng
339            ),
340            vec![Edit {
341                old_text: "".to_string(),
342                new_text: "".to_string(),
343                line_hint: None,
344            }]
345        );
346        assert_eq!(
347            parser.finish(),
348            EditParserMetrics {
349                tags: 2,
350                mismatched_tags: 0
351            }
352        );
353    }
354
355    #[gpui::test(iterations = 100)]
356    fn test_multiline_content(mut rng: StdRng) {
357        let mut parser = EditParser::new();
358        assert_eq!(
359            parse_random_chunks(
360                "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
361                &mut parser,
362                &mut rng
363            ),
364            vec![Edit {
365                old_text: "line1\nline2\nline3".to_string(),
366                new_text: "line1\nmodified line2\nline3".to_string(),
367                line_hint: None,
368            }]
369        );
370        assert_eq!(
371            parser.finish(),
372            EditParserMetrics {
373                tags: 2,
374                mismatched_tags: 0
375            }
376        );
377    }
378
379    #[gpui::test(iterations = 1000)]
380    fn test_mismatched_tags(mut rng: StdRng) {
381        let mut parser = EditParser::new();
382        assert_eq!(
383            parse_random_chunks(
384                // Reduced from an actual Sonnet 3.7 output
385                indoc! {"
386                    <old_text>
387                    a
388                    b
389                    c
390                    </new_text>
391                    <new_text>
392                    a
393                    B
394                    c
395                    </old_text>
396                    <old_text>
397                    d
398                    e
399                    f
400                    </new_text>
401                    <new_text>
402                    D
403                    e
404                    F
405                    </old_text>
406                "},
407                &mut parser,
408                &mut rng
409            ),
410            vec![
411                Edit {
412                    old_text: "a\nb\nc".to_string(),
413                    new_text: "a\nB\nc".to_string(),
414                    line_hint: None,
415                },
416                Edit {
417                    old_text: "d\ne\nf".to_string(),
418                    new_text: "D\ne\nF".to_string(),
419                    line_hint: None,
420                }
421            ]
422        );
423        assert_eq!(
424            parser.finish(),
425            EditParserMetrics {
426                tags: 4,
427                mismatched_tags: 4
428            }
429        );
430
431        let mut parser = EditParser::new();
432        assert_eq!(
433            parse_random_chunks(
434                // Reduced from an actual Opus 4 output
435                indoc! {"
436                    <edits>
437                    <old_text>
438                    Lorem
439                    </old_text>
440                    <new_text>
441                    LOREM
442                    </edits>
443                "},
444                &mut parser,
445                &mut rng
446            ),
447            vec![Edit {
448                old_text: "Lorem".to_string(),
449                new_text: "LOREM".to_string(),
450                line_hint: None,
451            },]
452        );
453        assert_eq!(
454            parser.finish(),
455            EditParserMetrics {
456                tags: 2,
457                mismatched_tags: 1
458            }
459        );
460    }
461
462    #[gpui::test(iterations = 100)]
463    fn test_line_hints(mut rng: StdRng) {
464        // Line hint is a single quoted line number
465        let mut parser = EditParser::new();
466
467        let edits = parse_random_chunks(
468            r#"
469                    <old_text line="23">original code</old_text>
470                    <new_text>updated code</new_text>"#,
471            &mut parser,
472            &mut rng,
473        );
474
475        assert_eq!(edits.len(), 1);
476        assert_eq!(edits[0].old_text, "original code");
477        assert_eq!(edits[0].line_hint, Some(23));
478        assert_eq!(edits[0].new_text, "updated code");
479
480        // Line hint is a single unquoted line number
481        let mut parser = EditParser::new();
482
483        let edits = parse_random_chunks(
484            r#"
485                    <old_text line=45>original code</old_text>
486                    <new_text>updated code</new_text>"#,
487            &mut parser,
488            &mut rng,
489        );
490
491        assert_eq!(edits.len(), 1);
492        assert_eq!(edits[0].old_text, "original code");
493        assert_eq!(edits[0].line_hint, Some(45));
494        assert_eq!(edits[0].new_text, "updated code");
495
496        // Line hint is a range
497        let mut parser = EditParser::new();
498
499        let edits = parse_random_chunks(
500            r#"
501            <old_text line="23:50">original code</old_text>
502            <new_text>updated code</new_text>"#,
503            &mut parser,
504            &mut rng,
505        );
506
507        assert_eq!(edits.len(), 1);
508        assert_eq!(edits[0].old_text, "original code");
509        assert_eq!(edits[0].line_hint, Some(23));
510        assert_eq!(edits[0].new_text, "updated code");
511
512        // No line hint
513        let mut parser = EditParser::new();
514        let edits = parse_random_chunks(
515            r#"
516            <old_text>old</old_text>
517            <new_text>new</new_text>"#,
518            &mut parser,
519            &mut rng,
520        );
521
522        assert_eq!(edits.len(), 1);
523        assert_eq!(edits[0].old_text, "old");
524        assert_eq!(edits[0].line_hint, None);
525        assert_eq!(edits[0].new_text, "new");
526    }
527
528    #[derive(Default, Debug, PartialEq, Eq)]
529    struct Edit {
530        old_text: String,
531        new_text: String,
532        line_hint: Option<u32>,
533    }
534
535    fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
536        let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
537        let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
538        chunk_indices.sort();
539        chunk_indices.push(input.len());
540
541        let mut old_text = Some(String::new());
542        let mut new_text = None;
543        let mut pending_edit = Edit::default();
544        let mut edits = Vec::new();
545        let mut last_ix = 0;
546        for chunk_ix in chunk_indices {
547            for event in parser.push(&input[last_ix..chunk_ix]) {
548                match event {
549                    EditParserEvent::OldTextChunk {
550                        chunk,
551                        done,
552                        line_hint,
553                    } => {
554                        old_text.as_mut().unwrap().push_str(&chunk);
555                        if done {
556                            pending_edit.old_text = old_text.take().unwrap();
557                            pending_edit.line_hint = line_hint;
558                            new_text = Some(String::new());
559                        }
560                    }
561                    EditParserEvent::NewTextChunk { chunk, done } => {
562                        new_text.as_mut().unwrap().push_str(&chunk);
563                        if done {
564                            pending_edit.new_text = new_text.take().unwrap();
565                            edits.push(pending_edit);
566                            pending_edit = Edit::default();
567                            old_text = Some(String::new());
568                        }
569                    }
570                }
571            }
572            last_ix = chunk_ix;
573        }
574
575        edits
576    }
577}