edit_parser.rs

  1use derive_more::{Add, AddAssign};
  2use schemars::JsonSchema;
  3use serde::{Deserialize, Serialize};
  4use smallvec::SmallVec;
  5use std::{mem, ops::Range};
  6
  7const OLD_TEXT_END_TAG: &str = "</old_text>";
  8const NEW_TEXT_END_TAG: &str = "</new_text>";
  9const EDITS_END_TAG: &str = "</edits>";
 10const END_TAGS: [&str; 3] = [OLD_TEXT_END_TAG, NEW_TEXT_END_TAG, EDITS_END_TAG];
 11
 12#[derive(Debug)]
 13pub enum EditParserEvent {
 14    OldTextChunk { chunk: String, done: bool },
 15    NewTextChunk { chunk: String, done: bool },
 16}
 17
 18#[derive(
 19    Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
 20)]
 21pub struct EditParserMetrics {
 22    pub tags: usize,
 23    pub mismatched_tags: usize,
 24}
 25
 26#[derive(Debug)]
 27pub struct EditParser {
 28    state: EditParserState,
 29    buffer: String,
 30    metrics: EditParserMetrics,
 31}
 32
 33#[derive(Debug, PartialEq)]
 34enum EditParserState {
 35    Pending,
 36    WithinOldText { start: bool },
 37    AfterOldText,
 38    WithinNewText { start: bool },
 39}
 40
 41impl EditParser {
 42    pub fn new() -> Self {
 43        EditParser {
 44            state: EditParserState::Pending,
 45            buffer: String::new(),
 46            metrics: EditParserMetrics::default(),
 47        }
 48    }
 49
 50    pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
 51        self.buffer.push_str(chunk);
 52
 53        let mut edit_events = SmallVec::new();
 54        loop {
 55            match &mut self.state {
 56                EditParserState::Pending => {
 57                    if let Some(start) = self.buffer.find("<old_text>") {
 58                        self.buffer.drain(..start + "<old_text>".len());
 59                        self.state = EditParserState::WithinOldText { start: true };
 60                    } else {
 61                        break;
 62                    }
 63                }
 64                EditParserState::WithinOldText { start } => {
 65                    if !self.buffer.is_empty() {
 66                        if *start && self.buffer.starts_with('\n') {
 67                            self.buffer.remove(0);
 68                        }
 69                        *start = false;
 70                    }
 71
 72                    if let Some(tag_range) = self.find_end_tag() {
 73                        let mut chunk = self.buffer[..tag_range.start].to_string();
 74                        if chunk.ends_with('\n') {
 75                            chunk.pop();
 76                        }
 77
 78                        self.metrics.tags += 1;
 79                        if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
 80                            self.metrics.mismatched_tags += 1;
 81                        }
 82
 83                        self.buffer.drain(..tag_range.end);
 84                        self.state = EditParserState::AfterOldText;
 85                        edit_events.push(EditParserEvent::OldTextChunk { chunk, done: true });
 86                    } else {
 87                        if !self.ends_with_tag_prefix() {
 88                            edit_events.push(EditParserEvent::OldTextChunk {
 89                                chunk: mem::take(&mut self.buffer),
 90                                done: false,
 91                            });
 92                        }
 93                        break;
 94                    }
 95                }
 96                EditParserState::AfterOldText => {
 97                    if let Some(start) = self.buffer.find("<new_text>") {
 98                        self.buffer.drain(..start + "<new_text>".len());
 99                        self.state = EditParserState::WithinNewText { start: true };
100                    } else {
101                        break;
102                    }
103                }
104                EditParserState::WithinNewText { start } => {
105                    if !self.buffer.is_empty() {
106                        if *start && self.buffer.starts_with('\n') {
107                            self.buffer.remove(0);
108                        }
109                        *start = false;
110                    }
111
112                    if let Some(tag_range) = self.find_end_tag() {
113                        let mut chunk = self.buffer[..tag_range.start].to_string();
114                        if chunk.ends_with('\n') {
115                            chunk.pop();
116                        }
117
118                        self.metrics.tags += 1;
119                        if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
120                            self.metrics.mismatched_tags += 1;
121                        }
122
123                        self.buffer.drain(..tag_range.end);
124                        self.state = EditParserState::Pending;
125                        edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
126                    } else {
127                        if !self.ends_with_tag_prefix() {
128                            edit_events.push(EditParserEvent::NewTextChunk {
129                                chunk: mem::take(&mut self.buffer),
130                                done: false,
131                            });
132                        }
133                        break;
134                    }
135                }
136            }
137        }
138        edit_events
139    }
140
141    fn find_end_tag(&self) -> Option<Range<usize>> {
142        let (tag, start_ix) = END_TAGS
143            .iter()
144            .flat_map(|tag| Some((tag, self.buffer.find(tag)?)))
145            .min_by_key(|(_, ix)| *ix)?;
146        Some(start_ix..start_ix + tag.len())
147    }
148
149    fn ends_with_tag_prefix(&self) -> bool {
150        let mut end_prefixes = END_TAGS
151            .iter()
152            .flat_map(|tag| (1..tag.len()).map(move |i| &tag[..i]))
153            .chain(["\n"]);
154        end_prefixes.any(|prefix| self.buffer.ends_with(&prefix))
155    }
156
157    pub fn finish(self) -> EditParserMetrics {
158        self.metrics
159    }
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165    use indoc::indoc;
166    use rand::prelude::*;
167    use std::cmp;
168
169    #[gpui::test(iterations = 1000)]
170    fn test_single_edit(mut rng: StdRng) {
171        let mut parser = EditParser::new();
172        assert_eq!(
173            parse_random_chunks(
174                "<old_text>original</old_text><new_text>updated</new_text>",
175                &mut parser,
176                &mut rng
177            ),
178            vec![Edit {
179                old_text: "original".to_string(),
180                new_text: "updated".to_string(),
181            }]
182        );
183        assert_eq!(
184            parser.finish(),
185            EditParserMetrics {
186                tags: 2,
187                mismatched_tags: 0
188            }
189        );
190    }
191
192    #[gpui::test(iterations = 1000)]
193    fn test_multiple_edits(mut rng: StdRng) {
194        let mut parser = EditParser::new();
195        assert_eq!(
196            parse_random_chunks(
197                indoc! {"
198                    <old_text>
199                    first old
200                    </old_text><new_text>first new</new_text>
201                    <old_text>second old</old_text><new_text>
202                    second new
203                    </new_text>
204                "},
205                &mut parser,
206                &mut rng
207            ),
208            vec![
209                Edit {
210                    old_text: "first old".to_string(),
211                    new_text: "first new".to_string(),
212                },
213                Edit {
214                    old_text: "second old".to_string(),
215                    new_text: "second new".to_string(),
216                },
217            ]
218        );
219        assert_eq!(
220            parser.finish(),
221            EditParserMetrics {
222                tags: 4,
223                mismatched_tags: 0
224            }
225        );
226    }
227
228    #[gpui::test(iterations = 1000)]
229    fn test_edits_with_extra_text(mut rng: StdRng) {
230        let mut parser = EditParser::new();
231        assert_eq!(
232            parse_random_chunks(
233                indoc! {"
234                    ignore this <old_text>
235                    content</old_text>extra stuff<new_text>updated content</new_text>trailing data
236                    more text <old_text>second item
237                    </old_text>middle text<new_text>modified second item</new_text>end
238                    <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
239                "},
240                &mut parser,
241                &mut rng
242            ),
243            vec![
244                Edit {
245                    old_text: "content".to_string(),
246                    new_text: "updated content".to_string(),
247                },
248                Edit {
249                    old_text: "second item".to_string(),
250                    new_text: "modified second item".to_string(),
251                },
252                Edit {
253                    old_text: "third case".to_string(),
254                    new_text: "improved third case".to_string(),
255                },
256            ]
257        );
258        assert_eq!(
259            parser.finish(),
260            EditParserMetrics {
261                tags: 6,
262                mismatched_tags: 0
263            }
264        );
265    }
266
267    #[gpui::test(iterations = 1000)]
268    fn test_nested_tags(mut rng: StdRng) {
269        let mut parser = EditParser::new();
270        assert_eq!(
271            parse_random_chunks(
272                "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
273                &mut parser,
274                &mut rng
275            ),
276            vec![Edit {
277                old_text: "code with <tag>nested</tag> elements".to_string(),
278                new_text: "new <code>content</code>".to_string(),
279            }]
280        );
281        assert_eq!(
282            parser.finish(),
283            EditParserMetrics {
284                tags: 2,
285                mismatched_tags: 0
286            }
287        );
288    }
289
290    #[gpui::test(iterations = 1000)]
291    fn test_empty_old_and_new_text(mut rng: StdRng) {
292        let mut parser = EditParser::new();
293        assert_eq!(
294            parse_random_chunks(
295                "<old_text></old_text><new_text></new_text>",
296                &mut parser,
297                &mut rng
298            ),
299            vec![Edit {
300                old_text: "".to_string(),
301                new_text: "".to_string(),
302            }]
303        );
304        assert_eq!(
305            parser.finish(),
306            EditParserMetrics {
307                tags: 2,
308                mismatched_tags: 0
309            }
310        );
311    }
312
313    #[gpui::test(iterations = 100)]
314    fn test_multiline_content(mut rng: StdRng) {
315        let mut parser = EditParser::new();
316        assert_eq!(
317            parse_random_chunks(
318                "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
319                &mut parser,
320                &mut rng
321            ),
322            vec![Edit {
323                old_text: "line1\nline2\nline3".to_string(),
324                new_text: "line1\nmodified line2\nline3".to_string(),
325            }]
326        );
327        assert_eq!(
328            parser.finish(),
329            EditParserMetrics {
330                tags: 2,
331                mismatched_tags: 0
332            }
333        );
334    }
335
336    #[gpui::test(iterations = 1000)]
337    fn test_mismatched_tags(mut rng: StdRng) {
338        let mut parser = EditParser::new();
339        assert_eq!(
340            parse_random_chunks(
341                // Reduced from an actual Sonnet 3.7 output
342                indoc! {"
343                    <old_text>
344                    a
345                    b
346                    c
347                    </new_text>
348                    <new_text>
349                    a
350                    B
351                    c
352                    </old_text>
353                    <old_text>
354                    d
355                    e
356                    f
357                    </new_text>
358                    <new_text>
359                    D
360                    e
361                    F
362                    </old_text>
363                "},
364                &mut parser,
365                &mut rng
366            ),
367            vec![
368                Edit {
369                    old_text: "a\nb\nc".to_string(),
370                    new_text: "a\nB\nc".to_string(),
371                },
372                Edit {
373                    old_text: "d\ne\nf".to_string(),
374                    new_text: "D\ne\nF".to_string(),
375                }
376            ]
377        );
378        assert_eq!(
379            parser.finish(),
380            EditParserMetrics {
381                tags: 4,
382                mismatched_tags: 4
383            }
384        );
385
386        let mut parser = EditParser::new();
387        assert_eq!(
388            parse_random_chunks(
389                // Reduced from an actual Opus 4 output
390                indoc! {"
391                    <edits>
392                    <old_text>
393                    Lorem
394                    </old_text>
395                    <new_text>
396                    LOREM
397                    </edits>
398                "},
399                &mut parser,
400                &mut rng
401            ),
402            vec![Edit {
403                old_text: "Lorem".to_string(),
404                new_text: "LOREM".to_string(),
405            },]
406        );
407        assert_eq!(
408            parser.finish(),
409            EditParserMetrics {
410                tags: 2,
411                mismatched_tags: 1
412            }
413        );
414    }
415
416    #[derive(Default, Debug, PartialEq, Eq)]
417    struct Edit {
418        old_text: String,
419        new_text: String,
420    }
421
422    fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
423        let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
424        let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
425        chunk_indices.sort();
426        chunk_indices.push(input.len());
427
428        let mut old_text = Some(String::new());
429        let mut new_text = None;
430        let mut pending_edit = Edit::default();
431        let mut edits = Vec::new();
432        let mut last_ix = 0;
433        for chunk_ix in chunk_indices {
434            for event in parser.push(&input[last_ix..chunk_ix]) {
435                match event {
436                    EditParserEvent::OldTextChunk { chunk, done } => {
437                        old_text.as_mut().unwrap().push_str(&chunk);
438                        if done {
439                            pending_edit.old_text = old_text.take().unwrap();
440                            new_text = Some(String::new());
441                        }
442                    }
443                    EditParserEvent::NewTextChunk { chunk, done } => {
444                        new_text.as_mut().unwrap().push_str(&chunk);
445                        if done {
446                            pending_edit.new_text = new_text.take().unwrap();
447                            edits.push(pending_edit);
448                            pending_edit = Edit::default();
449                            old_text = Some(String::new());
450                        }
451                    }
452                }
453            }
454            last_ix = chunk_ix;
455        }
456
457        edits
458    }
459}