edit_parser.rs

  1use derive_more::{Add, AddAssign};
  2use schemars::JsonSchema;
  3use serde::{Deserialize, Serialize};
  4use smallvec::SmallVec;
  5use std::{mem, ops::Range};
  6
  7const OLD_TEXT_END_TAG: &str = "</old_text>";
  8const NEW_TEXT_END_TAG: &str = "</new_text>";
  9const EDITS_END_TAG: &str = "</edits>";
 10const END_TAGS: [&str; 3] = [OLD_TEXT_END_TAG, NEW_TEXT_END_TAG, EDITS_END_TAG];
 11
 12#[derive(Debug)]
 13pub enum EditParserEvent {
 14    OldText(String),
 15    NewTextChunk { chunk: String, done: bool },
 16}
 17
 18#[derive(
 19    Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
 20)]
 21pub struct EditParserMetrics {
 22    pub tags: usize,
 23    pub mismatched_tags: usize,
 24}
 25
 26#[derive(Debug)]
 27pub struct EditParser {
 28    state: EditParserState,
 29    buffer: String,
 30    metrics: EditParserMetrics,
 31}
 32
 33#[derive(Debug, PartialEq)]
 34enum EditParserState {
 35    Pending,
 36    WithinOldText,
 37    AfterOldText,
 38    WithinNewText { start: bool },
 39}
 40
 41impl EditParser {
 42    pub fn new() -> Self {
 43        EditParser {
 44            state: EditParserState::Pending,
 45            buffer: String::new(),
 46            metrics: EditParserMetrics::default(),
 47        }
 48    }
 49
 50    pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
 51        self.buffer.push_str(chunk);
 52
 53        let mut edit_events = SmallVec::new();
 54        loop {
 55            match &mut self.state {
 56                EditParserState::Pending => {
 57                    if let Some(start) = self.buffer.find("<old_text>") {
 58                        self.buffer.drain(..start + "<old_text>".len());
 59                        self.state = EditParserState::WithinOldText;
 60                    } else {
 61                        break;
 62                    }
 63                }
 64                EditParserState::WithinOldText => {
 65                    if let Some(tag_range) = self.find_end_tag() {
 66                        let mut start = 0;
 67                        if self.buffer.starts_with('\n') {
 68                            start = 1;
 69                        }
 70                        let mut old_text = self.buffer[start..tag_range.start].to_string();
 71                        if old_text.ends_with('\n') {
 72                            old_text.pop();
 73                        }
 74
 75                        self.metrics.tags += 1;
 76                        if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
 77                            self.metrics.mismatched_tags += 1;
 78                        }
 79
 80                        self.buffer.drain(..tag_range.end);
 81                        self.state = EditParserState::AfterOldText;
 82                        edit_events.push(EditParserEvent::OldText(old_text));
 83                    } else {
 84                        break;
 85                    }
 86                }
 87                EditParserState::AfterOldText => {
 88                    if let Some(start) = self.buffer.find("<new_text>") {
 89                        self.buffer.drain(..start + "<new_text>".len());
 90                        self.state = EditParserState::WithinNewText { start: true };
 91                    } else {
 92                        break;
 93                    }
 94                }
 95                EditParserState::WithinNewText { start } => {
 96                    if !self.buffer.is_empty() {
 97                        if *start && self.buffer.starts_with('\n') {
 98                            self.buffer.remove(0);
 99                        }
100                        *start = false;
101                    }
102
103                    if let Some(tag_range) = self.find_end_tag() {
104                        let mut chunk = self.buffer[..tag_range.start].to_string();
105                        if chunk.ends_with('\n') {
106                            chunk.pop();
107                        }
108
109                        self.metrics.tags += 1;
110                        if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
111                            self.metrics.mismatched_tags += 1;
112                        }
113
114                        self.buffer.drain(..tag_range.end);
115                        self.state = EditParserState::Pending;
116                        edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
117                    } else {
118                        let mut end_prefixes = END_TAGS
119                            .iter()
120                            .flat_map(|tag| (1..tag.len()).map(move |i| &tag[..i]))
121                            .chain(["\n"]);
122                        if end_prefixes.all(|prefix| !self.buffer.ends_with(&prefix)) {
123                            edit_events.push(EditParserEvent::NewTextChunk {
124                                chunk: mem::take(&mut self.buffer),
125                                done: false,
126                            });
127                        }
128                        break;
129                    }
130                }
131            }
132        }
133        edit_events
134    }
135
136    fn find_end_tag(&self) -> Option<Range<usize>> {
137        let (tag, start_ix) = END_TAGS
138            .iter()
139            .flat_map(|tag| Some((tag, self.buffer.find(tag)?)))
140            .min_by_key(|(_, ix)| *ix)?;
141        Some(start_ix..start_ix + tag.len())
142    }
143
144    pub fn finish(self) -> EditParserMetrics {
145        self.metrics
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use indoc::indoc;
153    use rand::prelude::*;
154    use std::cmp;
155
156    #[gpui::test(iterations = 1000)]
157    fn test_single_edit(mut rng: StdRng) {
158        let mut parser = EditParser::new();
159        assert_eq!(
160            parse_random_chunks(
161                "<old_text>original</old_text><new_text>updated</new_text>",
162                &mut parser,
163                &mut rng
164            ),
165            vec![Edit {
166                old_text: "original".to_string(),
167                new_text: "updated".to_string(),
168            }]
169        );
170        assert_eq!(
171            parser.finish(),
172            EditParserMetrics {
173                tags: 2,
174                mismatched_tags: 0
175            }
176        );
177    }
178
179    #[gpui::test(iterations = 1000)]
180    fn test_multiple_edits(mut rng: StdRng) {
181        let mut parser = EditParser::new();
182        assert_eq!(
183            parse_random_chunks(
184                indoc! {"
185                    <old_text>
186                    first old
187                    </old_text><new_text>first new</new_text>
188                    <old_text>second old</old_text><new_text>
189                    second new
190                    </new_text>
191                "},
192                &mut parser,
193                &mut rng
194            ),
195            vec![
196                Edit {
197                    old_text: "first old".to_string(),
198                    new_text: "first new".to_string(),
199                },
200                Edit {
201                    old_text: "second old".to_string(),
202                    new_text: "second new".to_string(),
203                },
204            ]
205        );
206        assert_eq!(
207            parser.finish(),
208            EditParserMetrics {
209                tags: 4,
210                mismatched_tags: 0
211            }
212        );
213    }
214
215    #[gpui::test(iterations = 1000)]
216    fn test_edits_with_extra_text(mut rng: StdRng) {
217        let mut parser = EditParser::new();
218        assert_eq!(
219            parse_random_chunks(
220                indoc! {"
221                    ignore this <old_text>
222                    content</old_text>extra stuff<new_text>updated content</new_text>trailing data
223                    more text <old_text>second item
224                    </old_text>middle text<new_text>modified second item</new_text>end
225                    <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
226                "},
227                &mut parser,
228                &mut rng
229            ),
230            vec![
231                Edit {
232                    old_text: "content".to_string(),
233                    new_text: "updated content".to_string(),
234                },
235                Edit {
236                    old_text: "second item".to_string(),
237                    new_text: "modified second item".to_string(),
238                },
239                Edit {
240                    old_text: "third case".to_string(),
241                    new_text: "improved third case".to_string(),
242                },
243            ]
244        );
245        assert_eq!(
246            parser.finish(),
247            EditParserMetrics {
248                tags: 6,
249                mismatched_tags: 0
250            }
251        );
252    }
253
254    #[gpui::test(iterations = 1000)]
255    fn test_nested_tags(mut rng: StdRng) {
256        let mut parser = EditParser::new();
257        assert_eq!(
258            parse_random_chunks(
259                "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
260                &mut parser,
261                &mut rng
262            ),
263            vec![Edit {
264                old_text: "code with <tag>nested</tag> elements".to_string(),
265                new_text: "new <code>content</code>".to_string(),
266            }]
267        );
268        assert_eq!(
269            parser.finish(),
270            EditParserMetrics {
271                tags: 2,
272                mismatched_tags: 0
273            }
274        );
275    }
276
277    #[gpui::test(iterations = 1000)]
278    fn test_empty_old_and_new_text(mut rng: StdRng) {
279        let mut parser = EditParser::new();
280        assert_eq!(
281            parse_random_chunks(
282                "<old_text></old_text><new_text></new_text>",
283                &mut parser,
284                &mut rng
285            ),
286            vec![Edit {
287                old_text: "".to_string(),
288                new_text: "".to_string(),
289            }]
290        );
291        assert_eq!(
292            parser.finish(),
293            EditParserMetrics {
294                tags: 2,
295                mismatched_tags: 0
296            }
297        );
298    }
299
300    #[gpui::test(iterations = 100)]
301    fn test_multiline_content(mut rng: StdRng) {
302        let mut parser = EditParser::new();
303        assert_eq!(
304            parse_random_chunks(
305                "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
306                &mut parser,
307                &mut rng
308            ),
309            vec![Edit {
310                old_text: "line1\nline2\nline3".to_string(),
311                new_text: "line1\nmodified line2\nline3".to_string(),
312            }]
313        );
314        assert_eq!(
315            parser.finish(),
316            EditParserMetrics {
317                tags: 2,
318                mismatched_tags: 0
319            }
320        );
321    }
322
323    #[gpui::test(iterations = 1000)]
324    fn test_mismatched_tags(mut rng: StdRng) {
325        let mut parser = EditParser::new();
326        assert_eq!(
327            parse_random_chunks(
328                // Reduced from an actual Sonnet 3.7 output
329                indoc! {"
330                    <old_text>
331                    a
332                    b
333                    c
334                    </new_text>
335                    <new_text>
336                    a
337                    B
338                    c
339                    </old_text>
340                    <old_text>
341                    d
342                    e
343                    f
344                    </new_text>
345                    <new_text>
346                    D
347                    e
348                    F
349                    </old_text>
350                "},
351                &mut parser,
352                &mut rng
353            ),
354            vec![
355                Edit {
356                    old_text: "a\nb\nc".to_string(),
357                    new_text: "a\nB\nc".to_string(),
358                },
359                Edit {
360                    old_text: "d\ne\nf".to_string(),
361                    new_text: "D\ne\nF".to_string(),
362                }
363            ]
364        );
365        assert_eq!(
366            parser.finish(),
367            EditParserMetrics {
368                tags: 4,
369                mismatched_tags: 4
370            }
371        );
372
373        let mut parser = EditParser::new();
374        assert_eq!(
375            parse_random_chunks(
376                // Reduced from an actual Opus 4 output
377                indoc! {"
378                    <edits>
379                    <old_text>
380                    Lorem
381                    </old_text>
382                    <new_text>
383                    LOREM
384                    </edits>
385                "},
386                &mut parser,
387                &mut rng
388            ),
389            vec![Edit {
390                old_text: "Lorem".to_string(),
391                new_text: "LOREM".to_string(),
392            },]
393        );
394        assert_eq!(
395            parser.finish(),
396            EditParserMetrics {
397                tags: 2,
398                mismatched_tags: 1
399            }
400        );
401    }
402
403    #[derive(Default, Debug, PartialEq, Eq)]
404    struct Edit {
405        old_text: String,
406        new_text: String,
407    }
408
409    fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
410        let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
411        let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
412        chunk_indices.sort();
413        chunk_indices.push(input.len());
414
415        let mut pending_edit = Edit::default();
416        let mut edits = Vec::new();
417        let mut last_ix = 0;
418        for chunk_ix in chunk_indices {
419            for event in parser.push(&input[last_ix..chunk_ix]) {
420                match event {
421                    EditParserEvent::OldText(old_text) => {
422                        pending_edit.old_text = old_text;
423                    }
424                    EditParserEvent::NewTextChunk { chunk, done } => {
425                        pending_edit.new_text.push_str(&chunk);
426                        if done {
427                            edits.push(pending_edit);
428                            pending_edit = Edit::default();
429                        }
430                    }
431                }
432            }
433            last_ix = chunk_ix;
434        }
435
436        assert_eq!(pending_edit, Edit::default(), "unfinished edit");
437
438        edits
439    }
440}