edit_parser.rs

  1use derive_more::{Add, AddAssign};
  2use smallvec::SmallVec;
  3use std::{cmp, mem, ops::Range};
  4
  5const OLD_TEXT_END_TAG: &str = "</old_text>";
  6const NEW_TEXT_END_TAG: &str = "</new_text>";
  7const END_TAG_LEN: usize = OLD_TEXT_END_TAG.len();
  8const _: () = debug_assert!(OLD_TEXT_END_TAG.len() == NEW_TEXT_END_TAG.len());
  9
 10#[derive(Debug)]
 11pub enum EditParserEvent {
 12    OldText(String),
 13    NewTextChunk { chunk: String, done: bool },
 14}
 15
 16#[derive(Clone, Debug, Default, PartialEq, Eq, Add, AddAssign)]
 17pub struct EditParserMetrics {
 18    pub tags: usize,
 19    pub mismatched_tags: usize,
 20}
 21
 22#[derive(Debug)]
 23pub struct EditParser {
 24    state: EditParserState,
 25    buffer: String,
 26    metrics: EditParserMetrics,
 27}
 28
 29#[derive(Debug, PartialEq)]
 30enum EditParserState {
 31    Pending,
 32    WithinOldText,
 33    AfterOldText,
 34    WithinNewText { start: bool },
 35}
 36
 37impl EditParser {
 38    pub fn new() -> Self {
 39        EditParser {
 40            state: EditParserState::Pending,
 41            buffer: String::new(),
 42            metrics: EditParserMetrics::default(),
 43        }
 44    }
 45
 46    pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
 47        self.buffer.push_str(chunk);
 48
 49        let mut edit_events = SmallVec::new();
 50        loop {
 51            match &mut self.state {
 52                EditParserState::Pending => {
 53                    if let Some(start) = self.buffer.find("<old_text>") {
 54                        self.buffer.drain(..start + "<old_text>".len());
 55                        self.state = EditParserState::WithinOldText;
 56                    } else {
 57                        break;
 58                    }
 59                }
 60                EditParserState::WithinOldText => {
 61                    if let Some(tag_range) = self.find_end_tag() {
 62                        let mut start = 0;
 63                        if self.buffer.starts_with('\n') {
 64                            start = 1;
 65                        }
 66                        let mut old_text = self.buffer[start..tag_range.start].to_string();
 67                        if old_text.ends_with('\n') {
 68                            old_text.pop();
 69                        }
 70
 71                        self.metrics.tags += 1;
 72                        if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
 73                            self.metrics.mismatched_tags += 1;
 74                        }
 75
 76                        self.buffer.drain(..tag_range.end);
 77                        self.state = EditParserState::AfterOldText;
 78                        edit_events.push(EditParserEvent::OldText(old_text));
 79                    } else {
 80                        break;
 81                    }
 82                }
 83                EditParserState::AfterOldText => {
 84                    if let Some(start) = self.buffer.find("<new_text>") {
 85                        self.buffer.drain(..start + "<new_text>".len());
 86                        self.state = EditParserState::WithinNewText { start: true };
 87                    } else {
 88                        break;
 89                    }
 90                }
 91                EditParserState::WithinNewText { start } => {
 92                    if !self.buffer.is_empty() {
 93                        if *start && self.buffer.starts_with('\n') {
 94                            self.buffer.remove(0);
 95                        }
 96                        *start = false;
 97                    }
 98
 99                    if let Some(tag_range) = self.find_end_tag() {
100                        let mut chunk = self.buffer[..tag_range.start].to_string();
101                        if chunk.ends_with('\n') {
102                            chunk.pop();
103                        }
104
105                        self.metrics.tags += 1;
106                        if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
107                            self.metrics.mismatched_tags += 1;
108                        }
109
110                        self.buffer.drain(..tag_range.end);
111                        self.state = EditParserState::Pending;
112                        edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
113                    } else {
114                        let mut end_prefixes = (1..END_TAG_LEN)
115                            .flat_map(|i| [&NEW_TEXT_END_TAG[..i], &OLD_TEXT_END_TAG[..i]])
116                            .chain(["\n"]);
117                        if end_prefixes.all(|prefix| !self.buffer.ends_with(&prefix)) {
118                            edit_events.push(EditParserEvent::NewTextChunk {
119                                chunk: mem::take(&mut self.buffer),
120                                done: false,
121                            });
122                        }
123                        break;
124                    }
125                }
126            }
127        }
128        edit_events
129    }
130
131    fn find_end_tag(&self) -> Option<Range<usize>> {
132        let old_text_end_tag_ix = self.buffer.find(OLD_TEXT_END_TAG);
133        let new_text_end_tag_ix = self.buffer.find(NEW_TEXT_END_TAG);
134        let start_ix = if let Some((old_text_ix, new_text_ix)) =
135            old_text_end_tag_ix.zip(new_text_end_tag_ix)
136        {
137            cmp::min(old_text_ix, new_text_ix)
138        } else {
139            old_text_end_tag_ix.or(new_text_end_tag_ix)?
140        };
141        Some(start_ix..start_ix + END_TAG_LEN)
142    }
143
144    pub fn finish(self) -> EditParserMetrics {
145        self.metrics
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use indoc::indoc;
153    use rand::prelude::*;
154    use std::cmp;
155
156    #[gpui::test(iterations = 1000)]
157    fn test_single_edit(mut rng: StdRng) {
158        let mut parser = EditParser::new();
159        assert_eq!(
160            parse_random_chunks(
161                "<old_text>original</old_text><new_text>updated</new_text>",
162                &mut parser,
163                &mut rng
164            ),
165            vec![Edit {
166                old_text: "original".to_string(),
167                new_text: "updated".to_string(),
168            }]
169        );
170        assert_eq!(
171            parser.finish(),
172            EditParserMetrics {
173                tags: 2,
174                mismatched_tags: 0
175            }
176        );
177    }
178
179    #[gpui::test(iterations = 1000)]
180    fn test_multiple_edits(mut rng: StdRng) {
181        let mut parser = EditParser::new();
182        assert_eq!(
183            parse_random_chunks(
184                indoc! {"
185                    <old_text>
186                    first old
187                    </old_text><new_text>first new</new_text>
188                    <old_text>second old</old_text><new_text>
189                    second new
190                    </new_text>
191                "},
192                &mut parser,
193                &mut rng
194            ),
195            vec![
196                Edit {
197                    old_text: "first old".to_string(),
198                    new_text: "first new".to_string(),
199                },
200                Edit {
201                    old_text: "second old".to_string(),
202                    new_text: "second new".to_string(),
203                },
204            ]
205        );
206        assert_eq!(
207            parser.finish(),
208            EditParserMetrics {
209                tags: 4,
210                mismatched_tags: 0
211            }
212        );
213    }
214
215    #[gpui::test(iterations = 1000)]
216    fn test_edits_with_extra_text(mut rng: StdRng) {
217        let mut parser = EditParser::new();
218        assert_eq!(
219            parse_random_chunks(
220                indoc! {"
221                    ignore this <old_text>
222                    content</old_text>extra stuff<new_text>updated content</new_text>trailing data
223                    more text <old_text>second item
224                    </old_text>middle text<new_text>modified second item</new_text>end
225                    <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
226                "},
227                &mut parser,
228                &mut rng
229            ),
230            vec![
231                Edit {
232                    old_text: "content".to_string(),
233                    new_text: "updated content".to_string(),
234                },
235                Edit {
236                    old_text: "second item".to_string(),
237                    new_text: "modified second item".to_string(),
238                },
239                Edit {
240                    old_text: "third case".to_string(),
241                    new_text: "improved third case".to_string(),
242                },
243            ]
244        );
245        assert_eq!(
246            parser.finish(),
247            EditParserMetrics {
248                tags: 6,
249                mismatched_tags: 0
250            }
251        );
252    }
253
254    #[gpui::test(iterations = 1000)]
255    fn test_nested_tags(mut rng: StdRng) {
256        let mut parser = EditParser::new();
257        assert_eq!(
258            parse_random_chunks(
259                "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
260                &mut parser,
261                &mut rng
262            ),
263            vec![Edit {
264                old_text: "code with <tag>nested</tag> elements".to_string(),
265                new_text: "new <code>content</code>".to_string(),
266            }]
267        );
268        assert_eq!(
269            parser.finish(),
270            EditParserMetrics {
271                tags: 2,
272                mismatched_tags: 0
273            }
274        );
275    }
276
277    #[gpui::test(iterations = 1000)]
278    fn test_empty_old_and_new_text(mut rng: StdRng) {
279        let mut parser = EditParser::new();
280        assert_eq!(
281            parse_random_chunks(
282                "<old_text></old_text><new_text></new_text>",
283                &mut parser,
284                &mut rng
285            ),
286            vec![Edit {
287                old_text: "".to_string(),
288                new_text: "".to_string(),
289            }]
290        );
291        assert_eq!(
292            parser.finish(),
293            EditParserMetrics {
294                tags: 2,
295                mismatched_tags: 0
296            }
297        );
298    }
299
300    #[gpui::test(iterations = 100)]
301    fn test_multiline_content(mut rng: StdRng) {
302        let mut parser = EditParser::new();
303        assert_eq!(
304            parse_random_chunks(
305                "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
306                &mut parser,
307                &mut rng
308            ),
309            vec![Edit {
310                old_text: "line1\nline2\nline3".to_string(),
311                new_text: "line1\nmodified line2\nline3".to_string(),
312            }]
313        );
314        assert_eq!(
315            parser.finish(),
316            EditParserMetrics {
317                tags: 2,
318                mismatched_tags: 0
319            }
320        );
321    }
322
323    #[gpui::test(iterations = 1000)]
324    fn test_mismatched_tags(mut rng: StdRng) {
325        let mut parser = EditParser::new();
326        assert_eq!(
327            parse_random_chunks(
328                // Reduced from an actual Sonnet 3.7 output
329                indoc! {"
330                    <old_text>
331                    a
332                    b
333                    c
334                    </new_text>
335                    <new_text>
336                    a
337                    B
338                    c
339                    </old_text>
340                    <old_text>
341                    d
342                    e
343                    f
344                    </new_text>
345                    <new_text>
346                    D
347                    e
348                    F
349                    </old_text>
350                "},
351                &mut parser,
352                &mut rng
353            ),
354            vec![
355                Edit {
356                    old_text: "a\nb\nc".to_string(),
357                    new_text: "a\nB\nc".to_string(),
358                },
359                Edit {
360                    old_text: "d\ne\nf".to_string(),
361                    new_text: "D\ne\nF".to_string(),
362                }
363            ]
364        );
365        assert_eq!(
366            parser.finish(),
367            EditParserMetrics {
368                tags: 4,
369                mismatched_tags: 4
370            }
371        );
372    }
373
374    #[derive(Default, Debug, PartialEq, Eq)]
375    struct Edit {
376        old_text: String,
377        new_text: String,
378    }
379
380    fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
381        let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
382        let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
383        chunk_indices.sort();
384        chunk_indices.push(input.len());
385
386        let mut pending_edit = Edit::default();
387        let mut edits = Vec::new();
388        let mut last_ix = 0;
389        for chunk_ix in chunk_indices {
390            for event in parser.push(&input[last_ix..chunk_ix]) {
391                match event {
392                    EditParserEvent::OldText(old_text) => {
393                        pending_edit.old_text = old_text;
394                    }
395                    EditParserEvent::NewTextChunk { chunk, done } => {
396                        pending_edit.new_text.push_str(&chunk);
397                        if done {
398                            edits.push(pending_edit);
399                            pending_edit = Edit::default();
400                        }
401                    }
402                }
403            }
404            last_ix = chunk_ix;
405        }
406        edits
407    }
408}