edit_parser.rs

  1use derive_more::{Add, AddAssign};
  2use schemars::JsonSchema;
  3use serde::{Deserialize, Serialize};
  4use smallvec::SmallVec;
  5use std::{cmp, mem, ops::Range};
  6
  7const OLD_TEXT_END_TAG: &str = "</old_text>";
  8const NEW_TEXT_END_TAG: &str = "</new_text>";
  9const END_TAG_LEN: usize = OLD_TEXT_END_TAG.len();
 10const _: () = debug_assert!(OLD_TEXT_END_TAG.len() == NEW_TEXT_END_TAG.len());
 11
 12#[derive(Debug)]
 13pub enum EditParserEvent {
 14    OldText(String),
 15    NewTextChunk { chunk: String, done: bool },
 16}
 17
 18#[derive(
 19    Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
 20)]
 21pub struct EditParserMetrics {
 22    pub tags: usize,
 23    pub mismatched_tags: usize,
 24}
 25
 26#[derive(Debug)]
 27pub struct EditParser {
 28    state: EditParserState,
 29    buffer: String,
 30    metrics: EditParserMetrics,
 31}
 32
 33#[derive(Debug, PartialEq)]
 34enum EditParserState {
 35    Pending,
 36    WithinOldText,
 37    AfterOldText,
 38    WithinNewText { start: bool },
 39}
 40
 41impl EditParser {
 42    pub fn new() -> Self {
 43        EditParser {
 44            state: EditParserState::Pending,
 45            buffer: String::new(),
 46            metrics: EditParserMetrics::default(),
 47        }
 48    }
 49
 50    pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
 51        self.buffer.push_str(chunk);
 52
 53        let mut edit_events = SmallVec::new();
 54        loop {
 55            match &mut self.state {
 56                EditParserState::Pending => {
 57                    if let Some(start) = self.buffer.find("<old_text>") {
 58                        self.buffer.drain(..start + "<old_text>".len());
 59                        self.state = EditParserState::WithinOldText;
 60                    } else {
 61                        break;
 62                    }
 63                }
 64                EditParserState::WithinOldText => {
 65                    if let Some(tag_range) = self.find_end_tag() {
 66                        let mut start = 0;
 67                        if self.buffer.starts_with('\n') {
 68                            start = 1;
 69                        }
 70                        let mut old_text = self.buffer[start..tag_range.start].to_string();
 71                        if old_text.ends_with('\n') {
 72                            old_text.pop();
 73                        }
 74
 75                        self.metrics.tags += 1;
 76                        if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
 77                            self.metrics.mismatched_tags += 1;
 78                        }
 79
 80                        self.buffer.drain(..tag_range.end);
 81                        self.state = EditParserState::AfterOldText;
 82                        edit_events.push(EditParserEvent::OldText(old_text));
 83                    } else {
 84                        break;
 85                    }
 86                }
 87                EditParserState::AfterOldText => {
 88                    if let Some(start) = self.buffer.find("<new_text>") {
 89                        self.buffer.drain(..start + "<new_text>".len());
 90                        self.state = EditParserState::WithinNewText { start: true };
 91                    } else {
 92                        break;
 93                    }
 94                }
 95                EditParserState::WithinNewText { start } => {
 96                    if !self.buffer.is_empty() {
 97                        if *start && self.buffer.starts_with('\n') {
 98                            self.buffer.remove(0);
 99                        }
100                        *start = false;
101                    }
102
103                    if let Some(tag_range) = self.find_end_tag() {
104                        let mut chunk = self.buffer[..tag_range.start].to_string();
105                        if chunk.ends_with('\n') {
106                            chunk.pop();
107                        }
108
109                        self.metrics.tags += 1;
110                        if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
111                            self.metrics.mismatched_tags += 1;
112                        }
113
114                        self.buffer.drain(..tag_range.end);
115                        self.state = EditParserState::Pending;
116                        edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
117                    } else {
118                        let mut end_prefixes = (1..END_TAG_LEN)
119                            .flat_map(|i| [&NEW_TEXT_END_TAG[..i], &OLD_TEXT_END_TAG[..i]])
120                            .chain(["\n"]);
121                        if end_prefixes.all(|prefix| !self.buffer.ends_with(&prefix)) {
122                            edit_events.push(EditParserEvent::NewTextChunk {
123                                chunk: mem::take(&mut self.buffer),
124                                done: false,
125                            });
126                        }
127                        break;
128                    }
129                }
130            }
131        }
132        edit_events
133    }
134
135    fn find_end_tag(&self) -> Option<Range<usize>> {
136        let old_text_end_tag_ix = self.buffer.find(OLD_TEXT_END_TAG);
137        let new_text_end_tag_ix = self.buffer.find(NEW_TEXT_END_TAG);
138        let start_ix = if let Some((old_text_ix, new_text_ix)) =
139            old_text_end_tag_ix.zip(new_text_end_tag_ix)
140        {
141            cmp::min(old_text_ix, new_text_ix)
142        } else {
143            old_text_end_tag_ix.or(new_text_end_tag_ix)?
144        };
145        Some(start_ix..start_ix + END_TAG_LEN)
146    }
147
148    pub fn finish(self) -> EditParserMetrics {
149        self.metrics
150    }
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156    use indoc::indoc;
157    use rand::prelude::*;
158    use std::cmp;
159
160    #[gpui::test(iterations = 1000)]
161    fn test_single_edit(mut rng: StdRng) {
162        let mut parser = EditParser::new();
163        assert_eq!(
164            parse_random_chunks(
165                "<old_text>original</old_text><new_text>updated</new_text>",
166                &mut parser,
167                &mut rng
168            ),
169            vec![Edit {
170                old_text: "original".to_string(),
171                new_text: "updated".to_string(),
172            }]
173        );
174        assert_eq!(
175            parser.finish(),
176            EditParserMetrics {
177                tags: 2,
178                mismatched_tags: 0
179            }
180        );
181    }
182
183    #[gpui::test(iterations = 1000)]
184    fn test_multiple_edits(mut rng: StdRng) {
185        let mut parser = EditParser::new();
186        assert_eq!(
187            parse_random_chunks(
188                indoc! {"
189                    <old_text>
190                    first old
191                    </old_text><new_text>first new</new_text>
192                    <old_text>second old</old_text><new_text>
193                    second new
194                    </new_text>
195                "},
196                &mut parser,
197                &mut rng
198            ),
199            vec![
200                Edit {
201                    old_text: "first old".to_string(),
202                    new_text: "first new".to_string(),
203                },
204                Edit {
205                    old_text: "second old".to_string(),
206                    new_text: "second new".to_string(),
207                },
208            ]
209        );
210        assert_eq!(
211            parser.finish(),
212            EditParserMetrics {
213                tags: 4,
214                mismatched_tags: 0
215            }
216        );
217    }
218
219    #[gpui::test(iterations = 1000)]
220    fn test_edits_with_extra_text(mut rng: StdRng) {
221        let mut parser = EditParser::new();
222        assert_eq!(
223            parse_random_chunks(
224                indoc! {"
225                    ignore this <old_text>
226                    content</old_text>extra stuff<new_text>updated content</new_text>trailing data
227                    more text <old_text>second item
228                    </old_text>middle text<new_text>modified second item</new_text>end
229                    <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
230                "},
231                &mut parser,
232                &mut rng
233            ),
234            vec![
235                Edit {
236                    old_text: "content".to_string(),
237                    new_text: "updated content".to_string(),
238                },
239                Edit {
240                    old_text: "second item".to_string(),
241                    new_text: "modified second item".to_string(),
242                },
243                Edit {
244                    old_text: "third case".to_string(),
245                    new_text: "improved third case".to_string(),
246                },
247            ]
248        );
249        assert_eq!(
250            parser.finish(),
251            EditParserMetrics {
252                tags: 6,
253                mismatched_tags: 0
254            }
255        );
256    }
257
258    #[gpui::test(iterations = 1000)]
259    fn test_nested_tags(mut rng: StdRng) {
260        let mut parser = EditParser::new();
261        assert_eq!(
262            parse_random_chunks(
263                "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
264                &mut parser,
265                &mut rng
266            ),
267            vec![Edit {
268                old_text: "code with <tag>nested</tag> elements".to_string(),
269                new_text: "new <code>content</code>".to_string(),
270            }]
271        );
272        assert_eq!(
273            parser.finish(),
274            EditParserMetrics {
275                tags: 2,
276                mismatched_tags: 0
277            }
278        );
279    }
280
281    #[gpui::test(iterations = 1000)]
282    fn test_empty_old_and_new_text(mut rng: StdRng) {
283        let mut parser = EditParser::new();
284        assert_eq!(
285            parse_random_chunks(
286                "<old_text></old_text><new_text></new_text>",
287                &mut parser,
288                &mut rng
289            ),
290            vec![Edit {
291                old_text: "".to_string(),
292                new_text: "".to_string(),
293            }]
294        );
295        assert_eq!(
296            parser.finish(),
297            EditParserMetrics {
298                tags: 2,
299                mismatched_tags: 0
300            }
301        );
302    }
303
304    #[gpui::test(iterations = 100)]
305    fn test_multiline_content(mut rng: StdRng) {
306        let mut parser = EditParser::new();
307        assert_eq!(
308            parse_random_chunks(
309                "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
310                &mut parser,
311                &mut rng
312            ),
313            vec![Edit {
314                old_text: "line1\nline2\nline3".to_string(),
315                new_text: "line1\nmodified line2\nline3".to_string(),
316            }]
317        );
318        assert_eq!(
319            parser.finish(),
320            EditParserMetrics {
321                tags: 2,
322                mismatched_tags: 0
323            }
324        );
325    }
326
327    #[gpui::test(iterations = 1000)]
328    fn test_mismatched_tags(mut rng: StdRng) {
329        let mut parser = EditParser::new();
330        assert_eq!(
331            parse_random_chunks(
332                // Reduced from an actual Sonnet 3.7 output
333                indoc! {"
334                    <old_text>
335                    a
336                    b
337                    c
338                    </new_text>
339                    <new_text>
340                    a
341                    B
342                    c
343                    </old_text>
344                    <old_text>
345                    d
346                    e
347                    f
348                    </new_text>
349                    <new_text>
350                    D
351                    e
352                    F
353                    </old_text>
354                "},
355                &mut parser,
356                &mut rng
357            ),
358            vec![
359                Edit {
360                    old_text: "a\nb\nc".to_string(),
361                    new_text: "a\nB\nc".to_string(),
362                },
363                Edit {
364                    old_text: "d\ne\nf".to_string(),
365                    new_text: "D\ne\nF".to_string(),
366                }
367            ]
368        );
369        assert_eq!(
370            parser.finish(),
371            EditParserMetrics {
372                tags: 4,
373                mismatched_tags: 4
374            }
375        );
376    }
377
378    #[derive(Default, Debug, PartialEq, Eq)]
379    struct Edit {
380        old_text: String,
381        new_text: String,
382    }
383
384    fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
385        let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
386        let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
387        chunk_indices.sort();
388        chunk_indices.push(input.len());
389
390        let mut pending_edit = Edit::default();
391        let mut edits = Vec::new();
392        let mut last_ix = 0;
393        for chunk_ix in chunk_indices {
394            for event in parser.push(&input[last_ix..chunk_ix]) {
395                match event {
396                    EditParserEvent::OldText(old_text) => {
397                        pending_edit.old_text = old_text;
398                    }
399                    EditParserEvent::NewTextChunk { chunk, done } => {
400                        pending_edit.new_text.push_str(&chunk);
401                        if done {
402                            edits.push(pending_edit);
403                            pending_edit = Edit::default();
404                        }
405                    }
406                }
407            }
408            last_ix = chunk_ix;
409        }
410        edits
411    }
412}