1use derive_more::{Add, AddAssign};
2use schemars::JsonSchema;
3use serde::{Deserialize, Serialize};
4use smallvec::SmallVec;
5use std::{mem, ops::Range};
6
7const OLD_TEXT_END_TAG: &str = "</old_text>";
8const NEW_TEXT_END_TAG: &str = "</new_text>";
9const EDITS_END_TAG: &str = "</edits>";
10const END_TAGS: [&str; 3] = [OLD_TEXT_END_TAG, NEW_TEXT_END_TAG, EDITS_END_TAG];
11
12#[derive(Debug)]
13pub enum EditParserEvent {
14 OldText(String),
15 NewTextChunk { chunk: String, done: bool },
16}
17
18#[derive(
19 Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
20)]
21pub struct EditParserMetrics {
22 pub tags: usize,
23 pub mismatched_tags: usize,
24}
25
26#[derive(Debug)]
27pub struct EditParser {
28 state: EditParserState,
29 buffer: String,
30 metrics: EditParserMetrics,
31}
32
33#[derive(Debug, PartialEq)]
34enum EditParserState {
35 Pending,
36 WithinOldText,
37 AfterOldText,
38 WithinNewText { start: bool },
39}
40
41impl EditParser {
42 pub fn new() -> Self {
43 EditParser {
44 state: EditParserState::Pending,
45 buffer: String::new(),
46 metrics: EditParserMetrics::default(),
47 }
48 }
49
50 pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
51 self.buffer.push_str(chunk);
52
53 let mut edit_events = SmallVec::new();
54 loop {
55 match &mut self.state {
56 EditParserState::Pending => {
57 if let Some(start) = self.buffer.find("<old_text>") {
58 self.buffer.drain(..start + "<old_text>".len());
59 self.state = EditParserState::WithinOldText;
60 } else {
61 break;
62 }
63 }
64 EditParserState::WithinOldText => {
65 if let Some(tag_range) = self.find_end_tag() {
66 let mut start = 0;
67 if self.buffer.starts_with('\n') {
68 start = 1;
69 }
70 let mut old_text = self.buffer[start..tag_range.start].to_string();
71 if old_text.ends_with('\n') {
72 old_text.pop();
73 }
74
75 self.metrics.tags += 1;
76 if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
77 self.metrics.mismatched_tags += 1;
78 }
79
80 self.buffer.drain(..tag_range.end);
81 self.state = EditParserState::AfterOldText;
82 edit_events.push(EditParserEvent::OldText(old_text));
83 } else {
84 break;
85 }
86 }
87 EditParserState::AfterOldText => {
88 if let Some(start) = self.buffer.find("<new_text>") {
89 self.buffer.drain(..start + "<new_text>".len());
90 self.state = EditParserState::WithinNewText { start: true };
91 } else {
92 break;
93 }
94 }
95 EditParserState::WithinNewText { start } => {
96 if !self.buffer.is_empty() {
97 if *start && self.buffer.starts_with('\n') {
98 self.buffer.remove(0);
99 }
100 *start = false;
101 }
102
103 if let Some(tag_range) = self.find_end_tag() {
104 let mut chunk = self.buffer[..tag_range.start].to_string();
105 if chunk.ends_with('\n') {
106 chunk.pop();
107 }
108
109 self.metrics.tags += 1;
110 if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
111 self.metrics.mismatched_tags += 1;
112 }
113
114 self.buffer.drain(..tag_range.end);
115 self.state = EditParserState::Pending;
116 edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
117 } else {
118 let mut end_prefixes = END_TAGS
119 .iter()
120 .flat_map(|tag| (1..tag.len()).map(move |i| &tag[..i]))
121 .chain(["\n"]);
122 if end_prefixes.all(|prefix| !self.buffer.ends_with(&prefix)) {
123 edit_events.push(EditParserEvent::NewTextChunk {
124 chunk: mem::take(&mut self.buffer),
125 done: false,
126 });
127 }
128 break;
129 }
130 }
131 }
132 }
133 edit_events
134 }
135
136 fn find_end_tag(&self) -> Option<Range<usize>> {
137 let (tag, start_ix) = END_TAGS
138 .iter()
139 .flat_map(|tag| Some((tag, self.buffer.find(tag)?)))
140 .min_by_key(|(_, ix)| *ix)?;
141 Some(start_ix..start_ix + tag.len())
142 }
143
144 pub fn finish(self) -> EditParserMetrics {
145 self.metrics
146 }
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152 use indoc::indoc;
153 use rand::prelude::*;
154 use std::cmp;
155
156 #[gpui::test(iterations = 1000)]
157 fn test_single_edit(mut rng: StdRng) {
158 let mut parser = EditParser::new();
159 assert_eq!(
160 parse_random_chunks(
161 "<old_text>original</old_text><new_text>updated</new_text>",
162 &mut parser,
163 &mut rng
164 ),
165 vec![Edit {
166 old_text: "original".to_string(),
167 new_text: "updated".to_string(),
168 }]
169 );
170 assert_eq!(
171 parser.finish(),
172 EditParserMetrics {
173 tags: 2,
174 mismatched_tags: 0
175 }
176 );
177 }
178
179 #[gpui::test(iterations = 1000)]
180 fn test_multiple_edits(mut rng: StdRng) {
181 let mut parser = EditParser::new();
182 assert_eq!(
183 parse_random_chunks(
184 indoc! {"
185 <old_text>
186 first old
187 </old_text><new_text>first new</new_text>
188 <old_text>second old</old_text><new_text>
189 second new
190 </new_text>
191 "},
192 &mut parser,
193 &mut rng
194 ),
195 vec![
196 Edit {
197 old_text: "first old".to_string(),
198 new_text: "first new".to_string(),
199 },
200 Edit {
201 old_text: "second old".to_string(),
202 new_text: "second new".to_string(),
203 },
204 ]
205 );
206 assert_eq!(
207 parser.finish(),
208 EditParserMetrics {
209 tags: 4,
210 mismatched_tags: 0
211 }
212 );
213 }
214
215 #[gpui::test(iterations = 1000)]
216 fn test_edits_with_extra_text(mut rng: StdRng) {
217 let mut parser = EditParser::new();
218 assert_eq!(
219 parse_random_chunks(
220 indoc! {"
221 ignore this <old_text>
222 content</old_text>extra stuff<new_text>updated content</new_text>trailing data
223 more text <old_text>second item
224 </old_text>middle text<new_text>modified second item</new_text>end
225 <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
226 "},
227 &mut parser,
228 &mut rng
229 ),
230 vec![
231 Edit {
232 old_text: "content".to_string(),
233 new_text: "updated content".to_string(),
234 },
235 Edit {
236 old_text: "second item".to_string(),
237 new_text: "modified second item".to_string(),
238 },
239 Edit {
240 old_text: "third case".to_string(),
241 new_text: "improved third case".to_string(),
242 },
243 ]
244 );
245 assert_eq!(
246 parser.finish(),
247 EditParserMetrics {
248 tags: 6,
249 mismatched_tags: 0
250 }
251 );
252 }
253
254 #[gpui::test(iterations = 1000)]
255 fn test_nested_tags(mut rng: StdRng) {
256 let mut parser = EditParser::new();
257 assert_eq!(
258 parse_random_chunks(
259 "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
260 &mut parser,
261 &mut rng
262 ),
263 vec![Edit {
264 old_text: "code with <tag>nested</tag> elements".to_string(),
265 new_text: "new <code>content</code>".to_string(),
266 }]
267 );
268 assert_eq!(
269 parser.finish(),
270 EditParserMetrics {
271 tags: 2,
272 mismatched_tags: 0
273 }
274 );
275 }
276
277 #[gpui::test(iterations = 1000)]
278 fn test_empty_old_and_new_text(mut rng: StdRng) {
279 let mut parser = EditParser::new();
280 assert_eq!(
281 parse_random_chunks(
282 "<old_text></old_text><new_text></new_text>",
283 &mut parser,
284 &mut rng
285 ),
286 vec![Edit {
287 old_text: "".to_string(),
288 new_text: "".to_string(),
289 }]
290 );
291 assert_eq!(
292 parser.finish(),
293 EditParserMetrics {
294 tags: 2,
295 mismatched_tags: 0
296 }
297 );
298 }
299
300 #[gpui::test(iterations = 100)]
301 fn test_multiline_content(mut rng: StdRng) {
302 let mut parser = EditParser::new();
303 assert_eq!(
304 parse_random_chunks(
305 "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
306 &mut parser,
307 &mut rng
308 ),
309 vec![Edit {
310 old_text: "line1\nline2\nline3".to_string(),
311 new_text: "line1\nmodified line2\nline3".to_string(),
312 }]
313 );
314 assert_eq!(
315 parser.finish(),
316 EditParserMetrics {
317 tags: 2,
318 mismatched_tags: 0
319 }
320 );
321 }
322
323 #[gpui::test(iterations = 1000)]
324 fn test_mismatched_tags(mut rng: StdRng) {
325 let mut parser = EditParser::new();
326 assert_eq!(
327 parse_random_chunks(
328 // Reduced from an actual Sonnet 3.7 output
329 indoc! {"
330 <old_text>
331 a
332 b
333 c
334 </new_text>
335 <new_text>
336 a
337 B
338 c
339 </old_text>
340 <old_text>
341 d
342 e
343 f
344 </new_text>
345 <new_text>
346 D
347 e
348 F
349 </old_text>
350 "},
351 &mut parser,
352 &mut rng
353 ),
354 vec![
355 Edit {
356 old_text: "a\nb\nc".to_string(),
357 new_text: "a\nB\nc".to_string(),
358 },
359 Edit {
360 old_text: "d\ne\nf".to_string(),
361 new_text: "D\ne\nF".to_string(),
362 }
363 ]
364 );
365 assert_eq!(
366 parser.finish(),
367 EditParserMetrics {
368 tags: 4,
369 mismatched_tags: 4
370 }
371 );
372
373 let mut parser = EditParser::new();
374 assert_eq!(
375 parse_random_chunks(
376 // Reduced from an actual Opus 4 output
377 indoc! {"
378 <edits>
379 <old_text>
380 Lorem
381 </old_text>
382 <new_text>
383 LOREM
384 </edits>
385 "},
386 &mut parser,
387 &mut rng
388 ),
389 vec![Edit {
390 old_text: "Lorem".to_string(),
391 new_text: "LOREM".to_string(),
392 },]
393 );
394 assert_eq!(
395 parser.finish(),
396 EditParserMetrics {
397 tags: 2,
398 mismatched_tags: 1
399 }
400 );
401 }
402
403 #[derive(Default, Debug, PartialEq, Eq)]
404 struct Edit {
405 old_text: String,
406 new_text: String,
407 }
408
409 fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
410 let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
411 let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
412 chunk_indices.sort();
413 chunk_indices.push(input.len());
414
415 let mut pending_edit = Edit::default();
416 let mut edits = Vec::new();
417 let mut last_ix = 0;
418 for chunk_ix in chunk_indices {
419 for event in parser.push(&input[last_ix..chunk_ix]) {
420 match event {
421 EditParserEvent::OldText(old_text) => {
422 pending_edit.old_text = old_text;
423 }
424 EditParserEvent::NewTextChunk { chunk, done } => {
425 pending_edit.new_text.push_str(&chunk);
426 if done {
427 edits.push(pending_edit);
428 pending_edit = Edit::default();
429 }
430 }
431 }
432 }
433 last_ix = chunk_ix;
434 }
435
436 assert_eq!(pending_edit, Edit::default(), "unfinished edit");
437
438 edits
439 }
440}