1use derive_more::{Add, AddAssign};
2use schemars::JsonSchema;
3use serde::{Deserialize, Serialize};
4use smallvec::SmallVec;
5use std::{cmp, mem, ops::Range};
6
7const OLD_TEXT_END_TAG: &str = "</old_text>";
8const NEW_TEXT_END_TAG: &str = "</new_text>";
9const END_TAG_LEN: usize = OLD_TEXT_END_TAG.len();
10const _: () = debug_assert!(OLD_TEXT_END_TAG.len() == NEW_TEXT_END_TAG.len());
11
12#[derive(Debug)]
13pub enum EditParserEvent {
14 OldText(String),
15 NewTextChunk { chunk: String, done: bool },
16}
17
18#[derive(
19 Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
20)]
21pub struct EditParserMetrics {
22 pub tags: usize,
23 pub mismatched_tags: usize,
24}
25
26#[derive(Debug)]
27pub struct EditParser {
28 state: EditParserState,
29 buffer: String,
30 metrics: EditParserMetrics,
31}
32
33#[derive(Debug, PartialEq)]
34enum EditParserState {
35 Pending,
36 WithinOldText,
37 AfterOldText,
38 WithinNewText { start: bool },
39}
40
41impl EditParser {
42 pub fn new() -> Self {
43 EditParser {
44 state: EditParserState::Pending,
45 buffer: String::new(),
46 metrics: EditParserMetrics::default(),
47 }
48 }
49
50 pub fn push(&mut self, chunk: &str) -> SmallVec<[EditParserEvent; 1]> {
51 self.buffer.push_str(chunk);
52
53 let mut edit_events = SmallVec::new();
54 loop {
55 match &mut self.state {
56 EditParserState::Pending => {
57 if let Some(start) = self.buffer.find("<old_text>") {
58 self.buffer.drain(..start + "<old_text>".len());
59 self.state = EditParserState::WithinOldText;
60 } else {
61 break;
62 }
63 }
64 EditParserState::WithinOldText => {
65 if let Some(tag_range) = self.find_end_tag() {
66 let mut start = 0;
67 if self.buffer.starts_with('\n') {
68 start = 1;
69 }
70 let mut old_text = self.buffer[start..tag_range.start].to_string();
71 if old_text.ends_with('\n') {
72 old_text.pop();
73 }
74
75 self.metrics.tags += 1;
76 if &self.buffer[tag_range.clone()] != OLD_TEXT_END_TAG {
77 self.metrics.mismatched_tags += 1;
78 }
79
80 self.buffer.drain(..tag_range.end);
81 self.state = EditParserState::AfterOldText;
82 edit_events.push(EditParserEvent::OldText(old_text));
83 } else {
84 break;
85 }
86 }
87 EditParserState::AfterOldText => {
88 if let Some(start) = self.buffer.find("<new_text>") {
89 self.buffer.drain(..start + "<new_text>".len());
90 self.state = EditParserState::WithinNewText { start: true };
91 } else {
92 break;
93 }
94 }
95 EditParserState::WithinNewText { start } => {
96 if !self.buffer.is_empty() {
97 if *start && self.buffer.starts_with('\n') {
98 self.buffer.remove(0);
99 }
100 *start = false;
101 }
102
103 if let Some(tag_range) = self.find_end_tag() {
104 let mut chunk = self.buffer[..tag_range.start].to_string();
105 if chunk.ends_with('\n') {
106 chunk.pop();
107 }
108
109 self.metrics.tags += 1;
110 if &self.buffer[tag_range.clone()] != NEW_TEXT_END_TAG {
111 self.metrics.mismatched_tags += 1;
112 }
113
114 self.buffer.drain(..tag_range.end);
115 self.state = EditParserState::Pending;
116 edit_events.push(EditParserEvent::NewTextChunk { chunk, done: true });
117 } else {
118 let mut end_prefixes = (1..END_TAG_LEN)
119 .flat_map(|i| [&NEW_TEXT_END_TAG[..i], &OLD_TEXT_END_TAG[..i]])
120 .chain(["\n"]);
121 if end_prefixes.all(|prefix| !self.buffer.ends_with(&prefix)) {
122 edit_events.push(EditParserEvent::NewTextChunk {
123 chunk: mem::take(&mut self.buffer),
124 done: false,
125 });
126 }
127 break;
128 }
129 }
130 }
131 }
132 edit_events
133 }
134
135 fn find_end_tag(&self) -> Option<Range<usize>> {
136 let old_text_end_tag_ix = self.buffer.find(OLD_TEXT_END_TAG);
137 let new_text_end_tag_ix = self.buffer.find(NEW_TEXT_END_TAG);
138 let start_ix = if let Some((old_text_ix, new_text_ix)) =
139 old_text_end_tag_ix.zip(new_text_end_tag_ix)
140 {
141 cmp::min(old_text_ix, new_text_ix)
142 } else {
143 old_text_end_tag_ix.or(new_text_end_tag_ix)?
144 };
145 Some(start_ix..start_ix + END_TAG_LEN)
146 }
147
148 pub fn finish(self) -> EditParserMetrics {
149 self.metrics
150 }
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156 use indoc::indoc;
157 use rand::prelude::*;
158 use std::cmp;
159
160 #[gpui::test(iterations = 1000)]
161 fn test_single_edit(mut rng: StdRng) {
162 let mut parser = EditParser::new();
163 assert_eq!(
164 parse_random_chunks(
165 "<old_text>original</old_text><new_text>updated</new_text>",
166 &mut parser,
167 &mut rng
168 ),
169 vec![Edit {
170 old_text: "original".to_string(),
171 new_text: "updated".to_string(),
172 }]
173 );
174 assert_eq!(
175 parser.finish(),
176 EditParserMetrics {
177 tags: 2,
178 mismatched_tags: 0
179 }
180 );
181 }
182
183 #[gpui::test(iterations = 1000)]
184 fn test_multiple_edits(mut rng: StdRng) {
185 let mut parser = EditParser::new();
186 assert_eq!(
187 parse_random_chunks(
188 indoc! {"
189 <old_text>
190 first old
191 </old_text><new_text>first new</new_text>
192 <old_text>second old</old_text><new_text>
193 second new
194 </new_text>
195 "},
196 &mut parser,
197 &mut rng
198 ),
199 vec![
200 Edit {
201 old_text: "first old".to_string(),
202 new_text: "first new".to_string(),
203 },
204 Edit {
205 old_text: "second old".to_string(),
206 new_text: "second new".to_string(),
207 },
208 ]
209 );
210 assert_eq!(
211 parser.finish(),
212 EditParserMetrics {
213 tags: 4,
214 mismatched_tags: 0
215 }
216 );
217 }
218
219 #[gpui::test(iterations = 1000)]
220 fn test_edits_with_extra_text(mut rng: StdRng) {
221 let mut parser = EditParser::new();
222 assert_eq!(
223 parse_random_chunks(
224 indoc! {"
225 ignore this <old_text>
226 content</old_text>extra stuff<new_text>updated content</new_text>trailing data
227 more text <old_text>second item
228 </old_text>middle text<new_text>modified second item</new_text>end
229 <old_text>third case</old_text><new_text>improved third case</new_text> with trailing text
230 "},
231 &mut parser,
232 &mut rng
233 ),
234 vec![
235 Edit {
236 old_text: "content".to_string(),
237 new_text: "updated content".to_string(),
238 },
239 Edit {
240 old_text: "second item".to_string(),
241 new_text: "modified second item".to_string(),
242 },
243 Edit {
244 old_text: "third case".to_string(),
245 new_text: "improved third case".to_string(),
246 },
247 ]
248 );
249 assert_eq!(
250 parser.finish(),
251 EditParserMetrics {
252 tags: 6,
253 mismatched_tags: 0
254 }
255 );
256 }
257
258 #[gpui::test(iterations = 1000)]
259 fn test_nested_tags(mut rng: StdRng) {
260 let mut parser = EditParser::new();
261 assert_eq!(
262 parse_random_chunks(
263 "<old_text>code with <tag>nested</tag> elements</old_text><new_text>new <code>content</code></new_text>",
264 &mut parser,
265 &mut rng
266 ),
267 vec![Edit {
268 old_text: "code with <tag>nested</tag> elements".to_string(),
269 new_text: "new <code>content</code>".to_string(),
270 }]
271 );
272 assert_eq!(
273 parser.finish(),
274 EditParserMetrics {
275 tags: 2,
276 mismatched_tags: 0
277 }
278 );
279 }
280
281 #[gpui::test(iterations = 1000)]
282 fn test_empty_old_and_new_text(mut rng: StdRng) {
283 let mut parser = EditParser::new();
284 assert_eq!(
285 parse_random_chunks(
286 "<old_text></old_text><new_text></new_text>",
287 &mut parser,
288 &mut rng
289 ),
290 vec![Edit {
291 old_text: "".to_string(),
292 new_text: "".to_string(),
293 }]
294 );
295 assert_eq!(
296 parser.finish(),
297 EditParserMetrics {
298 tags: 2,
299 mismatched_tags: 0
300 }
301 );
302 }
303
304 #[gpui::test(iterations = 100)]
305 fn test_multiline_content(mut rng: StdRng) {
306 let mut parser = EditParser::new();
307 assert_eq!(
308 parse_random_chunks(
309 "<old_text>line1\nline2\nline3</old_text><new_text>line1\nmodified line2\nline3</new_text>",
310 &mut parser,
311 &mut rng
312 ),
313 vec![Edit {
314 old_text: "line1\nline2\nline3".to_string(),
315 new_text: "line1\nmodified line2\nline3".to_string(),
316 }]
317 );
318 assert_eq!(
319 parser.finish(),
320 EditParserMetrics {
321 tags: 2,
322 mismatched_tags: 0
323 }
324 );
325 }
326
327 #[gpui::test(iterations = 1000)]
328 fn test_mismatched_tags(mut rng: StdRng) {
329 let mut parser = EditParser::new();
330 assert_eq!(
331 parse_random_chunks(
332 // Reduced from an actual Sonnet 3.7 output
333 indoc! {"
334 <old_text>
335 a
336 b
337 c
338 </new_text>
339 <new_text>
340 a
341 B
342 c
343 </old_text>
344 <old_text>
345 d
346 e
347 f
348 </new_text>
349 <new_text>
350 D
351 e
352 F
353 </old_text>
354 "},
355 &mut parser,
356 &mut rng
357 ),
358 vec![
359 Edit {
360 old_text: "a\nb\nc".to_string(),
361 new_text: "a\nB\nc".to_string(),
362 },
363 Edit {
364 old_text: "d\ne\nf".to_string(),
365 new_text: "D\ne\nF".to_string(),
366 }
367 ]
368 );
369 assert_eq!(
370 parser.finish(),
371 EditParserMetrics {
372 tags: 4,
373 mismatched_tags: 4
374 }
375 );
376 }
377
378 #[derive(Default, Debug, PartialEq, Eq)]
379 struct Edit {
380 old_text: String,
381 new_text: String,
382 }
383
384 fn parse_random_chunks(input: &str, parser: &mut EditParser, rng: &mut StdRng) -> Vec<Edit> {
385 let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
386 let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
387 chunk_indices.sort();
388 chunk_indices.push(input.len());
389
390 let mut pending_edit = Edit::default();
391 let mut edits = Vec::new();
392 let mut last_ix = 0;
393 for chunk_ix in chunk_indices {
394 for event in parser.push(&input[last_ix..chunk_ix]) {
395 match event {
396 EditParserEvent::OldText(old_text) => {
397 pending_edit.old_text = old_text;
398 }
399 EditParserEvent::NewTextChunk { chunk, done } => {
400 pending_edit.new_text.push_str(&chunk);
401 if done {
402 edits.push(pending_edit);
403 pending_edit = Edit::default();
404 }
405 }
406 }
407 }
408 last_ix = chunk_ix;
409 }
410 edits
411 }
412}