multi_region.rs

  1use anyhow::{Context as _, Result, anyhow};
  2
  3pub const MARKER_TAG_PREFIX: &str = "<|marker_";
  4pub const MARKER_TAG_SUFFIX: &str = "|>";
  5const MIN_BLOCK_LINES: usize = 3;
  6const MAX_BLOCK_LINES: usize = 8;
  7
  8pub fn marker_tag(number: usize) -> String {
  9    format!("{MARKER_TAG_PREFIX}{number}{MARKER_TAG_SUFFIX}")
 10}
 11
 12/// Compute byte offsets within `editable_text` where marker boundaries should
 13/// be placed.
 14///
 15/// Returns a sorted `Vec<usize>` that always starts with `0` and ends with
 16/// `editable_text.len()`. Interior offsets are placed at line boundaries
 17/// (right after a `\n`), preferring blank-line boundaries when available and
 18/// respecting `MIN_BLOCK_LINES` / `MAX_BLOCK_LINES` constraints.
 19pub fn compute_marker_offsets(editable_text: &str) -> Vec<usize> {
 20    if editable_text.is_empty() {
 21        return vec![0, 0];
 22    }
 23
 24    let mut offsets = vec![0usize];
 25    let mut lines_since_last_marker = 0usize;
 26    let mut byte_offset = 0usize;
 27
 28    for line in editable_text.split('\n') {
 29        let line_end = byte_offset + line.len() + 1;
 30        let is_past_end = line_end > editable_text.len();
 31        let actual_line_end = line_end.min(editable_text.len());
 32        lines_since_last_marker += 1;
 33
 34        let is_blank = line.trim().is_empty();
 35
 36        if !is_past_end && lines_since_last_marker >= MIN_BLOCK_LINES {
 37            if is_blank {
 38                // Blank-line boundary found. We'll place the marker when we
 39                // find the next non-blank line (handled below).
 40            } else if lines_since_last_marker >= MAX_BLOCK_LINES {
 41                offsets.push(actual_line_end);
 42                lines_since_last_marker = 0;
 43            }
 44        }
 45
 46        // Non-blank line immediately following blank line(s): split here so
 47        // the new block starts with this line.
 48        if !is_blank && byte_offset > 0 && lines_since_last_marker >= MIN_BLOCK_LINES {
 49            let before = &editable_text[..byte_offset];
 50            let has_preceding_blank_line = before
 51                .strip_suffix('\n')
 52                .map(|stripped| {
 53                    let last_line = match stripped.rfind('\n') {
 54                        Some(pos) => &stripped[pos + 1..],
 55                        None => stripped,
 56                    };
 57                    last_line.trim().is_empty()
 58                })
 59                .unwrap_or(false);
 60
 61            if has_preceding_blank_line {
 62                offsets.push(byte_offset);
 63                lines_since_last_marker = 1;
 64            }
 65        }
 66
 67        byte_offset = actual_line_end;
 68
 69        // Re-check after blank-line logic since lines_since_last_marker may
 70        // have been reset.
 71        if !is_past_end && lines_since_last_marker >= MAX_BLOCK_LINES {
 72            if *offsets.last().unwrap_or(&0) != actual_line_end {
 73                offsets.push(actual_line_end);
 74                lines_since_last_marker = 0;
 75            }
 76        }
 77    }
 78
 79    let end = editable_text.len();
 80    if *offsets.last().unwrap_or(&0) != end {
 81        offsets.push(end);
 82    }
 83
 84    offsets
 85}
 86
 87/// Write the editable region content with marker tags, inserting the cursor
 88/// marker at the given offset within the editable text.
 89pub fn write_editable_with_markers(
 90    output: &mut String,
 91    editable_text: &str,
 92    cursor_offset_in_editable: usize,
 93    cursor_marker: &str,
 94) {
 95    let marker_offsets = compute_marker_offsets(editable_text);
 96    let mut cursor_placed = false;
 97    for (i, &offset) in marker_offsets.iter().enumerate() {
 98        let marker_num = i + 1;
 99        if !output.is_empty() && !output.ends_with('\n') {
100            output.push('\n');
101        }
102        output.push_str(&marker_tag(marker_num));
103
104        if let Some(&next_offset) = marker_offsets.get(i + 1) {
105            output.push('\n');
106            let block = &editable_text[offset..next_offset];
107            if !cursor_placed
108                && cursor_offset_in_editable >= offset
109                && cursor_offset_in_editable <= next_offset
110            {
111                cursor_placed = true;
112                let cursor_in_block = cursor_offset_in_editable - offset;
113                output.push_str(&block[..cursor_in_block]);
114                output.push_str(cursor_marker);
115                output.push_str(&block[cursor_in_block..]);
116            } else {
117                output.push_str(block);
118            }
119        }
120    }
121}
122
123/// Check if the output represents a "no edits" signal for V0316:
124/// the same marker tag appears twice in succession with no meaningful
125/// content between them (e.g. `<|marker_N|>\n<|marker_N|>`).
126pub fn is_repeated_final_marker(output: &str) -> bool {
127    let trimmed = output.trim();
128    let Some(prefix_end) = trimmed.find(MARKER_TAG_SUFFIX) else {
129        return false;
130    };
131    let first_tag_end = prefix_end + MARKER_TAG_SUFFIX.len();
132    let first_tag = &trimmed[..first_tag_end];
133
134    if !first_tag.starts_with(MARKER_TAG_PREFIX) {
135        return false;
136    }
137
138    let rest = &trimmed[first_tag_end..];
139    let rest = rest.strip_prefix('\n').unwrap_or(rest);
140    rest.trim() == first_tag
141}
142
143/// Strip any `<|marker_N|>` tags from `text`.
144///
145/// When a marker tag sits on its own line (followed by `\n`), the trailing
146/// newline is also removed so the surrounding lines stay joined naturally.
147fn strip_marker_tags(text: &str) -> String {
148    let mut result = String::with_capacity(text.len());
149    let mut pos = 0;
150    let bytes = text.as_bytes();
151    while let Some(rel) = text[pos..].find(MARKER_TAG_PREFIX) {
152        result.push_str(&text[pos..pos + rel]);
153        let num_start = pos + rel + MARKER_TAG_PREFIX.len();
154        if let Some(suffix_rel) = text[num_start..].find(MARKER_TAG_SUFFIX) {
155            let mut tag_end = num_start + suffix_rel + MARKER_TAG_SUFFIX.len();
156            if bytes.get(tag_end) == Some(&b'\n') {
157                tag_end += 1;
158            }
159            pos = tag_end;
160        } else {
161            result.push_str(MARKER_TAG_PREFIX);
162            pos = num_start;
163        }
164    }
165    result.push_str(&text[pos..]);
166    result
167}
168
169/// Parse model output that uses the marker format.
170///
171/// Returns `(start_marker_num, end_marker_num, content_between_markers)`.
172/// The leading format-level newline after the start marker is stripped.
173/// Trailing newlines are preserved so blank-line endings in the editable
174/// region are not lost.
175///
176/// Any extra intermediate marker tags that the model may have inserted
177/// between the first and last markers are stripped from the returned content.
178pub fn extract_marker_span(text: &str) -> Result<(usize, usize, String)> {
179    let first_tag_start = text
180        .find(MARKER_TAG_PREFIX)
181        .context("no start marker found in output")?;
182    let first_num_start = first_tag_start + MARKER_TAG_PREFIX.len();
183    let first_num_end = text[first_num_start..]
184        .find(MARKER_TAG_SUFFIX)
185        .map(|i| i + first_num_start)
186        .context("malformed start marker tag")?;
187    let start_num: usize = text[first_num_start..first_num_end]
188        .parse()
189        .context("start marker number is not a valid integer")?;
190    let first_tag_end = first_num_end + MARKER_TAG_SUFFIX.len();
191
192    let last_tag_start = text
193        .rfind(MARKER_TAG_PREFIX)
194        .context("no end marker found in output")?;
195    let last_num_start = last_tag_start + MARKER_TAG_PREFIX.len();
196    let last_num_end = text[last_num_start..]
197        .find(MARKER_TAG_SUFFIX)
198        .map(|i| i + last_num_start)
199        .context("malformed end marker tag")?;
200    let end_num: usize = text[last_num_start..last_num_end]
201        .parse()
202        .context("end marker number is not a valid integer")?;
203
204    if start_num == end_num {
205        return Err(anyhow!(
206            "start and end markers are the same (marker {})",
207            start_num
208        ));
209    }
210
211    let mut content_start = first_tag_end;
212    if text.as_bytes().get(content_start) == Some(&b'\n') {
213        content_start += 1;
214    }
215    let content_end = last_tag_start;
216
217    let content = &text[content_start..content_end.max(content_start)];
218    let content = strip_marker_tags(content);
219    Ok((start_num, end_num, content))
220}
221
222/// Given old editable text and model output with marker span, reconstruct the
223/// full new editable region.
224pub fn apply_marker_span(old_editable: &str, output: &str) -> Result<String> {
225    let (start_num, end_num, raw_new_span) = extract_marker_span(output)?;
226    let marker_offsets = compute_marker_offsets(old_editable);
227
228    let start_idx = start_num
229        .checked_sub(1)
230        .context("marker numbers are 1-indexed")?;
231    let end_idx = end_num
232        .checked_sub(1)
233        .context("marker numbers are 1-indexed")?;
234    let start_byte = *marker_offsets
235        .get(start_idx)
236        .context("start marker number out of range")?;
237    let end_byte = *marker_offsets
238        .get(end_idx)
239        .context("end marker number out of range")?;
240
241    if start_byte > end_byte {
242        return Err(anyhow!("start marker must come before end marker"));
243    }
244
245    let old_span = &old_editable[start_byte..end_byte];
246    let mut new_span = raw_new_span;
247    if old_span.ends_with('\n') && !new_span.ends_with('\n') && !new_span.is_empty() {
248        new_span.push('\n');
249    }
250    if !old_span.ends_with('\n') && new_span.ends_with('\n') {
251        new_span.pop();
252    }
253
254    let mut result = String::new();
255    result.push_str(&old_editable[..start_byte]);
256    result.push_str(&new_span);
257    result.push_str(&old_editable[end_byte..]);
258
259    Ok(result)
260}
261
262/// Compare old and new editable text, find the minimal marker span that covers
263/// all changes, and encode the result with marker tags.
264pub fn encode_from_old_and_new(
265    old_editable: &str,
266    new_editable: &str,
267    cursor_offset_in_new: Option<usize>,
268    cursor_marker: &str,
269    end_marker: &str,
270    no_edits_marker: &str,
271) -> Result<String> {
272    if old_editable == new_editable {
273        return Ok(format!("{no_edits_marker}{end_marker}"));
274    }
275
276    let marker_offsets = compute_marker_offsets(old_editable);
277
278    let common_prefix = old_editable
279        .bytes()
280        .zip(new_editable.bytes())
281        .take_while(|(a, b)| a == b)
282        .count();
283
284    let old_remaining = old_editable.len() - common_prefix;
285    let new_remaining = new_editable.len() - common_prefix;
286    let max_suffix = old_remaining.min(new_remaining);
287    let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..]
288        .iter()
289        .rev()
290        .zip(
291            new_editable.as_bytes()[new_editable.len() - max_suffix..]
292                .iter()
293                .rev(),
294        )
295        .take_while(|(a, b)| a == b)
296        .count();
297
298    let change_end_in_old = old_editable.len() - common_suffix;
299
300    let start_marker_idx = marker_offsets
301        .iter()
302        .rposition(|&offset| offset <= common_prefix)
303        .unwrap_or(0);
304    let end_marker_idx = marker_offsets
305        .iter()
306        .position(|&offset| offset >= change_end_in_old)
307        .unwrap_or(marker_offsets.len() - 1);
308
309    let old_start = marker_offsets[start_marker_idx];
310    let old_end = marker_offsets[end_marker_idx];
311
312    let new_start = old_start;
313    let new_end = new_editable
314        .len()
315        .saturating_sub(old_editable.len().saturating_sub(old_end));
316
317    let new_span = &new_editable[new_start..new_end];
318
319    let start_marker_num = start_marker_idx + 1;
320    let end_marker_num = end_marker_idx + 1;
321
322    let mut result = String::new();
323    result.push_str(&marker_tag(start_marker_num));
324    result.push('\n');
325
326    if let Some(cursor_offset) = cursor_offset_in_new {
327        if cursor_offset >= new_start && cursor_offset <= new_end {
328            let cursor_in_span = cursor_offset - new_start;
329            let bounded = cursor_in_span.min(new_span.len());
330            result.push_str(&new_span[..bounded]);
331            result.push_str(cursor_marker);
332            result.push_str(&new_span[bounded..]);
333        } else {
334            result.push_str(new_span);
335        }
336    } else {
337        result.push_str(new_span);
338    }
339
340    if !result.ends_with('\n') {
341        result.push('\n');
342    }
343    result.push_str(&marker_tag(end_marker_num));
344    result.push('\n');
345    result.push_str(end_marker);
346
347    Ok(result)
348}
349
350/// Extract the full editable region from text that uses marker tags.
351///
352/// Returns the concatenation of all block contents between the first and last
353/// markers, with intermediate marker tags stripped.
354pub fn extract_editable_region_from_markers(text: &str) -> Option<String> {
355    let first_marker_start = text.find(MARKER_TAG_PREFIX)?;
356
357    let mut markers: Vec<(usize, usize)> = Vec::new();
358    let mut search_start = first_marker_start;
359    while let Some(rel_pos) = text[search_start..].find(MARKER_TAG_PREFIX) {
360        let tag_start = search_start + rel_pos;
361        let num_start = tag_start + MARKER_TAG_PREFIX.len();
362        let num_end = text[num_start..].find(MARKER_TAG_SUFFIX)?;
363        let tag_end = num_start + num_end + MARKER_TAG_SUFFIX.len();
364        markers.push((tag_start, tag_end));
365        search_start = tag_end;
366    }
367
368    if markers.len() < 2 {
369        return None;
370    }
371
372    let (_, first_tag_end) = markers[0];
373    let (last_tag_start, _) = markers[markers.len() - 1];
374
375    let mut content_start = first_tag_end;
376    if text.as_bytes().get(content_start) == Some(&b'\n') {
377        content_start += 1;
378    }
379    let mut content_end = last_tag_start;
380    if content_end > content_start && text.as_bytes().get(content_end - 1) == Some(&b'\n') {
381        content_end -= 1;
382    }
383
384    let raw = &text[content_start..content_end];
385    let result = strip_marker_tags(raw);
386    let result = result.strip_suffix('\n').unwrap_or(&result).to_string();
387    Some(result)
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_compute_marker_offsets_small_block() {
396        let text = "aaa\nbbb\nccc\n";
397        let offsets = compute_marker_offsets(text);
398        assert_eq!(offsets, vec![0, text.len()]);
399    }
400
401    #[test]
402    fn test_compute_marker_offsets_blank_line_split() {
403        let text = "aaa\nbbb\nccc\n\nddd\neee\nfff\n";
404        let offsets = compute_marker_offsets(text);
405        assert_eq!(offsets[0], 0);
406        assert!(offsets.contains(&13), "offsets: {:?}", offsets);
407        assert_eq!(*offsets.last().unwrap(), text.len());
408    }
409
410    #[test]
411    fn test_compute_marker_offsets_max_lines_split() {
412        let text = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n";
413        let offsets = compute_marker_offsets(text);
414        assert!(offsets.len() >= 3, "offsets: {:?}", offsets);
415    }
416
417    #[test]
418    fn test_compute_marker_offsets_empty() {
419        let offsets = compute_marker_offsets("");
420        assert_eq!(offsets, vec![0, 0]);
421    }
422
423    #[test]
424    fn test_extract_marker_span() {
425        let text = "<|marker_2|>\n    new content\n<|marker_3|>\n";
426        let (start, end, content) = extract_marker_span(text).unwrap();
427        assert_eq!(start, 2);
428        assert_eq!(end, 3);
429        assert_eq!(content, "    new content\n");
430    }
431
432    #[test]
433    fn test_extract_marker_span_multi_line() {
434        let text = "<|marker_1|>\nline1\nline2\nline3\n<|marker_4|>";
435        let (start, end, content) = extract_marker_span(text).unwrap();
436        assert_eq!(start, 1);
437        assert_eq!(end, 4);
438        assert_eq!(content, "line1\nline2\nline3\n");
439    }
440
441    #[test]
442    fn test_apply_marker_span_basic() {
443        let old = "aaa\nbbb\nccc\n";
444        let output = "<|marker_1|>\naaa\nBBB\nccc\n<|marker_2|>";
445        let result = apply_marker_span(old, output).unwrap();
446        assert_eq!(result, "aaa\nBBB\nccc\n");
447    }
448
449    #[test]
450    fn test_apply_marker_span_preserves_trailing_blank_line() {
451        let old = "/\nresult\n\n";
452        let output = "<|marker_1|>\n//\nresult\n\n<|marker_2|>";
453        let result = apply_marker_span(old, output).unwrap();
454        assert_eq!(result, "//\nresult\n\n");
455    }
456
457    #[test]
458    fn test_encode_no_edits() {
459        let old = "aaa\nbbb\nccc\n";
460        let result = encode_from_old_and_new(
461            old,
462            old,
463            None,
464            "<|user_cursor|>",
465            ">>>>>>> UPDATED\n",
466            "NO_EDITS\n",
467        )
468        .unwrap();
469        assert_eq!(result, "NO_EDITS\n>>>>>>> UPDATED\n");
470    }
471
472    #[test]
473    fn test_encode_with_change() {
474        let old = "aaa\nbbb\nccc\n";
475        let new = "aaa\nBBB\nccc\n";
476        let result = encode_from_old_and_new(
477            old,
478            new,
479            None,
480            "<|user_cursor|>",
481            ">>>>>>> UPDATED\n",
482            "NO_EDITS\n",
483        )
484        .unwrap();
485        assert!(result.contains("<|marker_1|>"));
486        assert!(result.contains("<|marker_2|>"));
487        assert!(result.contains("aaa\nBBB\nccc\n"));
488        assert!(result.ends_with(">>>>>>> UPDATED\n"));
489    }
490
491    #[test]
492    fn test_roundtrip_encode_apply() {
493        let old = "line1\nline2\nline3\n\nline5\nline6\nline7\nline8\nline9\nline10\n";
494        let new = "line1\nline2\nline3\n\nline5\nLINE6\nline7\nline8\nline9\nline10\n";
495        let encoded = encode_from_old_and_new(
496            old,
497            new,
498            None,
499            "<|user_cursor|>",
500            ">>>>>>> UPDATED\n",
501            "NO_EDITS\n",
502        )
503        .unwrap();
504        let output = encoded
505            .strip_suffix(">>>>>>> UPDATED\n")
506            .expect("should have end marker");
507        let reconstructed = apply_marker_span(old, output).unwrap();
508        assert_eq!(reconstructed, new);
509    }
510
511    #[test]
512    fn test_extract_editable_region_from_markers_multi() {
513        let text = "prefix\n<|marker_1|>\naaa\nbbb\n<|marker_2|>\nccc\nddd\n<|marker_3|>\nsuffix";
514        let parsed = extract_editable_region_from_markers(text).unwrap();
515        assert_eq!(parsed, "aaa\nbbb\nccc\nddd");
516    }
517
518    #[test]
519    fn test_extract_editable_region_two_markers() {
520        let text = "<|marker_1|>\none\ntwo three\n<|marker_2|>";
521        let parsed = extract_editable_region_from_markers(text).unwrap();
522        assert_eq!(parsed, "one\ntwo three");
523    }
524
525    #[test]
526    fn test_encode_with_cursor() {
527        let old = "aaa\nbbb\nccc\n";
528        let new = "aaa\nBBB\nccc\n";
529        let result = encode_from_old_and_new(
530            old,
531            new,
532            Some(5),
533            "<|user_cursor|>",
534            ">>>>>>> UPDATED\n",
535            "NO_EDITS\n",
536        )
537        .unwrap();
538        assert!(result.contains("<|user_cursor|>"), "result: {result}");
539        assert!(result.contains("B<|user_cursor|>BB"), "result: {result}");
540    }
541
542    #[test]
543    fn test_extract_marker_span_strips_intermediate_markers() {
544        let text = "<|marker_2|>\nline1\n<|marker_3|>\nline2\n<|marker_4|>";
545        let (start, end, content) = extract_marker_span(text).unwrap();
546        assert_eq!(start, 2);
547        assert_eq!(end, 4);
548        assert_eq!(content, "line1\nline2\n");
549    }
550
551    #[test]
552    fn test_extract_marker_span_strips_multiple_intermediate_markers() {
553        let text = "<|marker_1|>\naaa\n<|marker_2|>\nbbb\n<|marker_3|>\nccc\n<|marker_4|>";
554        let (start, end, content) = extract_marker_span(text).unwrap();
555        assert_eq!(start, 1);
556        assert_eq!(end, 4);
557        assert_eq!(content, "aaa\nbbb\nccc\n");
558    }
559
560    #[test]
561    fn test_apply_marker_span_with_extra_intermediate_marker() {
562        let old = "aaa\nbbb\nccc\n";
563        let output = "<|marker_1|>\naaa\n<|marker_1|>\nBBB\nccc\n<|marker_2|>";
564        let result = apply_marker_span(old, output).unwrap();
565        assert_eq!(result, "aaa\nBBB\nccc\n");
566    }
567
568    #[test]
569    fn test_is_repeated_final_marker() {
570        assert!(is_repeated_final_marker("<|marker_5|>\n<|marker_5|>"));
571        assert!(is_repeated_final_marker("<|marker_5|>\n<|marker_5|>\n"));
572        assert!(is_repeated_final_marker("  <|marker_3|>\n<|marker_3|>  "));
573        assert!(!is_repeated_final_marker(
574            "<|marker_2|>\nnew content\n<|marker_3|>"
575        ));
576        assert!(!is_repeated_final_marker("<|marker_2|>\n<|marker_3|>"));
577        assert!(!is_repeated_final_marker("no markers here"));
578        assert!(!is_repeated_final_marker(""));
579    }
580
581    #[test]
582    fn test_strip_marker_tags_inline() {
583        assert_eq!(strip_marker_tags("no markers here"), "no markers here");
584        assert_eq!(strip_marker_tags("before<|marker_5|>after"), "beforeafter");
585        assert_eq!(
586            strip_marker_tags("line1\n<|marker_3|>\nline2"),
587            "line1\nline2"
588        );
589    }
590}