@@ -3,10 +3,14 @@ use anyhow::{Context as _, Result, anyhow};
pub const MARKER_TAG_PREFIX: &str = "<|marker_";
pub const MARKER_TAG_SUFFIX: &str = "|>";
pub const RELATIVE_MARKER_TAG_PREFIX: &str = "<|marker";
-const MIN_BLOCK_LINES: usize = 3;
-const MAX_BLOCK_LINES: usize = 8;
+const V0316_MIN_BLOCK_LINES: usize = 3;
+const V0316_MAX_BLOCK_LINES: usize = 8;
+const V0318_MIN_BLOCK_LINES: usize = 6;
+const V0318_MAX_BLOCK_LINES: usize = 16;
+const MAX_NUDGE_LINES: usize = 5;
pub const V0316_END_MARKER: &str = "<[endâofâsentence]>";
pub const V0317_END_MARKER: &str = "<[endâofâsentence]>";
+pub const V0318_END_MARKER: &str = "<[endâofâsentence]>";
pub fn marker_tag(number: usize) -> String {
format!("{MARKER_TAG_PREFIX}{number}{MARKER_TAG_SUFFIX}")
@@ -22,71 +26,104 @@ pub fn marker_tag_relative(delta: isize) -> String {
}
}
+struct LineInfo {
+ start: usize,
+ is_blank: bool,
+ is_good_start: bool,
+}
+
+fn collect_line_info(text: &str) -> Vec<LineInfo> {
+ let mut lines = Vec::new();
+ let mut offset = 0;
+ for line in text.split('\n') {
+ let trimmed = line.trim();
+ let is_blank = trimmed.is_empty();
+ let is_good_start = !is_blank && !is_structural_tail(trimmed);
+ lines.push(LineInfo {
+ start: offset,
+ is_blank,
+ is_good_start,
+ });
+ offset += line.len() + 1;
+ }
+ // split('\n') on "abc\n" yields ["abc", ""] â drop the phantom trailing
+ // empty element when the text ends with '\n'.
+ if text.ends_with('\n') && lines.len() > 1 {
+ lines.pop();
+ }
+ lines
+}
+
+fn is_structural_tail(trimmed_line: &str) -> bool {
+ if trimmed_line.starts_with(&['}', ']', ')']) {
+ return true;
+ }
+ matches!(
+ trimmed_line.trim_end_matches(';'),
+ "break" | "continue" | "return" | "throw" | "end"
+ )
+}
+
+/// Starting from line `from`, scan up to `MAX_NUDGE_LINES` forward to find a
+/// line with `is_good_start`. Returns `None` if no suitable line is found.
+fn skip_to_good_start(lines: &[LineInfo], from: usize) -> Option<usize> {
+ (from..lines.len().min(from + MAX_NUDGE_LINES)).find(|&i| lines[i].is_good_start)
+}
+
/// Compute byte offsets within `editable_text` where marker boundaries should
/// be placed.
///
/// Returns a sorted `Vec<usize>` that always starts with `0` and ends with
/// `editable_text.len()`. Interior offsets are placed at line boundaries
/// (right after a `\n`), preferring blank-line boundaries when available and
-/// respecting `MIN_BLOCK_LINES` / `MAX_BLOCK_LINES` constraints.
-pub fn compute_marker_offsets(editable_text: &str) -> Vec<usize> {
+/// respecting `min_block_lines` / `max_block_lines` constraints.
+fn compute_marker_offsets_with_limits(
+ editable_text: &str,
+ min_block_lines: usize,
+ max_block_lines: usize,
+) -> Vec<usize> {
if editable_text.is_empty() {
return vec![0, 0];
}
+ let lines = collect_line_info(editable_text);
let mut offsets = vec![0usize];
- let mut lines_since_last_marker = 0usize;
- let mut byte_offset = 0usize;
-
- for line in editable_text.split('\n') {
- let line_end = byte_offset + line.len() + 1;
- let is_past_end = line_end > editable_text.len();
- let actual_line_end = line_end.min(editable_text.len());
- lines_since_last_marker += 1;
-
- let is_blank = line.trim().is_empty();
-
- if !is_past_end && lines_since_last_marker >= MIN_BLOCK_LINES {
- if is_blank {
- // Blank-line boundary found. We'll place the marker when we
- // find the next non-blank line (handled below).
- } else if lines_since_last_marker >= MAX_BLOCK_LINES {
- offsets.push(actual_line_end);
- lines_since_last_marker = 0;
- }
- }
+ let mut last_boundary_line = 0;
+ let mut i = 0;
+
+ while i < lines.len() {
+ let gap = i - last_boundary_line;
- // Non-blank line immediately following blank line(s): split here so
- // the new block starts with this line.
- if !is_blank && byte_offset > 0 && lines_since_last_marker >= MIN_BLOCK_LINES {
- let before = &editable_text[..byte_offset];
- let has_preceding_blank_line = before
- .strip_suffix('\n')
- .map(|stripped| {
- let last_line = match stripped.rfind('\n') {
- Some(pos) => &stripped[pos + 1..],
- None => stripped,
- };
- last_line.trim().is_empty()
- })
- .unwrap_or(false);
-
- if has_preceding_blank_line {
- offsets.push(byte_offset);
- lines_since_last_marker = 1;
+ // Blank-line split: non-blank line following blank line(s) with enough
+ // accumulated lines.
+ if gap >= min_block_lines && !lines[i].is_blank && i > 0 && lines[i - 1].is_blank {
+ let target = if lines[i].is_good_start {
+ i
+ } else {
+ skip_to_good_start(&lines, i).unwrap_or(i)
+ };
+ if lines.len() - target >= min_block_lines
+ && lines[target].start > *offsets.last().unwrap_or(&0)
+ {
+ offsets.push(lines[target].start);
+ last_boundary_line = target;
+ i = target + 1;
+ continue;
}
}
- byte_offset = actual_line_end;
-
- // Re-check after blank-line logic since lines_since_last_marker may
- // have been reset.
- if !is_past_end && lines_since_last_marker >= MAX_BLOCK_LINES {
- if *offsets.last().unwrap_or(&0) != actual_line_end {
- offsets.push(actual_line_end);
- lines_since_last_marker = 0;
+ // Hard cap: too many lines without a split.
+ if gap >= max_block_lines {
+ let target = skip_to_good_start(&lines, i).unwrap_or(i);
+ if lines[target].start > *offsets.last().unwrap_or(&0) {
+ offsets.push(lines[target].start);
+ last_boundary_line = target;
+ i = target + 1;
+ continue;
}
}
+
+ i += 1;
}
let end = editable_text.len();
@@ -97,6 +134,15 @@ pub fn compute_marker_offsets(editable_text: &str) -> Vec<usize> {
offsets
}
+/// Compute byte offsets within `editable_text` for the V0316/V0317 block sizing rules.
+pub fn compute_marker_offsets(editable_text: &str) -> Vec<usize> {
+ compute_marker_offsets_with_limits(editable_text, V0316_MIN_BLOCK_LINES, V0316_MAX_BLOCK_LINES)
+}
+
+pub fn compute_marker_offsets_v0318(editable_text: &str) -> Vec<usize> {
+ compute_marker_offsets_with_limits(editable_text, V0318_MIN_BLOCK_LINES, V0318_MAX_BLOCK_LINES)
+}
+
/// Write the editable region content with marker tags, inserting the cursor
/// marker at the given offset within the editable text.
pub fn write_editable_with_markers(
@@ -267,27 +313,8 @@ pub fn encode_from_old_and_new(
}
let marker_offsets = compute_marker_offsets(old_editable);
-
- let common_prefix = old_editable
- .bytes()
- .zip(new_editable.bytes())
- .take_while(|(a, b)| a == b)
- .count();
-
- let old_remaining = old_editable.len() - common_prefix;
- let new_remaining = new_editable.len() - common_prefix;
- let max_suffix = old_remaining.min(new_remaining);
- let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..]
- .iter()
- .rev()
- .zip(
- new_editable.as_bytes()[new_editable.len() - max_suffix..]
- .iter()
- .rev(),
- )
- .take_while(|(a, b)| a == b)
- .count();
-
+ let (common_prefix, common_suffix) =
+ common_prefix_suffix(old_editable.as_bytes(), new_editable.as_bytes());
let change_end_in_old = old_editable.len() - common_suffix;
let start_marker_idx = marker_offsets
@@ -380,55 +407,24 @@ pub fn extract_editable_region_from_markers(text: &str) -> Option<String> {
Some(result)
}
-struct MarkerTag {
- number: usize,
- tag_start: usize,
- tag_end: usize,
-}
-
-struct RelativeMarkerTag {
- delta: isize,
+struct ParsedTag {
+ value: isize,
tag_start: usize,
tag_end: usize,
}
-fn collect_marker_tags(text: &str) -> Vec<MarkerTag> {
- let mut markers = Vec::new();
- let mut search_from = 0;
- while let Some(rel_pos) = text[search_from..].find(MARKER_TAG_PREFIX) {
- let tag_start = search_from + rel_pos;
- let num_start = tag_start + MARKER_TAG_PREFIX.len();
- if let Some(suffix_rel) = text[num_start..].find(MARKER_TAG_SUFFIX) {
- let num_end = num_start + suffix_rel;
- if let Ok(number) = text[num_start..num_end].parse::<usize>() {
- let tag_end = num_end + MARKER_TAG_SUFFIX.len();
- markers.push(MarkerTag {
- number,
- tag_start,
- tag_end,
- });
- search_from = tag_end;
- continue;
- }
- }
- search_from = tag_start + MARKER_TAG_PREFIX.len();
- }
- markers
-}
-
-fn collect_relative_marker_tags(text: &str) -> Vec<RelativeMarkerTag> {
- let mut markers = Vec::new();
+fn collect_tags(text: &str, prefix: &str, parse: fn(&str) -> Option<isize>) -> Vec<ParsedTag> {
+ let mut tags = Vec::new();
let mut search_from = 0;
- while let Some(rel_pos) = text[search_from..].find(RELATIVE_MARKER_TAG_PREFIX) {
+ while let Some(rel_pos) = text[search_from..].find(prefix) {
let tag_start = search_from + rel_pos;
- let payload_start = tag_start + RELATIVE_MARKER_TAG_PREFIX.len();
+ let payload_start = tag_start + prefix.len();
if let Some(suffix_rel) = text[payload_start..].find(MARKER_TAG_SUFFIX) {
let payload_end = payload_start + suffix_rel;
- let payload = &text[payload_start..payload_end];
- if let Ok(delta) = payload.parse::<isize>() {
+ if let Some(value) = parse(&text[payload_start..payload_end]) {
let tag_end = payload_end + MARKER_TAG_SUFFIX.len();
- markers.push(RelativeMarkerTag {
- delta,
+ tags.push(ParsedTag {
+ value,
tag_start,
tag_end,
});
@@ -436,9 +432,21 @@ fn collect_relative_marker_tags(text: &str) -> Vec<RelativeMarkerTag> {
continue;
}
}
- search_from = tag_start + RELATIVE_MARKER_TAG_PREFIX.len();
+ search_from = tag_start + prefix.len();
}
- markers
+ tags
+}
+
+fn collect_marker_tags(text: &str) -> Vec<ParsedTag> {
+ collect_tags(text, MARKER_TAG_PREFIX, |s| {
+ s.parse::<usize>().ok().map(|n| n as isize)
+ })
+}
+
+fn collect_relative_marker_tags(text: &str) -> Vec<ParsedTag> {
+ collect_tags(text, RELATIVE_MARKER_TAG_PREFIX, |s| {
+ s.parse::<isize>().ok()
+ })
}
pub fn nearest_marker_number(cursor_offset: Option<usize>, marker_offsets: &[usize]) -> usize {
@@ -459,21 +467,87 @@ fn cursor_block_index(cursor_offset: Option<usize>, marker_offsets: &[usize]) ->
.unwrap_or_else(|| marker_offsets.len().saturating_sub(2))
}
-/// Write the editable region content with V0317 byte-exact marker tags, where
-/// marker numbers are relative to the cursor block.
-pub fn write_editable_with_markers_v0317(
+fn common_prefix_suffix(a: &[u8], b: &[u8]) -> (usize, usize) {
+ let prefix = a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count();
+ let remaining_a = a.len() - prefix;
+ let remaining_b = b.len() - prefix;
+ let max_suffix = remaining_a.min(remaining_b);
+ let suffix = a[a.len() - max_suffix..]
+ .iter()
+ .rev()
+ .zip(b[b.len() - max_suffix..].iter().rev())
+ .take_while(|(x, y)| x == y)
+ .count();
+ (prefix, suffix)
+}
+
+/// Map a byte offset from old span coordinates to new span coordinates,
+/// using common prefix/suffix within the span for accuracy.
+fn map_boundary_offset(
+ old_rel: usize,
+ old_span_len: usize,
+ new_span_len: usize,
+ span_common_prefix: usize,
+ span_common_suffix: usize,
+) -> usize {
+ if old_rel <= span_common_prefix {
+ old_rel
+ } else if old_rel >= old_span_len - span_common_suffix {
+ new_span_len - (old_span_len - old_rel)
+ } else {
+ let old_changed_start = span_common_prefix;
+ let old_changed_len = old_span_len
+ .saturating_sub(span_common_prefix)
+ .saturating_sub(span_common_suffix);
+ let new_changed_start = span_common_prefix;
+ let new_changed_len = new_span_len
+ .saturating_sub(span_common_prefix)
+ .saturating_sub(span_common_suffix);
+
+ if old_changed_len == 0 {
+ new_changed_start
+ } else {
+ new_changed_start + ((old_rel - old_changed_start) * new_changed_len / old_changed_len)
+ }
+ }
+}
+
+fn snap_to_line_start(text: &str, offset: usize) -> usize {
+ let bounded = offset.min(text.len());
+ let bounded = text.floor_char_boundary(bounded);
+
+ if bounded >= text.len() {
+ return text.len();
+ }
+
+ if bounded == 0 || text.as_bytes().get(bounded - 1) == Some(&b'\n') {
+ return bounded;
+ }
+
+ if let Some(next_nl_rel) = text[bounded..].find('\n') {
+ let next = bounded + next_nl_rel + 1;
+ return text.floor_char_boundary(next.min(text.len()));
+ }
+
+ let prev_start = text[..bounded].rfind('\n').map(|idx| idx + 1).unwrap_or(0);
+ text.floor_char_boundary(prev_start)
+}
+
+/// Write the editable region content with byte-exact marker tags, inserting the
+/// cursor marker at the given offset within the editable text.
+///
+/// The `tag_for_index` closure maps a boundary index to the marker tag string.
+fn write_editable_with_markers_impl(
output: &mut String,
editable_text: &str,
cursor_offset_in_editable: usize,
cursor_marker: &str,
+ marker_offsets: &[usize],
+ tag_for_index: impl Fn(usize) -> String,
) {
- let marker_offsets = compute_marker_offsets(editable_text);
- let anchor_idx = cursor_block_index(Some(cursor_offset_in_editable), &marker_offsets);
let mut cursor_placed = false;
-
for (i, &offset) in marker_offsets.iter().enumerate() {
- let marker_delta = i as isize - anchor_idx as isize;
- output.push_str(&marker_tag_relative(marker_delta));
+ output.push_str(&tag_for_index(i));
if let Some(&next_offset) = marker_offsets.get(i + 1) {
let block = &editable_text[offset..next_offset];
@@ -493,11 +567,6 @@ pub fn write_editable_with_markers_v0317(
}
}
-/// Write the editable region content with V0316 byte-exact marker tags.
-///
-/// Unlike the V0306 version, markers are pure delimiters with no newline
-/// padding. The content between markers is the exact bytes from the editable
-/// text.
pub fn write_editable_with_markers_v0316(
output: &mut String,
editable_text: &str,
@@ -505,103 +574,93 @@ pub fn write_editable_with_markers_v0316(
cursor_marker: &str,
) {
let marker_offsets = compute_marker_offsets(editable_text);
- let mut cursor_placed = false;
- for (i, &offset) in marker_offsets.iter().enumerate() {
- let marker_num = i + 1;
- output.push_str(&marker_tag(marker_num));
+ write_editable_with_markers_impl(
+ output,
+ editable_text,
+ cursor_offset_in_editable,
+ cursor_marker,
+ &marker_offsets,
+ |i| marker_tag(i + 1),
+ );
+}
- if let Some(&next_offset) = marker_offsets.get(i + 1) {
- let block = &editable_text[offset..next_offset];
- if !cursor_placed
- && cursor_offset_in_editable >= offset
- && cursor_offset_in_editable <= next_offset
- {
- cursor_placed = true;
- let cursor_in_block = cursor_offset_in_editable - offset;
- output.push_str(&block[..cursor_in_block]);
- output.push_str(cursor_marker);
- output.push_str(&block[cursor_in_block..]);
- } else {
- output.push_str(block);
- }
- }
- }
+pub fn write_editable_with_markers_v0317(
+ output: &mut String,
+ editable_text: &str,
+ cursor_offset_in_editable: usize,
+ cursor_marker: &str,
+) {
+ let marker_offsets = compute_marker_offsets(editable_text);
+ let anchor_idx = cursor_block_index(Some(cursor_offset_in_editable), &marker_offsets);
+ write_editable_with_markers_impl(
+ output,
+ editable_text,
+ cursor_offset_in_editable,
+ cursor_marker,
+ &marker_offsets,
+ |i| marker_tag_relative(i as isize - anchor_idx as isize),
+ );
}
-/// Parse V0316 model output and reconstruct the full new editable region.
-///
-/// V0316 differences from V0306:
-/// - No newline stripping or normalization (byte-exact content).
-/// - The no-edit signal is `start_num == end_num` (any repeated marker).
-/// - Intermediate marker tags are used for block-level extraction.
-pub fn apply_marker_span_v0316(old_editable: &str, output: &str) -> Result<String> {
- let markers = collect_marker_tags(output);
+pub fn write_editable_with_markers_v0318(
+ output: &mut String,
+ editable_text: &str,
+ cursor_offset_in_editable: usize,
+ cursor_marker: &str,
+) {
+ let marker_offsets = compute_marker_offsets_v0318(editable_text);
+ write_editable_with_markers_impl(
+ output,
+ editable_text,
+ cursor_offset_in_editable,
+ cursor_marker,
+ &marker_offsets,
+ |i| marker_tag(i + 1),
+ );
+}
- if markers.is_empty() {
+/// Parse byte-exact model output and reconstruct the full new editable region.
+///
+/// `resolve_boundary` maps a parsed tag value to an absolute byte offset in
+/// old_editable, given the marker_offsets. Returns `(start_byte, end_byte)` or
+/// an error.
+fn apply_marker_span_impl(
+ old_editable: &str,
+ tags: &[ParsedTag],
+ output: &str,
+ resolve_boundaries: impl Fn(isize, isize) -> Result<(usize, usize)>,
+) -> Result<String> {
+ if tags.is_empty() {
return Err(anyhow!("no marker tags found in output"));
}
-
- if markers.len() == 1 {
+ if tags.len() == 1 {
return Err(anyhow!(
"only one marker tag found in output, expected at least two"
));
}
- let start_num = markers
- .first()
- .map(|marker| marker.number)
- .context("missing first marker")?;
- let end_num = markers
- .last()
- .map(|marker| marker.number)
- .context("missing last marker")?;
+ let start_value = tags[0].value;
+ let end_value = tags[tags.len() - 1].value;
- // No-edit signal: start_num == end_num
- if start_num == end_num {
+ if start_value == end_value {
return Ok(old_editable.to_string());
}
- // Validate monotonically increasing with no gaps
- let expected_nums: Vec<usize> = (start_num..=end_num).collect();
- let actual_nums: Vec<usize> = markers.iter().map(|m| m.number).collect();
- if actual_nums != expected_nums {
- eprintln!(
- "V0316 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.",
- expected_nums, actual_nums
- );
- }
-
- let marker_offsets = compute_marker_offsets(old_editable);
-
- let start_idx = start_num
- .checked_sub(1)
- .context("marker numbers are 1-indexed")?;
- let end_idx = end_num
- .checked_sub(1)
- .context("marker numbers are 1-indexed")?;
-
- let start_byte = *marker_offsets
- .get(start_idx)
- .context("start marker number out of range")?;
- let end_byte = *marker_offsets
- .get(end_idx)
- .context("end marker number out of range")?;
+ let (start_byte, end_byte) = resolve_boundaries(start_value, end_value)?;
if start_byte > end_byte {
return Err(anyhow!("start marker must come before end marker"));
}
- // Extract byte-exact content between consecutive markers
let mut new_content = String::new();
- for i in 0..markers.len() - 1 {
- let content_start = markers[i].tag_end;
- let content_end = markers[i + 1].tag_start;
+ for i in 0..tags.len() - 1 {
+ let content_start = tags[i].tag_end;
+ let content_end = tags[i + 1].tag_start;
if content_start <= content_end {
new_content.push_str(&output[content_start..content_end]);
}
}
- // Splice into old_editable
let mut result = String::new();
result.push_str(&old_editable[..start_byte]);
result.push_str(&new_content);
@@ -610,123 +669,127 @@ pub fn apply_marker_span_v0316(old_editable: &str, output: &str) -> Result<Strin
Ok(result)
}
-/// Parse V0317 model output and reconstruct the full new editable region.
-///
-/// V0317 differences from V0316:
-/// - Marker ids are relative to the cursor block (e.g. -2, -1, 0, +1, +2).
-/// - No-edit signal is any repeated relative marker tag.
+pub fn apply_marker_span_v0316(old_editable: &str, output: &str) -> Result<String> {
+ let tags = collect_marker_tags(output);
+
+ // Validate monotonically increasing with no gaps (best-effort warning)
+ if tags.len() >= 2 {
+ let start_num = tags[0].value;
+ let end_num = tags[tags.len() - 1].value;
+ if start_num != end_num {
+ let expected: Vec<isize> = (start_num..=end_num).collect();
+ let actual: Vec<isize> = tags.iter().map(|t| t.value).collect();
+ if actual != expected {
+ eprintln!(
+ "V0316 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.",
+ expected, actual
+ );
+ }
+ }
+ }
+
+ let marker_offsets = compute_marker_offsets(old_editable);
+ apply_marker_span_impl(old_editable, &tags, output, |start_val, end_val| {
+ let start_idx = (start_val as usize)
+ .checked_sub(1)
+ .context("marker numbers are 1-indexed")?;
+ let end_idx = (end_val as usize)
+ .checked_sub(1)
+ .context("marker numbers are 1-indexed")?;
+ let start_byte = *marker_offsets
+ .get(start_idx)
+ .context("start marker number out of range")?;
+ let end_byte = *marker_offsets
+ .get(end_idx)
+ .context("end marker number out of range")?;
+ Ok((start_byte, end_byte))
+ })
+}
+
pub fn apply_marker_span_v0317(
old_editable: &str,
output: &str,
cursor_offset_in_old: Option<usize>,
) -> Result<String> {
- let markers = collect_relative_marker_tags(output);
-
- if markers.is_empty() {
- return Err(anyhow!("no marker tags found in output"));
- }
-
- if markers.len() == 1 {
- return Err(anyhow!(
- "only one marker tag found in output, expected at least two"
- ));
- }
-
+ let tags = collect_relative_marker_tags(output);
let marker_offsets = compute_marker_offsets(old_editable);
let anchor_idx = cursor_block_index(cursor_offset_in_old, &marker_offsets);
- let start_delta = markers
- .first()
- .map(|marker| marker.delta)
- .context("missing first marker")?;
- let end_delta = markers
- .last()
- .map(|marker| marker.delta)
- .context("missing last marker")?;
-
- if start_delta == end_delta {
- return Ok(old_editable.to_string());
- }
-
- let start_idx_isize = anchor_idx as isize + start_delta;
- let end_idx_isize = anchor_idx as isize + end_delta;
- if start_idx_isize < 0 || end_idx_isize < 0 {
- return Err(anyhow!("relative marker maps before first marker"));
- }
-
- let start_idx = usize::try_from(start_idx_isize).context("invalid start marker index")?;
- let end_idx = usize::try_from(end_idx_isize).context("invalid end marker index")?;
-
- let start_byte = *marker_offsets
- .get(start_idx)
- .context("start marker number out of range")?;
- let end_byte = *marker_offsets
- .get(end_idx)
- .context("end marker number out of range")?;
-
- if start_byte > end_byte {
- return Err(anyhow!("start marker must come before end marker"));
- }
+ apply_marker_span_impl(old_editable, &tags, output, |start_delta, end_delta| {
+ let start_idx_signed = anchor_idx as isize + start_delta;
+ let end_idx_signed = anchor_idx as isize + end_delta;
+ if start_idx_signed < 0 || end_idx_signed < 0 {
+ return Err(anyhow!("relative marker maps before first marker"));
+ }
+ let start_idx = usize::try_from(start_idx_signed).context("invalid start marker index")?;
+ let end_idx = usize::try_from(end_idx_signed).context("invalid end marker index")?;
+ let start_byte = *marker_offsets
+ .get(start_idx)
+ .context("start marker number out of range")?;
+ let end_byte = *marker_offsets
+ .get(end_idx)
+ .context("end marker number out of range")?;
+ Ok((start_byte, end_byte))
+ })
+}
- let mut new_content = String::new();
- for i in 0..markers.len() - 1 {
- let content_start = markers[i].tag_end;
- let content_end = markers[i + 1].tag_start;
- if content_start <= content_end {
- new_content.push_str(&output[content_start..content_end]);
+pub fn apply_marker_span_v0318(old_editable: &str, output: &str) -> Result<String> {
+ let tags = collect_marker_tags(output);
+
+ if tags.len() >= 2 {
+ let start_num = tags[0].value;
+ let end_num = tags[tags.len() - 1].value;
+ if start_num != end_num {
+ let expected: Vec<isize> = (start_num..=end_num).collect();
+ let actual: Vec<isize> = tags.iter().map(|t| t.value).collect();
+ if actual != expected {
+ eprintln!(
+ "V0318 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.",
+ expected, actual
+ );
+ }
}
}
- let mut result = String::new();
- result.push_str(&old_editable[..start_byte]);
- result.push_str(&new_content);
- result.push_str(&old_editable[end_byte..]);
-
- Ok(result)
+ let marker_offsets = compute_marker_offsets_v0318(old_editable);
+ apply_marker_span_impl(old_editable, &tags, output, |start_val, end_val| {
+ let start_idx = (start_val as usize)
+ .checked_sub(1)
+ .context("marker numbers are 1-indexed")?;
+ let end_idx = (end_val as usize)
+ .checked_sub(1)
+ .context("marker numbers are 1-indexed")?;
+ let start_byte = *marker_offsets
+ .get(start_idx)
+ .context("start marker number out of range")?;
+ let end_byte = *marker_offsets
+ .get(end_idx)
+ .context("end marker number out of range")?;
+ Ok((start_byte, end_byte))
+ })
}
-/// Encode the V0316 training target from old and new editable text.
+/// Encode the training target from old and new editable text.
///
-/// V0316 differences from V0306:
-/// - No-edit signal: `<|marker_C|><|marker_C|>{end_marker}` where C is nearest
-/// to cursor.
-/// - All intermediate markers are emitted with byte-exact content.
-/// - No newline padding around marker tags.
-pub fn encode_from_old_and_new_v0316(
+/// Shared implementation for V0316, V0317, and V0318. The `tag_for_block_idx`
+/// closure maps a block index to the appropriate marker tag string.
+/// `no_edit_tag` is the marker tag to repeat when there are no edits.
+fn encode_from_old_and_new_impl(
old_editable: &str,
new_editable: &str,
cursor_offset_in_new: Option<usize>,
cursor_marker: &str,
end_marker: &str,
+ no_edit_tag: &str,
+ marker_offsets: &[usize],
+ tag_for_block_idx: impl Fn(usize) -> String,
) -> Result<String> {
- let marker_offsets = compute_marker_offsets(old_editable);
-
if old_editable == new_editable {
- let marker_num = nearest_marker_number(cursor_offset_in_new, &marker_offsets);
- let tag = marker_tag(marker_num);
- return Ok(format!("{tag}{tag}{end_marker}"));
+ return Ok(format!("{no_edit_tag}{no_edit_tag}{end_marker}"));
}
- let common_prefix = old_editable
- .bytes()
- .zip(new_editable.bytes())
- .take_while(|(a, b)| a == b)
- .count();
-
- let old_remaining = old_editable.len() - common_prefix;
- let new_remaining = new_editable.len() - common_prefix;
- let max_suffix = old_remaining.min(new_remaining);
- let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..]
- .iter()
- .rev()
- .zip(
- new_editable.as_bytes()[new_editable.len() - max_suffix..]
- .iter()
- .rev(),
- )
- .take_while(|(a, b)| a == b)
- .count();
-
+ let (common_prefix, common_suffix) =
+ common_prefix_suffix(old_editable.as_bytes(), new_editable.as_bytes());
let change_end_in_old = old_editable.len() - common_suffix;
let start_marker_idx = marker_offsets
@@ -749,40 +812,19 @@ pub fn encode_from_old_and_new_v0316(
let new_span = &new_editable[new_start..new_end];
let old_span = &old_editable[old_start..old_end];
- // Compute common prefix/suffix within the span for accurate boundary mapping
- let span_common_prefix = old_span
- .bytes()
- .zip(new_span.bytes())
- .take_while(|(a, b)| a == b)
- .count();
-
- let span_old_remaining = old_span.len() - span_common_prefix;
- let span_new_remaining = new_span.len() - span_common_prefix;
- let span_max_suffix = span_old_remaining.min(span_new_remaining);
- let span_common_suffix = old_span.as_bytes()[old_span.len() - span_max_suffix..]
- .iter()
- .rev()
- .zip(
- new_span.as_bytes()[new_span.len() - span_max_suffix..]
- .iter()
- .rev(),
- )
- .take_while(|(a, b)| a == b)
- .count();
+ let (span_common_prefix, span_common_suffix) =
+ common_prefix_suffix(old_span.as_bytes(), new_span.as_bytes());
let mut result = String::new();
let mut prev_new_rel = 0usize;
let mut cursor_placed = false;
for block_idx in start_marker_idx..end_marker_idx {
- let marker_num = block_idx + 1;
- result.push_str(&marker_tag(marker_num));
+ result.push_str(&tag_for_block_idx(block_idx));
let new_rel_end = if block_idx + 1 == end_marker_idx {
- // Last block: extends to end of new span
new_span.len()
} else {
- // Map the intermediate boundary from old to new coordinates
let old_rel = marker_offsets[block_idx + 1] - old_start;
let mapped = map_boundary_offset(
old_rel,
@@ -791,13 +833,10 @@ pub fn encode_from_old_and_new_v0316(
span_common_prefix,
span_common_suffix,
);
- // Ensure char boundary safety and monotonicity
- new_span.floor_char_boundary(mapped)
+ snap_to_line_start(new_span, mapped)
};
- // Ensure monotonicity (each block gets at least zero content)
let new_rel_end = new_rel_end.max(prev_new_rel);
-
let block_content = &new_span[prev_new_rel..new_rel_end];
if !cursor_placed {
@@ -821,19 +860,33 @@ pub fn encode_from_old_and_new_v0316(
prev_new_rel = new_rel_end;
}
- // Final closing marker
- let end_marker_num = end_marker_idx + 1;
- result.push_str(&marker_tag(end_marker_num));
+ result.push_str(&tag_for_block_idx(end_marker_idx));
result.push_str(end_marker);
Ok(result)
}
-/// Encode the V0317 training target from old and new editable text.
-///
-/// V0317 differences from V0316:
-/// - Marker ids are relative to cursor block (..., -2, -1, 0, +1, +2, ...).
-/// - No-edit signal: repeated cursor-relative marker.
+pub fn encode_from_old_and_new_v0316(
+ old_editable: &str,
+ new_editable: &str,
+ cursor_offset_in_new: Option<usize>,
+ cursor_marker: &str,
+ end_marker: &str,
+) -> Result<String> {
+ let marker_offsets = compute_marker_offsets(old_editable);
+ let no_edit_tag = marker_tag(nearest_marker_number(cursor_offset_in_new, &marker_offsets));
+ encode_from_old_and_new_impl(
+ old_editable,
+ new_editable,
+ cursor_offset_in_new,
+ cursor_marker,
+ end_marker,
+ &no_edit_tag,
+ &marker_offsets,
+ |block_idx| marker_tag(block_idx + 1),
+ )
+}
+
pub fn encode_from_old_and_new_v0317(
old_editable: &str,
new_editable: &str,
@@ -843,157 +896,38 @@ pub fn encode_from_old_and_new_v0317(
) -> Result<String> {
let marker_offsets = compute_marker_offsets(old_editable);
let anchor_idx = cursor_block_index(cursor_offset_in_new, &marker_offsets);
-
- if old_editable == new_editable {
- let tag = marker_tag_relative(0);
- return Ok(format!("{tag}{tag}{end_marker}"));
- }
-
- let common_prefix = old_editable
- .bytes()
- .zip(new_editable.bytes())
- .take_while(|(a, b)| a == b)
- .count();
-
- let old_remaining = old_editable.len() - common_prefix;
- let new_remaining = new_editable.len() - common_prefix;
- let max_suffix = old_remaining.min(new_remaining);
- let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..]
- .iter()
- .rev()
- .zip(
- new_editable.as_bytes()[new_editable.len() - max_suffix..]
- .iter()
- .rev(),
- )
- .take_while(|(a, b)| a == b)
- .count();
-
- let change_end_in_old = old_editable.len() - common_suffix;
-
- let start_marker_idx = marker_offsets
- .iter()
- .rposition(|&offset| offset <= common_prefix)
- .unwrap_or(0);
- let end_marker_idx = marker_offsets
- .iter()
- .position(|&offset| offset >= change_end_in_old)
- .unwrap_or(marker_offsets.len() - 1);
-
- let old_start = marker_offsets[start_marker_idx];
- let old_end = marker_offsets[end_marker_idx];
-
- let new_start = old_start;
- let new_end = new_editable
- .len()
- .saturating_sub(old_editable.len().saturating_sub(old_end));
-
- let new_span = &new_editable[new_start..new_end];
- let old_span = &old_editable[old_start..old_end];
-
- let span_common_prefix = old_span
- .bytes()
- .zip(new_span.bytes())
- .take_while(|(a, b)| a == b)
- .count();
-
- let span_old_remaining = old_span.len() - span_common_prefix;
- let span_new_remaining = new_span.len() - span_common_prefix;
- let span_max_suffix = span_old_remaining.min(span_new_remaining);
- let span_common_suffix = old_span.as_bytes()[old_span.len() - span_max_suffix..]
- .iter()
- .rev()
- .zip(
- new_span.as_bytes()[new_span.len() - span_max_suffix..]
- .iter()
- .rev(),
- )
- .take_while(|(a, b)| a == b)
- .count();
-
- let mut result = String::new();
- let mut prev_new_rel = 0usize;
- let mut cursor_placed = false;
-
- for block_idx in start_marker_idx..end_marker_idx {
- let marker_delta = block_idx as isize - anchor_idx as isize;
- result.push_str(&marker_tag_relative(marker_delta));
-
- let new_rel_end = if block_idx + 1 == end_marker_idx {
- new_span.len()
- } else {
- let old_rel = marker_offsets[block_idx + 1] - old_start;
- let mapped = map_boundary_offset(
- old_rel,
- old_span.len(),
- new_span.len(),
- span_common_prefix,
- span_common_suffix,
- );
- new_span.floor_char_boundary(mapped)
- };
-
- let new_rel_end = new_rel_end.max(prev_new_rel);
- let block_content = &new_span[prev_new_rel..new_rel_end];
-
- if !cursor_placed {
- if let Some(cursor_offset) = cursor_offset_in_new {
- let abs_start = new_start + prev_new_rel;
- let abs_end = new_start + new_rel_end;
- if cursor_offset >= abs_start && cursor_offset <= abs_end {
- cursor_placed = true;
- let cursor_in_block = cursor_offset - abs_start;
- let bounded = cursor_in_block.min(block_content.len());
- result.push_str(&block_content[..bounded]);
- result.push_str(cursor_marker);
- result.push_str(&block_content[bounded..]);
- prev_new_rel = new_rel_end;
- continue;
- }
- }
- }
-
- result.push_str(block_content);
- prev_new_rel = new_rel_end;
- }
-
- let end_marker_delta = end_marker_idx as isize - anchor_idx as isize;
- result.push_str(&marker_tag_relative(end_marker_delta));
- result.push_str(end_marker);
-
- Ok(result)
+ let no_edit_tag = marker_tag_relative(0);
+ encode_from_old_and_new_impl(
+ old_editable,
+ new_editable,
+ cursor_offset_in_new,
+ cursor_marker,
+ end_marker,
+ &no_edit_tag,
+ &marker_offsets,
+ |block_idx| marker_tag_relative(block_idx as isize - anchor_idx as isize),
+ )
}
-/// Map a byte offset from old span coordinates to new span coordinates,
-/// using common prefix/suffix within the span for accuracy.
-fn map_boundary_offset(
- old_rel: usize,
- old_span_len: usize,
- new_span_len: usize,
- span_common_prefix: usize,
- span_common_suffix: usize,
-) -> usize {
- if old_rel <= span_common_prefix {
- old_rel
- } else if old_rel >= old_span_len - span_common_suffix {
- new_span_len - (old_span_len - old_rel)
- } else {
- // Within the changed region: proportional mapping
- let old_changed_start = span_common_prefix;
- let old_changed_len = old_span_len
- .saturating_sub(span_common_prefix)
- .saturating_sub(span_common_suffix);
- let new_changed_start = span_common_prefix;
- let new_changed_len = new_span_len
- .saturating_sub(span_common_prefix)
- .saturating_sub(span_common_suffix);
-
- if old_changed_len == 0 {
- new_changed_start
- } else {
- new_changed_start + ((old_rel - old_changed_start) * new_changed_len / old_changed_len)
- }
- }
+pub fn encode_from_old_and_new_v0318(
+ old_editable: &str,
+ new_editable: &str,
+ cursor_offset_in_new: Option<usize>,
+ cursor_marker: &str,
+ end_marker: &str,
+) -> Result<String> {
+ let marker_offsets = compute_marker_offsets_v0318(old_editable);
+ let no_edit_tag = marker_tag(nearest_marker_number(cursor_offset_in_new, &marker_offsets));
+ encode_from_old_and_new_impl(
+ old_editable,
+ new_editable,
+ cursor_offset_in_new,
+ cursor_marker,
+ end_marker,
+ &no_edit_tag,
+ &marker_offsets,
+ |block_idx| marker_tag(block_idx + 1),
+ )
}
#[cfg(test)]
@@ -91,6 +91,8 @@ pub enum ZetaFormat {
V0306SeedMultiRegions,
/// Byte-exact marker spans; all intermediate markers emitted; repeated marker means no-edit.
V0316SeedMultiRegions,
+ /// V0316 with larger block sizes.
+ V0318SeedMultiRegions,
/// V0316, but marker numbers are relative to the cursor block (e.g. -1, -0, +1).
V0317SeedMultiRegions,
}
@@ -242,6 +244,18 @@ pub fn special_tokens_for_format(format: ZetaFormat) -> &'static [&'static str]
];
TOKENS
}
+ ZetaFormat::V0318SeedMultiRegions => {
+ static TOKENS: &[&str] = &[
+ seed_coder::FIM_SUFFIX,
+ seed_coder::FIM_PREFIX,
+ seed_coder::FIM_MIDDLE,
+ seed_coder::FILE_MARKER,
+ multi_region::V0318_END_MARKER,
+ CURSOR_MARKER,
+ multi_region::MARKER_TAG_PREFIX,
+ ];
+ TOKENS
+ }
ZetaFormat::V0317SeedMultiRegions => {
static TOKENS: &[&str] = &[
seed_coder::FIM_SUFFIX,
@@ -283,6 +297,7 @@ pub fn token_limits_for_format(format: ZetaFormat) -> (usize, usize) {
| ZetaFormat::v0226Hashline
| ZetaFormat::V0306SeedMultiRegions
| ZetaFormat::V0316SeedMultiRegions
+ | ZetaFormat::V0318SeedMultiRegions
| ZetaFormat::V0317SeedMultiRegions
| ZetaFormat::V0304SeedNoEdits => (350, 150),
ZetaFormat::V0304VariableEdit => (1024, 0),
@@ -303,6 +318,7 @@ pub fn stop_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] {
| ZetaFormat::V0306SeedMultiRegions
| ZetaFormat::V0304SeedNoEdits => &[],
ZetaFormat::V0316SeedMultiRegions => &[multi_region::V0316_END_MARKER],
+ ZetaFormat::V0318SeedMultiRegions => &[multi_region::V0318_END_MARKER],
ZetaFormat::V0317SeedMultiRegions => &[multi_region::V0317_END_MARKER],
}
}
@@ -328,6 +344,7 @@ pub fn excerpt_ranges_for_format(
| ZetaFormat::V0304SeedNoEdits
| ZetaFormat::V0306SeedMultiRegions
| ZetaFormat::V0316SeedMultiRegions
+ | ZetaFormat::V0318SeedMultiRegions
| ZetaFormat::V0317SeedMultiRegions => (
ranges.editable_350.clone(),
ranges.editable_350_context_150.clone(),
@@ -419,6 +436,14 @@ pub fn write_cursor_excerpt_section_for_format(
cursor_offset,
));
}
+ ZetaFormat::V0318SeedMultiRegions => {
+ prompt.push_str(&build_v0318_cursor_prefix(
+ path,
+ context,
+ editable_range,
+ cursor_offset,
+ ));
+ }
ZetaFormat::V0317SeedMultiRegions => {
prompt.push_str(&build_v0317_cursor_prefix(
path,
@@ -486,6 +511,33 @@ fn build_v0316_cursor_prefix(
section
}
+fn build_v0318_cursor_prefix(
+ path: &Path,
+ context: &str,
+ editable_range: &Range<usize>,
+ cursor_offset: usize,
+) -> String {
+ let mut section = String::new();
+ let path_str = path.to_string_lossy();
+ write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok();
+
+ section.push_str(&context[..editable_range.start]);
+
+ let editable_text = &context[editable_range.clone()];
+ let cursor_in_editable = cursor_offset - editable_range.start;
+ multi_region::write_editable_with_markers_v0318(
+ &mut section,
+ editable_text,
+ cursor_in_editable,
+ CURSOR_MARKER,
+ );
+
+ if !section.ends_with('\n') {
+ section.push('\n');
+ }
+ section
+}
+
fn build_v0317_cursor_prefix(
path: &Path,
context: &str,
@@ -551,6 +603,7 @@ pub fn format_prompt_with_budget_for_format(
| ZetaFormat::V0304SeedNoEdits
| ZetaFormat::V0306SeedMultiRegions
| ZetaFormat::V0316SeedMultiRegions
+ | ZetaFormat::V0318SeedMultiRegions
| ZetaFormat::V0317SeedMultiRegions => {
let mut cursor_section = String::new();
write_cursor_excerpt_section_for_format(
@@ -649,6 +702,7 @@ pub fn max_edit_event_count_for_format(format: &ZetaFormat) -> usize {
| ZetaFormat::V0304VariableEdit
| ZetaFormat::V0306SeedMultiRegions
| ZetaFormat::V0316SeedMultiRegions
+ | ZetaFormat::V0318SeedMultiRegions
| ZetaFormat::V0317SeedMultiRegions => 6,
}
}
@@ -671,6 +725,7 @@ pub fn get_prefill_for_format(
ZetaFormat::V0304SeedNoEdits
| ZetaFormat::V0306SeedMultiRegions
| ZetaFormat::V0316SeedMultiRegions
+ | ZetaFormat::V0318SeedMultiRegions
| ZetaFormat::V0317SeedMultiRegions => String::new(),
}
}
@@ -684,6 +739,7 @@ pub fn output_end_marker_for_format(format: ZetaFormat) -> Option<&'static str>
| ZetaFormat::V0304SeedNoEdits
| ZetaFormat::V0306SeedMultiRegions => Some(seed_coder::END_MARKER),
ZetaFormat::V0316SeedMultiRegions => Some(multi_region::V0316_END_MARKER),
+ ZetaFormat::V0318SeedMultiRegions => Some(multi_region::V0318_END_MARKER),
ZetaFormat::V0317SeedMultiRegions => Some(multi_region::V0317_END_MARKER),
ZetaFormat::V0112MiddleAtEnd
| ZetaFormat::V0113Ordered
@@ -727,6 +783,22 @@ pub fn encode_patch_as_output_for_format(
Ok(None)
}
}
+ ZetaFormat::V0318SeedMultiRegions => {
+ let empty_patch = patch.lines().count() <= 3;
+ if empty_patch {
+ let marker_offsets =
+ multi_region::compute_marker_offsets_v0318(old_editable_region);
+ let marker_num =
+ multi_region::nearest_marker_number(cursor_offset, &marker_offsets);
+ let tag = multi_region::marker_tag(marker_num);
+ Ok(Some(format!(
+ "{tag}{tag}{}",
+ multi_region::V0318_END_MARKER
+ )))
+ } else {
+ Ok(None)
+ }
+ }
ZetaFormat::V0317SeedMultiRegions => {
let empty_patch = patch.lines().count() <= 3;
if empty_patch {
@@ -797,6 +869,10 @@ pub fn parse_zeta2_model_output(
editable_range_in_context,
multi_region::apply_marker_span_v0316(old_editable_region, output)?,
),
+ ZetaFormat::V0318SeedMultiRegions => (
+ editable_range_in_context,
+ multi_region::apply_marker_span_v0318(old_editable_region, output)?,
+ ),
ZetaFormat::V0317SeedMultiRegions => (
editable_range_in_context,
multi_region::apply_marker_span_v0317(