diff --git a/crates/edit_prediction_cli/src/format_prompt.rs b/crates/edit_prediction_cli/src/format_prompt.rs index b0cfbd77ee543d4271cda0fb952f5ba48fc4a998..1da966ea9c5b2f3cf7b866bc82839de9d70e9fa6 100644 --- a/crates/edit_prediction_cli/src/format_prompt.rs +++ b/crates/edit_prediction_cli/src/format_prompt.rs @@ -150,6 +150,20 @@ pub fn zeta2_output_for_patch( ); } + if version == ZetaFormat::V0318SeedMultiRegions { + let cursor_in_new = cursor_offset.map(|cursor_offset| { + let hunk_start = first_hunk_offset.unwrap_or(0); + result.floor_char_boundary((hunk_start + cursor_offset).min(result.len())) + }); + return multi_region::encode_from_old_and_new_v0318( + &old_editable_region, + &result, + cursor_in_new, + zeta_prompt::CURSOR_MARKER, + multi_region::V0318_END_MARKER, + ); + } + if version == ZetaFormat::V0316SeedMultiRegions { let cursor_in_new = cursor_offset.map(|cursor_offset| { let hunk_start = first_hunk_offset.unwrap_or(0); diff --git a/crates/zeta_prompt/src/multi_region.rs b/crates/zeta_prompt/src/multi_region.rs index a27a7245ae74824a086c9a39cc6d48d89f00d8b2..0514b8fd9c3e3fe4887ed57c27600e93f0df497a 100644 --- a/crates/zeta_prompt/src/multi_region.rs +++ b/crates/zeta_prompt/src/multi_region.rs @@ -3,10 +3,14 @@ use anyhow::{Context as _, Result, anyhow}; pub const MARKER_TAG_PREFIX: &str = "<|marker_"; pub const MARKER_TAG_SUFFIX: &str = "|>"; pub const RELATIVE_MARKER_TAG_PREFIX: &str = "<|marker"; -const MIN_BLOCK_LINES: usize = 3; -const MAX_BLOCK_LINES: usize = 8; +const V0316_MIN_BLOCK_LINES: usize = 3; +const V0316_MAX_BLOCK_LINES: usize = 8; +const V0318_MIN_BLOCK_LINES: usize = 6; +const V0318_MAX_BLOCK_LINES: usize = 16; +const MAX_NUDGE_LINES: usize = 5; pub const V0316_END_MARKER: &str = "<[end▁of▁sentence]>"; pub const V0317_END_MARKER: &str = "<[end▁of▁sentence]>"; +pub const V0318_END_MARKER: &str = "<[end▁of▁sentence]>"; pub fn marker_tag(number: usize) -> String { format!("{MARKER_TAG_PREFIX}{number}{MARKER_TAG_SUFFIX}") @@ -22,71 +26,104 @@ pub fn marker_tag_relative(delta: isize) -> String { } } +struct LineInfo { + start: usize, + is_blank: bool, + is_good_start: bool, +} + +fn collect_line_info(text: &str) -> Vec { + let mut lines = Vec::new(); + let mut offset = 0; + for line in text.split('\n') { + let trimmed = line.trim(); + let is_blank = trimmed.is_empty(); + let is_good_start = !is_blank && !is_structural_tail(trimmed); + lines.push(LineInfo { + start: offset, + is_blank, + is_good_start, + }); + offset += line.len() + 1; + } + // split('\n') on "abc\n" yields ["abc", ""] — drop the phantom trailing + // empty element when the text ends with '\n'. + if text.ends_with('\n') && lines.len() > 1 { + lines.pop(); + } + lines +} + +fn is_structural_tail(trimmed_line: &str) -> bool { + if trimmed_line.starts_with(&['}', ']', ')']) { + return true; + } + matches!( + trimmed_line.trim_end_matches(';'), + "break" | "continue" | "return" | "throw" | "end" + ) +} + +/// Starting from line `from`, scan up to `MAX_NUDGE_LINES` forward to find a +/// line with `is_good_start`. Returns `None` if no suitable line is found. +fn skip_to_good_start(lines: &[LineInfo], from: usize) -> Option { + (from..lines.len().min(from + MAX_NUDGE_LINES)).find(|&i| lines[i].is_good_start) +} + /// Compute byte offsets within `editable_text` where marker boundaries should /// be placed. /// /// Returns a sorted `Vec` that always starts with `0` and ends with /// `editable_text.len()`. Interior offsets are placed at line boundaries /// (right after a `\n`), preferring blank-line boundaries when available and -/// respecting `MIN_BLOCK_LINES` / `MAX_BLOCK_LINES` constraints. -pub fn compute_marker_offsets(editable_text: &str) -> Vec { +/// respecting `min_block_lines` / `max_block_lines` constraints. +fn compute_marker_offsets_with_limits( + editable_text: &str, + min_block_lines: usize, + max_block_lines: usize, +) -> Vec { if editable_text.is_empty() { return vec![0, 0]; } + let lines = collect_line_info(editable_text); let mut offsets = vec![0usize]; - let mut lines_since_last_marker = 0usize; - let mut byte_offset = 0usize; - - for line in editable_text.split('\n') { - let line_end = byte_offset + line.len() + 1; - let is_past_end = line_end > editable_text.len(); - let actual_line_end = line_end.min(editable_text.len()); - lines_since_last_marker += 1; - - let is_blank = line.trim().is_empty(); - - if !is_past_end && lines_since_last_marker >= MIN_BLOCK_LINES { - if is_blank { - // Blank-line boundary found. We'll place the marker when we - // find the next non-blank line (handled below). - } else if lines_since_last_marker >= MAX_BLOCK_LINES { - offsets.push(actual_line_end); - lines_since_last_marker = 0; - } - } + let mut last_boundary_line = 0; + let mut i = 0; + + while i < lines.len() { + let gap = i - last_boundary_line; - // Non-blank line immediately following blank line(s): split here so - // the new block starts with this line. - if !is_blank && byte_offset > 0 && lines_since_last_marker >= MIN_BLOCK_LINES { - let before = &editable_text[..byte_offset]; - let has_preceding_blank_line = before - .strip_suffix('\n') - .map(|stripped| { - let last_line = match stripped.rfind('\n') { - Some(pos) => &stripped[pos + 1..], - None => stripped, - }; - last_line.trim().is_empty() - }) - .unwrap_or(false); - - if has_preceding_blank_line { - offsets.push(byte_offset); - lines_since_last_marker = 1; + // Blank-line split: non-blank line following blank line(s) with enough + // accumulated lines. + if gap >= min_block_lines && !lines[i].is_blank && i > 0 && lines[i - 1].is_blank { + let target = if lines[i].is_good_start { + i + } else { + skip_to_good_start(&lines, i).unwrap_or(i) + }; + if lines.len() - target >= min_block_lines + && lines[target].start > *offsets.last().unwrap_or(&0) + { + offsets.push(lines[target].start); + last_boundary_line = target; + i = target + 1; + continue; } } - byte_offset = actual_line_end; - - // Re-check after blank-line logic since lines_since_last_marker may - // have been reset. - if !is_past_end && lines_since_last_marker >= MAX_BLOCK_LINES { - if *offsets.last().unwrap_or(&0) != actual_line_end { - offsets.push(actual_line_end); - lines_since_last_marker = 0; + // Hard cap: too many lines without a split. + if gap >= max_block_lines { + let target = skip_to_good_start(&lines, i).unwrap_or(i); + if lines[target].start > *offsets.last().unwrap_or(&0) { + offsets.push(lines[target].start); + last_boundary_line = target; + i = target + 1; + continue; } } + + i += 1; } let end = editable_text.len(); @@ -97,6 +134,15 @@ pub fn compute_marker_offsets(editable_text: &str) -> Vec { offsets } +/// Compute byte offsets within `editable_text` for the V0316/V0317 block sizing rules. +pub fn compute_marker_offsets(editable_text: &str) -> Vec { + compute_marker_offsets_with_limits(editable_text, V0316_MIN_BLOCK_LINES, V0316_MAX_BLOCK_LINES) +} + +pub fn compute_marker_offsets_v0318(editable_text: &str) -> Vec { + compute_marker_offsets_with_limits(editable_text, V0318_MIN_BLOCK_LINES, V0318_MAX_BLOCK_LINES) +} + /// Write the editable region content with marker tags, inserting the cursor /// marker at the given offset within the editable text. pub fn write_editable_with_markers( @@ -267,27 +313,8 @@ pub fn encode_from_old_and_new( } let marker_offsets = compute_marker_offsets(old_editable); - - let common_prefix = old_editable - .bytes() - .zip(new_editable.bytes()) - .take_while(|(a, b)| a == b) - .count(); - - let old_remaining = old_editable.len() - common_prefix; - let new_remaining = new_editable.len() - common_prefix; - let max_suffix = old_remaining.min(new_remaining); - let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..] - .iter() - .rev() - .zip( - new_editable.as_bytes()[new_editable.len() - max_suffix..] - .iter() - .rev(), - ) - .take_while(|(a, b)| a == b) - .count(); - + let (common_prefix, common_suffix) = + common_prefix_suffix(old_editable.as_bytes(), new_editable.as_bytes()); let change_end_in_old = old_editable.len() - common_suffix; let start_marker_idx = marker_offsets @@ -380,55 +407,24 @@ pub fn extract_editable_region_from_markers(text: &str) -> Option { Some(result) } -struct MarkerTag { - number: usize, - tag_start: usize, - tag_end: usize, -} - -struct RelativeMarkerTag { - delta: isize, +struct ParsedTag { + value: isize, tag_start: usize, tag_end: usize, } -fn collect_marker_tags(text: &str) -> Vec { - let mut markers = Vec::new(); - let mut search_from = 0; - while let Some(rel_pos) = text[search_from..].find(MARKER_TAG_PREFIX) { - let tag_start = search_from + rel_pos; - let num_start = tag_start + MARKER_TAG_PREFIX.len(); - if let Some(suffix_rel) = text[num_start..].find(MARKER_TAG_SUFFIX) { - let num_end = num_start + suffix_rel; - if let Ok(number) = text[num_start..num_end].parse::() { - let tag_end = num_end + MARKER_TAG_SUFFIX.len(); - markers.push(MarkerTag { - number, - tag_start, - tag_end, - }); - search_from = tag_end; - continue; - } - } - search_from = tag_start + MARKER_TAG_PREFIX.len(); - } - markers -} - -fn collect_relative_marker_tags(text: &str) -> Vec { - let mut markers = Vec::new(); +fn collect_tags(text: &str, prefix: &str, parse: fn(&str) -> Option) -> Vec { + let mut tags = Vec::new(); let mut search_from = 0; - while let Some(rel_pos) = text[search_from..].find(RELATIVE_MARKER_TAG_PREFIX) { + while let Some(rel_pos) = text[search_from..].find(prefix) { let tag_start = search_from + rel_pos; - let payload_start = tag_start + RELATIVE_MARKER_TAG_PREFIX.len(); + let payload_start = tag_start + prefix.len(); if let Some(suffix_rel) = text[payload_start..].find(MARKER_TAG_SUFFIX) { let payload_end = payload_start + suffix_rel; - let payload = &text[payload_start..payload_end]; - if let Ok(delta) = payload.parse::() { + if let Some(value) = parse(&text[payload_start..payload_end]) { let tag_end = payload_end + MARKER_TAG_SUFFIX.len(); - markers.push(RelativeMarkerTag { - delta, + tags.push(ParsedTag { + value, tag_start, tag_end, }); @@ -436,9 +432,21 @@ fn collect_relative_marker_tags(text: &str) -> Vec { continue; } } - search_from = tag_start + RELATIVE_MARKER_TAG_PREFIX.len(); + search_from = tag_start + prefix.len(); } - markers + tags +} + +fn collect_marker_tags(text: &str) -> Vec { + collect_tags(text, MARKER_TAG_PREFIX, |s| { + s.parse::().ok().map(|n| n as isize) + }) +} + +fn collect_relative_marker_tags(text: &str) -> Vec { + collect_tags(text, RELATIVE_MARKER_TAG_PREFIX, |s| { + s.parse::().ok() + }) } pub fn nearest_marker_number(cursor_offset: Option, marker_offsets: &[usize]) -> usize { @@ -459,21 +467,87 @@ fn cursor_block_index(cursor_offset: Option, marker_offsets: &[usize]) -> .unwrap_or_else(|| marker_offsets.len().saturating_sub(2)) } -/// Write the editable region content with V0317 byte-exact marker tags, where -/// marker numbers are relative to the cursor block. -pub fn write_editable_with_markers_v0317( +fn common_prefix_suffix(a: &[u8], b: &[u8]) -> (usize, usize) { + let prefix = a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count(); + let remaining_a = a.len() - prefix; + let remaining_b = b.len() - prefix; + let max_suffix = remaining_a.min(remaining_b); + let suffix = a[a.len() - max_suffix..] + .iter() + .rev() + .zip(b[b.len() - max_suffix..].iter().rev()) + .take_while(|(x, y)| x == y) + .count(); + (prefix, suffix) +} + +/// Map a byte offset from old span coordinates to new span coordinates, +/// using common prefix/suffix within the span for accuracy. +fn map_boundary_offset( + old_rel: usize, + old_span_len: usize, + new_span_len: usize, + span_common_prefix: usize, + span_common_suffix: usize, +) -> usize { + if old_rel <= span_common_prefix { + old_rel + } else if old_rel >= old_span_len - span_common_suffix { + new_span_len - (old_span_len - old_rel) + } else { + let old_changed_start = span_common_prefix; + let old_changed_len = old_span_len + .saturating_sub(span_common_prefix) + .saturating_sub(span_common_suffix); + let new_changed_start = span_common_prefix; + let new_changed_len = new_span_len + .saturating_sub(span_common_prefix) + .saturating_sub(span_common_suffix); + + if old_changed_len == 0 { + new_changed_start + } else { + new_changed_start + ((old_rel - old_changed_start) * new_changed_len / old_changed_len) + } + } +} + +fn snap_to_line_start(text: &str, offset: usize) -> usize { + let bounded = offset.min(text.len()); + let bounded = text.floor_char_boundary(bounded); + + if bounded >= text.len() { + return text.len(); + } + + if bounded == 0 || text.as_bytes().get(bounded - 1) == Some(&b'\n') { + return bounded; + } + + if let Some(next_nl_rel) = text[bounded..].find('\n') { + let next = bounded + next_nl_rel + 1; + return text.floor_char_boundary(next.min(text.len())); + } + + let prev_start = text[..bounded].rfind('\n').map(|idx| idx + 1).unwrap_or(0); + text.floor_char_boundary(prev_start) +} + +/// Write the editable region content with byte-exact marker tags, inserting the +/// cursor marker at the given offset within the editable text. +/// +/// The `tag_for_index` closure maps a boundary index to the marker tag string. +fn write_editable_with_markers_impl( output: &mut String, editable_text: &str, cursor_offset_in_editable: usize, cursor_marker: &str, + marker_offsets: &[usize], + tag_for_index: impl Fn(usize) -> String, ) { - let marker_offsets = compute_marker_offsets(editable_text); - let anchor_idx = cursor_block_index(Some(cursor_offset_in_editable), &marker_offsets); let mut cursor_placed = false; - for (i, &offset) in marker_offsets.iter().enumerate() { - let marker_delta = i as isize - anchor_idx as isize; - output.push_str(&marker_tag_relative(marker_delta)); + output.push_str(&tag_for_index(i)); if let Some(&next_offset) = marker_offsets.get(i + 1) { let block = &editable_text[offset..next_offset]; @@ -493,11 +567,6 @@ pub fn write_editable_with_markers_v0317( } } -/// Write the editable region content with V0316 byte-exact marker tags. -/// -/// Unlike the V0306 version, markers are pure delimiters with no newline -/// padding. The content between markers is the exact bytes from the editable -/// text. pub fn write_editable_with_markers_v0316( output: &mut String, editable_text: &str, @@ -505,103 +574,93 @@ pub fn write_editable_with_markers_v0316( cursor_marker: &str, ) { let marker_offsets = compute_marker_offsets(editable_text); - let mut cursor_placed = false; - for (i, &offset) in marker_offsets.iter().enumerate() { - let marker_num = i + 1; - output.push_str(&marker_tag(marker_num)); + write_editable_with_markers_impl( + output, + editable_text, + cursor_offset_in_editable, + cursor_marker, + &marker_offsets, + |i| marker_tag(i + 1), + ); +} - if let Some(&next_offset) = marker_offsets.get(i + 1) { - let block = &editable_text[offset..next_offset]; - if !cursor_placed - && cursor_offset_in_editable >= offset - && cursor_offset_in_editable <= next_offset - { - cursor_placed = true; - let cursor_in_block = cursor_offset_in_editable - offset; - output.push_str(&block[..cursor_in_block]); - output.push_str(cursor_marker); - output.push_str(&block[cursor_in_block..]); - } else { - output.push_str(block); - } - } - } +pub fn write_editable_with_markers_v0317( + output: &mut String, + editable_text: &str, + cursor_offset_in_editable: usize, + cursor_marker: &str, +) { + let marker_offsets = compute_marker_offsets(editable_text); + let anchor_idx = cursor_block_index(Some(cursor_offset_in_editable), &marker_offsets); + write_editable_with_markers_impl( + output, + editable_text, + cursor_offset_in_editable, + cursor_marker, + &marker_offsets, + |i| marker_tag_relative(i as isize - anchor_idx as isize), + ); } -/// Parse V0316 model output and reconstruct the full new editable region. -/// -/// V0316 differences from V0306: -/// - No newline stripping or normalization (byte-exact content). -/// - The no-edit signal is `start_num == end_num` (any repeated marker). -/// - Intermediate marker tags are used for block-level extraction. -pub fn apply_marker_span_v0316(old_editable: &str, output: &str) -> Result { - let markers = collect_marker_tags(output); +pub fn write_editable_with_markers_v0318( + output: &mut String, + editable_text: &str, + cursor_offset_in_editable: usize, + cursor_marker: &str, +) { + let marker_offsets = compute_marker_offsets_v0318(editable_text); + write_editable_with_markers_impl( + output, + editable_text, + cursor_offset_in_editable, + cursor_marker, + &marker_offsets, + |i| marker_tag(i + 1), + ); +} - if markers.is_empty() { +/// Parse byte-exact model output and reconstruct the full new editable region. +/// +/// `resolve_boundary` maps a parsed tag value to an absolute byte offset in +/// old_editable, given the marker_offsets. Returns `(start_byte, end_byte)` or +/// an error. +fn apply_marker_span_impl( + old_editable: &str, + tags: &[ParsedTag], + output: &str, + resolve_boundaries: impl Fn(isize, isize) -> Result<(usize, usize)>, +) -> Result { + if tags.is_empty() { return Err(anyhow!("no marker tags found in output")); } - - if markers.len() == 1 { + if tags.len() == 1 { return Err(anyhow!( "only one marker tag found in output, expected at least two" )); } - let start_num = markers - .first() - .map(|marker| marker.number) - .context("missing first marker")?; - let end_num = markers - .last() - .map(|marker| marker.number) - .context("missing last marker")?; + let start_value = tags[0].value; + let end_value = tags[tags.len() - 1].value; - // No-edit signal: start_num == end_num - if start_num == end_num { + if start_value == end_value { return Ok(old_editable.to_string()); } - // Validate monotonically increasing with no gaps - let expected_nums: Vec = (start_num..=end_num).collect(); - let actual_nums: Vec = markers.iter().map(|m| m.number).collect(); - if actual_nums != expected_nums { - eprintln!( - "V0316 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.", - expected_nums, actual_nums - ); - } - - let marker_offsets = compute_marker_offsets(old_editable); - - let start_idx = start_num - .checked_sub(1) - .context("marker numbers are 1-indexed")?; - let end_idx = end_num - .checked_sub(1) - .context("marker numbers are 1-indexed")?; - - let start_byte = *marker_offsets - .get(start_idx) - .context("start marker number out of range")?; - let end_byte = *marker_offsets - .get(end_idx) - .context("end marker number out of range")?; + let (start_byte, end_byte) = resolve_boundaries(start_value, end_value)?; if start_byte > end_byte { return Err(anyhow!("start marker must come before end marker")); } - // Extract byte-exact content between consecutive markers let mut new_content = String::new(); - for i in 0..markers.len() - 1 { - let content_start = markers[i].tag_end; - let content_end = markers[i + 1].tag_start; + for i in 0..tags.len() - 1 { + let content_start = tags[i].tag_end; + let content_end = tags[i + 1].tag_start; if content_start <= content_end { new_content.push_str(&output[content_start..content_end]); } } - // Splice into old_editable let mut result = String::new(); result.push_str(&old_editable[..start_byte]); result.push_str(&new_content); @@ -610,123 +669,127 @@ pub fn apply_marker_span_v0316(old_editable: &str, output: &str) -> Result Result { + let tags = collect_marker_tags(output); + + // Validate monotonically increasing with no gaps (best-effort warning) + if tags.len() >= 2 { + let start_num = tags[0].value; + let end_num = tags[tags.len() - 1].value; + if start_num != end_num { + let expected: Vec = (start_num..=end_num).collect(); + let actual: Vec = tags.iter().map(|t| t.value).collect(); + if actual != expected { + eprintln!( + "V0316 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.", + expected, actual + ); + } + } + } + + let marker_offsets = compute_marker_offsets(old_editable); + apply_marker_span_impl(old_editable, &tags, output, |start_val, end_val| { + let start_idx = (start_val as usize) + .checked_sub(1) + .context("marker numbers are 1-indexed")?; + let end_idx = (end_val as usize) + .checked_sub(1) + .context("marker numbers are 1-indexed")?; + let start_byte = *marker_offsets + .get(start_idx) + .context("start marker number out of range")?; + let end_byte = *marker_offsets + .get(end_idx) + .context("end marker number out of range")?; + Ok((start_byte, end_byte)) + }) +} + pub fn apply_marker_span_v0317( old_editable: &str, output: &str, cursor_offset_in_old: Option, ) -> Result { - let markers = collect_relative_marker_tags(output); - - if markers.is_empty() { - return Err(anyhow!("no marker tags found in output")); - } - - if markers.len() == 1 { - return Err(anyhow!( - "only one marker tag found in output, expected at least two" - )); - } - + let tags = collect_relative_marker_tags(output); let marker_offsets = compute_marker_offsets(old_editable); let anchor_idx = cursor_block_index(cursor_offset_in_old, &marker_offsets); - let start_delta = markers - .first() - .map(|marker| marker.delta) - .context("missing first marker")?; - let end_delta = markers - .last() - .map(|marker| marker.delta) - .context("missing last marker")?; - - if start_delta == end_delta { - return Ok(old_editable.to_string()); - } - - let start_idx_isize = anchor_idx as isize + start_delta; - let end_idx_isize = anchor_idx as isize + end_delta; - if start_idx_isize < 0 || end_idx_isize < 0 { - return Err(anyhow!("relative marker maps before first marker")); - } - - let start_idx = usize::try_from(start_idx_isize).context("invalid start marker index")?; - let end_idx = usize::try_from(end_idx_isize).context("invalid end marker index")?; - - let start_byte = *marker_offsets - .get(start_idx) - .context("start marker number out of range")?; - let end_byte = *marker_offsets - .get(end_idx) - .context("end marker number out of range")?; - - if start_byte > end_byte { - return Err(anyhow!("start marker must come before end marker")); - } + apply_marker_span_impl(old_editable, &tags, output, |start_delta, end_delta| { + let start_idx_signed = anchor_idx as isize + start_delta; + let end_idx_signed = anchor_idx as isize + end_delta; + if start_idx_signed < 0 || end_idx_signed < 0 { + return Err(anyhow!("relative marker maps before first marker")); + } + let start_idx = usize::try_from(start_idx_signed).context("invalid start marker index")?; + let end_idx = usize::try_from(end_idx_signed).context("invalid end marker index")?; + let start_byte = *marker_offsets + .get(start_idx) + .context("start marker number out of range")?; + let end_byte = *marker_offsets + .get(end_idx) + .context("end marker number out of range")?; + Ok((start_byte, end_byte)) + }) +} - let mut new_content = String::new(); - for i in 0..markers.len() - 1 { - let content_start = markers[i].tag_end; - let content_end = markers[i + 1].tag_start; - if content_start <= content_end { - new_content.push_str(&output[content_start..content_end]); +pub fn apply_marker_span_v0318(old_editable: &str, output: &str) -> Result { + let tags = collect_marker_tags(output); + + if tags.len() >= 2 { + let start_num = tags[0].value; + let end_num = tags[tags.len() - 1].value; + if start_num != end_num { + let expected: Vec = (start_num..=end_num).collect(); + let actual: Vec = tags.iter().map(|t| t.value).collect(); + if actual != expected { + eprintln!( + "V0318 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.", + expected, actual + ); + } } } - let mut result = String::new(); - result.push_str(&old_editable[..start_byte]); - result.push_str(&new_content); - result.push_str(&old_editable[end_byte..]); - - Ok(result) + let marker_offsets = compute_marker_offsets_v0318(old_editable); + apply_marker_span_impl(old_editable, &tags, output, |start_val, end_val| { + let start_idx = (start_val as usize) + .checked_sub(1) + .context("marker numbers are 1-indexed")?; + let end_idx = (end_val as usize) + .checked_sub(1) + .context("marker numbers are 1-indexed")?; + let start_byte = *marker_offsets + .get(start_idx) + .context("start marker number out of range")?; + let end_byte = *marker_offsets + .get(end_idx) + .context("end marker number out of range")?; + Ok((start_byte, end_byte)) + }) } -/// Encode the V0316 training target from old and new editable text. +/// Encode the training target from old and new editable text. /// -/// V0316 differences from V0306: -/// - No-edit signal: `<|marker_C|><|marker_C|>{end_marker}` where C is nearest -/// to cursor. -/// - All intermediate markers are emitted with byte-exact content. -/// - No newline padding around marker tags. -pub fn encode_from_old_and_new_v0316( +/// Shared implementation for V0316, V0317, and V0318. The `tag_for_block_idx` +/// closure maps a block index to the appropriate marker tag string. +/// `no_edit_tag` is the marker tag to repeat when there are no edits. +fn encode_from_old_and_new_impl( old_editable: &str, new_editable: &str, cursor_offset_in_new: Option, cursor_marker: &str, end_marker: &str, + no_edit_tag: &str, + marker_offsets: &[usize], + tag_for_block_idx: impl Fn(usize) -> String, ) -> Result { - let marker_offsets = compute_marker_offsets(old_editable); - if old_editable == new_editable { - let marker_num = nearest_marker_number(cursor_offset_in_new, &marker_offsets); - let tag = marker_tag(marker_num); - return Ok(format!("{tag}{tag}{end_marker}")); + return Ok(format!("{no_edit_tag}{no_edit_tag}{end_marker}")); } - let common_prefix = old_editable - .bytes() - .zip(new_editable.bytes()) - .take_while(|(a, b)| a == b) - .count(); - - let old_remaining = old_editable.len() - common_prefix; - let new_remaining = new_editable.len() - common_prefix; - let max_suffix = old_remaining.min(new_remaining); - let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..] - .iter() - .rev() - .zip( - new_editable.as_bytes()[new_editable.len() - max_suffix..] - .iter() - .rev(), - ) - .take_while(|(a, b)| a == b) - .count(); - + let (common_prefix, common_suffix) = + common_prefix_suffix(old_editable.as_bytes(), new_editable.as_bytes()); let change_end_in_old = old_editable.len() - common_suffix; let start_marker_idx = marker_offsets @@ -749,40 +812,19 @@ pub fn encode_from_old_and_new_v0316( let new_span = &new_editable[new_start..new_end]; let old_span = &old_editable[old_start..old_end]; - // Compute common prefix/suffix within the span for accurate boundary mapping - let span_common_prefix = old_span - .bytes() - .zip(new_span.bytes()) - .take_while(|(a, b)| a == b) - .count(); - - let span_old_remaining = old_span.len() - span_common_prefix; - let span_new_remaining = new_span.len() - span_common_prefix; - let span_max_suffix = span_old_remaining.min(span_new_remaining); - let span_common_suffix = old_span.as_bytes()[old_span.len() - span_max_suffix..] - .iter() - .rev() - .zip( - new_span.as_bytes()[new_span.len() - span_max_suffix..] - .iter() - .rev(), - ) - .take_while(|(a, b)| a == b) - .count(); + let (span_common_prefix, span_common_suffix) = + common_prefix_suffix(old_span.as_bytes(), new_span.as_bytes()); let mut result = String::new(); let mut prev_new_rel = 0usize; let mut cursor_placed = false; for block_idx in start_marker_idx..end_marker_idx { - let marker_num = block_idx + 1; - result.push_str(&marker_tag(marker_num)); + result.push_str(&tag_for_block_idx(block_idx)); let new_rel_end = if block_idx + 1 == end_marker_idx { - // Last block: extends to end of new span new_span.len() } else { - // Map the intermediate boundary from old to new coordinates let old_rel = marker_offsets[block_idx + 1] - old_start; let mapped = map_boundary_offset( old_rel, @@ -791,13 +833,10 @@ pub fn encode_from_old_and_new_v0316( span_common_prefix, span_common_suffix, ); - // Ensure char boundary safety and monotonicity - new_span.floor_char_boundary(mapped) + snap_to_line_start(new_span, mapped) }; - // Ensure monotonicity (each block gets at least zero content) let new_rel_end = new_rel_end.max(prev_new_rel); - let block_content = &new_span[prev_new_rel..new_rel_end]; if !cursor_placed { @@ -821,19 +860,33 @@ pub fn encode_from_old_and_new_v0316( prev_new_rel = new_rel_end; } - // Final closing marker - let end_marker_num = end_marker_idx + 1; - result.push_str(&marker_tag(end_marker_num)); + result.push_str(&tag_for_block_idx(end_marker_idx)); result.push_str(end_marker); Ok(result) } -/// Encode the V0317 training target from old and new editable text. -/// -/// V0317 differences from V0316: -/// - Marker ids are relative to cursor block (..., -2, -1, 0, +1, +2, ...). -/// - No-edit signal: repeated cursor-relative marker. +pub fn encode_from_old_and_new_v0316( + old_editable: &str, + new_editable: &str, + cursor_offset_in_new: Option, + cursor_marker: &str, + end_marker: &str, +) -> Result { + let marker_offsets = compute_marker_offsets(old_editable); + let no_edit_tag = marker_tag(nearest_marker_number(cursor_offset_in_new, &marker_offsets)); + encode_from_old_and_new_impl( + old_editable, + new_editable, + cursor_offset_in_new, + cursor_marker, + end_marker, + &no_edit_tag, + &marker_offsets, + |block_idx| marker_tag(block_idx + 1), + ) +} + pub fn encode_from_old_and_new_v0317( old_editable: &str, new_editable: &str, @@ -843,157 +896,38 @@ pub fn encode_from_old_and_new_v0317( ) -> Result { let marker_offsets = compute_marker_offsets(old_editable); let anchor_idx = cursor_block_index(cursor_offset_in_new, &marker_offsets); - - if old_editable == new_editable { - let tag = marker_tag_relative(0); - return Ok(format!("{tag}{tag}{end_marker}")); - } - - let common_prefix = old_editable - .bytes() - .zip(new_editable.bytes()) - .take_while(|(a, b)| a == b) - .count(); - - let old_remaining = old_editable.len() - common_prefix; - let new_remaining = new_editable.len() - common_prefix; - let max_suffix = old_remaining.min(new_remaining); - let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..] - .iter() - .rev() - .zip( - new_editable.as_bytes()[new_editable.len() - max_suffix..] - .iter() - .rev(), - ) - .take_while(|(a, b)| a == b) - .count(); - - let change_end_in_old = old_editable.len() - common_suffix; - - let start_marker_idx = marker_offsets - .iter() - .rposition(|&offset| offset <= common_prefix) - .unwrap_or(0); - let end_marker_idx = marker_offsets - .iter() - .position(|&offset| offset >= change_end_in_old) - .unwrap_or(marker_offsets.len() - 1); - - let old_start = marker_offsets[start_marker_idx]; - let old_end = marker_offsets[end_marker_idx]; - - let new_start = old_start; - let new_end = new_editable - .len() - .saturating_sub(old_editable.len().saturating_sub(old_end)); - - let new_span = &new_editable[new_start..new_end]; - let old_span = &old_editable[old_start..old_end]; - - let span_common_prefix = old_span - .bytes() - .zip(new_span.bytes()) - .take_while(|(a, b)| a == b) - .count(); - - let span_old_remaining = old_span.len() - span_common_prefix; - let span_new_remaining = new_span.len() - span_common_prefix; - let span_max_suffix = span_old_remaining.min(span_new_remaining); - let span_common_suffix = old_span.as_bytes()[old_span.len() - span_max_suffix..] - .iter() - .rev() - .zip( - new_span.as_bytes()[new_span.len() - span_max_suffix..] - .iter() - .rev(), - ) - .take_while(|(a, b)| a == b) - .count(); - - let mut result = String::new(); - let mut prev_new_rel = 0usize; - let mut cursor_placed = false; - - for block_idx in start_marker_idx..end_marker_idx { - let marker_delta = block_idx as isize - anchor_idx as isize; - result.push_str(&marker_tag_relative(marker_delta)); - - let new_rel_end = if block_idx + 1 == end_marker_idx { - new_span.len() - } else { - let old_rel = marker_offsets[block_idx + 1] - old_start; - let mapped = map_boundary_offset( - old_rel, - old_span.len(), - new_span.len(), - span_common_prefix, - span_common_suffix, - ); - new_span.floor_char_boundary(mapped) - }; - - let new_rel_end = new_rel_end.max(prev_new_rel); - let block_content = &new_span[prev_new_rel..new_rel_end]; - - if !cursor_placed { - if let Some(cursor_offset) = cursor_offset_in_new { - let abs_start = new_start + prev_new_rel; - let abs_end = new_start + new_rel_end; - if cursor_offset >= abs_start && cursor_offset <= abs_end { - cursor_placed = true; - let cursor_in_block = cursor_offset - abs_start; - let bounded = cursor_in_block.min(block_content.len()); - result.push_str(&block_content[..bounded]); - result.push_str(cursor_marker); - result.push_str(&block_content[bounded..]); - prev_new_rel = new_rel_end; - continue; - } - } - } - - result.push_str(block_content); - prev_new_rel = new_rel_end; - } - - let end_marker_delta = end_marker_idx as isize - anchor_idx as isize; - result.push_str(&marker_tag_relative(end_marker_delta)); - result.push_str(end_marker); - - Ok(result) + let no_edit_tag = marker_tag_relative(0); + encode_from_old_and_new_impl( + old_editable, + new_editable, + cursor_offset_in_new, + cursor_marker, + end_marker, + &no_edit_tag, + &marker_offsets, + |block_idx| marker_tag_relative(block_idx as isize - anchor_idx as isize), + ) } -/// Map a byte offset from old span coordinates to new span coordinates, -/// using common prefix/suffix within the span for accuracy. -fn map_boundary_offset( - old_rel: usize, - old_span_len: usize, - new_span_len: usize, - span_common_prefix: usize, - span_common_suffix: usize, -) -> usize { - if old_rel <= span_common_prefix { - old_rel - } else if old_rel >= old_span_len - span_common_suffix { - new_span_len - (old_span_len - old_rel) - } else { - // Within the changed region: proportional mapping - let old_changed_start = span_common_prefix; - let old_changed_len = old_span_len - .saturating_sub(span_common_prefix) - .saturating_sub(span_common_suffix); - let new_changed_start = span_common_prefix; - let new_changed_len = new_span_len - .saturating_sub(span_common_prefix) - .saturating_sub(span_common_suffix); - - if old_changed_len == 0 { - new_changed_start - } else { - new_changed_start + ((old_rel - old_changed_start) * new_changed_len / old_changed_len) - } - } +pub fn encode_from_old_and_new_v0318( + old_editable: &str, + new_editable: &str, + cursor_offset_in_new: Option, + cursor_marker: &str, + end_marker: &str, +) -> Result { + let marker_offsets = compute_marker_offsets_v0318(old_editable); + let no_edit_tag = marker_tag(nearest_marker_number(cursor_offset_in_new, &marker_offsets)); + encode_from_old_and_new_impl( + old_editable, + new_editable, + cursor_offset_in_new, + cursor_marker, + end_marker, + &no_edit_tag, + &marker_offsets, + |block_idx| marker_tag(block_idx + 1), + ) } #[cfg(test)] @@ -1016,6 +950,88 @@ mod tests { assert_eq!(*offsets.last().unwrap(), text.len()); } + #[test] + fn test_compute_marker_offsets_blank_line_split_overrides_pending_hard_cap_boundary() { + let text = "\ +class OCRDataframe(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + df: pl.DataFrame + + def page(self, page_number: int = 0) -> \"OCRDataframe\": + # Filter dataframe on specific page + df_page = self.df.filter(pl.col(\"page\") == page_number) + return OCRDataframe(df=df_page) + + def get_text_cell( + self, + cell: Cell, + margin: int = 0, + page_number: Optional[int] = None, + min_confidence: int = 50, + ) -> Optional[str]: + \"\"\" + Get text corresponding to cell +"; + let offsets = compute_marker_offsets(text); + + let def_start = text + .find(" def get_text_cell(") + .expect("def line exists"); + let self_start = text.find(" self,").expect("self line exists"); + + assert!( + offsets.contains(&def_start), + "expected boundary at def line start ({def_start}), got {offsets:?}" + ); + assert!( + !offsets.contains(&self_start), + "did not expect boundary at self line start ({self_start}), got {offsets:?}" + ); + } + + #[test] + fn test_compute_marker_offsets_blank_line_split_skips_closer_line() { + let text = "\ +impl Plugin for AhoySchedulePlugin { + fn build(&self, app: &mut App) { + app.configure_sets( + self.schedule, + ( + AhoySystems::MoveCharacters, + AhoySystems::ApplyForcesToDynamicRigidBodies, + ) + .chain() + .before(PhysicsSystems::First), + ); + + } +} + +/// System set used by all systems of `bevy_ahoy`. +#[derive(SystemSet, Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub enum AhoySystems { + MoveCharacters, + ApplyForcesToDynamicRigidBodies, +} +"; + let offsets = compute_marker_offsets(text); + + let closer_start = text.find(" }\n").expect("closer line exists"); + let doc_start = text + .find("/// System set used by all systems of `bevy_ahoy`.") + .expect("doc line exists"); + + assert!( + !offsets.contains(&closer_start), + "did not expect boundary at closer line start ({closer_start}), got {offsets:?}" + ); + assert!( + offsets.contains(&doc_start), + "expected boundary at doc line start ({doc_start}), got {offsets:?}" + ); + } + #[test] fn test_compute_marker_offsets_max_lines_split() { let text = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n"; @@ -1023,12 +1039,152 @@ mod tests { assert!(offsets.len() >= 3, "offsets: {:?}", offsets); } + #[test] + fn test_compute_marker_offsets_hard_cap_nudges_past_closer_to_case_line() { + let text = "a1\na2\na3\na4\na5\na6\na7\na8\n}\ncase 'x': {\nbody\n"; + let offsets = compute_marker_offsets(text); + + let expected = text.find("case 'x': {").expect("case line exists"); + assert!( + offsets.contains(&expected), + "expected nudged boundary at case line start ({expected}), got {offsets:?}" + ); + } + + #[test] + fn test_compute_marker_offsets_hard_cap_nudge_respects_max_forward_lines() { + let text = "a1\na2\na3\na4\na5\na6\na7\na8\n}\n}\n}\n}\n}\ncase 'x': {\nbody\n"; + let offsets = compute_marker_offsets(text); + + let case_start = text.find("case 'x': {").expect("case line exists"); + assert!( + !offsets.contains(&case_start), + "boundary should not nudge beyond max forward lines; offsets: {offsets:?}" + ); + } + + #[test] + fn test_compute_marker_offsets_stay_sorted_when_hard_cap_boundary_nudges_forward() { + let text = "\ +aaaaaaaaaa = 1; +bbbbbbbbbb = 2; +cccccccccc = 3; +dddddddddd = 4; +eeeeeeeeee = 5; +ffffffffff = 6; +gggggggggg = 7; +hhhhhhhhhh = 8; + }; + }; + + grafanaDashboards = { + cluster-overview.spec = { + inherit instanceSelector; + folderRef = \"infrastructure\"; + json = builtins.readFile ./grafana/dashboards/cluster-overview.json; + }; + }; +"; + let offsets = compute_marker_offsets(text); + + assert_eq!(offsets.first().copied(), Some(0), "offsets: {offsets:?}"); + assert_eq!( + offsets.last().copied(), + Some(text.len()), + "offsets: {offsets:?}" + ); + assert!( + offsets.windows(2).all(|window| window[0] <= window[1]), + "offsets must be sorted: {offsets:?}" + ); + } + #[test] fn test_compute_marker_offsets_empty() { let offsets = compute_marker_offsets(""); assert_eq!(offsets, vec![0, 0]); } + #[test] + fn test_compute_marker_offsets_avoid_short_markdown_blocks() { + let text = "\ +# Spree Posts + +This is a Posts extension for [Spree Commerce](https://spreecommerce.org), built with Ruby on Rails. + +## Installation + +1. Add this extension to your Gemfile with this line: + + ```ruby + bundle add spree_posts + ``` + +2. Run the install generator + + ```ruby + bundle exec rails g spree_posts:install + ``` + +3. Restart your server + + If your server was running, restart it so that it can find the assets properly. + +## Developing + +1. Create a dummy app + + ```bash + bundle update + bundle exec rake test_app + ``` + +2. Add your new code +3. Run tests + + ```bash + bundle exec rspec + ``` + +When testing your applications integration with this extension you may use it's factories. +Simply add this require statement to your spec_helper: + +```ruby +require 'spree_posts/factories' +``` + +## Releasing a new version + +```shell +bundle exec gem bump -p -t +bundle exec gem release +``` + +For more options please see [gem-release README](https://github.com/svenfuchs/gem-release) + +## Contributing + +If you'd like to contribute, please take a look at the contributing guide. +"; + let offsets = compute_marker_offsets(text); + + assert_eq!(offsets.first().copied(), Some(0), "offsets: {offsets:?}"); + assert_eq!( + offsets.last().copied(), + Some(text.len()), + "offsets: {offsets:?}" + ); + + for window in offsets.windows(2) { + let block = &text[window[0]..window[1]]; + let line_count = block.lines().count(); + assert!( + line_count >= V0316_MIN_BLOCK_LINES, + "block too short: {line_count} lines in block {block:?} with offsets {offsets:?}" + ); + } + } + #[test] fn test_extract_marker_span() { let text = "<|marker_2|>\n new content\n<|marker_3|>\n"; @@ -1189,10 +1345,8 @@ mod tests { let editable = "aaa\nbbb\nccc\n"; let mut output = String::new(); write_editable_with_markers_v0316(&mut output, editable, 4, "<|user_cursor|>"); - // Should have marker tags with no extra newlines assert!(output.starts_with("<|marker_1|>")); assert!(output.contains("<|user_cursor|>")); - // Content should be byte-exact - no extra newlines added by markers let stripped = output.replace("<|user_cursor|>", ""); let stripped = strip_marker_tags(&stripped); assert_eq!(stripped, editable); @@ -1232,11 +1386,9 @@ mod tests { marker_offsets ); - // Build output spanning all blocks with new content let new_content = "LINE1\nLINE2\nLINE3\n\nLINE5\nLINE6\nLINE7\nLINE8\n"; let mut output = String::new(); output.push_str("<|marker_1|>"); - // Split new_content at old block boundaries for i in 0..marker_offsets.len() - 1 { if i > 0 { output.push_str(&marker_tag(i + 1)); @@ -1244,7 +1396,6 @@ mod tests { let start = marker_offsets[i]; let end = marker_offsets[i + 1]; let block_len = end - start; - // Use same length blocks from new content (they happen to be same length) output.push_str(&new_content[start..start + block_len]); } let last_marker_num = marker_offsets.len(); @@ -1256,10 +1407,8 @@ mod tests { #[test] fn test_apply_marker_span_v0316_byte_exact_no_normalization() { let old = "aaa\nbbb\nccc\n"; - // Content doesn't end with \n - should NOT be normalized let output = "<|marker_1|>aaa\nBBB\nccc<|marker_2|>"; let result = apply_marker_span_v0316(old, output).unwrap(); - // V0316 is byte-exact: the missing trailing \n is NOT added assert_eq!(result, "aaa\nBBB\nccc"); } @@ -1268,9 +1417,7 @@ mod tests { let old = "aaa\nbbb\nccc\n"; let result = encode_from_old_and_new_v0316(old, old, Some(5), "<|user_cursor|>", "<|end|>").unwrap(); - // Should be <|marker_K|><|marker_K|><|end|> where K is nearest to cursor assert!(result.ends_with("<|end|>")); - // Parse it and verify it's a no-edit let stripped = result.strip_suffix("<|end|>").unwrap(); let result_parsed = apply_marker_span_v0316(old, stripped).unwrap(); assert_eq!(result_parsed, old); @@ -1412,4 +1559,95 @@ mod tests { assert!(result.contains("<|user_cursor|>"), "result: {result}"); assert!(result.contains("<|marker-0|>"), "result: {result}"); } + + #[test] + fn test_compute_marker_offsets_v0318_uses_larger_block_sizes() { + let text = "l1\nl2\nl3\n\nl5\nl6\nl7\nl8\nl9\nl10\nl11\nl12\nl13\n"; + let v0316_offsets = compute_marker_offsets(text); + let v0318_offsets = compute_marker_offsets_v0318(text); + + assert!(v0318_offsets.len() < v0316_offsets.len()); + assert_eq!(v0316_offsets.first().copied(), Some(0)); + assert_eq!(v0318_offsets.first().copied(), Some(0)); + assert_eq!(v0316_offsets.last().copied(), Some(text.len())); + assert_eq!(v0318_offsets.last().copied(), Some(text.len())); + } + + #[test] + fn test_roundtrip_v0318() { + let old = "line1\nline2\nline3\n\nline5\nline6\nline7\nline8\nline9\nline10\n"; + let new = "line1\nline2\nline3\n\nline5\nLINE6\nline7\nline8\nline9\nline10\n"; + let encoded = + encode_from_old_and_new_v0318(old, new, None, "<|user_cursor|>", "<|end|>").unwrap(); + let stripped = encoded + .strip_suffix("<|end|>") + .expect("should have end marker"); + let reconstructed = apply_marker_span_v0318(old, stripped).unwrap(); + assert_eq!(reconstructed, new); + } + + #[test] + fn test_encode_v0317_markers_stay_on_line_boundaries() { + let old = "\ +\t\t\t\tcontinue outer; +\t\t\t} +\t\t} +\t} + +\tconst intersectionObserver = new IntersectionObserver((entries) => { +\t\tfor (const entry of entries) { +\t\t\tif (entry.isIntersecting) { +\t\t\t\tintersectionObserver.unobserve(entry.target); +\t\t\t\tanchorPreload(/** @type {HTMLAnchorElement} */ (entry.target)); +\t\t\t} +\t\t} +\t}); + +\tconst observer = new MutationObserver(() => { +\t\tconst links = /** @type {NodeListOf} */ ( +\t\t\tdocument.querySelectorAll('a[data-preload]') +\t\t); + +\t\tfor (const link of links) { +\t\t\tif (linkSet.has(link)) continue; +\t\t\tlinkSet.add(link); + +\t\t\tswitch (link.dataset.preload) { +\t\t\t\tcase '': +\t\t\t\tcase 'true': +\t\t\t\tcase 'hover': { +\t\t\t\t\tlink.addEventListener('mouseenter', function callback() { +\t\t\t\t\t\tlink.removeEventListener('mouseenter', callback); +\t\t\t\t\t\tanchorPreload(link); +\t\t\t\t\t}); +"; + let new = old.replacen( + "\t\t\t\tcase 'true':\n", + "\t\t\t\tcase 'TRUE':<|user_cursor|>\n", + 1, + ); + + let cursor_offset = new.find("<|user_cursor|>").expect("cursor marker in new"); + let new_without_cursor = new.replace("<|user_cursor|>", ""); + + let encoded = encode_from_old_and_new_v0317( + old, + &new_without_cursor, + Some(cursor_offset), + "<|user_cursor|>", + "<|end|>", + ) + .unwrap(); + + let core = encoded.strip_suffix("<|end|>").unwrap_or(&encoded); + for marker in collect_relative_marker_tags(core) { + let tag_start = marker.tag_start; + assert!( + tag_start == 0 || core.as_bytes()[tag_start - 1] == b'\n', + "marker not at line boundary: {} in output:\n{}", + marker_tag_relative(marker.value), + core + ); + } + } } diff --git a/crates/zeta_prompt/src/zeta_prompt.rs b/crates/zeta_prompt/src/zeta_prompt.rs index 2cc5322db0ce5b2e11002f06c036832357199d97..3ec90baf6e7d7781b5ddedb0af3dbdb0994cb3ad 100644 --- a/crates/zeta_prompt/src/zeta_prompt.rs +++ b/crates/zeta_prompt/src/zeta_prompt.rs @@ -91,6 +91,8 @@ pub enum ZetaFormat { V0306SeedMultiRegions, /// Byte-exact marker spans; all intermediate markers emitted; repeated marker means no-edit. V0316SeedMultiRegions, + /// V0316 with larger block sizes. + V0318SeedMultiRegions, /// V0316, but marker numbers are relative to the cursor block (e.g. -1, -0, +1). V0317SeedMultiRegions, } @@ -242,6 +244,18 @@ pub fn special_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] ]; TOKENS } + ZetaFormat::V0318SeedMultiRegions => { + static TOKENS: &[&str] = &[ + seed_coder::FIM_SUFFIX, + seed_coder::FIM_PREFIX, + seed_coder::FIM_MIDDLE, + seed_coder::FILE_MARKER, + multi_region::V0318_END_MARKER, + CURSOR_MARKER, + multi_region::MARKER_TAG_PREFIX, + ]; + TOKENS + } ZetaFormat::V0317SeedMultiRegions => { static TOKENS: &[&str] = &[ seed_coder::FIM_SUFFIX, @@ -283,6 +297,7 @@ pub fn token_limits_for_format(format: ZetaFormat) -> (usize, usize) { | ZetaFormat::v0226Hashline | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0318SeedMultiRegions | ZetaFormat::V0317SeedMultiRegions | ZetaFormat::V0304SeedNoEdits => (350, 150), ZetaFormat::V0304VariableEdit => (1024, 0), @@ -303,6 +318,7 @@ pub fn stop_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] { | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0304SeedNoEdits => &[], ZetaFormat::V0316SeedMultiRegions => &[multi_region::V0316_END_MARKER], + ZetaFormat::V0318SeedMultiRegions => &[multi_region::V0318_END_MARKER], ZetaFormat::V0317SeedMultiRegions => &[multi_region::V0317_END_MARKER], } } @@ -328,6 +344,7 @@ pub fn excerpt_ranges_for_format( | ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0318SeedMultiRegions | ZetaFormat::V0317SeedMultiRegions => ( ranges.editable_350.clone(), ranges.editable_350_context_150.clone(), @@ -419,6 +436,14 @@ pub fn write_cursor_excerpt_section_for_format( cursor_offset, )); } + ZetaFormat::V0318SeedMultiRegions => { + prompt.push_str(&build_v0318_cursor_prefix( + path, + context, + editable_range, + cursor_offset, + )); + } ZetaFormat::V0317SeedMultiRegions => { prompt.push_str(&build_v0317_cursor_prefix( path, @@ -486,6 +511,33 @@ fn build_v0316_cursor_prefix( section } +fn build_v0318_cursor_prefix( + path: &Path, + context: &str, + editable_range: &Range, + cursor_offset: usize, +) -> String { + let mut section = String::new(); + let path_str = path.to_string_lossy(); + write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok(); + + section.push_str(&context[..editable_range.start]); + + let editable_text = &context[editable_range.clone()]; + let cursor_in_editable = cursor_offset - editable_range.start; + multi_region::write_editable_with_markers_v0318( + &mut section, + editable_text, + cursor_in_editable, + CURSOR_MARKER, + ); + + if !section.ends_with('\n') { + section.push('\n'); + } + section +} + fn build_v0317_cursor_prefix( path: &Path, context: &str, @@ -551,6 +603,7 @@ pub fn format_prompt_with_budget_for_format( | ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0318SeedMultiRegions | ZetaFormat::V0317SeedMultiRegions => { let mut cursor_section = String::new(); write_cursor_excerpt_section_for_format( @@ -649,6 +702,7 @@ pub fn max_edit_event_count_for_format(format: &ZetaFormat) -> usize { | ZetaFormat::V0304VariableEdit | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0318SeedMultiRegions | ZetaFormat::V0317SeedMultiRegions => 6, } } @@ -671,6 +725,7 @@ pub fn get_prefill_for_format( ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0318SeedMultiRegions | ZetaFormat::V0317SeedMultiRegions => String::new(), } } @@ -684,6 +739,7 @@ pub fn output_end_marker_for_format(format: ZetaFormat) -> Option<&'static str> | ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions => Some(seed_coder::END_MARKER), ZetaFormat::V0316SeedMultiRegions => Some(multi_region::V0316_END_MARKER), + ZetaFormat::V0318SeedMultiRegions => Some(multi_region::V0318_END_MARKER), ZetaFormat::V0317SeedMultiRegions => Some(multi_region::V0317_END_MARKER), ZetaFormat::V0112MiddleAtEnd | ZetaFormat::V0113Ordered @@ -727,6 +783,22 @@ pub fn encode_patch_as_output_for_format( Ok(None) } } + ZetaFormat::V0318SeedMultiRegions => { + let empty_patch = patch.lines().count() <= 3; + if empty_patch { + let marker_offsets = + multi_region::compute_marker_offsets_v0318(old_editable_region); + let marker_num = + multi_region::nearest_marker_number(cursor_offset, &marker_offsets); + let tag = multi_region::marker_tag(marker_num); + Ok(Some(format!( + "{tag}{tag}{}", + multi_region::V0318_END_MARKER + ))) + } else { + Ok(None) + } + } ZetaFormat::V0317SeedMultiRegions => { let empty_patch = patch.lines().count() <= 3; if empty_patch { @@ -797,6 +869,10 @@ pub fn parse_zeta2_model_output( editable_range_in_context, multi_region::apply_marker_span_v0316(old_editable_region, output)?, ), + ZetaFormat::V0318SeedMultiRegions => ( + editable_range_in_context, + multi_region::apply_marker_span_v0318(old_editable_region, output)?, + ), ZetaFormat::V0317SeedMultiRegions => ( editable_range_in_context, multi_region::apply_marker_span_v0317(