diff --git a/crates/edit_prediction/src/example_spec.rs b/crates/edit_prediction/src/example_spec.rs index 77cbb92b9eb6bbeb54cc76f888d6266fb2d5fafa..4486cde22c3429568bf29f152d0f5f2ded59e8f4 100644 --- a/crates/edit_prediction/src/example_spec.rs +++ b/crates/edit_prediction/src/example_spec.rs @@ -26,6 +26,14 @@ pub fn encode_cursor_in_patch(patch: &str, cursor_offset: Option) -> Stri let mut line_start_offset = 0usize; for line in patch.lines() { + if matches!( + DiffLine::parse(line), + DiffLine::Garbage(content) + if content.starts_with('#') && content.contains(CURSOR_POSITION_MARKER) + ) { + continue; + } + if !result.is_empty() { result.push('\n'); } @@ -846,6 +854,31 @@ mod tests { assert_eq!(results, vec![(clean_patch, None)]); } + #[test] + fn test_encode_cursor_in_patch_is_idempotent() { + let patch = indoc! {r#" + --- a/test.rs + +++ b/test.rs + @@ -1,2 +1,2 @@ + -fn old() {} + +fn new_name() {} + # ^[CURSOR_POSITION] + "#}; + + let cursor_offset = "fn new_name() {}".find("name").unwrap(); + let encoded_once = encode_cursor_in_patch(patch, Some(cursor_offset)); + let encoded_twice = encode_cursor_in_patch(&encoded_once, Some(cursor_offset)); + + assert_eq!(encoded_once, encoded_twice); + assert_eq!( + encoded_once + .lines() + .filter(|line| line.contains(CURSOR_POSITION_MARKER)) + .count(), + 1 + ); + } + #[test] fn test_from_markdown_accepted_prediction_marker() { let markdown = indoc! {r#" diff --git a/crates/edit_prediction_cli/src/format_prompt.rs b/crates/edit_prediction_cli/src/format_prompt.rs index 3a20fe0e9a5f89fa3325c1972721a836d60f7156..b0cfbd77ee543d4271cda0fb952f5ba48fc4a998 100644 --- a/crates/edit_prediction_cli/src/format_prompt.rs +++ b/crates/edit_prediction_cli/src/format_prompt.rs @@ -136,6 +136,34 @@ pub fn zeta2_output_for_patch( }, )?; + if version == ZetaFormat::V0317SeedMultiRegions { + let cursor_in_new = cursor_offset.map(|cursor_offset| { + let hunk_start = first_hunk_offset.unwrap_or(0); + result.floor_char_boundary((hunk_start + cursor_offset).min(result.len())) + }); + return multi_region::encode_from_old_and_new_v0317( + &old_editable_region, + &result, + cursor_in_new, + zeta_prompt::CURSOR_MARKER, + multi_region::V0317_END_MARKER, + ); + } + + if version == ZetaFormat::V0316SeedMultiRegions { + let cursor_in_new = cursor_offset.map(|cursor_offset| { + let hunk_start = first_hunk_offset.unwrap_or(0); + result.floor_char_boundary((hunk_start + cursor_offset).min(result.len())) + }); + return multi_region::encode_from_old_and_new_v0316( + &old_editable_region, + &result, + cursor_in_new, + zeta_prompt::CURSOR_MARKER, + multi_region::V0316_END_MARKER, + ); + } + if version == ZetaFormat::V0306SeedMultiRegions { let cursor_in_new = cursor_offset.map(|cursor_offset| { let hunk_start = first_hunk_offset.unwrap_or(0); diff --git a/crates/zeta_prompt/src/multi_region.rs b/crates/zeta_prompt/src/multi_region.rs index 1bac794b1d71fdf5ca8e086b748b8aa426bad1bd..a27a7245ae74824a086c9a39cc6d48d89f00d8b2 100644 --- a/crates/zeta_prompt/src/multi_region.rs +++ b/crates/zeta_prompt/src/multi_region.rs @@ -2,13 +2,26 @@ use anyhow::{Context as _, Result, anyhow}; pub const MARKER_TAG_PREFIX: &str = "<|marker_"; pub const MARKER_TAG_SUFFIX: &str = "|>"; +pub const RELATIVE_MARKER_TAG_PREFIX: &str = "<|marker"; const MIN_BLOCK_LINES: usize = 3; const MAX_BLOCK_LINES: usize = 8; +pub const V0316_END_MARKER: &str = "<[end▁of▁sentence]>"; +pub const V0317_END_MARKER: &str = "<[end▁of▁sentence]>"; pub fn marker_tag(number: usize) -> String { format!("{MARKER_TAG_PREFIX}{number}{MARKER_TAG_SUFFIX}") } +pub fn marker_tag_relative(delta: isize) -> String { + if delta > 0 { + format!("<|marker+{delta}|>") + } else if delta == 0 { + String::from("<|marker-0|>") + } else { + format!("<|marker{delta}|>") + } +} + /// Compute byte offsets within `editable_text` where marker boundaries should /// be placed. /// @@ -367,6 +380,622 @@ pub fn extract_editable_region_from_markers(text: &str) -> Option { Some(result) } +struct MarkerTag { + number: usize, + tag_start: usize, + tag_end: usize, +} + +struct RelativeMarkerTag { + delta: isize, + tag_start: usize, + tag_end: usize, +} + +fn collect_marker_tags(text: &str) -> Vec { + let mut markers = Vec::new(); + let mut search_from = 0; + while let Some(rel_pos) = text[search_from..].find(MARKER_TAG_PREFIX) { + let tag_start = search_from + rel_pos; + let num_start = tag_start + MARKER_TAG_PREFIX.len(); + if let Some(suffix_rel) = text[num_start..].find(MARKER_TAG_SUFFIX) { + let num_end = num_start + suffix_rel; + if let Ok(number) = text[num_start..num_end].parse::() { + let tag_end = num_end + MARKER_TAG_SUFFIX.len(); + markers.push(MarkerTag { + number, + tag_start, + tag_end, + }); + search_from = tag_end; + continue; + } + } + search_from = tag_start + MARKER_TAG_PREFIX.len(); + } + markers +} + +fn collect_relative_marker_tags(text: &str) -> Vec { + let mut markers = Vec::new(); + let mut search_from = 0; + while let Some(rel_pos) = text[search_from..].find(RELATIVE_MARKER_TAG_PREFIX) { + let tag_start = search_from + rel_pos; + let payload_start = tag_start + RELATIVE_MARKER_TAG_PREFIX.len(); + if let Some(suffix_rel) = text[payload_start..].find(MARKER_TAG_SUFFIX) { + let payload_end = payload_start + suffix_rel; + let payload = &text[payload_start..payload_end]; + if let Ok(delta) = payload.parse::() { + let tag_end = payload_end + MARKER_TAG_SUFFIX.len(); + markers.push(RelativeMarkerTag { + delta, + tag_start, + tag_end, + }); + search_from = tag_end; + continue; + } + } + search_from = tag_start + RELATIVE_MARKER_TAG_PREFIX.len(); + } + markers +} + +pub fn nearest_marker_number(cursor_offset: Option, marker_offsets: &[usize]) -> usize { + let cursor = cursor_offset.unwrap_or(0); + marker_offsets + .iter() + .enumerate() + .min_by_key(|(_, offset)| (**offset as isize - cursor as isize).unsigned_abs()) + .map(|(idx, _)| idx + 1) + .unwrap_or(1) +} + +fn cursor_block_index(cursor_offset: Option, marker_offsets: &[usize]) -> usize { + let cursor = cursor_offset.unwrap_or(0); + marker_offsets + .windows(2) + .position(|window| cursor >= window[0] && cursor < window[1]) + .unwrap_or_else(|| marker_offsets.len().saturating_sub(2)) +} + +/// Write the editable region content with V0317 byte-exact marker tags, where +/// marker numbers are relative to the cursor block. +pub fn write_editable_with_markers_v0317( + output: &mut String, + editable_text: &str, + cursor_offset_in_editable: usize, + cursor_marker: &str, +) { + let marker_offsets = compute_marker_offsets(editable_text); + let anchor_idx = cursor_block_index(Some(cursor_offset_in_editable), &marker_offsets); + let mut cursor_placed = false; + + for (i, &offset) in marker_offsets.iter().enumerate() { + let marker_delta = i as isize - anchor_idx as isize; + output.push_str(&marker_tag_relative(marker_delta)); + + if let Some(&next_offset) = marker_offsets.get(i + 1) { + let block = &editable_text[offset..next_offset]; + if !cursor_placed + && cursor_offset_in_editable >= offset + && cursor_offset_in_editable <= next_offset + { + cursor_placed = true; + let cursor_in_block = cursor_offset_in_editable - offset; + output.push_str(&block[..cursor_in_block]); + output.push_str(cursor_marker); + output.push_str(&block[cursor_in_block..]); + } else { + output.push_str(block); + } + } + } +} + +/// Write the editable region content with V0316 byte-exact marker tags. +/// +/// Unlike the V0306 version, markers are pure delimiters with no newline +/// padding. The content between markers is the exact bytes from the editable +/// text. +pub fn write_editable_with_markers_v0316( + output: &mut String, + editable_text: &str, + cursor_offset_in_editable: usize, + cursor_marker: &str, +) { + let marker_offsets = compute_marker_offsets(editable_text); + let mut cursor_placed = false; + for (i, &offset) in marker_offsets.iter().enumerate() { + let marker_num = i + 1; + output.push_str(&marker_tag(marker_num)); + + if let Some(&next_offset) = marker_offsets.get(i + 1) { + let block = &editable_text[offset..next_offset]; + if !cursor_placed + && cursor_offset_in_editable >= offset + && cursor_offset_in_editable <= next_offset + { + cursor_placed = true; + let cursor_in_block = cursor_offset_in_editable - offset; + output.push_str(&block[..cursor_in_block]); + output.push_str(cursor_marker); + output.push_str(&block[cursor_in_block..]); + } else { + output.push_str(block); + } + } + } +} + +/// Parse V0316 model output and reconstruct the full new editable region. +/// +/// V0316 differences from V0306: +/// - No newline stripping or normalization (byte-exact content). +/// - The no-edit signal is `start_num == end_num` (any repeated marker). +/// - Intermediate marker tags are used for block-level extraction. +pub fn apply_marker_span_v0316(old_editable: &str, output: &str) -> Result { + let markers = collect_marker_tags(output); + + if markers.is_empty() { + return Err(anyhow!("no marker tags found in output")); + } + + if markers.len() == 1 { + return Err(anyhow!( + "only one marker tag found in output, expected at least two" + )); + } + + let start_num = markers + .first() + .map(|marker| marker.number) + .context("missing first marker")?; + let end_num = markers + .last() + .map(|marker| marker.number) + .context("missing last marker")?; + + // No-edit signal: start_num == end_num + if start_num == end_num { + return Ok(old_editable.to_string()); + } + + // Validate monotonically increasing with no gaps + let expected_nums: Vec = (start_num..=end_num).collect(); + let actual_nums: Vec = markers.iter().map(|m| m.number).collect(); + if actual_nums != expected_nums { + eprintln!( + "V0316 marker sequence validation failed: expected {:?}, got {:?}. Attempting best-effort parse.", + expected_nums, actual_nums + ); + } + + let marker_offsets = compute_marker_offsets(old_editable); + + let start_idx = start_num + .checked_sub(1) + .context("marker numbers are 1-indexed")?; + let end_idx = end_num + .checked_sub(1) + .context("marker numbers are 1-indexed")?; + + let start_byte = *marker_offsets + .get(start_idx) + .context("start marker number out of range")?; + let end_byte = *marker_offsets + .get(end_idx) + .context("end marker number out of range")?; + + if start_byte > end_byte { + return Err(anyhow!("start marker must come before end marker")); + } + + // Extract byte-exact content between consecutive markers + let mut new_content = String::new(); + for i in 0..markers.len() - 1 { + let content_start = markers[i].tag_end; + let content_end = markers[i + 1].tag_start; + if content_start <= content_end { + new_content.push_str(&output[content_start..content_end]); + } + } + + // Splice into old_editable + let mut result = String::new(); + result.push_str(&old_editable[..start_byte]); + result.push_str(&new_content); + result.push_str(&old_editable[end_byte..]); + + Ok(result) +} + +/// Parse V0317 model output and reconstruct the full new editable region. +/// +/// V0317 differences from V0316: +/// - Marker ids are relative to the cursor block (e.g. -2, -1, 0, +1, +2). +/// - No-edit signal is any repeated relative marker tag. +pub fn apply_marker_span_v0317( + old_editable: &str, + output: &str, + cursor_offset_in_old: Option, +) -> Result { + let markers = collect_relative_marker_tags(output); + + if markers.is_empty() { + return Err(anyhow!("no marker tags found in output")); + } + + if markers.len() == 1 { + return Err(anyhow!( + "only one marker tag found in output, expected at least two" + )); + } + + let marker_offsets = compute_marker_offsets(old_editable); + let anchor_idx = cursor_block_index(cursor_offset_in_old, &marker_offsets); + + let start_delta = markers + .first() + .map(|marker| marker.delta) + .context("missing first marker")?; + let end_delta = markers + .last() + .map(|marker| marker.delta) + .context("missing last marker")?; + + if start_delta == end_delta { + return Ok(old_editable.to_string()); + } + + let start_idx_isize = anchor_idx as isize + start_delta; + let end_idx_isize = anchor_idx as isize + end_delta; + if start_idx_isize < 0 || end_idx_isize < 0 { + return Err(anyhow!("relative marker maps before first marker")); + } + + let start_idx = usize::try_from(start_idx_isize).context("invalid start marker index")?; + let end_idx = usize::try_from(end_idx_isize).context("invalid end marker index")?; + + let start_byte = *marker_offsets + .get(start_idx) + .context("start marker number out of range")?; + let end_byte = *marker_offsets + .get(end_idx) + .context("end marker number out of range")?; + + if start_byte > end_byte { + return Err(anyhow!("start marker must come before end marker")); + } + + let mut new_content = String::new(); + for i in 0..markers.len() - 1 { + let content_start = markers[i].tag_end; + let content_end = markers[i + 1].tag_start; + if content_start <= content_end { + new_content.push_str(&output[content_start..content_end]); + } + } + + let mut result = String::new(); + result.push_str(&old_editable[..start_byte]); + result.push_str(&new_content); + result.push_str(&old_editable[end_byte..]); + + Ok(result) +} + +/// Encode the V0316 training target from old and new editable text. +/// +/// V0316 differences from V0306: +/// - No-edit signal: `<|marker_C|><|marker_C|>{end_marker}` where C is nearest +/// to cursor. +/// - All intermediate markers are emitted with byte-exact content. +/// - No newline padding around marker tags. +pub fn encode_from_old_and_new_v0316( + old_editable: &str, + new_editable: &str, + cursor_offset_in_new: Option, + cursor_marker: &str, + end_marker: &str, +) -> Result { + let marker_offsets = compute_marker_offsets(old_editable); + + if old_editable == new_editable { + let marker_num = nearest_marker_number(cursor_offset_in_new, &marker_offsets); + let tag = marker_tag(marker_num); + return Ok(format!("{tag}{tag}{end_marker}")); + } + + let common_prefix = old_editable + .bytes() + .zip(new_editable.bytes()) + .take_while(|(a, b)| a == b) + .count(); + + let old_remaining = old_editable.len() - common_prefix; + let new_remaining = new_editable.len() - common_prefix; + let max_suffix = old_remaining.min(new_remaining); + let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..] + .iter() + .rev() + .zip( + new_editable.as_bytes()[new_editable.len() - max_suffix..] + .iter() + .rev(), + ) + .take_while(|(a, b)| a == b) + .count(); + + let change_end_in_old = old_editable.len() - common_suffix; + + let start_marker_idx = marker_offsets + .iter() + .rposition(|&offset| offset <= common_prefix) + .unwrap_or(0); + let end_marker_idx = marker_offsets + .iter() + .position(|&offset| offset >= change_end_in_old) + .unwrap_or(marker_offsets.len() - 1); + + let old_start = marker_offsets[start_marker_idx]; + let old_end = marker_offsets[end_marker_idx]; + + let new_start = old_start; + let new_end = new_editable + .len() + .saturating_sub(old_editable.len().saturating_sub(old_end)); + + let new_span = &new_editable[new_start..new_end]; + let old_span = &old_editable[old_start..old_end]; + + // Compute common prefix/suffix within the span for accurate boundary mapping + let span_common_prefix = old_span + .bytes() + .zip(new_span.bytes()) + .take_while(|(a, b)| a == b) + .count(); + + let span_old_remaining = old_span.len() - span_common_prefix; + let span_new_remaining = new_span.len() - span_common_prefix; + let span_max_suffix = span_old_remaining.min(span_new_remaining); + let span_common_suffix = old_span.as_bytes()[old_span.len() - span_max_suffix..] + .iter() + .rev() + .zip( + new_span.as_bytes()[new_span.len() - span_max_suffix..] + .iter() + .rev(), + ) + .take_while(|(a, b)| a == b) + .count(); + + let mut result = String::new(); + let mut prev_new_rel = 0usize; + let mut cursor_placed = false; + + for block_idx in start_marker_idx..end_marker_idx { + let marker_num = block_idx + 1; + result.push_str(&marker_tag(marker_num)); + + let new_rel_end = if block_idx + 1 == end_marker_idx { + // Last block: extends to end of new span + new_span.len() + } else { + // Map the intermediate boundary from old to new coordinates + let old_rel = marker_offsets[block_idx + 1] - old_start; + let mapped = map_boundary_offset( + old_rel, + old_span.len(), + new_span.len(), + span_common_prefix, + span_common_suffix, + ); + // Ensure char boundary safety and monotonicity + new_span.floor_char_boundary(mapped) + }; + + // Ensure monotonicity (each block gets at least zero content) + let new_rel_end = new_rel_end.max(prev_new_rel); + + let block_content = &new_span[prev_new_rel..new_rel_end]; + + if !cursor_placed { + if let Some(cursor_offset) = cursor_offset_in_new { + let abs_start = new_start + prev_new_rel; + let abs_end = new_start + new_rel_end; + if cursor_offset >= abs_start && cursor_offset <= abs_end { + cursor_placed = true; + let cursor_in_block = cursor_offset - abs_start; + let bounded = cursor_in_block.min(block_content.len()); + result.push_str(&block_content[..bounded]); + result.push_str(cursor_marker); + result.push_str(&block_content[bounded..]); + prev_new_rel = new_rel_end; + continue; + } + } + } + + result.push_str(block_content); + prev_new_rel = new_rel_end; + } + + // Final closing marker + let end_marker_num = end_marker_idx + 1; + result.push_str(&marker_tag(end_marker_num)); + result.push_str(end_marker); + + Ok(result) +} + +/// Encode the V0317 training target from old and new editable text. +/// +/// V0317 differences from V0316: +/// - Marker ids are relative to cursor block (..., -2, -1, 0, +1, +2, ...). +/// - No-edit signal: repeated cursor-relative marker. +pub fn encode_from_old_and_new_v0317( + old_editable: &str, + new_editable: &str, + cursor_offset_in_new: Option, + cursor_marker: &str, + end_marker: &str, +) -> Result { + let marker_offsets = compute_marker_offsets(old_editable); + let anchor_idx = cursor_block_index(cursor_offset_in_new, &marker_offsets); + + if old_editable == new_editable { + let tag = marker_tag_relative(0); + return Ok(format!("{tag}{tag}{end_marker}")); + } + + let common_prefix = old_editable + .bytes() + .zip(new_editable.bytes()) + .take_while(|(a, b)| a == b) + .count(); + + let old_remaining = old_editable.len() - common_prefix; + let new_remaining = new_editable.len() - common_prefix; + let max_suffix = old_remaining.min(new_remaining); + let common_suffix = old_editable.as_bytes()[old_editable.len() - max_suffix..] + .iter() + .rev() + .zip( + new_editable.as_bytes()[new_editable.len() - max_suffix..] + .iter() + .rev(), + ) + .take_while(|(a, b)| a == b) + .count(); + + let change_end_in_old = old_editable.len() - common_suffix; + + let start_marker_idx = marker_offsets + .iter() + .rposition(|&offset| offset <= common_prefix) + .unwrap_or(0); + let end_marker_idx = marker_offsets + .iter() + .position(|&offset| offset >= change_end_in_old) + .unwrap_or(marker_offsets.len() - 1); + + let old_start = marker_offsets[start_marker_idx]; + let old_end = marker_offsets[end_marker_idx]; + + let new_start = old_start; + let new_end = new_editable + .len() + .saturating_sub(old_editable.len().saturating_sub(old_end)); + + let new_span = &new_editable[new_start..new_end]; + let old_span = &old_editable[old_start..old_end]; + + let span_common_prefix = old_span + .bytes() + .zip(new_span.bytes()) + .take_while(|(a, b)| a == b) + .count(); + + let span_old_remaining = old_span.len() - span_common_prefix; + let span_new_remaining = new_span.len() - span_common_prefix; + let span_max_suffix = span_old_remaining.min(span_new_remaining); + let span_common_suffix = old_span.as_bytes()[old_span.len() - span_max_suffix..] + .iter() + .rev() + .zip( + new_span.as_bytes()[new_span.len() - span_max_suffix..] + .iter() + .rev(), + ) + .take_while(|(a, b)| a == b) + .count(); + + let mut result = String::new(); + let mut prev_new_rel = 0usize; + let mut cursor_placed = false; + + for block_idx in start_marker_idx..end_marker_idx { + let marker_delta = block_idx as isize - anchor_idx as isize; + result.push_str(&marker_tag_relative(marker_delta)); + + let new_rel_end = if block_idx + 1 == end_marker_idx { + new_span.len() + } else { + let old_rel = marker_offsets[block_idx + 1] - old_start; + let mapped = map_boundary_offset( + old_rel, + old_span.len(), + new_span.len(), + span_common_prefix, + span_common_suffix, + ); + new_span.floor_char_boundary(mapped) + }; + + let new_rel_end = new_rel_end.max(prev_new_rel); + let block_content = &new_span[prev_new_rel..new_rel_end]; + + if !cursor_placed { + if let Some(cursor_offset) = cursor_offset_in_new { + let abs_start = new_start + prev_new_rel; + let abs_end = new_start + new_rel_end; + if cursor_offset >= abs_start && cursor_offset <= abs_end { + cursor_placed = true; + let cursor_in_block = cursor_offset - abs_start; + let bounded = cursor_in_block.min(block_content.len()); + result.push_str(&block_content[..bounded]); + result.push_str(cursor_marker); + result.push_str(&block_content[bounded..]); + prev_new_rel = new_rel_end; + continue; + } + } + } + + result.push_str(block_content); + prev_new_rel = new_rel_end; + } + + let end_marker_delta = end_marker_idx as isize - anchor_idx as isize; + result.push_str(&marker_tag_relative(end_marker_delta)); + result.push_str(end_marker); + + Ok(result) +} + +/// Map a byte offset from old span coordinates to new span coordinates, +/// using common prefix/suffix within the span for accuracy. +fn map_boundary_offset( + old_rel: usize, + old_span_len: usize, + new_span_len: usize, + span_common_prefix: usize, + span_common_suffix: usize, +) -> usize { + if old_rel <= span_common_prefix { + old_rel + } else if old_rel >= old_span_len - span_common_suffix { + new_span_len - (old_span_len - old_rel) + } else { + // Within the changed region: proportional mapping + let old_changed_start = span_common_prefix; + let old_changed_len = old_span_len + .saturating_sub(span_common_prefix) + .saturating_sub(span_common_suffix); + let new_changed_start = span_common_prefix; + let new_changed_len = new_span_len + .saturating_sub(span_common_prefix) + .saturating_sub(span_common_suffix); + + if old_changed_len == 0 { + new_changed_start + } else { + new_changed_start + ((old_rel - old_changed_start) * new_changed_len / old_changed_len) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -554,4 +1183,233 @@ mod tests { "line1\nline2" ); } + + #[test] + fn test_write_editable_with_markers_v0316_byte_exact() { + let editable = "aaa\nbbb\nccc\n"; + let mut output = String::new(); + write_editable_with_markers_v0316(&mut output, editable, 4, "<|user_cursor|>"); + // Should have marker tags with no extra newlines + assert!(output.starts_with("<|marker_1|>")); + assert!(output.contains("<|user_cursor|>")); + // Content should be byte-exact - no extra newlines added by markers + let stripped = output.replace("<|user_cursor|>", ""); + let stripped = strip_marker_tags(&stripped); + assert_eq!(stripped, editable); + } + + #[test] + fn test_apply_marker_span_v0316_basic() { + let old = "aaa\nbbb\nccc\n"; + let output = "<|marker_1|>aaa\nBBB\nccc\n<|marker_2|>"; + let result = apply_marker_span_v0316(old, output).unwrap(); + assert_eq!(result, "aaa\nBBB\nccc\n"); + } + + #[test] + fn test_apply_marker_span_v0316_no_edit() { + let old = "aaa\nbbb\nccc\n"; + let output = "<|marker_1|><|marker_1|>"; + let result = apply_marker_span_v0316(old, output).unwrap(); + assert_eq!(result, old); + } + + #[test] + fn test_apply_marker_span_v0316_no_edit_any_marker() { + let old = "aaa\nbbb\nccc\n"; + let output = "<|marker_2|>ignored content<|marker_2|>"; + let result = apply_marker_span_v0316(old, output).unwrap(); + assert_eq!(result, old); + } + + #[test] + fn test_apply_marker_span_v0316_multi_block() { + let old = "line1\nline2\nline3\n\nline5\nline6\nline7\nline8\n"; + let marker_offsets = compute_marker_offsets(old); + assert!( + marker_offsets.len() >= 3, + "expected at least 3 offsets, got {:?}", + marker_offsets + ); + + // Build output spanning all blocks with new content + let new_content = "LINE1\nLINE2\nLINE3\n\nLINE5\nLINE6\nLINE7\nLINE8\n"; + let mut output = String::new(); + output.push_str("<|marker_1|>"); + // Split new_content at old block boundaries + for i in 0..marker_offsets.len() - 1 { + if i > 0 { + output.push_str(&marker_tag(i + 1)); + } + let start = marker_offsets[i]; + let end = marker_offsets[i + 1]; + let block_len = end - start; + // Use same length blocks from new content (they happen to be same length) + output.push_str(&new_content[start..start + block_len]); + } + let last_marker_num = marker_offsets.len(); + output.push_str(&marker_tag(last_marker_num)); + let result = apply_marker_span_v0316(old, &output).unwrap(); + assert_eq!(result, new_content); + } + + #[test] + fn test_apply_marker_span_v0316_byte_exact_no_normalization() { + let old = "aaa\nbbb\nccc\n"; + // Content doesn't end with \n - should NOT be normalized + let output = "<|marker_1|>aaa\nBBB\nccc<|marker_2|>"; + let result = apply_marker_span_v0316(old, output).unwrap(); + // V0316 is byte-exact: the missing trailing \n is NOT added + assert_eq!(result, "aaa\nBBB\nccc"); + } + + #[test] + fn test_encode_v0316_no_edits() { + let old = "aaa\nbbb\nccc\n"; + let result = + encode_from_old_and_new_v0316(old, old, Some(5), "<|user_cursor|>", "<|end|>").unwrap(); + // Should be <|marker_K|><|marker_K|><|end|> where K is nearest to cursor + assert!(result.ends_with("<|end|>")); + // Parse it and verify it's a no-edit + let stripped = result.strip_suffix("<|end|>").unwrap(); + let result_parsed = apply_marker_span_v0316(old, stripped).unwrap(); + assert_eq!(result_parsed, old); + } + + #[test] + fn test_encode_v0316_with_change() { + let old = "aaa\nbbb\nccc\n"; + let new = "aaa\nBBB\nccc\n"; + let result = + encode_from_old_and_new_v0316(old, new, None, "<|user_cursor|>", "<|end|>").unwrap(); + assert!(result.contains("<|marker_1|>")); + assert!(result.contains("<|marker_2|>")); + assert!(result.ends_with("<|end|>")); + } + + #[test] + fn test_roundtrip_v0316() { + let old = "line1\nline2\nline3\n\nline5\nline6\nline7\nline8\nline9\nline10\n"; + let new = "line1\nline2\nline3\n\nline5\nLINE6\nline7\nline8\nline9\nline10\n"; + let encoded = + encode_from_old_and_new_v0316(old, new, None, "<|user_cursor|>", "<|end|>").unwrap(); + let stripped = encoded + .strip_suffix("<|end|>") + .expect("should have end marker"); + let reconstructed = apply_marker_span_v0316(old, stripped).unwrap(); + assert_eq!(reconstructed, new); + } + + #[test] + fn test_roundtrip_v0316_with_cursor() { + let old = "aaa\nbbb\nccc\n"; + let new = "aaa\nBBB\nccc\n"; + let result = + encode_from_old_and_new_v0316(old, new, Some(5), "<|user_cursor|>", "<|end|>").unwrap(); + assert!(result.contains("<|user_cursor|>"), "result: {result}"); + assert!(result.contains("B<|user_cursor|>BB"), "result: {result}"); + } + + #[test] + fn test_roundtrip_v0316_multi_block_change() { + let old = "line1\nline2\nline3\n\nline5\nline6\nline7\nline8\n"; + let new = "line1\nLINE2\nline3\n\nline5\nLINE6\nline7\nline8\n"; + let encoded = + encode_from_old_and_new_v0316(old, new, None, "<|user_cursor|>", "<|end|>").unwrap(); + let stripped = encoded + .strip_suffix("<|end|>") + .expect("should have end marker"); + let reconstructed = apply_marker_span_v0316(old, stripped).unwrap(); + assert_eq!(reconstructed, new); + } + + #[test] + fn test_nearest_marker_number() { + let offsets = vec![0, 10, 20, 30]; + assert_eq!(nearest_marker_number(Some(0), &offsets), 1); + assert_eq!(nearest_marker_number(Some(9), &offsets), 2); + assert_eq!(nearest_marker_number(Some(15), &offsets), 2); + assert_eq!(nearest_marker_number(Some(25), &offsets), 3); + assert_eq!(nearest_marker_number(Some(30), &offsets), 4); + assert_eq!(nearest_marker_number(None, &offsets), 1); + } + + #[test] + fn test_marker_tag_relative_formats_as_expected() { + assert_eq!(marker_tag_relative(-2), "<|marker-2|>"); + assert_eq!(marker_tag_relative(-1), "<|marker-1|>"); + assert_eq!(marker_tag_relative(0), "<|marker-0|>"); + assert_eq!(marker_tag_relative(1), "<|marker+1|>"); + assert_eq!(marker_tag_relative(2), "<|marker+2|>"); + } + + #[test] + fn test_write_editable_with_markers_v0317_includes_relative_markers_and_cursor() { + let editable = "aaa\nbbb\nccc\n"; + let mut output = String::new(); + write_editable_with_markers_v0317(&mut output, editable, 4, "<|user_cursor|>"); + + assert!(output.contains("<|marker-0|>")); + assert!(output.contains("<|user_cursor|>")); + + let stripped = output.replace("<|user_cursor|>", ""); + let stripped = + collect_relative_marker_tags(&stripped) + .iter() + .fold(stripped.clone(), |acc, marker| { + let tag = &stripped[marker.tag_start..marker.tag_end]; + acc.replace(tag, "") + }); + assert_eq!(stripped, editable); + } + + #[test] + fn test_apply_marker_span_v0317_basic() { + let old = "aaa\nbbb\nccc\n"; + let output = "<|marker-0|>aaa\nBBB\nccc\n<|marker+1|>"; + let result = apply_marker_span_v0317(old, output, Some(0)).unwrap(); + assert_eq!(result, "aaa\nBBB\nccc\n"); + } + + #[test] + fn test_apply_marker_span_v0317_no_edit() { + let old = "aaa\nbbb\nccc\n"; + let output = "<|marker-0|><|marker-0|>"; + let result = apply_marker_span_v0317(old, output, Some(0)).unwrap(); + assert_eq!(result, old); + } + + #[test] + fn test_encode_v0317_no_edits() { + let old = "aaa\nbbb\nccc\n"; + let result = + encode_from_old_and_new_v0317(old, old, Some(5), "<|user_cursor|>", "<|end|>").unwrap(); + assert_eq!(result, "<|marker-0|><|marker-0|><|end|>"); + } + + #[test] + fn test_roundtrip_v0317() { + let old = "line1\nline2\nline3\n\nline5\nline6\nline7\nline8\n"; + let new = "line1\nLINE2\nline3\n\nline5\nLINE6\nline7\nline8\n"; + let cursor = Some(6); + + let encoded = + encode_from_old_and_new_v0317(old, new, cursor, "<|user_cursor|>", "<|end|>").unwrap(); + let stripped = encoded + .strip_suffix("<|end|>") + .expect("should have end marker"); + let stripped = stripped.replace("<|user_cursor|>", ""); + let reconstructed = apply_marker_span_v0317(old, &stripped, cursor).unwrap(); + assert_eq!(reconstructed, new); + } + + #[test] + fn test_roundtrip_v0317_with_cursor_marker() { + let old = "aaa\nbbb\nccc\n"; + let new = "aaa\nBBB\nccc\n"; + let result = + encode_from_old_and_new_v0317(old, new, Some(5), "<|user_cursor|>", "<|end|>").unwrap(); + assert!(result.contains("<|user_cursor|>"), "result: {result}"); + assert!(result.contains("<|marker-0|>"), "result: {result}"); + } } diff --git a/crates/zeta_prompt/src/zeta_prompt.rs b/crates/zeta_prompt/src/zeta_prompt.rs index 0dce7764e7b9c451b4360fb2177d9d3e0eb7315b..f2bf994ae551b930aba873597dcdd210a6ad7c92 100644 --- a/crates/zeta_prompt/src/zeta_prompt.rs +++ b/crates/zeta_prompt/src/zeta_prompt.rs @@ -82,7 +82,12 @@ pub enum ZetaFormat { v0226Hashline, V0304VariableEdit, V0304SeedNoEdits, + /// Multi-block marker spans with NO_EDITS sentinel. V0306SeedMultiRegions, + /// Byte-exact marker spans; all intermediate markers emitted; repeated marker means no-edit. + V0316SeedMultiRegions, + /// V0316, but marker numbers are relative to the cursor block (e.g. -1, -0, +1). + V0317SeedMultiRegions, } impl std::fmt::Display for ZetaFormat { @@ -220,6 +225,30 @@ pub fn special_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] ZetaFormat::v0226Hashline => hashline::special_tokens(), ZetaFormat::V0304VariableEdit => v0304_variable_edit::special_tokens(), ZetaFormat::V0304SeedNoEdits => seed_coder::special_tokens(), + ZetaFormat::V0316SeedMultiRegions => { + static TOKENS: &[&str] = &[ + seed_coder::FIM_SUFFIX, + seed_coder::FIM_PREFIX, + seed_coder::FIM_MIDDLE, + seed_coder::FILE_MARKER, + multi_region::V0316_END_MARKER, + CURSOR_MARKER, + multi_region::MARKER_TAG_PREFIX, + ]; + TOKENS + } + ZetaFormat::V0317SeedMultiRegions => { + static TOKENS: &[&str] = &[ + seed_coder::FIM_SUFFIX, + seed_coder::FIM_PREFIX, + seed_coder::FIM_MIDDLE, + seed_coder::FILE_MARKER, + multi_region::V0317_END_MARKER, + CURSOR_MARKER, + multi_region::RELATIVE_MARKER_TAG_PREFIX, + ]; + TOKENS + } ZetaFormat::V0306SeedMultiRegions => { static TOKENS: &[&str] = &[ seed_coder::FIM_SUFFIX, @@ -248,6 +277,8 @@ pub fn token_limits_for_format(format: ZetaFormat) -> (usize, usize) { | ZetaFormat::V0211SeedCoder | ZetaFormat::v0226Hashline | ZetaFormat::V0306SeedMultiRegions + | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0317SeedMultiRegions | ZetaFormat::V0304SeedNoEdits => (350, 150), ZetaFormat::V0304VariableEdit => (1024, 0), } @@ -266,6 +297,8 @@ pub fn stop_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] { | ZetaFormat::V0304VariableEdit | ZetaFormat::V0306SeedMultiRegions | ZetaFormat::V0304SeedNoEdits => &[], + ZetaFormat::V0316SeedMultiRegions => &[multi_region::V0316_END_MARKER], + ZetaFormat::V0317SeedMultiRegions => &[multi_region::V0317_END_MARKER], } } @@ -288,7 +321,9 @@ pub fn excerpt_ranges_for_format( | ZetaFormat::V0211SeedCoder | ZetaFormat::v0226Hashline | ZetaFormat::V0304SeedNoEdits - | ZetaFormat::V0306SeedMultiRegions => ( + | ZetaFormat::V0306SeedMultiRegions + | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0317SeedMultiRegions => ( ranges.editable_350.clone(), ranges.editable_350_context_150.clone(), ), @@ -371,6 +406,22 @@ pub fn write_cursor_excerpt_section_for_format( cursor_offset, )); } + ZetaFormat::V0316SeedMultiRegions => { + prompt.push_str(&build_v0316_cursor_prefix( + path, + context, + editable_range, + cursor_offset, + )); + } + ZetaFormat::V0317SeedMultiRegions => { + prompt.push_str(&build_v0317_cursor_prefix( + path, + context, + editable_range, + cursor_offset, + )); + } } } @@ -403,6 +454,60 @@ fn build_v0306_cursor_prefix( section } +fn build_v0316_cursor_prefix( + path: &Path, + context: &str, + editable_range: &Range, + cursor_offset: usize, +) -> String { + let mut section = String::new(); + let path_str = path.to_string_lossy(); + write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok(); + + section.push_str(&context[..editable_range.start]); + + let editable_text = &context[editable_range.clone()]; + let cursor_in_editable = cursor_offset - editable_range.start; + multi_region::write_editable_with_markers_v0316( + &mut section, + editable_text, + cursor_in_editable, + CURSOR_MARKER, + ); + + if !section.ends_with('\n') { + section.push('\n'); + } + section +} + +fn build_v0317_cursor_prefix( + path: &Path, + context: &str, + editable_range: &Range, + cursor_offset: usize, +) -> String { + let mut section = String::new(); + let path_str = path.to_string_lossy(); + write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok(); + + section.push_str(&context[..editable_range.start]); + + let editable_text = &context[editable_range.clone()]; + let cursor_in_editable = cursor_offset - editable_range.start; + multi_region::write_editable_with_markers_v0317( + &mut section, + editable_text, + cursor_in_editable, + CURSOR_MARKER, + ); + + if !section.ends_with('\n') { + section.push('\n'); + } + section +} + fn offset_range_to_row_range(text: &str, range: Range) -> Range { let start_row = text[0..range.start].matches('\n').count() as u32; let mut end_row = start_row + text[range.clone()].matches('\n').count() as u32; @@ -439,7 +544,9 @@ pub fn format_prompt_with_budget_for_format( let prompt = match format { ZetaFormat::V0211SeedCoder | ZetaFormat::V0304SeedNoEdits - | ZetaFormat::V0306SeedMultiRegions => { + | ZetaFormat::V0306SeedMultiRegions + | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0317SeedMultiRegions => { let mut cursor_section = String::new(); write_cursor_excerpt_section_for_format( format, @@ -533,7 +640,9 @@ pub fn max_edit_event_count_for_format(format: &ZetaFormat) -> usize { | ZetaFormat::v0226Hashline | ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0304VariableEdit - | ZetaFormat::V0306SeedMultiRegions => 6, + | ZetaFormat::V0306SeedMultiRegions + | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0317SeedMultiRegions => 6, } } @@ -552,7 +661,10 @@ pub fn get_prefill_for_format( | ZetaFormat::V0211SeedCoder | ZetaFormat::v0226Hashline | ZetaFormat::V0304VariableEdit => String::new(), - ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions => String::new(), + ZetaFormat::V0304SeedNoEdits + | ZetaFormat::V0306SeedMultiRegions + | ZetaFormat::V0316SeedMultiRegions + | ZetaFormat::V0317SeedMultiRegions => String::new(), } } @@ -564,6 +676,8 @@ pub fn output_end_marker_for_format(format: ZetaFormat) -> Option<&'static str> ZetaFormat::V0211SeedCoder | ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions => Some(seed_coder::END_MARKER), + ZetaFormat::V0316SeedMultiRegions => Some(multi_region::V0316_END_MARKER), + ZetaFormat::V0317SeedMultiRegions => Some(multi_region::V0317_END_MARKER), ZetaFormat::V0112MiddleAtEnd | ZetaFormat::V0113Ordered | ZetaFormat::V0114180EditableRegion @@ -591,6 +705,33 @@ pub fn encode_patch_as_output_for_format( ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions => { Ok(seed_coder::no_edits(patch)) } + ZetaFormat::V0316SeedMultiRegions => { + let empty_patch = patch.lines().count() <= 3; + if empty_patch { + let marker_offsets = multi_region::compute_marker_offsets(old_editable_region); + let marker_num = + multi_region::nearest_marker_number(cursor_offset, &marker_offsets); + let tag = multi_region::marker_tag(marker_num); + Ok(Some(format!( + "{tag}{tag}{}", + multi_region::V0316_END_MARKER + ))) + } else { + Ok(None) + } + } + ZetaFormat::V0317SeedMultiRegions => { + let empty_patch = patch.lines().count() <= 3; + if empty_patch { + let tag = multi_region::marker_tag_relative(0); + Ok(Some(format!( + "{tag}{tag}{}", + multi_region::V0317_END_MARKER + ))) + } else { + Ok(None) + } + } _ => Ok(None), } } @@ -613,10 +754,11 @@ pub fn parse_zeta2_model_output( None => output, }; - let (context, editable_range_in_context, context_range, _) = + let (context, editable_range_in_context, context_range, cursor_offset) = resolve_cursor_region(prompt_inputs, format); let context_start = context_range.start; let old_editable_region = &context[editable_range_in_context.clone()]; + let cursor_offset_in_editable = cursor_offset.saturating_sub(editable_range_in_context.start); let (range_in_context, output) = match format { ZetaFormat::v0226Hashline => ( @@ -644,6 +786,18 @@ pub fn parse_zeta2_model_output( multi_region::apply_marker_span(old_editable_region, output)? }, ), + ZetaFormat::V0316SeedMultiRegions => ( + editable_range_in_context, + multi_region::apply_marker_span_v0316(old_editable_region, output)?, + ), + ZetaFormat::V0317SeedMultiRegions => ( + editable_range_in_context, + multi_region::apply_marker_span_v0317( + old_editable_region, + output, + Some(cursor_offset_in_editable), + )?, + ), _ => (editable_range_in_context, output.to_string()), };