diff --git a/crates/edit_prediction/src/zeta2.rs b/crates/edit_prediction/src/zeta2.rs index bc24b4efa4bbbfeec73471b42eff518e0b9d3b2e..36f70c6d9a85a0e2ac840f3655e48fdab9166252 100644 --- a/crates/edit_prediction/src/zeta2.rs +++ b/crates/edit_prediction/src/zeta2.rs @@ -25,6 +25,7 @@ pub fn max_editable_tokens(format: ZetaFormat) -> usize { ZetaFormat::V0120GitMergeMarkers => 180, ZetaFormat::V0131GitMergeMarkersPrefix => 180, ZetaFormat::V0211Prefill => 180, + ZetaFormat::V0211SeedCoder => 180, } } diff --git a/crates/edit_prediction_cli/src/format_prompt.rs b/crates/edit_prediction_cli/src/format_prompt.rs index e2414f49c7b7228aded210e181c895a6956b2614..aaa5b2307f7f6df9a3e5a2c584d7d815ffb5cb53 100644 --- a/crates/edit_prediction_cli/src/format_prompt.rs +++ b/crates/edit_prediction_cli/src/format_prompt.rs @@ -158,7 +158,9 @@ pub fn zeta2_output_for_patch( } match version { - ZetaFormat::V0120GitMergeMarkers | ZetaFormat::V0131GitMergeMarkersPrefix => { + ZetaFormat::V0120GitMergeMarkers + | ZetaFormat::V0131GitMergeMarkersPrefix + | ZetaFormat::V0211SeedCoder => { if !result.ends_with('\n') { result.push('\n'); } diff --git a/crates/edit_prediction_cli/src/parse_output.rs b/crates/edit_prediction_cli/src/parse_output.rs index e5dc12d76aea2d1b9be9467515bc7ef178f71166..1eda4c94d6f78499eb185002a197107e373d5bb8 100644 --- a/crates/edit_prediction_cli/src/parse_output.rs +++ b/crates/edit_prediction_cli/src/parse_output.rs @@ -61,6 +61,10 @@ fn extract_zeta2_current_region(prompt: &str, format: ZetaFormat) -> Result ( + zeta_prompt::seed_coder::START_MARKER, + zeta_prompt::seed_coder::SEPARATOR, + ), }; let start = prompt.find(current_marker).with_context(|| { @@ -110,6 +114,7 @@ fn parse_zeta2_output( ZetaFormat::V0112MiddleAtEnd | ZetaFormat::V0113Ordered | ZetaFormat::V0114180EditableRegion => "", + ZetaFormat::V0211SeedCoder => zeta_prompt::seed_coder::END_MARKER, }; if !suffix.is_empty() { new_text = new_text diff --git a/crates/zeta_prompt/src/zeta_prompt.rs b/crates/zeta_prompt/src/zeta_prompt.rs index 24aa8dcbd9b96df4b40c334d26f99dc38b93a06f..407ed5f561080065fe5737e0a8b4b7c578284184 100644 --- a/crates/zeta_prompt/src/zeta_prompt.rs +++ b/crates/zeta_prompt/src/zeta_prompt.rs @@ -52,6 +52,7 @@ pub enum ZetaFormat { V0120GitMergeMarkers, V0131GitMergeMarkersPrefix, V0211Prefill, + V0211SeedCoder, } impl std::fmt::Display for ZetaFormat { @@ -156,6 +157,9 @@ pub fn clean_zeta2_model_output(output: &str, format: ZetaFormat) -> &str { ZetaFormat::V0131GitMergeMarkersPrefix => output .strip_suffix(v0131_git_merge_markers_prefix::END_MARKER) .unwrap_or(output), + ZetaFormat::V0211SeedCoder => output + .strip_suffix(seed_coder::END_MARKER) + .unwrap_or(output), _ => output, } } @@ -179,18 +183,28 @@ fn format_zeta_prompt_with_budget( ZetaFormat::V0131GitMergeMarkersPrefix | ZetaFormat::V0211Prefill => { v0131_git_merge_markers_prefix::write_cursor_excerpt_section(&mut cursor_section, input) } + ZetaFormat::V0211SeedCoder => { + return seed_coder::format_prompt_with_budget(input, max_tokens); + } } let cursor_tokens = estimate_tokens(cursor_section.len()); let budget_after_cursor = max_tokens.saturating_sub(cursor_tokens); - let edit_history_section = - format_edit_history_within_budget(&input.events, budget_after_cursor); + let edit_history_section = format_edit_history_within_budget( + &input.events, + "<|file_sep|>", + "edit history", + budget_after_cursor, + ); let edit_history_tokens = estimate_tokens(edit_history_section.len()); let budget_after_edit_history = budget_after_cursor.saturating_sub(edit_history_tokens); - let related_files_section = - format_related_files_within_budget(&input.related_files, budget_after_edit_history); + let related_files_section = format_related_files_within_budget( + &input.related_files, + "<|file_sep|>", + budget_after_edit_history, + ); let mut prompt = String::new(); prompt.push_str(&related_files_section); @@ -205,13 +219,19 @@ pub fn get_prefill(input: &ZetaPromptInput, format: ZetaFormat) -> String { | ZetaFormat::V0113Ordered | ZetaFormat::V0114180EditableRegion | ZetaFormat::V0120GitMergeMarkers - | ZetaFormat::V0131GitMergeMarkersPrefix => String::new(), + | ZetaFormat::V0131GitMergeMarkersPrefix + | ZetaFormat::V0211SeedCoder => String::new(), ZetaFormat::V0211Prefill => v0211_prefill::get_prefill(input), } } -fn format_edit_history_within_budget(events: &[Arc], max_tokens: usize) -> String { - let header = "<|file_sep|>edit history\n"; +fn format_edit_history_within_budget( + events: &[Arc], + file_marker: &str, + edit_history_name: &str, + max_tokens: usize, +) -> String { + let header = format!("{}{}\n", file_marker, edit_history_name); let header_tokens = estimate_tokens(header.len()); if header_tokens >= max_tokens { return String::new(); @@ -236,21 +256,25 @@ fn format_edit_history_within_budget(events: &[Arc], max_tokens: usize) - return String::new(); } - let mut result = String::from(header); + let mut result = header; for event_str in event_strings.iter().rev() { - result.push_str(&event_str); + result.push_str(event_str); } result } -fn format_related_files_within_budget(related_files: &[RelatedFile], max_tokens: usize) -> String { +fn format_related_files_within_budget( + related_files: &[RelatedFile], + file_marker: &str, + max_tokens: usize, +) -> String { let mut result = String::new(); let mut total_tokens = 0; for file in related_files { let path_str = file.path.to_string_lossy(); - let header_len = "<|file_sep|>".len() + path_str.len() + 1; - let header_tokens = estimate_tokens(header_len); + let header = format!("{}{}\n", file_marker, path_str); + let header_tokens = estimate_tokens(header.len()); if total_tokens + header_tokens > max_tokens { break; @@ -263,12 +287,8 @@ fn format_related_files_within_budget(related_files: &[RelatedFile], max_tokens: let needs_newline = !excerpt.text.ends_with('\n'); let needs_ellipsis = excerpt.row_range.end < file.max_row; let excerpt_len = excerpt.text.len() - + if needs_newline { "\n".len() } else { "".len() } - + if needs_ellipsis { - "...\n".len() - } else { - "".len() - }; + + if needs_newline { "\n".len() } else { 0 } + + if needs_ellipsis { "...\n".len() } else { 0 }; let excerpt_tokens = estimate_tokens(excerpt_len); if total_tokens + file_tokens + excerpt_tokens > max_tokens { @@ -280,7 +300,7 @@ fn format_related_files_within_budget(related_files: &[RelatedFile], max_tokens: if excerpts_to_include > 0 { total_tokens += file_tokens; - write!(result, "<|file_sep|>{}\n", path_str).ok(); + result.push_str(&header); for excerpt in file.excerpts.iter().take(excerpts_to_include) { result.push_str(&excerpt.text); if !result.ends_with('\n') { @@ -548,6 +568,130 @@ pub mod v0211_prefill { } } +pub mod seed_coder { + //! Seed-Coder prompt format using SPM (Suffix-Prefix-Middle) FIM mode. + //! + //! Seed-Coder uses different FIM tokens and order than Qwen: + //! - SPM order: suffix comes FIRST, then prefix, then middle + //! - Tokens: `<[fim-suffix]>`, `<[fim-prefix]>`, `<[fim-middle]>` + //! - File markers: StarCoder-style `path` (single token + path) + //! + //! All context (related files, edit history) goes in the PREFIX section. + //! The suffix contains only code after the editable region. + //! + //! Example prompt: + //! + //! <[fim-suffix]> + //! code after editable region + //! <[fim-prefix]>related/file.py + //! related file content + //! + //! edit_history + //! --- a/some_file.py + //! +++ b/some_file.py + //! -old + //! +new + //! + //! path/to/target_file.py + //! code before editable region + //! <<<<<<< CURRENT + //! code that + //! needs to<|user_cursor|> + //! be rewritten + //! ======= + //! <[fim-middle]> + //! + //! Expected output (model generates): + //! + //! updated + //! code with + //! changes applied + //! >>>>>>> UPDATED + + use super::*; + + pub const FIM_SUFFIX: &str = "<[fim-suffix]>"; + pub const FIM_PREFIX: &str = "<[fim-prefix]>"; + pub const FIM_MIDDLE: &str = "<[fim-middle]>"; + pub const FILE_MARKER: &str = ""; + + pub const START_MARKER: &str = "<<<<<<< CURRENT\n"; + pub const SEPARATOR: &str = "=======\n"; + pub const END_MARKER: &str = ">>>>>>> UPDATED\n"; + + pub fn format_prompt_with_budget(input: &ZetaPromptInput, max_tokens: usize) -> String { + let suffix_section = build_suffix_section(input); + let cursor_prefix_section = build_cursor_prefix_section(input); + + let suffix_tokens = estimate_tokens(suffix_section.len()); + let cursor_prefix_tokens = estimate_tokens(cursor_prefix_section.len()); + let budget_after_cursor = max_tokens.saturating_sub(suffix_tokens + cursor_prefix_tokens); + + let edit_history_section = super::format_edit_history_within_budget( + &input.events, + FILE_MARKER, + "edit_history", + budget_after_cursor, + ); + let edit_history_tokens = estimate_tokens(edit_history_section.len()); + let budget_after_edit_history = budget_after_cursor.saturating_sub(edit_history_tokens); + + let related_files_section = super::format_related_files_within_budget( + &input.related_files, + FILE_MARKER, + budget_after_edit_history, + ); + + let mut prompt = String::new(); + prompt.push_str(&suffix_section); + prompt.push_str(FIM_PREFIX); + prompt.push_str(&related_files_section); + if !related_files_section.is_empty() { + prompt.push('\n'); + } + prompt.push_str(&edit_history_section); + if !edit_history_section.is_empty() { + prompt.push('\n'); + } + prompt.push_str(&cursor_prefix_section); + prompt.push_str(FIM_MIDDLE); + prompt + } + + fn build_suffix_section(input: &ZetaPromptInput) -> String { + let mut section = String::new(); + section.push_str(FIM_SUFFIX); + section.push_str(&input.cursor_excerpt[input.editable_range_in_excerpt.end..]); + if !section.ends_with('\n') { + section.push('\n'); + } + section + } + + fn build_cursor_prefix_section(input: &ZetaPromptInput) -> String { + let mut section = String::new(); + let path_str = input.cursor_path.to_string_lossy(); + write!(section, "{}{}\n", FILE_MARKER, path_str).ok(); + + section.push_str(&input.cursor_excerpt[..input.editable_range_in_excerpt.start]); + section.push_str(START_MARKER); + section.push_str( + &input.cursor_excerpt + [input.editable_range_in_excerpt.start..input.cursor_offset_in_excerpt], + ); + section.push_str(CURSOR_MARKER); + section.push_str( + &input.cursor_excerpt + [input.cursor_offset_in_excerpt..input.editable_range_in_excerpt.end], + ); + if !section.ends_with('\n') { + section.push('\n'); + } + section.push_str(SEPARATOR); + section + } +} + /// The zeta1 prompt format pub mod zeta1 { pub const CURSOR_MARKER: &str = "<|user_cursor_is_here|>"; @@ -844,4 +988,122 @@ mod tests { "#} ); } + + fn format_seed_coder(input: &ZetaPromptInput) -> String { + format_zeta_prompt_with_budget(input, ZetaFormat::V0211SeedCoder, 10000) + } + + fn format_seed_coder_with_budget(input: &ZetaPromptInput, max_tokens: usize) -> String { + format_zeta_prompt_with_budget(input, ZetaFormat::V0211SeedCoder, max_tokens) + } + + #[test] + fn test_seed_coder_basic_format() { + let input = make_input( + "prefix\neditable\nsuffix", + 7..15, + 10, + vec![make_event("a.rs", "-old\n+new\n")], + vec![make_related_file("related.rs", "fn helper() {}\n")], + ); + + assert_eq!( + format_seed_coder(&input), + indoc! {r#" + <[fim-suffix]> + suffix + <[fim-prefix]>related.rs + fn helper() {} + + edit_history + --- a/a.rs + +++ b/a.rs + -old + +new + + test.rs + prefix + <<<<<<< CURRENT + edi<|user_cursor|>table + ======= + <[fim-middle]>"#} + ); + } + + #[test] + fn test_seed_coder_no_context() { + let input = make_input("before\nmiddle\nafter", 7..13, 10, vec![], vec![]); + + assert_eq!( + format_seed_coder(&input), + indoc! {r#" + <[fim-suffix]> + after + <[fim-prefix]>test.rs + before + <<<<<<< CURRENT + mid<|user_cursor|>dle + ======= + <[fim-middle]>"#} + ); + } + + #[test] + fn test_seed_coder_truncation_drops_context() { + let input = make_input( + "code", + 0..4, + 2, + vec![make_event("a.rs", "-x\n+y\n")], + vec![make_related_file("r1.rs", "content\n")], + ); + + // With large budget, everything is included + assert_eq!( + format_seed_coder(&input), + indoc! {r#" + <[fim-suffix]> + <[fim-prefix]>r1.rs + content + + edit_history + --- a/a.rs + +++ b/a.rs + -x + +y + + test.rs + <<<<<<< CURRENT + co<|user_cursor|>de + ======= + <[fim-middle]>"#} + ); + + // With tight budget, context is dropped but cursor section remains + assert_eq!( + format_seed_coder_with_budget(&input, 30), + indoc! {r#" + <[fim-suffix]> + <[fim-prefix]>test.rs + <<<<<<< CURRENT + co<|user_cursor|>de + ======= + <[fim-middle]>"#} + ); + } + + #[test] + fn test_seed_coder_clean_output() { + let output_with_marker = "new code\n>>>>>>> UPDATED\n"; + let output_without_marker = "new code\n"; + + assert_eq!( + clean_zeta2_model_output(output_with_marker, ZetaFormat::V0211SeedCoder), + "new code\n" + ); + assert_eq!( + clean_zeta2_model_output(output_without_marker, ZetaFormat::V0211SeedCoder), + "new code\n" + ); + } }