1pub mod excerpt_ranges;
2pub mod multi_region;
3pub mod udiff;
4
5use anyhow::{Result, anyhow};
6use serde::{Deserialize, Serialize};
7use std::fmt::Write;
8use std::ops::Range;
9use std::path::Path;
10use std::sync::Arc;
11use strum::{EnumIter, IntoEnumIterator as _, IntoStaticStr};
12
13pub use crate::excerpt_ranges::{
14 ExcerptRanges, compute_editable_and_context_ranges, compute_legacy_excerpt_ranges,
15};
16
17pub const CURSOR_MARKER: &str = "<|user_cursor|>";
18pub const MAX_PROMPT_TOKENS: usize = 4096;
19
20/// Use up to this amount of the editable region for prefill.
21/// Larger values may result in more robust generation, but
22/// this region becomes non-editable.
23pub const PREFILL_RATIO: f64 = 0.1; // 10%
24
25fn estimate_tokens(bytes: usize) -> usize {
26 bytes / 3
27}
28
29/// Leave some slack to avoid overflow.
30fn apply_prompt_budget_margin(max_tokens: usize) -> usize {
31 (max_tokens as f64 * 0.9).floor() as usize
32}
33
34#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
35pub struct ZetaPromptInput {
36 pub cursor_path: Arc<Path>,
37 pub cursor_excerpt: Arc<str>,
38 pub cursor_offset_in_excerpt: usize,
39 #[serde(default, skip_serializing_if = "Option::is_none")]
40 pub excerpt_start_row: Option<u32>,
41 pub events: Vec<Arc<Event>>,
42 #[serde(default)]
43 pub related_files: Option<Vec<RelatedFile>>,
44 #[serde(default, skip_serializing_if = "Vec::is_empty")]
45 pub active_buffer_diagnostics: Vec<ActiveBufferDiagnostic>,
46 /// These ranges let the server select model-appropriate subsets.
47 pub excerpt_ranges: ExcerptRanges,
48 /// Byte offset ranges within `cursor_excerpt` for all syntax nodes that
49 /// contain `cursor_offset_in_excerpt`, ordered from innermost to outermost.
50 /// When present, the server uses these to compute editable/context ranges
51 /// instead of `excerpt_ranges`.
52 #[serde(default, skip_serializing_if = "Option::is_none")]
53 pub syntax_ranges: Option<Vec<Range<usize>>>,
54 /// The name of the edit prediction model experiment to use.
55 #[serde(default, skip_serializing_if = "Option::is_none")]
56 pub experiment: Option<String>,
57 #[serde(default)]
58 pub in_open_source_repo: bool,
59 #[serde(default)]
60 pub can_collect_data: bool,
61 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub repo_url: Option<String>,
63}
64
65#[derive(
66 Default,
67 Clone,
68 Copy,
69 Debug,
70 PartialEq,
71 Eq,
72 Hash,
73 EnumIter,
74 IntoStaticStr,
75 Serialize,
76 Deserialize,
77)]
78#[allow(non_camel_case_types)]
79pub enum ZetaFormat {
80 V0112MiddleAtEnd,
81 V0113Ordered,
82 V0114180EditableRegion,
83 V0120GitMergeMarkers,
84 #[default]
85 V0131GitMergeMarkersPrefix,
86 V0211Prefill,
87 V0211SeedCoder,
88 v0226Hashline,
89 V0304VariableEdit,
90 V0304SeedNoEdits,
91 /// Multi-block marker spans with NO_EDITS sentinel.
92 V0306SeedMultiRegions,
93 /// Byte-exact marker spans; all intermediate markers emitted; repeated marker means no-edit.
94 V0316SeedMultiRegions,
95 /// V0316 with larger block sizes.
96 V0318SeedMultiRegions,
97 /// V0316, but marker numbers are relative to the cursor block (e.g. -1, -0, +1).
98 V0317SeedMultiRegions,
99}
100
101impl std::fmt::Display for ZetaFormat {
102 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
103 write!(f, "{}", <&'static str>::from(self))
104 }
105}
106
107impl ZetaFormat {
108 pub fn parse(format_name: &str) -> Result<Self> {
109 let mut results = ZetaFormat::iter().filter(|version| {
110 <&'static str>::from(version)
111 .to_lowercase()
112 .contains(&format_name.to_lowercase())
113 });
114 let Some(result) = results.next() else {
115 anyhow::bail!(
116 "`{format_name}` did not match any of:\n{}",
117 Self::options_as_string()
118 );
119 };
120 if results.next().is_some() {
121 anyhow::bail!(
122 "`{format_name}` matched more than one of:\n{}",
123 Self::options_as_string()
124 );
125 }
126 Ok(result)
127 }
128
129 pub fn options_as_string() -> String {
130 ZetaFormat::iter()
131 .map(|format| format!("- {}\n", <&'static str>::from(format)))
132 .collect::<Vec<_>>()
133 .concat()
134 }
135}
136
137#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
138#[serde(tag = "event")]
139pub enum Event {
140 BufferChange {
141 path: Arc<Path>,
142 old_path: Arc<Path>,
143 diff: String,
144 predicted: bool,
145 in_open_source_repo: bool,
146 },
147}
148
149impl Event {
150 pub fn in_open_source_repo(&self) -> bool {
151 match self {
152 Event::BufferChange {
153 in_open_source_repo,
154 ..
155 } => *in_open_source_repo,
156 }
157 }
158}
159
160pub fn write_event(prompt: &mut String, event: &Event) {
161 fn write_path_as_unix_str(prompt: &mut String, path: &Path) {
162 for component in path.components() {
163 prompt.push('/');
164 write!(prompt, "{}", component.as_os_str().display()).ok();
165 }
166 }
167 match event {
168 Event::BufferChange {
169 path,
170 old_path,
171 diff,
172 predicted,
173 in_open_source_repo: _,
174 } => {
175 if *predicted {
176 prompt.push_str("// User accepted prediction:\n");
177 }
178 prompt.push_str("--- a");
179 write_path_as_unix_str(prompt, old_path.as_ref());
180 prompt.push_str("\n+++ b");
181 write_path_as_unix_str(prompt, path.as_ref());
182 prompt.push('\n');
183 prompt.push_str(diff);
184 }
185 }
186}
187
188#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
189pub struct ActiveBufferDiagnostic {
190 pub severity: Option<i32>,
191 pub message: String,
192 pub snippet: String,
193 pub snippet_buffer_row_range: Range<u32>,
194 pub diagnostic_range_in_snippet: Range<usize>,
195}
196
197#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
198pub struct RelatedFile {
199 pub path: Arc<Path>,
200 pub max_row: u32,
201 pub excerpts: Vec<RelatedExcerpt>,
202 #[serde(default)]
203 pub in_open_source_repo: bool,
204}
205
206#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
207pub struct RelatedExcerpt {
208 pub row_range: Range<u32>,
209 pub text: Arc<str>,
210 #[serde(default)]
211 pub order: usize,
212}
213
214pub fn prompt_input_contains_special_tokens(input: &ZetaPromptInput, format: ZetaFormat) -> bool {
215 special_tokens_for_format(format).iter().any(|token| {
216 if let Some(line_token) = token.strip_suffix('\n') {
217 input.cursor_excerpt.lines().any(|line| line == line_token)
218 } else {
219 input.cursor_excerpt.contains(token)
220 }
221 })
222}
223
224pub fn format_zeta_prompt(input: &ZetaPromptInput, format: ZetaFormat) -> Option<String> {
225 format_prompt_with_budget_for_format(input, format, MAX_PROMPT_TOKENS)
226}
227
228pub fn special_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] {
229 match format {
230 ZetaFormat::V0112MiddleAtEnd => v0112_middle_at_end::special_tokens(),
231 ZetaFormat::V0113Ordered => v0113_ordered::special_tokens(),
232 ZetaFormat::V0114180EditableRegion => v0114180_editable_region::special_tokens(),
233 ZetaFormat::V0120GitMergeMarkers => v0120_git_merge_markers::special_tokens(),
234 ZetaFormat::V0131GitMergeMarkersPrefix => v0131_git_merge_markers_prefix::special_tokens(),
235 ZetaFormat::V0211Prefill => v0211_prefill::special_tokens(),
236 ZetaFormat::V0211SeedCoder => seed_coder::special_tokens(),
237 ZetaFormat::v0226Hashline => hashline::special_tokens(),
238 ZetaFormat::V0304VariableEdit => v0304_variable_edit::special_tokens(),
239 ZetaFormat::V0304SeedNoEdits => seed_coder::special_tokens(),
240 ZetaFormat::V0316SeedMultiRegions => {
241 static TOKENS: &[&str] = &[
242 seed_coder::FIM_SUFFIX,
243 seed_coder::FIM_PREFIX,
244 seed_coder::FIM_MIDDLE,
245 seed_coder::FILE_MARKER,
246 multi_region::V0316_END_MARKER,
247 CURSOR_MARKER,
248 multi_region::MARKER_TAG_PREFIX,
249 ];
250 TOKENS
251 }
252 ZetaFormat::V0318SeedMultiRegions => {
253 static TOKENS: &[&str] = &[
254 seed_coder::FIM_SUFFIX,
255 seed_coder::FIM_PREFIX,
256 seed_coder::FIM_MIDDLE,
257 seed_coder::FILE_MARKER,
258 multi_region::V0318_END_MARKER,
259 CURSOR_MARKER,
260 multi_region::MARKER_TAG_PREFIX,
261 ];
262 TOKENS
263 }
264 ZetaFormat::V0317SeedMultiRegions => {
265 static TOKENS: &[&str] = &[
266 seed_coder::FIM_SUFFIX,
267 seed_coder::FIM_PREFIX,
268 seed_coder::FIM_MIDDLE,
269 seed_coder::FILE_MARKER,
270 multi_region::V0317_END_MARKER,
271 CURSOR_MARKER,
272 multi_region::RELATIVE_MARKER_TAG_PREFIX,
273 ];
274 TOKENS
275 }
276 ZetaFormat::V0306SeedMultiRegions => {
277 static TOKENS: &[&str] = &[
278 seed_coder::FIM_SUFFIX,
279 seed_coder::FIM_PREFIX,
280 seed_coder::FIM_MIDDLE,
281 seed_coder::FILE_MARKER,
282 seed_coder::START_MARKER,
283 seed_coder::SEPARATOR,
284 seed_coder::END_MARKER,
285 CURSOR_MARKER,
286 multi_region::MARKER_TAG_PREFIX,
287 ];
288 TOKENS
289 }
290 }
291}
292
293/// Returns the (editable_token_limit, context_token_limit) for a given format.
294pub fn token_limits_for_format(format: ZetaFormat) -> (usize, usize) {
295 match format {
296 ZetaFormat::V0112MiddleAtEnd | ZetaFormat::V0113Ordered => (150, 350),
297 ZetaFormat::V0114180EditableRegion => (180, 350),
298 ZetaFormat::V0120GitMergeMarkers
299 | ZetaFormat::V0131GitMergeMarkersPrefix
300 | ZetaFormat::V0211Prefill
301 | ZetaFormat::V0211SeedCoder
302 | ZetaFormat::v0226Hashline
303 | ZetaFormat::V0306SeedMultiRegions
304 | ZetaFormat::V0316SeedMultiRegions
305 | ZetaFormat::V0318SeedMultiRegions
306 | ZetaFormat::V0317SeedMultiRegions
307 | ZetaFormat::V0304SeedNoEdits => (350, 150),
308 ZetaFormat::V0304VariableEdit => (1024, 0),
309 }
310}
311
312pub fn stop_tokens_for_format(format: ZetaFormat) -> &'static [&'static str] {
313 match format {
314 ZetaFormat::v0226Hashline => &[hashline::NO_EDITS_COMMAND_MARKER],
315 ZetaFormat::V0112MiddleAtEnd
316 | ZetaFormat::V0113Ordered
317 | ZetaFormat::V0114180EditableRegion
318 | ZetaFormat::V0120GitMergeMarkers
319 | ZetaFormat::V0131GitMergeMarkersPrefix
320 | ZetaFormat::V0211Prefill
321 | ZetaFormat::V0211SeedCoder
322 | ZetaFormat::V0304VariableEdit
323 | ZetaFormat::V0306SeedMultiRegions
324 | ZetaFormat::V0304SeedNoEdits => &[],
325 ZetaFormat::V0316SeedMultiRegions => &[multi_region::V0316_END_MARKER],
326 ZetaFormat::V0318SeedMultiRegions => &[multi_region::V0318_END_MARKER],
327 ZetaFormat::V0317SeedMultiRegions => &[multi_region::V0317_END_MARKER],
328 }
329}
330
331pub fn excerpt_ranges_for_format(
332 format: ZetaFormat,
333 ranges: &ExcerptRanges,
334) -> (Range<usize>, Range<usize>) {
335 match format {
336 ZetaFormat::V0112MiddleAtEnd | ZetaFormat::V0113Ordered => (
337 ranges.editable_150.clone(),
338 ranges.editable_150_context_350.clone(),
339 ),
340 ZetaFormat::V0114180EditableRegion => (
341 ranges.editable_180.clone(),
342 ranges.editable_180_context_350.clone(),
343 ),
344 ZetaFormat::V0120GitMergeMarkers
345 | ZetaFormat::V0131GitMergeMarkersPrefix
346 | ZetaFormat::V0211Prefill
347 | ZetaFormat::V0211SeedCoder
348 | ZetaFormat::v0226Hashline
349 | ZetaFormat::V0304SeedNoEdits
350 | ZetaFormat::V0306SeedMultiRegions
351 | ZetaFormat::V0316SeedMultiRegions
352 | ZetaFormat::V0318SeedMultiRegions
353 | ZetaFormat::V0317SeedMultiRegions => (
354 ranges.editable_350.clone(),
355 ranges.editable_350_context_150.clone(),
356 ),
357 ZetaFormat::V0304VariableEdit => {
358 let context = ranges
359 .editable_350_context_1024
360 .clone()
361 .or(ranges.editable_350_context_512.clone())
362 .unwrap_or_else(|| ranges.editable_350_context_150.clone());
363 (context.clone(), context)
364 }
365 }
366}
367
368pub fn write_cursor_excerpt_section_for_format(
369 format: ZetaFormat,
370 prompt: &mut String,
371 path: &Path,
372 context: &str,
373 editable_range: &Range<usize>,
374 cursor_offset: usize,
375) {
376 match format {
377 ZetaFormat::V0112MiddleAtEnd => v0112_middle_at_end::write_cursor_excerpt_section(
378 prompt,
379 path,
380 context,
381 editable_range,
382 cursor_offset,
383 ),
384 ZetaFormat::V0113Ordered | ZetaFormat::V0114180EditableRegion => {
385 v0113_ordered::write_cursor_excerpt_section(
386 prompt,
387 path,
388 context,
389 editable_range,
390 cursor_offset,
391 )
392 }
393 ZetaFormat::V0120GitMergeMarkers => v0120_git_merge_markers::write_cursor_excerpt_section(
394 prompt,
395 path,
396 context,
397 editable_range,
398 cursor_offset,
399 ),
400 ZetaFormat::V0131GitMergeMarkersPrefix | ZetaFormat::V0211Prefill => {
401 v0131_git_merge_markers_prefix::write_cursor_excerpt_section(
402 prompt,
403 path,
404 context,
405 editable_range,
406 cursor_offset,
407 )
408 }
409 ZetaFormat::V0211SeedCoder | ZetaFormat::V0304SeedNoEdits => {
410 seed_coder::write_cursor_excerpt_section(
411 prompt,
412 path,
413 context,
414 editable_range,
415 cursor_offset,
416 )
417 }
418 ZetaFormat::v0226Hashline => hashline::write_cursor_excerpt_section(
419 prompt,
420 path,
421 context,
422 editable_range,
423 cursor_offset,
424 ),
425 ZetaFormat::V0304VariableEdit => {
426 v0304_variable_edit::write_cursor_excerpt_section(prompt, path, context, cursor_offset)
427 }
428 ZetaFormat::V0306SeedMultiRegions => {
429 prompt.push_str(&build_v0306_cursor_prefix(
430 path,
431 context,
432 editable_range,
433 cursor_offset,
434 ));
435 }
436 ZetaFormat::V0316SeedMultiRegions => {
437 prompt.push_str(&build_v0316_cursor_prefix(
438 path,
439 context,
440 editable_range,
441 cursor_offset,
442 ));
443 }
444 ZetaFormat::V0318SeedMultiRegions => {
445 prompt.push_str(&build_v0318_cursor_prefix(
446 path,
447 context,
448 editable_range,
449 cursor_offset,
450 ));
451 }
452 ZetaFormat::V0317SeedMultiRegions => {
453 prompt.push_str(&build_v0317_cursor_prefix(
454 path,
455 context,
456 editable_range,
457 cursor_offset,
458 ));
459 }
460 }
461}
462
463fn build_v0306_cursor_prefix(
464 path: &Path,
465 context: &str,
466 editable_range: &Range<usize>,
467 cursor_offset: usize,
468) -> String {
469 let mut section = String::new();
470 let path_str = path.to_string_lossy();
471 write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok();
472
473 section.push_str(&context[..editable_range.start]);
474 section.push_str(seed_coder::START_MARKER);
475
476 let editable_text = &context[editable_range.clone()];
477 let cursor_in_editable = cursor_offset - editable_range.start;
478 multi_region::write_editable_with_markers(
479 &mut section,
480 editable_text,
481 cursor_in_editable,
482 CURSOR_MARKER,
483 );
484
485 if !section.ends_with('\n') {
486 section.push('\n');
487 }
488 section.push_str(seed_coder::SEPARATOR);
489 section
490}
491
492fn build_v0316_cursor_prefix(
493 path: &Path,
494 context: &str,
495 editable_range: &Range<usize>,
496 cursor_offset: usize,
497) -> String {
498 let mut section = String::new();
499 let path_str = path.to_string_lossy();
500 write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok();
501
502 section.push_str(&context[..editable_range.start]);
503
504 let editable_text = &context[editable_range.clone()];
505 let cursor_in_editable = cursor_offset - editable_range.start;
506 multi_region::write_editable_with_markers_v0316(
507 &mut section,
508 editable_text,
509 cursor_in_editable,
510 CURSOR_MARKER,
511 );
512
513 if !section.ends_with('\n') {
514 section.push('\n');
515 }
516 section
517}
518
519fn build_v0318_cursor_prefix(
520 path: &Path,
521 context: &str,
522 editable_range: &Range<usize>,
523 cursor_offset: usize,
524) -> String {
525 let mut section = String::new();
526 let path_str = path.to_string_lossy();
527 write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok();
528
529 section.push_str(&context[..editable_range.start]);
530
531 let editable_text = &context[editable_range.clone()];
532 let cursor_in_editable = cursor_offset - editable_range.start;
533 multi_region::write_editable_with_markers_v0318(
534 &mut section,
535 editable_text,
536 cursor_in_editable,
537 CURSOR_MARKER,
538 );
539
540 if !section.ends_with('\n') {
541 section.push('\n');
542 }
543 section
544}
545
546fn build_v0317_cursor_prefix(
547 path: &Path,
548 context: &str,
549 editable_range: &Range<usize>,
550 cursor_offset: usize,
551) -> String {
552 let mut section = String::new();
553 let path_str = path.to_string_lossy();
554 write!(section, "{}{}\n", seed_coder::FILE_MARKER, path_str).ok();
555
556 section.push_str(&context[..editable_range.start]);
557
558 let editable_text = &context[editable_range.clone()];
559 let cursor_in_editable = cursor_offset - editable_range.start;
560 multi_region::write_editable_with_markers_v0317(
561 &mut section,
562 editable_text,
563 cursor_in_editable,
564 CURSOR_MARKER,
565 );
566
567 if !section.ends_with('\n') {
568 section.push('\n');
569 }
570 section
571}
572
573fn offset_range_to_row_range(text: &str, range: Range<usize>) -> Range<u32> {
574 let start_row = text[0..range.start].matches('\n').count() as u32;
575 let mut end_row = start_row + text[range.clone()].matches('\n').count() as u32;
576 if !text[..range.end].ends_with('\n') {
577 end_row += 1;
578 }
579 return start_row..end_row;
580}
581
582pub fn format_prompt_with_budget_for_format(
583 input: &ZetaPromptInput,
584 format: ZetaFormat,
585 max_tokens: usize,
586) -> Option<String> {
587 let (context, editable_range, context_range, cursor_offset) =
588 resolve_cursor_region(input, format);
589 let path = &*input.cursor_path;
590
591 let empty_files = Vec::new();
592 let input_related_files = input.related_files.as_deref().unwrap_or(&empty_files);
593 let related_files = if let Some(cursor_excerpt_start_row) = input.excerpt_start_row {
594 let relative_row_range = offset_range_to_row_range(&input.cursor_excerpt, context_range);
595 let row_range = relative_row_range.start + cursor_excerpt_start_row
596 ..relative_row_range.end + cursor_excerpt_start_row;
597 &filter_redundant_excerpts(
598 input_related_files.to_vec(),
599 input.cursor_path.as_ref(),
600 row_range,
601 )
602 } else {
603 input_related_files
604 };
605
606 let prompt = match format {
607 ZetaFormat::V0211SeedCoder
608 | ZetaFormat::V0304SeedNoEdits
609 | ZetaFormat::V0306SeedMultiRegions
610 | ZetaFormat::V0316SeedMultiRegions
611 | ZetaFormat::V0318SeedMultiRegions
612 | ZetaFormat::V0317SeedMultiRegions => {
613 let mut cursor_section = String::new();
614 write_cursor_excerpt_section_for_format(
615 format,
616 &mut cursor_section,
617 path,
618 context,
619 &editable_range,
620 cursor_offset,
621 );
622
623 let budget_with_margin = apply_prompt_budget_margin(max_tokens);
624 seed_coder::assemble_fim_prompt(
625 context,
626 &editable_range,
627 &cursor_section,
628 &input.events,
629 related_files,
630 budget_with_margin,
631 )
632 }
633 _ => {
634 let mut cursor_section = String::new();
635 write_cursor_excerpt_section_for_format(
636 format,
637 &mut cursor_section,
638 path,
639 context,
640 &editable_range,
641 cursor_offset,
642 );
643
644 let mut remaining_budget = apply_prompt_budget_margin(max_tokens);
645 let cursor_tokens = estimate_tokens(cursor_section.len());
646 remaining_budget = remaining_budget.saturating_sub(cursor_tokens);
647
648 let edit_history_section = format_edit_history_within_budget(
649 &input.events,
650 "<|file_sep|>",
651 "edit history",
652 remaining_budget,
653 max_edit_event_count_for_format(&format),
654 );
655 let edit_history_tokens = estimate_tokens(edit_history_section.len());
656 remaining_budget = remaining_budget.saturating_sub(edit_history_tokens);
657
658 let related_files_section = format_related_files_within_budget(
659 &related_files,
660 "<|file_sep|>",
661 "",
662 remaining_budget,
663 );
664
665 let mut prompt = String::new();
666 prompt.push_str(&related_files_section);
667 prompt.push_str(&edit_history_section);
668 prompt.push_str(&cursor_section);
669 prompt
670 }
671 };
672 let prompt_tokens = estimate_tokens(prompt.len());
673 if prompt_tokens > max_tokens {
674 return None;
675 }
676 return Some(prompt);
677}
678
679pub fn filter_redundant_excerpts(
680 mut related_files: Vec<RelatedFile>,
681 cursor_path: &Path,
682 cursor_row_range: Range<u32>,
683) -> Vec<RelatedFile> {
684 for file in &mut related_files {
685 if file.path.as_ref() == cursor_path {
686 file.excerpts.retain(|excerpt| {
687 excerpt.row_range.start < cursor_row_range.start
688 || excerpt.row_range.end > cursor_row_range.end
689 });
690 }
691 }
692 related_files.retain(|file| !file.excerpts.is_empty());
693 related_files
694}
695
696pub fn max_edit_event_count_for_format(format: &ZetaFormat) -> usize {
697 match format {
698 ZetaFormat::V0112MiddleAtEnd
699 | ZetaFormat::V0113Ordered
700 | ZetaFormat::V0114180EditableRegion
701 | ZetaFormat::V0120GitMergeMarkers
702 | ZetaFormat::V0131GitMergeMarkersPrefix
703 | ZetaFormat::V0211Prefill
704 | ZetaFormat::V0211SeedCoder
705 | ZetaFormat::v0226Hashline
706 | ZetaFormat::V0304SeedNoEdits
707 | ZetaFormat::V0304VariableEdit
708 | ZetaFormat::V0306SeedMultiRegions
709 | ZetaFormat::V0316SeedMultiRegions
710 | ZetaFormat::V0318SeedMultiRegions
711 | ZetaFormat::V0317SeedMultiRegions => 6,
712 }
713}
714
715pub fn get_prefill_for_format(
716 format: ZetaFormat,
717 context: &str,
718 editable_range: &Range<usize>,
719) -> String {
720 match format {
721 ZetaFormat::V0211Prefill => v0211_prefill::get_prefill(context, editable_range),
722 ZetaFormat::V0112MiddleAtEnd
723 | ZetaFormat::V0113Ordered
724 | ZetaFormat::V0114180EditableRegion
725 | ZetaFormat::V0120GitMergeMarkers
726 | ZetaFormat::V0131GitMergeMarkersPrefix
727 | ZetaFormat::V0211SeedCoder
728 | ZetaFormat::v0226Hashline
729 | ZetaFormat::V0304VariableEdit => String::new(),
730 ZetaFormat::V0304SeedNoEdits
731 | ZetaFormat::V0306SeedMultiRegions
732 | ZetaFormat::V0316SeedMultiRegions
733 | ZetaFormat::V0318SeedMultiRegions
734 | ZetaFormat::V0317SeedMultiRegions => String::new(),
735 }
736}
737
738pub fn output_end_marker_for_format(format: ZetaFormat) -> Option<&'static str> {
739 match format {
740 ZetaFormat::V0120GitMergeMarkers => Some(v0120_git_merge_markers::END_MARKER),
741 ZetaFormat::V0131GitMergeMarkersPrefix => Some(v0131_git_merge_markers_prefix::END_MARKER),
742 ZetaFormat::V0211Prefill => Some(v0131_git_merge_markers_prefix::END_MARKER),
743 ZetaFormat::V0211SeedCoder
744 | ZetaFormat::V0304SeedNoEdits
745 | ZetaFormat::V0306SeedMultiRegions => Some(seed_coder::END_MARKER),
746 ZetaFormat::V0316SeedMultiRegions => Some(multi_region::V0316_END_MARKER),
747 ZetaFormat::V0318SeedMultiRegions => Some(multi_region::V0318_END_MARKER),
748 ZetaFormat::V0317SeedMultiRegions => Some(multi_region::V0317_END_MARKER),
749 ZetaFormat::V0112MiddleAtEnd
750 | ZetaFormat::V0113Ordered
751 | ZetaFormat::V0114180EditableRegion
752 | ZetaFormat::v0226Hashline
753 | ZetaFormat::V0304VariableEdit => None,
754 }
755}
756
757pub fn encode_patch_as_output_for_format(
758 format: ZetaFormat,
759 old_editable_region: &str,
760 patch: &str,
761 cursor_offset: Option<usize>,
762) -> Result<Option<String>> {
763 match format {
764 ZetaFormat::v0226Hashline => {
765 hashline::patch_to_edit_commands(old_editable_region, patch, cursor_offset).map(Some)
766 }
767 ZetaFormat::V0304VariableEdit => v0304_variable_edit::patch_to_variable_edit_output(
768 old_editable_region,
769 patch,
770 cursor_offset,
771 )
772 .map(Some),
773 ZetaFormat::V0304SeedNoEdits | ZetaFormat::V0306SeedMultiRegions => {
774 Ok(seed_coder::no_edits(patch))
775 }
776 ZetaFormat::V0316SeedMultiRegions => {
777 let empty_patch = patch.lines().count() <= 3;
778 if empty_patch {
779 let marker_offsets = multi_region::compute_marker_offsets(old_editable_region);
780 let marker_num =
781 multi_region::nearest_marker_number(cursor_offset, &marker_offsets);
782 let tag = multi_region::marker_tag(marker_num);
783 Ok(Some(format!(
784 "{tag}{tag}{}",
785 multi_region::V0316_END_MARKER
786 )))
787 } else {
788 Ok(None)
789 }
790 }
791 ZetaFormat::V0318SeedMultiRegions => {
792 let empty_patch = patch.lines().count() <= 3;
793 if empty_patch {
794 let marker_offsets =
795 multi_region::compute_marker_offsets_v0318(old_editable_region);
796 let marker_num =
797 multi_region::nearest_marker_number(cursor_offset, &marker_offsets);
798 let tag = multi_region::marker_tag(marker_num);
799 Ok(Some(format!(
800 "{tag}{tag}{}",
801 multi_region::V0318_END_MARKER
802 )))
803 } else {
804 Ok(None)
805 }
806 }
807 ZetaFormat::V0317SeedMultiRegions => {
808 let empty_patch = patch.lines().count() <= 3;
809 if empty_patch {
810 let tag = multi_region::marker_tag_relative(0);
811 Ok(Some(format!(
812 "{tag}{tag}{}",
813 multi_region::V0317_END_MARKER
814 )))
815 } else {
816 Ok(None)
817 }
818 }
819 _ => Ok(None),
820 }
821}
822
823/// Given a `ZetaPromptInput`, a format, and a patch (with cursor already
824/// extracted), produce the expected model output string for training.
825pub fn format_expected_output(
826 input: &ZetaPromptInput,
827 format: ZetaFormat,
828 patch: &str,
829 cursor_offset: Option<usize>,
830) -> Result<String> {
831 let (context, editable_range, _, _) = resolve_cursor_region(input, format);
832 let mut old_editable = context[editable_range].to_string();
833 if !old_editable.is_empty() && !old_editable.ends_with('\n') {
834 old_editable.push('\n');
835 }
836
837 // Formats with their own output encoding (hashline, variable-edit,
838 // multi-region empty patches) are handled here.
839 if let Some(output) =
840 encode_patch_as_output_for_format(format, &old_editable, patch, cursor_offset)?
841 {
842 return Ok(output);
843 }
844
845 let empty_patch = patch.lines().count() <= 3;
846
847 match format {
848 // Multi-region formats: non-empty patches need diff application
849 // then marker-span encoding.
850 ZetaFormat::V0316SeedMultiRegions => {
851 let (new_editable, first_hunk_offset) =
852 udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
853 let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
854 multi_region::encode_from_old_and_new_v0316(
855 &old_editable,
856 &new_editable,
857 cursor_in_new,
858 CURSOR_MARKER,
859 multi_region::V0316_END_MARKER,
860 )
861 }
862 ZetaFormat::V0318SeedMultiRegions => {
863 let (new_editable, first_hunk_offset) =
864 udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
865 let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
866 multi_region::encode_from_old_and_new_v0318(
867 &old_editable,
868 &new_editable,
869 cursor_in_new,
870 CURSOR_MARKER,
871 multi_region::V0318_END_MARKER,
872 )
873 }
874 ZetaFormat::V0317SeedMultiRegions => {
875 let (new_editable, first_hunk_offset) =
876 udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
877 let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
878 multi_region::encode_from_old_and_new_v0317(
879 &old_editable,
880 &new_editable,
881 cursor_in_new,
882 CURSOR_MARKER,
883 multi_region::V0317_END_MARKER,
884 )
885 }
886 // V0131-style formats and fallback: produce new editable text with
887 // cursor marker inserted, followed by the end marker.
888 _ => {
889 let (mut result, first_hunk_offset) = if empty_patch {
890 (old_editable.clone(), None)
891 } else {
892 udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?
893 };
894
895 if let Some(cursor) = cursor_offset {
896 let hunk_start = if !empty_patch {
897 first_hunk_offset.unwrap_or(0)
898 } else {
899 0
900 };
901 let offset = (hunk_start + cursor).min(result.len());
902 result.insert_str(offset, CURSOR_MARKER);
903 }
904
905 if !result.is_empty() && !result.ends_with('\n') {
906 result.push('\n');
907 }
908
909 if let Some(end_marker) = output_end_marker_for_format(format) {
910 result.push_str(end_marker);
911 }
912
913 Ok(result)
914 }
915 }
916}
917
918/// Compute the cursor position within the new text after diff application.
919fn cursor_in_new_text(
920 cursor_offset: Option<usize>,
921 first_hunk_offset: Option<usize>,
922 new_text: &str,
923) -> Option<usize> {
924 cursor_offset.map(|cursor| {
925 let hunk_start = first_hunk_offset.unwrap_or(0);
926 (hunk_start + cursor).min(new_text.len())
927 })
928}
929
930pub struct ParsedOutput {
931 /// Text that should replace the editable region
932 pub new_editable_region: String,
933 /// The byte range within `cursor_excerpt` that this replacement applies to
934 pub range_in_excerpt: Range<usize>,
935}
936
937/// Parse model output for the given zeta format
938pub fn parse_zeta2_model_output(
939 output: &str,
940 format: ZetaFormat,
941 prompt_inputs: &ZetaPromptInput,
942) -> Result<ParsedOutput> {
943 let output = match output_end_marker_for_format(format) {
944 Some(marker) => output.strip_suffix(marker).unwrap_or(output),
945 None => output,
946 };
947
948 let (context, editable_range_in_context, context_range, cursor_offset) =
949 resolve_cursor_region(prompt_inputs, format);
950 let context_start = context_range.start;
951 let old_editable_region = &context[editable_range_in_context.clone()];
952 let cursor_offset_in_editable = cursor_offset.saturating_sub(editable_range_in_context.start);
953
954 let (range_in_context, output) = match format {
955 ZetaFormat::v0226Hashline => (
956 editable_range_in_context,
957 if hashline::output_has_edit_commands(output) {
958 hashline::apply_edit_commands(old_editable_region, output)
959 } else {
960 output.to_string()
961 },
962 ),
963 ZetaFormat::V0304VariableEdit => v0304_variable_edit::apply_variable_edit(context, output)?,
964 ZetaFormat::V0304SeedNoEdits => (
965 editable_range_in_context,
966 if output.starts_with(seed_coder::NO_EDITS) {
967 old_editable_region.to_string()
968 } else {
969 output.to_string()
970 },
971 ),
972 ZetaFormat::V0306SeedMultiRegions => (
973 editable_range_in_context,
974 if output.starts_with(seed_coder::NO_EDITS) {
975 old_editable_region.to_string()
976 } else {
977 multi_region::apply_marker_span(old_editable_region, output)?
978 },
979 ),
980 ZetaFormat::V0316SeedMultiRegions => (
981 editable_range_in_context,
982 multi_region::apply_marker_span_v0316(old_editable_region, output)?,
983 ),
984 ZetaFormat::V0318SeedMultiRegions => (
985 editable_range_in_context,
986 multi_region::apply_marker_span_v0318(old_editable_region, output)?,
987 ),
988 ZetaFormat::V0317SeedMultiRegions => (
989 editable_range_in_context,
990 multi_region::apply_marker_span_v0317(
991 old_editable_region,
992 output,
993 Some(cursor_offset_in_editable),
994 )?,
995 ),
996 _ => (editable_range_in_context, output.to_string()),
997 };
998
999 let range_in_excerpt =
1000 range_in_context.start + context_start..range_in_context.end + context_start;
1001
1002 Ok(ParsedOutput {
1003 new_editable_region: output,
1004 range_in_excerpt,
1005 })
1006}
1007
1008pub fn excerpt_range_for_format(
1009 format: ZetaFormat,
1010 ranges: &ExcerptRanges,
1011) -> (Range<usize>, Range<usize>) {
1012 excerpt_ranges_for_format(format, ranges)
1013}
1014
1015pub fn resolve_cursor_region(
1016 input: &ZetaPromptInput,
1017 format: ZetaFormat,
1018) -> (&str, Range<usize>, Range<usize>, usize) {
1019 let (editable_range, context_range) = if let Some(syntax_ranges) = &input.syntax_ranges {
1020 let (editable_tokens, context_tokens) = token_limits_for_format(format);
1021 compute_editable_and_context_ranges(
1022 &input.cursor_excerpt,
1023 input.cursor_offset_in_excerpt,
1024 syntax_ranges,
1025 editable_tokens,
1026 context_tokens,
1027 )
1028 } else {
1029 excerpt_range_for_format(format, &input.excerpt_ranges)
1030 };
1031 let context_start = context_range.start;
1032 let context_text = &input.cursor_excerpt[context_range.clone()];
1033 let adjusted_editable =
1034 (editable_range.start - context_start)..(editable_range.end - context_start);
1035 let adjusted_cursor = input.cursor_offset_in_excerpt - context_start;
1036
1037 (
1038 context_text,
1039 adjusted_editable,
1040 context_range,
1041 adjusted_cursor,
1042 )
1043}
1044
1045pub fn get_prefill(input: &ZetaPromptInput, format: ZetaFormat) -> String {
1046 let (context, editable_range, _, _) = resolve_cursor_region(input, format);
1047 get_prefill_for_format(format, context, &editable_range)
1048}
1049
1050fn format_edit_history_within_budget(
1051 events: &[Arc<Event>],
1052 file_marker: &str,
1053 edit_history_name: &str,
1054 max_tokens: usize,
1055 max_edit_event_count: usize,
1056) -> String {
1057 let header = format!("{}{}\n", file_marker, edit_history_name);
1058 let header_tokens = estimate_tokens(header.len());
1059 if header_tokens >= max_tokens {
1060 return String::new();
1061 }
1062
1063 let mut event_strings: Vec<String> = Vec::new();
1064 let mut total_tokens = header_tokens;
1065
1066 for event in events.iter().rev().take(max_edit_event_count) {
1067 let mut event_str = String::new();
1068 write_event(&mut event_str, event);
1069 let event_tokens = estimate_tokens(event_str.len());
1070
1071 if total_tokens + event_tokens > max_tokens {
1072 break;
1073 }
1074 total_tokens += event_tokens;
1075 event_strings.push(event_str);
1076 }
1077
1078 if event_strings.is_empty() {
1079 return String::new();
1080 }
1081
1082 let mut result = header;
1083 for event_str in event_strings.iter().rev() {
1084 result.push_str(event_str);
1085 }
1086 result
1087}
1088
1089fn excerpt_rendered_tokens(excerpt: &RelatedExcerpt, file_max_row: u32) -> usize {
1090 let needs_newline = !excerpt.text.ends_with('\n');
1091 let needs_ellipsis = excerpt.row_range.end < file_max_row;
1092 let len = excerpt.text.len()
1093 + if needs_newline { "\n".len() } else { 0 }
1094 + if needs_ellipsis { "...\n".len() } else { 0 };
1095 estimate_tokens(len)
1096}
1097
1098pub fn format_related_files_within_budget(
1099 related_files: &[RelatedFile],
1100 file_prefix: &str,
1101 file_suffix: &str,
1102 max_tokens: usize,
1103) -> String {
1104 struct ExcerptCandidate {
1105 file_ix: usize,
1106 excerpt_ix: usize,
1107 order: usize,
1108 }
1109
1110 let mut excerpt_candidates: Vec<ExcerptCandidate> = related_files
1111 .iter()
1112 .enumerate()
1113 .flat_map(|(file_ix, file)| {
1114 file.excerpts
1115 .iter()
1116 .enumerate()
1117 .map(move |(excerpt_ix, e)| ExcerptCandidate {
1118 file_ix,
1119 excerpt_ix,
1120 order: e.order,
1121 })
1122 })
1123 .collect();
1124
1125 // Pre-compute file header strings and their token costs.
1126 let file_headers: Vec<String> = related_files
1127 .iter()
1128 .map(|file| {
1129 let path_str = file.path.to_string_lossy();
1130 format!("{}{}\n", file_prefix, path_str)
1131 })
1132 .collect();
1133
1134 // Sort the excerpts by their order and determine how many fit within the budget.
1135 let mut total_tokens = 0;
1136 let mut included_excerpt_count = 0_usize;
1137 let mut included_file_indices = vec![false; related_files.len()];
1138 excerpt_candidates.sort_by_key(|e| (e.order, e.file_ix, e.excerpt_ix));
1139 for candidate in &excerpt_candidates {
1140 let file = &related_files[candidate.file_ix];
1141 let excerpt = &file.excerpts[candidate.excerpt_ix];
1142 let file_already_included = included_file_indices[candidate.file_ix];
1143 let header_cost = if file_already_included {
1144 0
1145 } else {
1146 estimate_tokens(file_headers[candidate.file_ix].len() + file_suffix.len())
1147 };
1148 let excerpt_cost = excerpt_rendered_tokens(excerpt, file.max_row);
1149 if total_tokens + header_cost + excerpt_cost > max_tokens {
1150 break;
1151 }
1152 total_tokens += header_cost + excerpt_cost;
1153 if !file_already_included {
1154 included_file_indices[candidate.file_ix] = true;
1155 }
1156 included_excerpt_count += 1;
1157 }
1158
1159 excerpt_candidates.truncate(included_excerpt_count);
1160 excerpt_candidates.sort_unstable_by_key(|c| (c.file_ix, c.excerpt_ix));
1161
1162 // Render all of the files that fit within the token budget, in the original order.
1163 let mut result = String::new();
1164 let mut last_file_ix = None;
1165 for candidate in &excerpt_candidates {
1166 if last_file_ix != Some(candidate.file_ix) {
1167 if last_file_ix.is_some() {
1168 result.push_str(file_suffix);
1169 }
1170 result.push_str(&file_headers[candidate.file_ix]);
1171 last_file_ix = Some(candidate.file_ix);
1172 }
1173 let file = &related_files[candidate.file_ix];
1174 let excerpt = &file.excerpts[candidate.excerpt_ix];
1175 result.push_str(&excerpt.text);
1176 if !result.ends_with('\n') {
1177 result.push('\n');
1178 }
1179 if excerpt.row_range.end < file.max_row {
1180 result.push_str("...\n");
1181 }
1182 }
1183
1184 result
1185}
1186
1187pub fn write_related_files(
1188 prompt: &mut String,
1189 related_files: &[RelatedFile],
1190) -> Vec<Range<usize>> {
1191 let mut ranges = Vec::new();
1192 for file in related_files {
1193 let start = prompt.len();
1194 let path_str = file.path.to_string_lossy();
1195 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
1196 for excerpt in &file.excerpts {
1197 prompt.push_str(&excerpt.text);
1198 if !prompt.ends_with('\n') {
1199 prompt.push('\n');
1200 }
1201 if excerpt.row_range.end < file.max_row {
1202 prompt.push_str("...\n");
1203 }
1204 }
1205 let end = prompt.len();
1206 ranges.push(start..end);
1207 }
1208 ranges
1209}
1210
1211mod v0112_middle_at_end {
1212 use super::*;
1213
1214 pub fn special_tokens() -> &'static [&'static str] {
1215 &[
1216 "<|fim_prefix|>",
1217 "<|fim_suffix|>",
1218 "<|fim_middle|>",
1219 "<|file_sep|>",
1220 CURSOR_MARKER,
1221 ]
1222 }
1223
1224 pub fn write_cursor_excerpt_section(
1225 prompt: &mut String,
1226 path: &Path,
1227 context: &str,
1228 editable_range: &Range<usize>,
1229 cursor_offset: usize,
1230 ) {
1231 let path_str = path.to_string_lossy();
1232 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
1233
1234 prompt.push_str("<|fim_prefix|>\n");
1235 prompt.push_str(&context[..editable_range.start]);
1236
1237 prompt.push_str("<|fim_suffix|>\n");
1238 prompt.push_str(&context[editable_range.end..]);
1239 if !prompt.ends_with('\n') {
1240 prompt.push('\n');
1241 }
1242
1243 prompt.push_str("<|fim_middle|>current\n");
1244 prompt.push_str(&context[editable_range.start..cursor_offset]);
1245 prompt.push_str(CURSOR_MARKER);
1246 prompt.push_str(&context[cursor_offset..editable_range.end]);
1247 if !prompt.ends_with('\n') {
1248 prompt.push('\n');
1249 }
1250
1251 prompt.push_str("<|fim_middle|>updated\n");
1252 }
1253}
1254
1255mod v0113_ordered {
1256 use super::*;
1257
1258 pub fn special_tokens() -> &'static [&'static str] {
1259 &[
1260 "<|fim_prefix|>",
1261 "<|fim_suffix|>",
1262 "<|fim_middle|>",
1263 "<|file_sep|>",
1264 CURSOR_MARKER,
1265 ]
1266 }
1267
1268 pub fn write_cursor_excerpt_section(
1269 prompt: &mut String,
1270 path: &Path,
1271 context: &str,
1272 editable_range: &Range<usize>,
1273 cursor_offset: usize,
1274 ) {
1275 let path_str = path.to_string_lossy();
1276 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
1277
1278 prompt.push_str("<|fim_prefix|>\n");
1279 prompt.push_str(&context[..editable_range.start]);
1280 if !prompt.ends_with('\n') {
1281 prompt.push('\n');
1282 }
1283
1284 prompt.push_str("<|fim_middle|>current\n");
1285 prompt.push_str(&context[editable_range.start..cursor_offset]);
1286 prompt.push_str(CURSOR_MARKER);
1287 prompt.push_str(&context[cursor_offset..editable_range.end]);
1288 if !prompt.ends_with('\n') {
1289 prompt.push('\n');
1290 }
1291
1292 prompt.push_str("<|fim_suffix|>\n");
1293 prompt.push_str(&context[editable_range.end..]);
1294 if !prompt.ends_with('\n') {
1295 prompt.push('\n');
1296 }
1297
1298 prompt.push_str("<|fim_middle|>updated\n");
1299 }
1300}
1301
1302mod v0114180_editable_region {
1303 use super::*;
1304
1305 pub fn special_tokens() -> &'static [&'static str] {
1306 v0113_ordered::special_tokens()
1307 }
1308}
1309
1310pub mod v0120_git_merge_markers {
1311 //! A prompt that uses git-style merge conflict markers to represent the editable region.
1312 //!
1313 //! Example prompt:
1314 //!
1315 //! <|file_sep|>path/to/target_file.py
1316 //! <|fim_prefix|>
1317 //! code before editable region
1318 //! <|fim_suffix|>
1319 //! code after editable region
1320 //! <|fim_middle|>
1321 //! <<<<<<< CURRENT
1322 //! code that
1323 //! needs to<|user_cursor|>
1324 //! be rewritten
1325 //! =======
1326 //!
1327 //! Expected output (should be generated by the model):
1328 //!
1329 //! updated
1330 //! code with
1331 //! changes applied
1332 //! >>>>>>> UPDATED
1333
1334 use super::*;
1335
1336 pub const START_MARKER: &str = "<<<<<<< CURRENT\n";
1337 pub const SEPARATOR: &str = "=======\n";
1338 pub const END_MARKER: &str = ">>>>>>> UPDATED\n";
1339
1340 pub fn special_tokens() -> &'static [&'static str] {
1341 &[
1342 "<|fim_prefix|>",
1343 "<|fim_suffix|>",
1344 "<|fim_middle|>",
1345 "<|file_sep|>",
1346 START_MARKER,
1347 SEPARATOR,
1348 END_MARKER,
1349 CURSOR_MARKER,
1350 ]
1351 }
1352
1353 pub fn write_cursor_excerpt_section(
1354 prompt: &mut String,
1355 path: &Path,
1356 context: &str,
1357 editable_range: &Range<usize>,
1358 cursor_offset: usize,
1359 ) {
1360 let path_str = path.to_string_lossy();
1361 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
1362
1363 prompt.push_str("<|fim_prefix|>");
1364 prompt.push_str(&context[..editable_range.start]);
1365
1366 prompt.push_str("<|fim_suffix|>");
1367 prompt.push_str(&context[editable_range.end..]);
1368 if !prompt.ends_with('\n') {
1369 prompt.push('\n');
1370 }
1371
1372 prompt.push_str("<|fim_middle|>");
1373 prompt.push_str(START_MARKER);
1374 prompt.push_str(&context[editable_range.start..cursor_offset]);
1375 prompt.push_str(CURSOR_MARKER);
1376 prompt.push_str(&context[cursor_offset..editable_range.end]);
1377 if !prompt.ends_with('\n') {
1378 prompt.push('\n');
1379 }
1380 prompt.push_str(SEPARATOR);
1381 }
1382}
1383
1384pub mod v0131_git_merge_markers_prefix {
1385 //! A prompt that uses git-style merge conflict markers to represent the editable region.
1386 //!
1387 //! Example prompt:
1388 //!
1389 //! <|file_sep|>path/to/target_file.py
1390 //! <|fim_prefix|>
1391 //! code before editable region
1392 //! <<<<<<< CURRENT
1393 //! code that
1394 //! needs to<|user_cursor|>
1395 //! be rewritten
1396 //! =======
1397 //! <|fim_suffix|>
1398 //! code after editable region
1399 //! <|fim_middle|>
1400 //!
1401 //! Expected output (should be generated by the model):
1402 //!
1403 //! updated
1404 //! code with
1405 //! changes applied
1406 //! >>>>>>> UPDATED
1407
1408 use super::*;
1409
1410 pub const START_MARKER: &str = "<<<<<<< CURRENT\n";
1411 pub const SEPARATOR: &str = "=======\n";
1412 pub const END_MARKER: &str = ">>>>>>> UPDATED\n";
1413
1414 pub fn special_tokens() -> &'static [&'static str] {
1415 &[
1416 "<|fim_prefix|>",
1417 "<|fim_suffix|>",
1418 "<|fim_middle|>",
1419 "<|file_sep|>",
1420 START_MARKER,
1421 SEPARATOR,
1422 END_MARKER,
1423 CURSOR_MARKER,
1424 ]
1425 }
1426
1427 pub fn write_cursor_excerpt_section(
1428 prompt: &mut String,
1429 path: &Path,
1430 context: &str,
1431 editable_range: &Range<usize>,
1432 cursor_offset: usize,
1433 ) {
1434 let path_str = path.to_string_lossy();
1435 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
1436
1437 prompt.push_str("<|fim_prefix|>");
1438 prompt.push_str(&context[..editable_range.start]);
1439 prompt.push_str(START_MARKER);
1440 prompt.push_str(&context[editable_range.start..cursor_offset]);
1441 prompt.push_str(CURSOR_MARKER);
1442 prompt.push_str(&context[cursor_offset..editable_range.end]);
1443 if !prompt.ends_with('\n') {
1444 prompt.push('\n');
1445 }
1446 prompt.push_str(SEPARATOR);
1447
1448 prompt.push_str("<|fim_suffix|>");
1449 prompt.push_str(&context[editable_range.end..]);
1450 if !prompt.ends_with('\n') {
1451 prompt.push('\n');
1452 }
1453
1454 prompt.push_str("<|fim_middle|>");
1455 }
1456}
1457
1458pub mod v0211_prefill {
1459 use super::*;
1460
1461 pub fn special_tokens() -> &'static [&'static str] {
1462 v0131_git_merge_markers_prefix::special_tokens()
1463 }
1464
1465 pub fn get_prefill(context: &str, editable_range: &Range<usize>) -> String {
1466 let editable_region = &context[editable_range.start..editable_range.end];
1467
1468 let prefill_len = (editable_region.len() as f64 * PREFILL_RATIO) as usize;
1469 let prefill_len = editable_region.floor_char_boundary(prefill_len);
1470
1471 // Find a token boundary to avoid splitting tokens in the prefill.
1472 // In Qwen2.5-Coder, \n is always the END of a token (e.g. `;\n`,
1473 // ` {\n`), and \n\n / \n\n\n are single tokens, so we must include
1474 // the \n and consume any consecutive \n characters after it.
1475 let prefill = &editable_region[..prefill_len];
1476 match prefill.rfind('\n') {
1477 Some(pos) => {
1478 let mut end = pos + 1;
1479 while end < editable_region.len()
1480 && editable_region.as_bytes().get(end) == Some(&b'\n')
1481 {
1482 end += 1;
1483 }
1484 editable_region[..end].to_string()
1485 }
1486 // No newline found. Fall back to splitting before the last space
1487 // (word-level boundary)
1488 None => match prefill.rfind(' ') {
1489 Some(pos) => prefill[..pos].to_string(),
1490 None => prefill.to_string(),
1491 },
1492 }
1493 }
1494}
1495
1496pub mod hashline {
1497
1498 use std::fmt::Display;
1499
1500 pub const END_MARKER: &str = "<|fim_middle|>updated";
1501 pub const START_MARKER: &str = "<|fim_middle|>current";
1502
1503 use super::*;
1504
1505 const SET_COMMAND_MARKER: &str = "<|set|>";
1506 const INSERT_COMMAND_MARKER: &str = "<|insert|>";
1507 pub const NO_EDITS_COMMAND_MARKER: &str = "<|no_edits|>";
1508
1509 pub fn special_tokens() -> &'static [&'static str] {
1510 return &[
1511 SET_COMMAND_MARKER,
1512 "<|set_range|>",
1513 INSERT_COMMAND_MARKER,
1514 NO_EDITS_COMMAND_MARKER,
1515 CURSOR_MARKER,
1516 "<|file_sep|>",
1517 "<|fim_prefix|>",
1518 "<|fim_suffix|>",
1519 "<|fim_middle|>",
1520 ];
1521 }
1522
1523 /// A parsed line reference like `3:c3` (line index 3 with hash 0xc3).
1524 #[derive(Debug, Clone, PartialEq, Eq)]
1525 struct LineRef {
1526 index: usize,
1527 hash: u8,
1528 }
1529
1530 impl Display for LineRef {
1531 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1532 write!(f, "{}:{:02x}", self.index, self.hash)
1533 }
1534 }
1535
1536 pub fn hash_line(line: &[u8]) -> u8 {
1537 let mut h: u8 = 0;
1538 for &byte in line {
1539 h = h.wrapping_add(byte);
1540 }
1541 return h;
1542 }
1543
1544 /// Write the hashline-encoded editable region into `out`. Each line of
1545 /// `editable_text` is prefixed with `{line_index}:{hash}|` and the cursor
1546 /// marker is inserted at `cursor_offset_in_editable` (byte offset relative
1547 /// to the start of `editable_text`).
1548 pub fn write_hashline_editable_region(
1549 out: &mut String,
1550 editable_text: &str,
1551 cursor_offset_in_editable: usize,
1552 ) {
1553 let mut offset = 0;
1554 for (i, line) in editable_text.lines().enumerate() {
1555 let (head, cursor, tail) = if cursor_offset_in_editable > offset
1556 && cursor_offset_in_editable < offset + line.len()
1557 {
1558 (
1559 &line[..cursor_offset_in_editable - offset],
1560 CURSOR_MARKER,
1561 &line[cursor_offset_in_editable - offset..],
1562 )
1563 } else {
1564 (line, "", "")
1565 };
1566 write!(
1567 out,
1568 "\n{}|{head}{cursor}{tail}",
1569 LineRef {
1570 index: i,
1571 hash: hash_line(line.as_bytes())
1572 }
1573 )
1574 .unwrap();
1575 offset += line.len() + 1;
1576 }
1577 }
1578
1579 pub fn write_cursor_excerpt_section(
1580 prompt: &mut String,
1581 path: &Path,
1582 context: &str,
1583 editable_range: &Range<usize>,
1584 cursor_offset: usize,
1585 ) {
1586 let path_str = path.to_string_lossy();
1587 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
1588
1589 prompt.push_str("<|fim_prefix|>\n");
1590 prompt.push_str(&context[..editable_range.start]);
1591 prompt.push_str(START_MARKER);
1592
1593 let cursor_offset_in_editable = cursor_offset.saturating_sub(editable_range.start);
1594 let editable_region = &context[editable_range.clone()];
1595 write_hashline_editable_region(prompt, editable_region, cursor_offset_in_editable);
1596
1597 if !prompt.ends_with('\n') {
1598 prompt.push('\n');
1599 }
1600
1601 prompt.push_str("<|fim_suffix|>\n");
1602 prompt.push_str(&context[editable_range.end..]);
1603 if !prompt.ends_with('\n') {
1604 prompt.push('\n');
1605 }
1606
1607 prompt.push_str(END_MARKER);
1608 prompt.push('\n');
1609 }
1610
1611 /// A single edit command parsed from the model output.
1612 #[derive(Debug)]
1613 enum EditCommand<'a> {
1614 /// Replace a range of lines (inclusive on both ends). Single-line set is
1615 /// represented by `start == end`.
1616 Set {
1617 start: LineRef,
1618 end: LineRef,
1619 content: &'a str,
1620 },
1621 /// Insert new lines after the given line, or before the first line if
1622 /// `after` is `None`.
1623 Insert {
1624 after: Option<LineRef>,
1625 content: &'a str,
1626 },
1627 }
1628
1629 /// Parse a line reference like `3:c3` into a `LineRef`.
1630 fn parse_line_ref(s: &str) -> Option<LineRef> {
1631 let (idx_str, hash_str) = s.split_once(':')?;
1632 let index = idx_str.parse::<usize>().ok()?;
1633 let hash = u8::from_str_radix(hash_str, 16).ok()?;
1634 Some(LineRef { index, hash })
1635 }
1636
1637 /// Parse the model output into a list of `EditCommand`s.
1638 fn parse_edit_commands(model_output: &str) -> Vec<EditCommand<'_>> {
1639 let mut commands = Vec::new();
1640 let mut offset = 0usize;
1641
1642 while offset < model_output.len() {
1643 let next_nl = model_output[offset..]
1644 .find('\n')
1645 .map(|i| offset + i)
1646 .unwrap_or(model_output.len());
1647 let line = &model_output[offset..next_nl];
1648 let line_end = if next_nl < model_output.len() {
1649 next_nl + 1
1650 } else {
1651 next_nl
1652 };
1653
1654 let trimmed = line.trim();
1655 let (is_set, specifier) = if let Some(spec) = trimmed.strip_prefix(SET_COMMAND_MARKER) {
1656 (true, spec)
1657 } else if let Some(spec) = trimmed.strip_prefix(INSERT_COMMAND_MARKER) {
1658 (false, spec)
1659 } else {
1660 offset = line_end;
1661 continue;
1662 };
1663
1664 let mut content_end = line_end;
1665 let mut scan = line_end;
1666
1667 while scan < model_output.len() {
1668 let body_nl = model_output[scan..]
1669 .find('\n')
1670 .map(|i| scan + i)
1671 .unwrap_or(model_output.len());
1672 let body_line = &model_output[scan..body_nl];
1673 if body_line.trim().starts_with(SET_COMMAND_MARKER)
1674 || body_line.trim().starts_with(INSERT_COMMAND_MARKER)
1675 {
1676 break;
1677 }
1678 scan = if body_nl < model_output.len() {
1679 body_nl + 1
1680 } else {
1681 body_nl
1682 };
1683 content_end = scan;
1684 }
1685
1686 let content = &model_output[line_end..content_end];
1687
1688 if is_set {
1689 if let Some((start_str, end_str)) = specifier.split_once('-') {
1690 if let (Some(start), Some(end)) =
1691 (parse_line_ref(start_str), parse_line_ref(end_str))
1692 {
1693 commands.push(EditCommand::Set {
1694 start,
1695 end,
1696 content,
1697 });
1698 }
1699 } else if let Some(target) = parse_line_ref(specifier) {
1700 commands.push(EditCommand::Set {
1701 start: target.clone(),
1702 end: target,
1703 content,
1704 });
1705 }
1706 } else {
1707 let after = parse_line_ref(specifier);
1708 commands.push(EditCommand::Insert { after, content });
1709 }
1710
1711 offset = scan;
1712 }
1713
1714 commands
1715 }
1716
1717 /// Returns `true` if the model output contains `<|set|>` or `<|insert|>` commands
1718 /// (as opposed to being a plain full-replacement output).
1719 /// Strip the `{line_num}:{hash}|` prefixes from each line of a hashline-encoded
1720 /// editable region, returning the plain text content.
1721 pub fn strip_hashline_prefixes(region: &str) -> String {
1722 let mut decoded: String = region
1723 .lines()
1724 .map(|line| line.find('|').map_or(line, |pos| &line[pos + 1..]))
1725 .collect::<Vec<_>>()
1726 .join("\n");
1727 if region.ends_with('\n') {
1728 decoded.push('\n');
1729 }
1730 decoded
1731 }
1732
1733 pub fn output_has_edit_commands(model_output: &str) -> bool {
1734 model_output.contains(SET_COMMAND_MARKER)
1735 || model_output.contains(INSERT_COMMAND_MARKER)
1736 || model_output.contains(NO_EDITS_COMMAND_MARKER)
1737 }
1738
1739 /// Apply `<|set|>` and `<|insert|>` edit commands from the model output to the
1740 /// original editable region text.
1741 ///
1742 /// `editable_region` is the original text of the editable region (without hash
1743 /// prefixes). `model_output` is the raw model response containing edit commands.
1744 ///
1745 /// Returns the full replacement text for the editable region.
1746 pub fn apply_edit_commands(editable_region: &str, model_output: &str) -> String {
1747 if model_output
1748 .trim_start()
1749 .starts_with(NO_EDITS_COMMAND_MARKER)
1750 {
1751 return editable_region.to_string();
1752 }
1753
1754 let original_lines: Vec<&str> = editable_region.lines().collect();
1755 let old_hashes: Vec<u8> = original_lines
1756 .iter()
1757 .map(|line| hash_line(line.as_bytes()))
1758 .collect();
1759
1760 let commands = parse_edit_commands(model_output);
1761
1762 // For set operations: indexed by start line → Some((end line index, content))
1763 // For insert operations: indexed by line index → vec of content to insert after
1764 // Insert-before-first is tracked separately.
1765 let mut set_ops: Vec<Option<(usize, &str)>> = vec![None; original_lines.len()];
1766 let mut insert_before_first: Vec<&str> = Vec::new();
1767 let mut insert_after: Vec<Vec<&str>> = vec![Vec::new(); original_lines.len()];
1768
1769 for command in &commands {
1770 match command {
1771 EditCommand::Set {
1772 start,
1773 end,
1774 content,
1775 } => {
1776 if start.index < old_hashes.len()
1777 && end.index < old_hashes.len()
1778 && start.index <= end.index
1779 && old_hashes[start.index] == start.hash
1780 && old_hashes[end.index] == end.hash
1781 {
1782 set_ops[start.index] = Some((end.index, *content));
1783 }
1784 }
1785 EditCommand::Insert { after, content } => match after {
1786 None => insert_before_first.push(*content),
1787 Some(line_ref) => {
1788 if line_ref.index < old_hashes.len()
1789 && old_hashes[line_ref.index] == line_ref.hash
1790 {
1791 insert_after[line_ref.index].push(*content);
1792 }
1793 }
1794 },
1795 }
1796 }
1797
1798 let mut result = String::new();
1799
1800 // Emit any insertions before the first line
1801 for content in &insert_before_first {
1802 result.push_str(content);
1803 if !content.ends_with('\n') {
1804 result.push('\n');
1805 }
1806 }
1807
1808 let mut i = 0;
1809 while i < original_lines.len() {
1810 if let Some((end_index, replacement)) = set_ops[i].as_ref() {
1811 // Replace lines i..=end_index with the replacement content
1812 result.push_str(replacement);
1813 if !replacement.is_empty() && !replacement.ends_with('\n') {
1814 result.push('\n');
1815 }
1816 // Emit any insertions after the end of this set range
1817 if *end_index < insert_after.len() {
1818 for content in &insert_after[*end_index] {
1819 result.push_str(content);
1820 if !content.ends_with('\n') {
1821 result.push('\n');
1822 }
1823 }
1824 }
1825 i = end_index + 1;
1826 } else {
1827 // Keep the original line
1828 result.push_str(original_lines[i]);
1829 result.push('\n');
1830 // Emit any insertions after this line
1831 for content in &insert_after[i] {
1832 result.push_str(content);
1833 if !content.ends_with('\n') {
1834 result.push('\n');
1835 }
1836 }
1837 i += 1;
1838 }
1839 }
1840
1841 // Preserve trailing newline behavior: if the original ended with a
1842 // newline the result already has one; if it didn't, trim the extra one
1843 // we added.
1844 if !editable_region.ends_with('\n') && result.ends_with('\n') {
1845 result.pop();
1846 }
1847
1848 result
1849 }
1850
1851 /// Convert a unified diff patch into hashline edit commands.
1852 ///
1853 /// Parses the unified diff `patch` directly to determine which lines of
1854 /// `old_text` are deleted/replaced and what new lines are added, then emits
1855 /// `<|set|>` and `<|insert|>` edit commands referencing old lines by their
1856 /// `{index}:{hash}` identifiers.
1857 ///
1858 /// `cursor_offset` is an optional byte offset into the first hunk's new
1859 /// text (context + additions) where the cursor marker should be placed.
1860 pub fn patch_to_edit_commands(
1861 old_text: &str,
1862 patch: &str,
1863 cursor_offset: Option<usize>,
1864 ) -> Result<String> {
1865 let old_lines: Vec<&str> = old_text.lines().collect();
1866 let old_hashes: Vec<u8> = old_lines
1867 .iter()
1868 .map(|line| hash_line(line.as_bytes()))
1869 .collect();
1870
1871 let mut result = String::new();
1872 let mut first_hunk = true;
1873
1874 struct Hunk<'a> {
1875 line_range: Range<usize>,
1876 new_text_lines: Vec<&'a str>,
1877 cursor_line_offset_in_new_text: Option<(usize, usize)>,
1878 }
1879
1880 // Parse the patch line by line. We only care about hunk headers,
1881 // context, deletions, and additions.
1882 let mut old_line_index: usize = 0;
1883 let mut current_hunk: Option<Hunk> = None;
1884 // Byte offset tracking within the hunk's new text for cursor placement.
1885 let mut new_text_byte_offset: usize = 0;
1886 // The line index of the last old line seen before/in the current hunk
1887 // (used for insert-after reference).
1888 let mut last_old_line_before_hunk: Option<usize> = None;
1889
1890 fn flush_hunk(
1891 hunk: Hunk,
1892 last_old_line: Option<usize>,
1893 result: &mut String,
1894 old_hashes: &[u8],
1895 ) {
1896 if hunk.line_range.is_empty() {
1897 // Pure insertion — reference the old line to insert after when in bounds.
1898 if let Some(after) = last_old_line
1899 && let Some(&hash) = old_hashes.get(after)
1900 {
1901 write!(
1902 result,
1903 "{INSERT_COMMAND_MARKER}{}\n",
1904 LineRef { index: after, hash }
1905 )
1906 .unwrap();
1907 } else {
1908 result.push_str(INSERT_COMMAND_MARKER);
1909 result.push('\n');
1910 }
1911 } else {
1912 let start = hunk.line_range.start;
1913 let end_exclusive = hunk.line_range.end;
1914 let deleted_line_count = end_exclusive.saturating_sub(start);
1915
1916 if deleted_line_count == 1 {
1917 if let Some(&hash) = old_hashes.get(start) {
1918 write!(
1919 result,
1920 "{SET_COMMAND_MARKER}{}\n",
1921 LineRef { index: start, hash }
1922 )
1923 .unwrap();
1924 } else {
1925 result.push_str(SET_COMMAND_MARKER);
1926 result.push('\n');
1927 }
1928 } else {
1929 let end_inclusive = end_exclusive - 1;
1930 match (
1931 old_hashes.get(start).copied(),
1932 old_hashes.get(end_inclusive).copied(),
1933 ) {
1934 (Some(start_hash), Some(end_hash)) => {
1935 write!(
1936 result,
1937 "{SET_COMMAND_MARKER}{}-{}\n",
1938 LineRef {
1939 index: start,
1940 hash: start_hash
1941 },
1942 LineRef {
1943 index: end_inclusive,
1944 hash: end_hash
1945 }
1946 )
1947 .unwrap();
1948 }
1949 _ => {
1950 result.push_str(SET_COMMAND_MARKER);
1951 result.push('\n');
1952 }
1953 }
1954 }
1955 }
1956 for (line_offset, line) in hunk.new_text_lines.iter().enumerate() {
1957 if let Some((cursor_line_offset, char_offset)) = hunk.cursor_line_offset_in_new_text
1958 && line_offset == cursor_line_offset
1959 {
1960 result.push_str(&line[..char_offset]);
1961 result.push_str(CURSOR_MARKER);
1962 result.push_str(&line[char_offset..]);
1963 continue;
1964 }
1965
1966 result.push_str(line);
1967 }
1968 }
1969
1970 for raw_line in patch.split_inclusive('\n') {
1971 if raw_line.starts_with("@@") {
1972 // Flush any pending change hunk from a previous patch hunk.
1973 if let Some(hunk) = current_hunk.take() {
1974 flush_hunk(hunk, last_old_line_before_hunk, &mut result, &old_hashes);
1975 }
1976
1977 // Parse hunk header: @@ -old_start[,old_count] +new_start[,new_count] @@
1978 // We intentionally do not trust old_start as a direct local index into `old_text`,
1979 // because some patches are produced against a larger file region and carry
1980 // non-local line numbers. We keep indexing local by advancing from parsed patch lines.
1981 if first_hunk {
1982 new_text_byte_offset = 0;
1983 first_hunk = false;
1984 }
1985 continue;
1986 }
1987
1988 if raw_line.starts_with("---") || raw_line.starts_with("+++") {
1989 continue;
1990 }
1991 if raw_line.starts_with("\\ No newline") {
1992 continue;
1993 }
1994
1995 if raw_line.starts_with('-') {
1996 // Extend or start a change hunk with this deleted old line.
1997 match &mut current_hunk {
1998 Some(Hunk {
1999 line_range: range, ..
2000 }) => range.end = old_line_index + 1,
2001 None => {
2002 current_hunk = Some(Hunk {
2003 line_range: old_line_index..old_line_index + 1,
2004 new_text_lines: Vec::new(),
2005 cursor_line_offset_in_new_text: None,
2006 });
2007 }
2008 }
2009 old_line_index += 1;
2010 } else if let Some(added_content) = raw_line.strip_prefix('+') {
2011 // Place cursor marker if cursor_offset falls within this line.
2012 let mut cursor_line_offset = None;
2013 if let Some(cursor_off) = cursor_offset
2014 && (first_hunk
2015 || cursor_off >= new_text_byte_offset
2016 && cursor_off <= new_text_byte_offset + added_content.len())
2017 {
2018 let line_offset = added_content.floor_char_boundary(
2019 cursor_off
2020 .saturating_sub(new_text_byte_offset)
2021 .min(added_content.len()),
2022 );
2023 cursor_line_offset = Some(line_offset);
2024 }
2025
2026 new_text_byte_offset += added_content.len();
2027
2028 let hunk = current_hunk.get_or_insert(Hunk {
2029 line_range: old_line_index..old_line_index,
2030 new_text_lines: vec![],
2031 cursor_line_offset_in_new_text: None,
2032 });
2033 hunk.new_text_lines.push(added_content);
2034 hunk.cursor_line_offset_in_new_text = cursor_line_offset
2035 .map(|offset_in_line| (hunk.new_text_lines.len() - 1, offset_in_line));
2036 } else {
2037 // Context line (starts with ' ' or is empty).
2038 if let Some(hunk) = current_hunk.take() {
2039 flush_hunk(hunk, last_old_line_before_hunk, &mut result, &old_hashes);
2040 }
2041 last_old_line_before_hunk = Some(old_line_index);
2042 old_line_index += 1;
2043 let content = raw_line.strip_prefix(' ').unwrap_or(raw_line);
2044 new_text_byte_offset += content.len();
2045 }
2046 }
2047
2048 // Flush final group.
2049 if let Some(hunk) = current_hunk.take() {
2050 flush_hunk(hunk, last_old_line_before_hunk, &mut result, &old_hashes);
2051 }
2052
2053 // Trim a single trailing newline.
2054 if result.ends_with('\n') {
2055 result.pop();
2056 }
2057
2058 if result.is_empty() {
2059 return Ok(NO_EDITS_COMMAND_MARKER.to_string());
2060 }
2061
2062 Ok(result)
2063 }
2064
2065 #[cfg(test)]
2066 mod tests {
2067 use super::*;
2068 use indoc::indoc;
2069
2070 #[test]
2071 fn test_format_cursor_region() {
2072 struct Case {
2073 name: &'static str,
2074 context: &'static str,
2075 editable_range: Range<usize>,
2076 cursor_offset: usize,
2077 expected: &'static str,
2078 }
2079
2080 let cases = [
2081 Case {
2082 name: "basic_cursor_placement",
2083 context: "hello world\n",
2084 editable_range: 0..12,
2085 cursor_offset: 5,
2086 expected: indoc! {"
2087 <|file_sep|>test.rs
2088 <|fim_prefix|>
2089 <|fim_middle|>current
2090 0:5c|hello<|user_cursor|> world
2091 <|fim_suffix|>
2092 <|fim_middle|>updated
2093 "},
2094 },
2095 Case {
2096 name: "multiline_cursor_on_second_line",
2097 context: "aaa\nbbb\nccc\n",
2098 editable_range: 0..12,
2099 cursor_offset: 5, // byte 5 → 1 byte into "bbb"
2100 expected: indoc! {"
2101 <|file_sep|>test.rs
2102 <|fim_prefix|>
2103 <|fim_middle|>current
2104 0:23|aaa
2105 1:26|b<|user_cursor|>bb
2106 2:29|ccc
2107 <|fim_suffix|>
2108 <|fim_middle|>updated
2109 "},
2110 },
2111 Case {
2112 name: "no_trailing_newline_in_context",
2113 context: "line1\nline2",
2114 editable_range: 0..11,
2115 cursor_offset: 3,
2116 expected: indoc! {"
2117 <|file_sep|>test.rs
2118 <|fim_prefix|>
2119 <|fim_middle|>current
2120 0:d9|lin<|user_cursor|>e1
2121 1:da|line2
2122 <|fim_suffix|>
2123 <|fim_middle|>updated
2124 "},
2125 },
2126 Case {
2127 name: "leading_newline_in_editable_region",
2128 context: "\nabc\n",
2129 editable_range: 0..5,
2130 cursor_offset: 2, // byte 2 = 'a' in "abc" (after leading \n)
2131 expected: indoc! {"
2132 <|file_sep|>test.rs
2133 <|fim_prefix|>
2134 <|fim_middle|>current
2135 0:00|
2136 1:26|a<|user_cursor|>bc
2137 <|fim_suffix|>
2138 <|fim_middle|>updated
2139 "},
2140 },
2141 Case {
2142 name: "with_suffix",
2143 context: "abc\ndef",
2144 editable_range: 0..4, // editable region = "abc\n", suffix = "def"
2145 cursor_offset: 2,
2146 expected: indoc! {"
2147 <|file_sep|>test.rs
2148 <|fim_prefix|>
2149 <|fim_middle|>current
2150 0:26|ab<|user_cursor|>c
2151 <|fim_suffix|>
2152 def
2153 <|fim_middle|>updated
2154 "},
2155 },
2156 Case {
2157 name: "unicode_two_byte_chars",
2158 context: "héllo\n",
2159 editable_range: 0..7,
2160 cursor_offset: 3, // byte 3 = after "hé" (h=1 byte, é=2 bytes), before "llo"
2161 expected: indoc! {"
2162 <|file_sep|>test.rs
2163 <|fim_prefix|>
2164 <|fim_middle|>current
2165 0:1b|hé<|user_cursor|>llo
2166 <|fim_suffix|>
2167 <|fim_middle|>updated
2168 "},
2169 },
2170 Case {
2171 name: "unicode_three_byte_chars",
2172 context: "日本語\n",
2173 editable_range: 0..10,
2174 cursor_offset: 6, // byte 6 = after "日本" (3+3 bytes), before "語"
2175 expected: indoc! {"
2176 <|file_sep|>test.rs
2177 <|fim_prefix|>
2178 <|fim_middle|>current
2179 0:80|日本<|user_cursor|>語
2180 <|fim_suffix|>
2181 <|fim_middle|>updated
2182 "},
2183 },
2184 Case {
2185 name: "unicode_four_byte_chars",
2186 context: "a🌍b\n",
2187 editable_range: 0..7,
2188 cursor_offset: 5, // byte 5 = after "a🌍" (1+4 bytes), before "b"
2189 expected: indoc! {"
2190 <|file_sep|>test.rs
2191 <|fim_prefix|>
2192 <|fim_middle|>current
2193 0:6b|a🌍<|user_cursor|>b
2194 <|fim_suffix|>
2195 <|fim_middle|>updated
2196 "},
2197 },
2198 Case {
2199 name: "cursor_at_start_of_region_not_placed",
2200 context: "abc\n",
2201 editable_range: 0..4,
2202 cursor_offset: 0, // cursor_offset(0) > offset(0) is false → cursor not placed
2203 expected: indoc! {"
2204 <|file_sep|>test.rs
2205 <|fim_prefix|>
2206 <|fim_middle|>current
2207 0:26|abc
2208 <|fim_suffix|>
2209 <|fim_middle|>updated
2210 "},
2211 },
2212 Case {
2213 name: "cursor_at_end_of_line_not_placed",
2214 context: "abc\ndef\n",
2215 editable_range: 0..8,
2216 cursor_offset: 3, // byte 3 = the \n after "abc" → falls between lines, not placed
2217 expected: indoc! {"
2218 <|file_sep|>test.rs
2219 <|fim_prefix|>
2220 <|fim_middle|>current
2221 0:26|abc
2222 1:2f|def
2223 <|fim_suffix|>
2224 <|fim_middle|>updated
2225 "},
2226 },
2227 Case {
2228 name: "cursor_offset_relative_to_context_not_editable_region",
2229 // cursor_offset is relative to `context`, so when editable_range.start > 0,
2230 // write_cursor_excerpt_section must subtract it before comparing against
2231 // per-line offsets within the editable region.
2232 context: "pre\naaa\nbbb\nsuf\n",
2233 editable_range: 4..12, // editable region = "aaa\nbbb\n"
2234 cursor_offset: 9, // byte 9 in context = second 'b' in "bbb"
2235 expected: indoc! {"
2236 <|file_sep|>test.rs
2237 <|fim_prefix|>
2238 pre
2239 <|fim_middle|>current
2240 0:23|aaa
2241 1:26|b<|user_cursor|>bb
2242 <|fim_suffix|>
2243 suf
2244 <|fim_middle|>updated
2245 "},
2246 },
2247 ];
2248
2249 for case in &cases {
2250 let mut prompt = String::new();
2251 hashline::write_cursor_excerpt_section(
2252 &mut prompt,
2253 Path::new("test.rs"),
2254 case.context,
2255 &case.editable_range,
2256 case.cursor_offset,
2257 );
2258 assert_eq!(prompt, case.expected, "failed case: {}", case.name);
2259 }
2260 }
2261
2262 #[test]
2263 fn test_apply_edit_commands() {
2264 struct Case {
2265 name: &'static str,
2266 original: &'static str,
2267 model_output: &'static str,
2268 expected: &'static str,
2269 }
2270
2271 let cases = vec![
2272 Case {
2273 name: "set_single_line",
2274 original: indoc! {"
2275 let mut total = 0;
2276 for product in products {
2277 total += ;
2278 }
2279 total
2280 "},
2281 model_output: indoc! {"
2282 <|set|>2:87
2283 total += product.price;
2284 "},
2285 expected: indoc! {"
2286 let mut total = 0;
2287 for product in products {
2288 total += product.price;
2289 }
2290 total
2291 "},
2292 },
2293 Case {
2294 name: "set_range",
2295 original: indoc! {"
2296 fn foo() {
2297 let x = 1;
2298 let y = 2;
2299 let z = 3;
2300 }
2301 "},
2302 model_output: indoc! {"
2303 <|set|>1:46-3:4a
2304 let sum = 6;
2305 "},
2306 expected: indoc! {"
2307 fn foo() {
2308 let sum = 6;
2309 }
2310 "},
2311 },
2312 Case {
2313 name: "insert_after_line",
2314 original: indoc! {"
2315 fn main() {
2316 let x = 1;
2317 }
2318 "},
2319 model_output: indoc! {"
2320 <|insert|>1:46
2321 let y = 2;
2322 "},
2323 expected: indoc! {"
2324 fn main() {
2325 let x = 1;
2326 let y = 2;
2327 }
2328 "},
2329 },
2330 Case {
2331 name: "insert_before_first",
2332 original: indoc! {"
2333 let x = 1;
2334 let y = 2;
2335 "},
2336 model_output: indoc! {"
2337 <|insert|>
2338 use std::io;
2339 "},
2340 expected: indoc! {"
2341 use std::io;
2342 let x = 1;
2343 let y = 2;
2344 "},
2345 },
2346 Case {
2347 name: "set_with_cursor_marker",
2348 original: indoc! {"
2349 fn main() {
2350 println!();
2351 }
2352 "},
2353 model_output: indoc! {"
2354 <|set|>1:34
2355 eprintln!(\"<|user_cursor|>\");
2356 "},
2357 expected: indoc! {"
2358 fn main() {
2359 eprintln!(\"<|user_cursor|>\");
2360 }
2361 "},
2362 },
2363 Case {
2364 name: "multiple_set_commands",
2365 original: indoc! {"
2366 aaa
2367 bbb
2368 ccc
2369 ddd
2370 "},
2371 model_output: indoc! {"
2372 <|set|>0:23
2373 AAA
2374 <|set|>2:29
2375 CCC
2376 "},
2377 expected: indoc! {"
2378 AAA
2379 bbb
2380 CCC
2381 ddd
2382 "},
2383 },
2384 Case {
2385 name: "set_range_multiline_replacement",
2386 original: indoc! {"
2387 fn handle_submit() {
2388 }
2389
2390 fn handle_keystroke() {
2391 "},
2392 model_output: indoc! {"
2393 <|set|>0:3f-1:7d
2394 fn handle_submit(modal_state: &mut ModalState) {
2395 <|user_cursor|>
2396 }
2397 "},
2398 expected: indoc! {"
2399 fn handle_submit(modal_state: &mut ModalState) {
2400 <|user_cursor|>
2401 }
2402
2403 fn handle_keystroke() {
2404 "},
2405 },
2406 Case {
2407 name: "no_edit_commands_returns_original",
2408 original: indoc! {"
2409 hello
2410 world
2411 "},
2412 model_output: "some random text with no commands",
2413 expected: indoc! {"
2414 hello
2415 world
2416 "},
2417 },
2418 Case {
2419 name: "no_edits_command_returns_original",
2420 original: indoc! {"
2421 hello
2422 world
2423 "},
2424 model_output: "<|no_edits|>",
2425 expected: indoc! {"
2426 hello
2427 world
2428 "},
2429 },
2430 Case {
2431 name: "wrong_hash_set_ignored",
2432 original: indoc! {"
2433 aaa
2434 bbb
2435 "},
2436 model_output: indoc! {"
2437 <|set|>0:ff
2438 ZZZ
2439 "},
2440 expected: indoc! {"
2441 aaa
2442 bbb
2443 "},
2444 },
2445 Case {
2446 name: "insert_and_set_combined",
2447 original: indoc! {"
2448 alpha
2449 beta
2450 gamma
2451 "},
2452 model_output: indoc! {"
2453 <|set|>0:06
2454 ALPHA
2455 <|insert|>1:9c
2456 beta_extra
2457 "},
2458 expected: indoc! {"
2459 ALPHA
2460 beta
2461 beta_extra
2462 gamma
2463 "},
2464 },
2465 Case {
2466 name: "no_trailing_newline_preserved",
2467 original: "hello\nworld",
2468 model_output: indoc! {"
2469 <|set|>0:14
2470 HELLO
2471 "},
2472 expected: "HELLO\nworld",
2473 },
2474 Case {
2475 name: "set_range_hash_mismatch_in_end_bound",
2476 original: indoc! {"
2477 one
2478 two
2479 three
2480 "},
2481 model_output: indoc! {"
2482 <|set|>0:42-2:ff
2483 ONE_TWO_THREE
2484 "},
2485 expected: indoc! {"
2486 one
2487 two
2488 three
2489 "},
2490 },
2491 Case {
2492 name: "set_range_start_greater_than_end_ignored",
2493 original: indoc! {"
2494 a
2495 b
2496 c
2497 "},
2498 model_output: indoc! {"
2499 <|set|>2:63-1:62
2500 X
2501 "},
2502 expected: indoc! {"
2503 a
2504 b
2505 c
2506 "},
2507 },
2508 Case {
2509 name: "insert_out_of_bounds_ignored",
2510 original: indoc! {"
2511 x
2512 y
2513 "},
2514 model_output: indoc! {"
2515 <|insert|>99:aa
2516 z
2517 "},
2518 expected: indoc! {"
2519 x
2520 y
2521 "},
2522 },
2523 Case {
2524 name: "set_out_of_bounds_ignored",
2525 original: indoc! {"
2526 x
2527 y
2528 "},
2529 model_output: indoc! {"
2530 <|set|>99:aa
2531 z
2532 "},
2533 expected: indoc! {"
2534 x
2535 y
2536 "},
2537 },
2538 Case {
2539 name: "malformed_set_command_ignored",
2540 original: indoc! {"
2541 alpha
2542 beta
2543 "},
2544 model_output: indoc! {"
2545 <|set|>not-a-line-ref
2546 UPDATED
2547 "},
2548 expected: indoc! {"
2549 alpha
2550 beta
2551 "},
2552 },
2553 Case {
2554 name: "malformed_insert_hash_treated_as_before_first",
2555 original: indoc! {"
2556 alpha
2557 beta
2558 "},
2559 model_output: indoc! {"
2560 <|insert|>1:nothex
2561 preamble
2562 "},
2563 expected: indoc! {"
2564 preamble
2565 alpha
2566 beta
2567 "},
2568 },
2569 Case {
2570 name: "set_then_insert_same_target_orders_insert_after_replacement",
2571 original: indoc! {"
2572 cat
2573 dog
2574 "},
2575 model_output: indoc! {"
2576 <|set|>0:38
2577 CAT
2578 <|insert|>0:38
2579 TAIL
2580 "},
2581 expected: indoc! {"
2582 CAT
2583 TAIL
2584 dog
2585 "},
2586 },
2587 Case {
2588 name: "overlapping_set_ranges_last_wins",
2589 original: indoc! {"
2590 a
2591 b
2592 c
2593 d
2594 "},
2595 model_output: indoc! {"
2596 <|set|>0:61-2:63
2597 FIRST
2598 <|set|>1:62-3:64
2599 SECOND
2600 "},
2601 expected: indoc! {"
2602 FIRST
2603 d
2604 "},
2605 },
2606 Case {
2607 name: "insert_before_first_and_after_line",
2608 original: indoc! {"
2609 a
2610 b
2611 "},
2612 model_output: indoc! {"
2613 <|insert|>
2614 HEAD
2615 <|insert|>0:61
2616 MID
2617 "},
2618 expected: indoc! {"
2619 HEAD
2620 a
2621 MID
2622 b
2623 "},
2624 },
2625 ];
2626
2627 for case in &cases {
2628 let result = hashline::apply_edit_commands(case.original, &case.model_output);
2629 assert_eq!(result, case.expected, "failed case: {}", case.name);
2630 }
2631 }
2632
2633 #[test]
2634 fn test_output_has_edit_commands() {
2635 assert!(hashline::output_has_edit_commands(&format!(
2636 "{}0:ab\nnew",
2637 SET_COMMAND_MARKER
2638 )));
2639 assert!(hashline::output_has_edit_commands(&format!(
2640 "{}0:ab\nnew",
2641 INSERT_COMMAND_MARKER
2642 )));
2643 assert!(hashline::output_has_edit_commands(&format!(
2644 "some text\n{}1:cd\nstuff",
2645 SET_COMMAND_MARKER
2646 )));
2647 assert!(!hashline::output_has_edit_commands("just plain text"));
2648 assert!(!hashline::output_has_edit_commands("NO_EDITS"));
2649 assert!(hashline::output_has_edit_commands("<|no_edits|>"));
2650 }
2651
2652 // ---- hashline::patch_to_edit_commands round-trip tests ----
2653
2654 #[test]
2655 fn test_patch_to_edit_commands() {
2656 struct Case {
2657 name: &'static str,
2658 old: &'static str,
2659 patch: &'static str,
2660 expected_new: &'static str,
2661 }
2662
2663 let cases = [
2664 Case {
2665 name: "single_line_replacement",
2666 old: indoc! {"
2667 let mut total = 0;
2668 for product in products {
2669 total += ;
2670 }
2671 total
2672 "},
2673 patch: indoc! {"
2674 @@ -1,5 +1,5 @@
2675 let mut total = 0;
2676 for product in products {
2677 - total += ;
2678 + total += product.price;
2679 }
2680 total
2681 "},
2682 expected_new: indoc! {"
2683 let mut total = 0;
2684 for product in products {
2685 total += product.price;
2686 }
2687 total
2688 "},
2689 },
2690 Case {
2691 name: "multiline_replacement",
2692 old: indoc! {"
2693 fn foo() {
2694 let x = 1;
2695 let y = 2;
2696 let z = 3;
2697 }
2698 "},
2699 patch: indoc! {"
2700 @@ -1,5 +1,3 @@
2701 fn foo() {
2702 - let x = 1;
2703 - let y = 2;
2704 - let z = 3;
2705 + let sum = 1 + 2 + 3;
2706 }
2707 "},
2708 expected_new: indoc! {"
2709 fn foo() {
2710 let sum = 1 + 2 + 3;
2711 }
2712 "},
2713 },
2714 Case {
2715 name: "insertion",
2716 old: indoc! {"
2717 fn main() {
2718 let x = 1;
2719 }
2720 "},
2721 patch: indoc! {"
2722 @@ -1,3 +1,4 @@
2723 fn main() {
2724 let x = 1;
2725 + let y = 2;
2726 }
2727 "},
2728 expected_new: indoc! {"
2729 fn main() {
2730 let x = 1;
2731 let y = 2;
2732 }
2733 "},
2734 },
2735 Case {
2736 name: "insertion_before_first",
2737 old: indoc! {"
2738 let x = 1;
2739 let y = 2;
2740 "},
2741 patch: indoc! {"
2742 @@ -1,2 +1,3 @@
2743 +use std::io;
2744 let x = 1;
2745 let y = 2;
2746 "},
2747 expected_new: indoc! {"
2748 use std::io;
2749 let x = 1;
2750 let y = 2;
2751 "},
2752 },
2753 Case {
2754 name: "deletion",
2755 old: indoc! {"
2756 aaa
2757 bbb
2758 ccc
2759 ddd
2760 "},
2761 patch: indoc! {"
2762 @@ -1,4 +1,2 @@
2763 aaa
2764 -bbb
2765 -ccc
2766 ddd
2767 "},
2768 expected_new: indoc! {"
2769 aaa
2770 ddd
2771 "},
2772 },
2773 Case {
2774 name: "multiple_changes",
2775 old: indoc! {"
2776 alpha
2777 beta
2778 gamma
2779 delta
2780 epsilon
2781 "},
2782 patch: indoc! {"
2783 @@ -1,5 +1,5 @@
2784 -alpha
2785 +ALPHA
2786 beta
2787 gamma
2788 -delta
2789 +DELTA
2790 epsilon
2791 "},
2792 expected_new: indoc! {"
2793 ALPHA
2794 beta
2795 gamma
2796 DELTA
2797 epsilon
2798 "},
2799 },
2800 Case {
2801 name: "replace_with_insertion",
2802 old: indoc! {r#"
2803 fn handle() {
2804 modal_state.close();
2805 modal_state.dismiss();
2806 "#},
2807 patch: indoc! {r#"
2808 @@ -1,3 +1,4 @@
2809 fn handle() {
2810 modal_state.close();
2811 + eprintln!("");
2812 modal_state.dismiss();
2813 "#},
2814 expected_new: indoc! {r#"
2815 fn handle() {
2816 modal_state.close();
2817 eprintln!("");
2818 modal_state.dismiss();
2819 "#},
2820 },
2821 Case {
2822 name: "complete_replacement",
2823 old: indoc! {"
2824 aaa
2825 bbb
2826 ccc
2827 "},
2828 patch: indoc! {"
2829 @@ -1,3 +1,3 @@
2830 -aaa
2831 -bbb
2832 -ccc
2833 +xxx
2834 +yyy
2835 +zzz
2836 "},
2837 expected_new: indoc! {"
2838 xxx
2839 yyy
2840 zzz
2841 "},
2842 },
2843 Case {
2844 name: "add_function_body",
2845 old: indoc! {"
2846 fn foo() {
2847 modal_state.dismiss();
2848 }
2849
2850 fn
2851
2852 fn handle_keystroke() {
2853 "},
2854 patch: indoc! {"
2855 @@ -1,6 +1,8 @@
2856 fn foo() {
2857 modal_state.dismiss();
2858 }
2859
2860 -fn
2861 +fn handle_submit() {
2862 + todo()
2863 +}
2864
2865 fn handle_keystroke() {
2866 "},
2867 expected_new: indoc! {"
2868 fn foo() {
2869 modal_state.dismiss();
2870 }
2871
2872 fn handle_submit() {
2873 todo()
2874 }
2875
2876 fn handle_keystroke() {
2877 "},
2878 },
2879 Case {
2880 name: "with_cursor_offset",
2881 old: indoc! {r#"
2882 fn main() {
2883 println!();
2884 }
2885 "#},
2886 patch: indoc! {r#"
2887 @@ -1,3 +1,3 @@
2888 fn main() {
2889 - println!();
2890 + eprintln!("");
2891 }
2892 "#},
2893 expected_new: indoc! {r#"
2894 fn main() {
2895 eprintln!("<|user_cursor|>");
2896 }
2897 "#},
2898 },
2899 Case {
2900 name: "non_local_hunk_header_pure_insertion_repro",
2901 old: indoc! {"
2902 aaa
2903 bbb
2904 "},
2905 patch: indoc! {"
2906 @@ -20,2 +20,3 @@
2907 aaa
2908 +xxx
2909 bbb
2910 "},
2911 expected_new: indoc! {"
2912 aaa
2913 xxx
2914 bbb
2915 "},
2916 },
2917 Case {
2918 name: "empty_patch_produces_no_edits_marker",
2919 old: indoc! {"
2920 aaa
2921 bbb
2922 "},
2923 patch: "@@ -20,2 +20,3 @@\n",
2924 expected_new: indoc! {"
2925 aaa
2926 bbb
2927 "},
2928 },
2929 ];
2930
2931 for case in &cases {
2932 // The cursor_offset for patch_to_edit_commands is relative to
2933 // the first hunk's new text (context + additions). We compute
2934 // it by finding where the marker sits in the expected output
2935 // (which mirrors the new text of the hunk).
2936 let cursor_offset = case.expected_new.find(CURSOR_MARKER);
2937
2938 let commands =
2939 hashline::patch_to_edit_commands(case.old, case.patch, cursor_offset)
2940 .unwrap_or_else(|e| panic!("failed case {}: {e}", case.name));
2941
2942 assert!(
2943 hashline::output_has_edit_commands(&commands),
2944 "case {}: expected edit commands, got: {commands:?}",
2945 case.name,
2946 );
2947
2948 let applied = hashline::apply_edit_commands(case.old, &commands);
2949 assert_eq!(applied, case.expected_new, "case {}", case.name);
2950 }
2951 }
2952 }
2953}
2954
2955pub mod seed_coder {
2956 //! Seed-Coder prompt format using SPM (Suffix-Prefix-Middle) FIM mode.
2957 //!
2958 //! Seed-Coder uses different FIM tokens and order than Qwen:
2959 //! - SPM order: suffix comes FIRST, then prefix, then middle
2960 //! - Tokens: `<[fim-suffix]>`, `<[fim-prefix]>`, `<[fim-middle]>`
2961 //! - File markers: StarCoder-style `<filename>path` (single token + path)
2962 //!
2963 //! All context (related files, edit history) goes in the PREFIX section.
2964 //! The suffix contains only code after the editable region.
2965 //!
2966 //! Example prompt:
2967 //!
2968 //! <[fim-suffix]>
2969 //! code after editable region
2970 //! <[fim-prefix]><filename>related/file.py
2971 //! related file content
2972 //!
2973 //! <filename>edit_history
2974 //! --- a/some_file.py
2975 //! +++ b/some_file.py
2976 //! -old
2977 //! +new
2978 //!
2979 //! <filename>path/to/target_file.py
2980 //! code before editable region
2981 //! <<<<<<< CURRENT
2982 //! code that
2983 //! needs to<|user_cursor|>
2984 //! be rewritten
2985 //! =======
2986 //! <[fim-middle]>
2987 //!
2988 //! Expected output (model generates):
2989 //!
2990 //! updated
2991 //! code with
2992 //! changes applied
2993 //! >>>>>>> UPDATED
2994
2995 use super::*;
2996
2997 pub const FIM_SUFFIX: &str = "<[fim-suffix]>";
2998 pub const FIM_PREFIX: &str = "<[fim-prefix]>";
2999 pub const FIM_MIDDLE: &str = "<[fim-middle]>";
3000 pub const FILE_MARKER: &str = "<filename>";
3001
3002 pub const START_MARKER: &str = "<<<<<<< CURRENT\n";
3003 pub const SEPARATOR: &str = "=======\n";
3004 pub const END_MARKER: &str = ">>>>>>> UPDATED\n";
3005
3006 pub const NO_EDITS: &str = "NO_EDITS\n";
3007
3008 pub fn special_tokens() -> &'static [&'static str] {
3009 &[
3010 FIM_SUFFIX,
3011 FIM_PREFIX,
3012 FIM_MIDDLE,
3013 FILE_MARKER,
3014 START_MARKER,
3015 SEPARATOR,
3016 END_MARKER,
3017 CURSOR_MARKER,
3018 ]
3019 }
3020
3021 pub fn write_cursor_excerpt_section(
3022 prompt: &mut String,
3023 path: &Path,
3024 context: &str,
3025 editable_range: &Range<usize>,
3026 cursor_offset: usize,
3027 ) {
3028 let section = build_cursor_prefix_section(path, context, editable_range, cursor_offset);
3029 prompt.push_str(§ion);
3030 }
3031
3032 pub fn format_prompt_with_budget(
3033 path: &Path,
3034 context: &str,
3035 editable_range: &Range<usize>,
3036 cursor_offset: usize,
3037 events: &[Arc<Event>],
3038 related_files: &[RelatedFile],
3039 max_tokens: usize,
3040 ) -> String {
3041 let cursor_prefix_section =
3042 build_cursor_prefix_section(path, context, editable_range, cursor_offset);
3043 assemble_fim_prompt(
3044 context,
3045 editable_range,
3046 &cursor_prefix_section,
3047 events,
3048 related_files,
3049 max_tokens,
3050 )
3051 }
3052
3053 pub fn assemble_fim_prompt(
3054 context: &str,
3055 editable_range: &Range<usize>,
3056 cursor_prefix_section: &str,
3057 events: &[Arc<Event>],
3058 related_files: &[RelatedFile],
3059 max_tokens: usize,
3060 ) -> String {
3061 let suffix_section = build_suffix_section(context, editable_range);
3062
3063 let suffix_tokens = estimate_tokens(suffix_section.len() + FIM_PREFIX.len());
3064 let cursor_prefix_tokens = estimate_tokens(cursor_prefix_section.len() + FIM_MIDDLE.len());
3065 let budget_after_cursor = max_tokens.saturating_sub(suffix_tokens + cursor_prefix_tokens);
3066
3067 let edit_history_section = super::format_edit_history_within_budget(
3068 events,
3069 FILE_MARKER,
3070 "edit_history",
3071 budget_after_cursor,
3072 max_edit_event_count_for_format(&ZetaFormat::V0211SeedCoder),
3073 );
3074 let edit_history_tokens = estimate_tokens(edit_history_section.len() + "\n".len());
3075 let budget_after_edit_history =
3076 budget_after_cursor.saturating_sub(edit_history_tokens + "\n".len());
3077
3078 let related_files_section = super::format_related_files_within_budget(
3079 related_files,
3080 FILE_MARKER,
3081 "",
3082 budget_after_edit_history,
3083 );
3084
3085 let mut prompt = String::new();
3086 prompt.push_str(&suffix_section);
3087 prompt.push_str(FIM_PREFIX);
3088 prompt.push_str(&related_files_section);
3089 if !related_files_section.is_empty() {
3090 prompt.push('\n');
3091 }
3092 prompt.push_str(&edit_history_section);
3093 if !edit_history_section.is_empty() {
3094 prompt.push('\n');
3095 }
3096 prompt.push_str(cursor_prefix_section);
3097 prompt.push_str(FIM_MIDDLE);
3098
3099 prompt
3100 }
3101
3102 fn build_suffix_section(context: &str, editable_range: &Range<usize>) -> String {
3103 let mut section = String::new();
3104 section.push_str(FIM_SUFFIX);
3105 section.push_str(&context[editable_range.end..]);
3106 if !section.ends_with('\n') {
3107 section.push('\n');
3108 }
3109 section
3110 }
3111
3112 fn build_cursor_prefix_section(
3113 path: &Path,
3114 context: &str,
3115 editable_range: &Range<usize>,
3116 cursor_offset: usize,
3117 ) -> String {
3118 let mut section = String::new();
3119 let path_str = path.to_string_lossy();
3120 write!(section, "{}{}\n", FILE_MARKER, path_str).ok();
3121
3122 section.push_str(&context[..editable_range.start]);
3123 section.push_str(START_MARKER);
3124 section.push_str(&context[editable_range.start..cursor_offset]);
3125 section.push_str(CURSOR_MARKER);
3126 section.push_str(&context[cursor_offset..editable_range.end]);
3127 if !section.ends_with('\n') {
3128 section.push('\n');
3129 }
3130 section.push_str(SEPARATOR);
3131 section
3132 }
3133
3134 /// Format patch as containing no changes if it's empty; otherwise return None.
3135 pub(crate) fn no_edits(patch: &str) -> Option<String> {
3136 // Count lines in the patch
3137 let empty_patch = patch.lines().count() <= 3;
3138 if empty_patch {
3139 Some(format!("{NO_EDITS}{END_MARKER}"))
3140 } else {
3141 None
3142 }
3143 }
3144}
3145
3146pub mod v0304_variable_edit {
3147 //! A prompt format with no fixed editable region. The entire context is shown
3148 //! to the model, and it chooses which text to replace by outputting surrounding
3149 //! context lines with `<|fim_middle|>` and `<|fim_suffix|>` delimiting the new
3150 //! text.
3151 //!
3152 //! Example prompt:
3153 //!
3154 //! <|file_sep|>path/to/file.py
3155 //! zero
3156 //! one
3157 //! two
3158 //! three<|user_cursor|>
3159 //! four
3160 //! five
3161 //! <|fim_prefix|>
3162 //
3163 //! Expected output (model generates):
3164 //!
3165 //! two
3166 //! <|fim_middle|>
3167 //! THREE
3168 //! <|fim_suffix|>
3169 //! four
3170 //!
3171 //! The output means: find "two\n...\nfour" in the context, and replace
3172 //! everything between "two\n" and "four" with "THREE\n".
3173
3174 use super::*;
3175
3176 pub fn special_tokens() -> &'static [&'static str] {
3177 &[
3178 "<|fim_prefix|>",
3179 "<|fim_suffix|>",
3180 "<|fim_middle|>",
3181 "<|file_sep|>",
3182 CURSOR_MARKER,
3183 ]
3184 }
3185
3186 pub fn write_cursor_excerpt_section(
3187 prompt: &mut String,
3188 path: &Path,
3189 context: &str,
3190 cursor_offset: usize,
3191 ) {
3192 let path_str = path.to_string_lossy();
3193 write!(prompt, "<|file_sep|>{}\n", path_str).ok();
3194
3195 prompt.push_str(&context[..cursor_offset]);
3196 prompt.push_str(CURSOR_MARKER);
3197 prompt.push_str(&context[cursor_offset..]);
3198 if !prompt.ends_with('\n') {
3199 prompt.push('\n');
3200 }
3201 prompt.push_str("<|fim_prefix|>\n")
3202 }
3203
3204 /// Apply a variable-edit model output to the original context text.
3205 ///
3206 /// The model output has the form:
3207 ///
3208 /// - prefix context lines
3209 /// - `<|fim_middle|>`
3210 /// - new text
3211 /// - `<|fim_suffix|>`
3212 /// - suffix context lines
3213 ///
3214 /// We locate the prefix/suffix context lines in the original text and replace
3215 /// everything between them with the new text.
3216 pub fn apply_variable_edit(
3217 context: &str,
3218 model_output: &str,
3219 ) -> Result<(Range<usize>, String)> {
3220 let (prefix_context, rest) = model_output
3221 .split_once("<|fim_middle|>\n")
3222 .or_else(|| model_output.split_once("<|fim_middle|>"))
3223 .ok_or_else(|| anyhow::anyhow!("missing <|fim_middle|> in model output"))?;
3224
3225 let (new_text, suffix_context) = rest
3226 .split_once("<|fim_suffix|>\n")
3227 .or_else(|| rest.split_once("<|fim_suffix|>"))
3228 .unwrap_or((rest, ""));
3229
3230 let suffix_context = if prefix_context.is_empty() && !suffix_context.is_empty() {
3231 suffix_context.strip_prefix('\n').unwrap_or(suffix_context)
3232 } else {
3233 suffix_context
3234 };
3235
3236 let prefix_offset = find_substring_at_line_boundary(context, prefix_context)
3237 .ok_or_else(|| anyhow!("could not locate prefix lines"))?
3238 + prefix_context.len();
3239 let suffix_offset = if suffix_context.is_empty() {
3240 context.len()
3241 } else {
3242 find_substring_at_line_boundary(&context[prefix_offset..], suffix_context)
3243 .ok_or_else(|| anyhow!("could not locate suffix lines"))?
3244 + prefix_offset
3245 };
3246
3247 let edit_range = prefix_offset..suffix_offset;
3248 return Ok((edit_range, new_text.to_string()));
3249 }
3250
3251 fn find_substring_at_line_boundary(haystack: &str, needle: &str) -> Option<usize> {
3252 if needle.is_empty() {
3253 return Some(0);
3254 }
3255
3256 haystack.match_indices(needle).find_map(|(offset, _)| {
3257 let matched_line_start = offset == 0 || haystack[..offset].ends_with('\n');
3258 matched_line_start.then_some(offset)
3259 })
3260 }
3261
3262 /// Convert a unified diff patch into the variable-edit output format.
3263 ///
3264 /// Parses `patch` as a unified diff against `old_text` and produces model
3265 /// output with context lines surrounding `<|fim_middle|>` / `<|fim_suffix|>`
3266 /// delimiters. The diff is resolved by content matching rather than line
3267 /// numbers.
3268 pub fn patch_to_variable_edit_output(
3269 old_text: &str,
3270 patch: &str,
3271 cursor_offset: Option<usize>,
3272 ) -> Result<String> {
3273 // Parse the unified diff into hunks. Each hunk has an `old_context`
3274 // string (context + deleted lines interleaved in order) and a list of
3275 // edits expressed as byte ranges within that context plus replacement
3276 // text.
3277 let hunks = parse_hunks(patch);
3278 if hunks.is_empty() {
3279 return Ok(String::new());
3280 }
3281
3282 // Apply each hunk by finding its old_context in the text and
3283 // performing the edits. We search forward from where the previous
3284 // hunk ended so that hunks are applied in order.
3285 let mut new_text = old_text.to_string();
3286 let mut search_from: usize = 0;
3287 let mut first_hunk_pos: Option<usize> = None;
3288
3289 for hunk in &hunks {
3290 let context_pos = new_text[search_from..]
3291 .find(&hunk.old_context)
3292 .map(|pos| pos + search_from)
3293 .ok_or_else(|| anyhow::anyhow!("could not locate hunk context in text"))?;
3294
3295 if first_hunk_pos.is_none() {
3296 first_hunk_pos = Some(context_pos);
3297 }
3298
3299 // Apply edits in reverse order so byte offsets remain valid.
3300 for edit in hunk.edits.iter().rev() {
3301 let abs_start = context_pos + edit.range.start;
3302 let abs_end = context_pos + edit.range.end;
3303 new_text.replace_range(abs_start..abs_end, &edit.text);
3304 }
3305
3306 // Advance past this hunk's region in the (now modified) text.
3307 let new_region_len: usize =
3308 hunk.edits.iter().fold(hunk.old_context.len(), |len, edit| {
3309 len + edit.text.len() - (edit.range.end - edit.range.start)
3310 });
3311 search_from = context_pos + new_region_len;
3312 }
3313
3314 // Now we have old_text and new_text. Find the changed line range by
3315 // comparing them.
3316 let old_lines: Vec<&str> = old_text.lines().collect();
3317 let new_lines: Vec<&str> = new_text.lines().collect();
3318
3319 // Find first differing line.
3320 let first_changed_row = old_lines
3321 .iter()
3322 .zip(new_lines.iter())
3323 .position(|(a, b)| a != b)
3324 .unwrap_or_else(|| old_lines.len().min(new_lines.len()));
3325
3326 // Find last differing line (from the end).
3327 let max_suffix = old_lines.len().min(new_lines.len()) - first_changed_row;
3328 let common_suffix = old_lines
3329 .iter()
3330 .rev()
3331 .zip(new_lines.iter().rev())
3332 .take(max_suffix)
3333 .take_while(|(a, b)| a == b)
3334 .count();
3335
3336 let old_end = old_lines.len() - common_suffix;
3337 let new_end = new_lines.len() - common_suffix;
3338
3339 if first_changed_row == old_end && first_changed_row == new_end {
3340 return Ok(String::new());
3341 }
3342
3343 // Build the replacement text from new_lines[first_diff..new_end].
3344 let mut merged_new_text = String::new();
3345 for line in &new_lines[first_changed_row..new_end] {
3346 merged_new_text.push_str(line);
3347 merged_new_text.push('\n');
3348 }
3349
3350 // cursor_offset is relative to the first hunk's new content in
3351 // new_text. Translate it to an offset within merged_new_text, which
3352 // only contains lines first_diff..new_end of new_text.
3353 if let Some(hunk_offset) = cursor_offset {
3354 let hunk_start = first_hunk_pos.unwrap_or(0);
3355 let absolute_pos = hunk_start + hunk_offset;
3356
3357 // Byte offset where first_diff starts in new_text.
3358 let merged_start: usize = new_lines[..first_changed_row]
3359 .iter()
3360 .map(|line| line.len() + 1)
3361 .sum();
3362
3363 if absolute_pos >= merged_start {
3364 let relative_offset = absolute_pos - merged_start;
3365 if relative_offset <= merged_new_text.len() {
3366 merged_new_text.insert_str(relative_offset, CURSOR_MARKER);
3367 }
3368 }
3369 }
3370
3371 // Build output with 2 lines of context above and below.
3372 let context_lines_count = 2;
3373 let mut prefix_start = first_changed_row.saturating_sub(context_lines_count);
3374 let mut suffix_end = (old_end + context_lines_count).min(old_lines.len());
3375
3376 fn count_matches(line_range: Range<usize>, lines: &[&str]) -> usize {
3377 let pattern = &lines[line_range];
3378 let pattern_len = pattern.len();
3379
3380 let mut count = 0;
3381 for offset in 0..=lines.len() - pattern_len {
3382 if &lines[offset..offset + pattern_len] == pattern {
3383 count += 1;
3384 }
3385 }
3386 count
3387 }
3388
3389 // Expand prefix and suffix until they are unique
3390 while prefix_start > 0 {
3391 if count_matches(prefix_start..first_changed_row, &old_lines) > 1 {
3392 prefix_start -= 1;
3393 } else {
3394 break;
3395 }
3396 }
3397 while suffix_end < old_lines.len() {
3398 if count_matches(old_end..suffix_end, &old_lines) > 1 {
3399 suffix_end += 1;
3400 } else {
3401 break;
3402 }
3403 }
3404
3405 let mut output = String::new();
3406 for line in &old_lines[prefix_start..first_changed_row] {
3407 output.push_str(line);
3408 output.push('\n');
3409 }
3410 output.push_str("<|fim_middle|>\n");
3411 output.push_str(&merged_new_text);
3412 output.push_str("<|fim_suffix|>\n");
3413 for line in &old_lines[old_end..suffix_end] {
3414 output.push_str(line);
3415 output.push('\n');
3416 }
3417
3418 Ok(output)
3419 }
3420
3421 struct ParsedHunk {
3422 old_context: String,
3423 edits: Vec<ParsedEdit>,
3424 }
3425
3426 struct ParsedEdit {
3427 range: Range<usize>,
3428 text: String,
3429 }
3430
3431 /// Parse a unified diff into content-based hunks. Each hunk contains an
3432 /// `old_context` string (context lines + deleted lines, which together
3433 /// form the text that should be found in the original) and a list of edits
3434 /// expressed as byte ranges within that context.
3435 fn parse_hunks(patch: &str) -> Vec<ParsedHunk> {
3436 let mut hunks = Vec::new();
3437 let mut current: Option<ParsedHunk> = None;
3438
3439 for line in patch.lines() {
3440 if line.starts_with("@@") {
3441 if let Some(hunk) = current.take() {
3442 if !hunk.old_context.is_empty() || !hunk.edits.is_empty() {
3443 hunks.push(hunk);
3444 }
3445 }
3446 current = Some(ParsedHunk {
3447 old_context: String::new(),
3448 edits: Vec::new(),
3449 });
3450 } else if line.starts_with("---") || line.starts_with("+++") {
3451 continue;
3452 } else if let Some(hunk) = &mut current {
3453 if let Some(added) = line.strip_prefix('+') {
3454 let pos = hunk.old_context.len();
3455 if let Some(last_edit) = hunk.edits.last_mut() {
3456 if last_edit.range.end == pos {
3457 writeln!(&mut last_edit.text, "{added}").ok();
3458 continue;
3459 }
3460 }
3461 hunk.edits.push(ParsedEdit {
3462 range: pos..pos,
3463 text: format!("{added}\n"),
3464 });
3465 } else if let Some(removed) = line.strip_prefix('-') {
3466 let start = hunk.old_context.len();
3467 writeln!(&mut hunk.old_context, "{removed}").ok();
3468 let end = hunk.old_context.len();
3469 if let Some(last_edit) = hunk.edits.last_mut() {
3470 if last_edit.range.end == start {
3471 last_edit.range.end = end;
3472 continue;
3473 }
3474 }
3475 hunk.edits.push(ParsedEdit {
3476 range: start..end,
3477 text: String::new(),
3478 });
3479 } else {
3480 let ctx = line.strip_prefix(' ').unwrap_or(line);
3481 writeln!(&mut hunk.old_context, "{ctx}").ok();
3482 }
3483 }
3484 }
3485
3486 if let Some(hunk) = current {
3487 if !hunk.old_context.is_empty() || !hunk.edits.is_empty() {
3488 hunks.push(hunk);
3489 }
3490 }
3491
3492 hunks
3493 }
3494
3495 #[cfg(test)]
3496 mod tests {
3497 use super::*;
3498 use indoc::indoc;
3499
3500 #[test]
3501 fn test_apply_variable_edit() {
3502 struct Case {
3503 name: &'static str,
3504 original: &'static str,
3505 model_output: &'static str,
3506 expected: &'static str,
3507 }
3508
3509 let cases = [
3510 Case {
3511 name: "simple_single_line_replacement",
3512 original: indoc! {"
3513 zero
3514 one
3515 two
3516 three
3517 four
3518 five
3519 "},
3520 model_output: indoc! {"
3521 two
3522 <|fim_middle|>
3523 THREE
3524 <|fim_suffix|>
3525 four
3526 "},
3527 expected: indoc! {"
3528 zero
3529 one
3530 two
3531 THREE
3532 four
3533 five
3534 "},
3535 },
3536 Case {
3537 name: "multi_line_replacement",
3538 original: indoc! {"
3539 a
3540 b
3541 c
3542 d
3543 e
3544 "},
3545 model_output: indoc! {"
3546 a
3547 <|fim_middle|>
3548 B
3549 C
3550 D
3551 <|fim_suffix|>
3552 e
3553 "},
3554 expected: indoc! {"
3555 a
3556 B
3557 C
3558 D
3559 e
3560 "},
3561 },
3562 Case {
3563 name: "insertion_between_existing_lines",
3564 original: indoc! {"
3565 a
3566 b
3567 c
3568 "},
3569 model_output: indoc! {"
3570 a
3571 <|fim_middle|>
3572 X
3573 <|fim_suffix|>
3574 b
3575 "},
3576 expected: indoc! {"
3577 a
3578 X
3579 b
3580 c
3581 "},
3582 },
3583 Case {
3584 name: "deletion",
3585 original: indoc! {"
3586 a
3587 b
3588 c
3589 d
3590 "},
3591 model_output: indoc! {"
3592 a
3593 <|fim_middle|>
3594 <|fim_suffix|>
3595 c
3596 "},
3597 expected: indoc! {"
3598 a
3599 c
3600 d
3601 "},
3602 },
3603 Case {
3604 name: "replacement_at_start_no_prefix_context",
3605 original: indoc! {"
3606 a
3607 b
3608 c
3609 "},
3610 model_output: indoc! {"
3611 <|fim_middle|>
3612 X
3613 <|fim_suffix|>
3614 b
3615 "},
3616 expected: indoc! {"
3617 X
3618 b
3619 c
3620 "},
3621 },
3622 Case {
3623 name: "replacement_at_end_no_suffix_context",
3624 original: indoc! {"
3625 a
3626 b
3627 c
3628 "},
3629 model_output: indoc! {"
3630 b
3631 <|fim_middle|>
3632 Z
3633 <|fim_suffix|>
3634 "},
3635 expected: indoc! {"
3636 a
3637 b
3638 Z
3639 "},
3640 },
3641 Case {
3642 name: "context_with_trailing_newline_is_preserved",
3643 original: indoc! {"
3644 a
3645 b
3646 c
3647 "},
3648 model_output: indoc! {"
3649 a
3650 <|fim_middle|>
3651 B
3652 <|fim_suffix|>
3653 c
3654 "},
3655 expected: indoc! {"
3656 a
3657 B
3658 c
3659 "},
3660 },
3661 Case {
3662 name: "cursor_marker_passes_through_untouched",
3663 original: indoc! {"
3664 a
3665 b
3666 c
3667 "},
3668 model_output: indoc! {"
3669 a
3670 <|fim_middle|>
3671 B<|user_cursor|>B
3672 <|fim_suffix|>
3673 c
3674 "},
3675 expected: indoc! {"
3676 a
3677 B<|user_cursor|>B
3678 c
3679 "},
3680 },
3681 Case {
3682 name: "multiple_prefix_context_lines",
3683 original: indoc! {"
3684 a
3685 b
3686 c
3687 d
3688 e
3689 "},
3690 model_output: indoc! {"
3691 b
3692 c
3693 <|fim_middle|>
3694 D
3695 <|fim_suffix|>
3696 e
3697 "},
3698 expected: indoc! {"
3699 a
3700 b
3701 c
3702 D
3703 e
3704 "},
3705 },
3706 ];
3707
3708 for case in cases {
3709 let (edit_range, replacement) =
3710 apply_variable_edit(case.original, case.model_output).unwrap();
3711 let mut edited = case.original.to_string();
3712 edited.replace_range(edit_range, &replacement);
3713 assert_eq!(edited, case.expected, "{}", case.name);
3714 }
3715 }
3716
3717 #[test]
3718 fn test_patch_to_variable_edit() {
3719 struct Case {
3720 name: &'static str,
3721 old: &'static str,
3722 patch: &'static str,
3723 cursor_offset: Option<usize>,
3724 expected_variable_edit: &'static str,
3725 expected_after_apply: &'static str,
3726 }
3727
3728 let cases = [
3729 Case {
3730 name: "simple_replacement",
3731 old: indoc! {"
3732 zero
3733 one
3734 two
3735 three
3736 four
3737 five
3738 "},
3739 patch: indoc! {"
3740 @@ -3,3 +3,3 @@
3741 two
3742 -three
3743 +THREE
3744 four
3745 "},
3746 cursor_offset: None,
3747 expected_variable_edit: indoc! {"
3748 one
3749 two
3750 <|fim_middle|>
3751 THREE
3752 <|fim_suffix|>
3753 four
3754 five
3755 "},
3756 expected_after_apply: indoc! {"
3757 zero
3758 one
3759 two
3760 THREE
3761 four
3762 five
3763 "},
3764 },
3765 Case {
3766 name: "insertion",
3767 old: indoc! {"
3768 a
3769 b
3770 c
3771 d
3772 e
3773 "},
3774 patch: indoc! {"
3775 @@ -2,0 +3,1 @@
3776 b
3777 +X
3778 c
3779 "},
3780 cursor_offset: None,
3781 expected_variable_edit: indoc! {"
3782 a
3783 b
3784 <|fim_middle|>
3785 X
3786 <|fim_suffix|>
3787 c
3788 d
3789 "},
3790 expected_after_apply: indoc! {"
3791 a
3792 b
3793 X
3794 c
3795 d
3796 e
3797 "},
3798 },
3799 Case {
3800 name: "deletion",
3801 old: indoc! {"
3802 a
3803 b
3804 c
3805 d
3806 e
3807 "},
3808 patch: indoc! {"
3809 @@ -2,3 +2,2 @@
3810 b
3811 -c
3812 d
3813 "},
3814 cursor_offset: None,
3815 expected_variable_edit: indoc! {"
3816 a
3817 b
3818 <|fim_middle|>
3819 <|fim_suffix|>
3820 d
3821 e
3822 "},
3823 expected_after_apply: indoc! {"
3824 a
3825 b
3826 d
3827 e
3828 "},
3829 },
3830 Case {
3831 name: "edit_near_start",
3832 old: indoc! {"
3833 first
3834 second
3835 third
3836 fourth
3837 "},
3838 patch: indoc! {"
3839 @@ -1,1 +1,1 @@
3840 -first
3841 +FIRST
3842 "},
3843 cursor_offset: None,
3844 expected_variable_edit: indoc! {"
3845 <|fim_middle|>
3846 FIRST
3847 <|fim_suffix|>
3848 second
3849 third
3850 "},
3851 expected_after_apply: indoc! {"
3852 FIRST
3853 second
3854 third
3855 fourth
3856 "},
3857 },
3858 Case {
3859 name: "edit_near_end",
3860 old: indoc! {"
3861 first
3862 second
3863 third
3864 fourth
3865 "},
3866 patch: indoc! {"
3867 @@ -4,1 +4,1 @@
3868 -fourth
3869 +FOURTH
3870 "},
3871 cursor_offset: None,
3872 expected_variable_edit: indoc! {"
3873 second
3874 third
3875 <|fim_middle|>
3876 FOURTH
3877 <|fim_suffix|>
3878 "},
3879 expected_after_apply: indoc! {"
3880 first
3881 second
3882 third
3883 FOURTH
3884 "},
3885 },
3886 Case {
3887 name: "cursor_at_start_of_replacement",
3888 old: indoc! {"
3889 zero
3890 one
3891 two
3892 three
3893 four
3894 five
3895 "},
3896 patch: indoc! {"
3897 @@ -3,3 +3,3 @@
3898 two
3899 -three
3900 +THREE
3901 four
3902 "},
3903 cursor_offset: Some(4),
3904 expected_variable_edit: indoc! {"
3905 one
3906 two
3907 <|fim_middle|>
3908 <|user_cursor|>THREE
3909 <|fim_suffix|>
3910 four
3911 five
3912 "},
3913 expected_after_apply: indoc! {"
3914 zero
3915 one
3916 two
3917 <|user_cursor|>THREE
3918 four
3919 five
3920 "},
3921 },
3922 Case {
3923 name: "cursor_in_middle_of_replacement",
3924 old: indoc! {"
3925 zero
3926 one
3927 two
3928 three
3929 four
3930 five
3931 "},
3932 patch: indoc! {"
3933 @@ -3,3 +3,3 @@
3934 two
3935 -three
3936 +THREE
3937 four
3938 "},
3939 cursor_offset: Some(6),
3940 expected_variable_edit: indoc! {"
3941 one
3942 two
3943 <|fim_middle|>
3944 TH<|user_cursor|>REE
3945 <|fim_suffix|>
3946 four
3947 five
3948 "},
3949 expected_after_apply: indoc! {"
3950 zero
3951 one
3952 two
3953 TH<|user_cursor|>REE
3954 four
3955 five
3956 "},
3957 },
3958 Case {
3959 name: "expands_context_when_two_lines_not_unique_before_and_after",
3960 old: indoc! {"
3961 one
3962 a
3963 b
3964 c
3965 d
3966 two
3967 a
3968 b
3969 c
3970 d
3971 three
3972 a
3973 b
3974 c
3975 d
3976 four
3977 "},
3978 patch: indoc! {"
3979 @@ -4,5 +4,5 @@
3980 two
3981 a
3982 b
3983 -c
3984 +C
3985 d
3986 three
3987 "},
3988 cursor_offset: None,
3989 expected_variable_edit: indoc! {"
3990 two
3991 a
3992 b
3993 <|fim_middle|>
3994 C
3995 <|fim_suffix|>
3996 d
3997 three
3998 "},
3999 expected_after_apply: indoc! {"
4000 one
4001 a
4002 b
4003 c
4004 d
4005 two
4006 a
4007 b
4008 C
4009 d
4010 three
4011 a
4012 b
4013 c
4014 d
4015 four
4016 "},
4017 },
4018 Case {
4019 name: "expands_context_when_two_lines_not_unique_before_and_after",
4020 old: indoc! {"
4021 {
4022 {
4023 one();
4024 }
4025 }
4026 {
4027 {
4028 two();
4029 }
4030 }
4031 {
4032 {
4033 three();
4034 }
4035 }
4036 {
4037 {
4038 four();
4039 }
4040 }
4041 "},
4042 patch: indoc! {"
4043 @@ -4,5 +4,5 @@
4044 {
4045 - two();
4046 + TWO();
4047 }
4048 "},
4049 cursor_offset: None,
4050 expected_variable_edit: indoc! {"
4051 one();
4052 }
4053 }
4054 {
4055 {
4056 <|fim_middle|>
4057 TWO();
4058 <|fim_suffix|>
4059 }
4060 }
4061 {
4062 {
4063 three();
4064 "},
4065 expected_after_apply: indoc! {"
4066 {
4067 {
4068 one();
4069 }
4070 }
4071 {
4072 {
4073 TWO();
4074 }
4075 }
4076 {
4077 {
4078 three();
4079 }
4080 }
4081 {
4082 {
4083 four();
4084 }
4085 }
4086 "},
4087 },
4088 ];
4089
4090 for case in cases {
4091 let output =
4092 patch_to_variable_edit_output(case.old, case.patch, case.cursor_offset)
4093 .unwrap_or_else(|error| {
4094 panic!("failed converting patch for {}: {error}", case.name)
4095 });
4096 assert_eq!(
4097 output, case.expected_variable_edit,
4098 "patch->variable_edit mismatch for {}",
4099 case.name
4100 );
4101
4102 let (edit_range, replacement) = apply_variable_edit(case.old, &output)
4103 .unwrap_or_else(|error| {
4104 panic!("failed applying variable_edit for {}: {error}", case.name)
4105 });
4106 let mut edited_by_variable_edit = case.old.to_string();
4107 edited_by_variable_edit.replace_range(edit_range, &replacement);
4108 assert_eq!(
4109 edited_by_variable_edit, case.expected_after_apply,
4110 "variable_edit apply mismatch for {}",
4111 case.name
4112 );
4113
4114 let (expected_edit_range, expected_replacement) =
4115 apply_variable_edit(case.old, case.expected_variable_edit).unwrap_or_else(
4116 |error| {
4117 panic!(
4118 "failed applying expected variable_edit for {}: {error}",
4119 case.name
4120 )
4121 },
4122 );
4123 let mut edited_by_expected_variable_edit = case.old.to_string();
4124 edited_by_expected_variable_edit
4125 .replace_range(expected_edit_range, &expected_replacement);
4126 assert_eq!(
4127 edited_by_expected_variable_edit, case.expected_after_apply,
4128 "expected variable_edit apply mismatch for {}",
4129 case.name
4130 );
4131 }
4132 }
4133
4134 #[test]
4135 fn test_write_cursor_excerpt_section() {
4136 let path = Path::new("test.rs");
4137 let context = "fn main() {\n hello();\n}\n";
4138 let cursor_offset = 17;
4139 let mut prompt = String::new();
4140 write_cursor_excerpt_section(&mut prompt, path, context, cursor_offset);
4141 assert_eq!(
4142 prompt,
4143 "<|file_sep|>test.rs\nfn main() {\n h<|user_cursor|>ello();\n}\n<|fim_prefix|>\n"
4144 );
4145 }
4146 }
4147}
4148
4149/// The zeta1 prompt format
4150pub mod zeta1 {
4151 use super::*;
4152 use std::fmt::Write;
4153
4154 pub const CURSOR_MARKER: &str = "<|user_cursor_is_here|>";
4155 pub const START_OF_FILE_MARKER: &str = "<|start_of_file|>";
4156 pub const EDITABLE_REGION_START_MARKER: &str = "<|editable_region_start|>";
4157 pub const EDITABLE_REGION_END_MARKER: &str = "<|editable_region_end|>";
4158
4159 const INSTRUCTION_HEADER: &str = concat!(
4160 "### Instruction:\n",
4161 "You are a code completion assistant and your task is to analyze user edits and then rewrite an ",
4162 "excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking ",
4163 "into account the cursor location.\n\n",
4164 "### User Edits:\n\n"
4165 );
4166 const EXCERPT_HEADER: &str = "\n\n### User Excerpt:\n\n";
4167 const RESPONSE_HEADER: &str = "\n\n### Response:\n";
4168
4169 /// Formats a complete zeta1 prompt from the input events and excerpt.
4170 pub fn format_zeta1_prompt(input_events: &str, input_excerpt: &str) -> String {
4171 let mut prompt = String::with_capacity(
4172 INSTRUCTION_HEADER.len()
4173 + input_events.len()
4174 + EXCERPT_HEADER.len()
4175 + input_excerpt.len()
4176 + RESPONSE_HEADER.len(),
4177 );
4178 prompt.push_str(INSTRUCTION_HEADER);
4179 prompt.push_str(input_events);
4180 prompt.push_str(EXCERPT_HEADER);
4181 prompt.push_str(input_excerpt);
4182 prompt.push_str(RESPONSE_HEADER);
4183 prompt
4184 }
4185
4186 /// Formats a complete zeta1 prompt from a `ZetaPromptInput` using the given
4187 /// editable and context byte-offset ranges within `cursor_excerpt`.
4188 pub fn format_zeta1_from_input(
4189 input: &ZetaPromptInput,
4190 editable_range: Range<usize>,
4191 context_range: Range<usize>,
4192 ) -> String {
4193 let events = format_zeta1_events(&input.events);
4194 let excerpt = format_zeta1_excerpt(input, editable_range, context_range);
4195 format_zeta1_prompt(&events, &excerpt)
4196 }
4197
4198 /// Formats events in zeta1 style (oldest first).
4199 fn format_zeta1_events(events: &[Arc<Event>]) -> String {
4200 let mut result = String::new();
4201 for event in
4202 events
4203 .iter()
4204 .skip(events.len().saturating_sub(max_edit_event_count_for_format(
4205 &ZetaFormat::V0114180EditableRegion,
4206 )))
4207 {
4208 let event_string = format_zeta1_event(event);
4209 if event_string.is_empty() {
4210 continue;
4211 }
4212 if !result.is_empty() {
4213 result.push_str("\n\n");
4214 }
4215 result.push_str(&event_string);
4216 }
4217 result
4218 }
4219
4220 fn format_zeta1_event(event: &Event) -> String {
4221 match event {
4222 Event::BufferChange {
4223 path,
4224 old_path,
4225 diff,
4226 ..
4227 } => {
4228 let mut prompt = String::new();
4229 if old_path != path {
4230 writeln!(
4231 prompt,
4232 "User renamed {} to {}\n",
4233 old_path.display(),
4234 path.display()
4235 )
4236 .ok();
4237 }
4238 if !diff.is_empty() {
4239 write!(
4240 prompt,
4241 "User edited {}:\n```diff\n{}\n```",
4242 path.display(),
4243 diff
4244 )
4245 .ok();
4246 }
4247 prompt
4248 }
4249 }
4250 }
4251
4252 /// Formats the excerpt section of a zeta1 prompt using byte-offset ranges
4253 /// within `cursor_excerpt`.
4254 fn format_zeta1_excerpt(
4255 input: &ZetaPromptInput,
4256 editable_range: Range<usize>,
4257 context_range: Range<usize>,
4258 ) -> String {
4259 let path_str = input.cursor_path.to_string_lossy();
4260 let excerpt = &*input.cursor_excerpt;
4261 let cursor_offset = input.cursor_offset_in_excerpt;
4262
4263 let mut prompt = String::new();
4264 writeln!(&mut prompt, "```{path_str}").ok();
4265
4266 let starts_at_file_beginning =
4267 input.excerpt_start_row == Some(0) && context_range.start == 0;
4268 if starts_at_file_beginning {
4269 writeln!(&mut prompt, "{START_OF_FILE_MARKER}").ok();
4270 }
4271
4272 prompt.push_str(&excerpt[context_range.start..editable_range.start]);
4273
4274 writeln!(&mut prompt, "{EDITABLE_REGION_START_MARKER}").ok();
4275 prompt.push_str(&excerpt[editable_range.start..cursor_offset]);
4276 prompt.push_str(CURSOR_MARKER);
4277 prompt.push_str(&excerpt[cursor_offset..editable_range.end]);
4278 write!(&mut prompt, "\n{EDITABLE_REGION_END_MARKER}").ok();
4279
4280 prompt.push_str(&excerpt[editable_range.end..context_range.end]);
4281 write!(prompt, "\n```").ok();
4282
4283 prompt
4284 }
4285
4286 /// Cleans zeta1 model output by extracting content between editable region
4287 /// markers and converting the zeta1 cursor marker to the universal one.
4288 /// Returns `None` if the output doesn't contain the expected markers.
4289 pub fn clean_zeta1_model_output(output: &str) -> Option<String> {
4290 let content = output.replace(CURSOR_MARKER, "");
4291
4292 let content_start = content
4293 .find(EDITABLE_REGION_START_MARKER)
4294 .map(|pos| pos + EDITABLE_REGION_START_MARKER.len())
4295 .map(|pos| {
4296 if content.as_bytes().get(pos) == Some(&b'\n') {
4297 pos + 1
4298 } else {
4299 pos
4300 }
4301 })
4302 .unwrap_or(0);
4303
4304 let content_end = content
4305 .find(EDITABLE_REGION_END_MARKER)
4306 .map(|pos| {
4307 if pos > 0 && content.as_bytes().get(pos - 1) == Some(&b'\n') {
4308 pos - 1
4309 } else {
4310 pos
4311 }
4312 })
4313 .unwrap_or(content.len());
4314
4315 if content_start > content_end {
4316 return Some(String::new());
4317 }
4318
4319 let extracted = &content[content_start..content_end];
4320
4321 let cursor_offset = output.find(CURSOR_MARKER).map(|zeta1_cursor_pos| {
4322 let text_before_cursor = output[..zeta1_cursor_pos].replace(CURSOR_MARKER, "");
4323 let text_before_cursor = text_before_cursor
4324 .find(EDITABLE_REGION_START_MARKER)
4325 .map(|pos| {
4326 let after_marker = pos + EDITABLE_REGION_START_MARKER.len();
4327 if text_before_cursor.as_bytes().get(after_marker) == Some(&b'\n') {
4328 after_marker + 1
4329 } else {
4330 after_marker
4331 }
4332 })
4333 .unwrap_or(0);
4334 let offset_in_extracted = zeta1_cursor_pos
4335 .saturating_sub(text_before_cursor)
4336 .min(extracted.len());
4337 offset_in_extracted
4338 });
4339
4340 let mut result = String::with_capacity(extracted.len() + super::CURSOR_MARKER.len());
4341 if let Some(offset) = cursor_offset {
4342 result.push_str(&extracted[..offset]);
4343 result.push_str(super::CURSOR_MARKER);
4344 result.push_str(&extracted[offset..]);
4345 } else {
4346 result.push_str(extracted);
4347 }
4348
4349 Some(result)
4350 }
4351}
4352
4353#[cfg(test)]
4354mod tests {
4355 use super::*;
4356 use indoc::indoc;
4357
4358 fn make_input(
4359 cursor_excerpt: &str,
4360 editable_range: Range<usize>,
4361 cursor_offset: usize,
4362 events: Vec<Event>,
4363 related_files: Vec<RelatedFile>,
4364 ) -> ZetaPromptInput {
4365 let context_range = 0..cursor_excerpt.len();
4366 ZetaPromptInput {
4367 cursor_path: Path::new("test.rs").into(),
4368 cursor_excerpt: cursor_excerpt.into(),
4369 cursor_offset_in_excerpt: cursor_offset,
4370 excerpt_start_row: None,
4371 events: events.into_iter().map(Arc::new).collect(),
4372 related_files: Some(related_files),
4373 active_buffer_diagnostics: vec![],
4374 excerpt_ranges: ExcerptRanges {
4375 editable_150: editable_range.clone(),
4376 editable_180: editable_range.clone(),
4377 editable_350: editable_range,
4378 editable_150_context_350: context_range.clone(),
4379 editable_180_context_350: context_range.clone(),
4380 editable_350_context_150: context_range,
4381 ..Default::default()
4382 },
4383 syntax_ranges: None,
4384 experiment: None,
4385 in_open_source_repo: false,
4386 can_collect_data: false,
4387 repo_url: None,
4388 }
4389 }
4390
4391 fn make_input_with_context_range(
4392 excerpt: &str,
4393 editable_range: Range<usize>,
4394 context_range: Range<usize>,
4395 cursor_offset: usize,
4396 ) -> ZetaPromptInput {
4397 ZetaPromptInput {
4398 cursor_path: Path::new("test.rs").into(),
4399 cursor_excerpt: excerpt.into(),
4400 cursor_offset_in_excerpt: cursor_offset,
4401 excerpt_start_row: None,
4402 events: vec![],
4403 related_files: Some(vec![]),
4404 active_buffer_diagnostics: vec![],
4405 excerpt_ranges: ExcerptRanges {
4406 editable_150: editable_range.clone(),
4407 editable_180: editable_range.clone(),
4408 editable_350: editable_range,
4409 editable_150_context_350: context_range.clone(),
4410 editable_180_context_350: context_range.clone(),
4411 editable_350_context_150: context_range,
4412 ..Default::default()
4413 },
4414 syntax_ranges: None,
4415 experiment: None,
4416 in_open_source_repo: false,
4417 can_collect_data: false,
4418 repo_url: None,
4419 }
4420 }
4421
4422 fn make_event(path: &str, diff: &str) -> Event {
4423 Event::BufferChange {
4424 path: Path::new(path).into(),
4425 old_path: Path::new(path).into(),
4426 diff: diff.to_string(),
4427 predicted: false,
4428 in_open_source_repo: false,
4429 }
4430 }
4431
4432 fn make_related_file(path: &str, content: &str) -> RelatedFile {
4433 RelatedFile {
4434 path: Path::new(path).into(),
4435 max_row: content.lines().count() as u32,
4436 excerpts: vec![RelatedExcerpt {
4437 row_range: 0..content.lines().count() as u32,
4438 text: content.into(),
4439 order: 0,
4440 }],
4441 in_open_source_repo: false,
4442 }
4443 }
4444
4445 fn format_with_budget(input: &ZetaPromptInput, max_tokens: usize) -> Option<String> {
4446 format_prompt_with_budget_for_format(input, ZetaFormat::V0114180EditableRegion, max_tokens)
4447 }
4448
4449 fn budget_with_margin(requested_tokens: usize) -> usize {
4450 ((requested_tokens as f64) / 0.9).ceil() as usize
4451 }
4452
4453 #[test]
4454 fn test_no_truncation_when_within_budget() {
4455 let input = make_input(
4456 "prefix\neditable\nsuffix",
4457 7..15,
4458 10,
4459 vec![make_event("a.rs", "-old\n+new\n")],
4460 vec![make_related_file("related.rs", "fn helper() {}\n")],
4461 );
4462
4463 assert_eq!(
4464 format_with_budget(&input, 10000).unwrap(),
4465 indoc! {r#"
4466 <|file_sep|>related.rs
4467 fn helper() {}
4468 <|file_sep|>edit history
4469 --- a/a.rs
4470 +++ b/a.rs
4471 -old
4472 +new
4473 <|file_sep|>test.rs
4474 <|fim_prefix|>
4475 prefix
4476 <|fim_middle|>current
4477 edi<|user_cursor|>table
4478 <|fim_suffix|>
4479
4480 suffix
4481 <|fim_middle|>updated
4482 "#}
4483 .to_string()
4484 );
4485 }
4486
4487 #[test]
4488 fn test_truncation_drops_edit_history_when_budget_tight() {
4489 let input = make_input(
4490 "code",
4491 0..4,
4492 2,
4493 vec![make_event("a.rs", "-x\n+y\n")],
4494 vec![
4495 make_related_file("r1.rs", "aaaaaaa\n"),
4496 make_related_file("r2.rs", "bbbbbbb\n"),
4497 ],
4498 );
4499
4500 assert_eq!(
4501 format_with_budget(&input, 10000).unwrap(),
4502 indoc! {r#"
4503 <|file_sep|>r1.rs
4504 aaaaaaa
4505 <|file_sep|>r2.rs
4506 bbbbbbb
4507 <|file_sep|>edit history
4508 --- a/a.rs
4509 +++ b/a.rs
4510 -x
4511 +y
4512 <|file_sep|>test.rs
4513 <|fim_prefix|>
4514 <|fim_middle|>current
4515 co<|user_cursor|>de
4516 <|fim_suffix|>
4517 <|fim_middle|>updated
4518 "#}
4519 .to_string()
4520 );
4521
4522 assert_eq!(
4523 format_with_budget(&input, budget_with_margin(55)),
4524 Some(
4525 indoc! {r#"
4526 <|file_sep|>edit history
4527 --- a/a.rs
4528 +++ b/a.rs
4529 -x
4530 +y
4531 <|file_sep|>test.rs
4532 <|fim_prefix|>
4533 <|fim_middle|>current
4534 co<|user_cursor|>de
4535 <|fim_suffix|>
4536 <|fim_middle|>updated
4537 "#}
4538 .to_string()
4539 )
4540 );
4541 }
4542
4543 #[test]
4544 fn test_truncation_includes_partial_excerpts() {
4545 let input = make_input(
4546 "x",
4547 0..1,
4548 0,
4549 vec![],
4550 vec![RelatedFile {
4551 path: Path::new("big.rs").into(),
4552 max_row: 30,
4553 in_open_source_repo: false,
4554 excerpts: vec![
4555 RelatedExcerpt {
4556 row_range: 0..10,
4557 text: "first excerpt\n".into(),
4558 order: 0,
4559 },
4560 RelatedExcerpt {
4561 row_range: 10..20,
4562 text: "second excerpt\n".into(),
4563 order: 0,
4564 },
4565 RelatedExcerpt {
4566 row_range: 20..30,
4567 text: "third excerpt\n".into(),
4568 order: 0,
4569 },
4570 ],
4571 }],
4572 );
4573
4574 assert_eq!(
4575 format_with_budget(&input, 10000).unwrap(),
4576 indoc! {r#"
4577 <|file_sep|>big.rs
4578 first excerpt
4579 ...
4580 second excerpt
4581 ...
4582 third excerpt
4583 <|file_sep|>test.rs
4584 <|fim_prefix|>
4585 <|fim_middle|>current
4586 <|user_cursor|>x
4587 <|fim_suffix|>
4588 <|fim_middle|>updated
4589 "#}
4590 .to_string()
4591 );
4592
4593 assert_eq!(
4594 format_with_budget(&input, budget_with_margin(50)).unwrap(),
4595 indoc! {r#"
4596 <|file_sep|>big.rs
4597 first excerpt
4598 ...
4599 <|file_sep|>test.rs
4600 <|fim_prefix|>
4601 <|fim_middle|>current
4602 <|user_cursor|>x
4603 <|fim_suffix|>
4604 <|fim_middle|>updated
4605 "#}
4606 .to_string()
4607 );
4608 }
4609
4610 #[test]
4611 fn test_truncation_prioritizes_lower_order_excerpts() {
4612 // Two files: file_a has a high-order excerpt, file_b has a low-order one.
4613 // With tight budget, only the lower-order excerpt from file_b should be included.
4614 let input = make_input(
4615 "x",
4616 0..1,
4617 0,
4618 vec![],
4619 vec![
4620 RelatedFile {
4621 path: Path::new("file_a.rs").into(),
4622 max_row: 10,
4623 in_open_source_repo: false,
4624 excerpts: vec![RelatedExcerpt {
4625 row_range: 0..10,
4626 text: "low priority content\n".into(),
4627 order: 5,
4628 }],
4629 },
4630 RelatedFile {
4631 path: Path::new("file_b.rs").into(),
4632 max_row: 10,
4633 in_open_source_repo: false,
4634 excerpts: vec![RelatedExcerpt {
4635 row_range: 0..10,
4636 text: "high priority content\n".into(),
4637 order: 1,
4638 }],
4639 },
4640 ],
4641 );
4642
4643 // With large budget, both files included; rendered in stable lexicographic order.
4644 assert_eq!(
4645 format_with_budget(&input, 10000).unwrap(),
4646 indoc! {r#"
4647 <|file_sep|>file_a.rs
4648 low priority content
4649 <|file_sep|>file_b.rs
4650 high priority content
4651 <|file_sep|>test.rs
4652 <|fim_prefix|>
4653 <|fim_middle|>current
4654 <|user_cursor|>x
4655 <|fim_suffix|>
4656 <|fim_middle|>updated
4657 "#}
4658 .to_string()
4659 );
4660
4661 // With tight budget, only file_b (lower order) fits.
4662 // Cursor section is ~37 tokens, so budget 52 leaves ~15 for related files.
4663 // file_b header (7) + excerpt (7) = 14 tokens, which fits.
4664 // file_a would need another 14 tokens, which doesn't fit.
4665 assert_eq!(
4666 format_with_budget(&input, budget_with_margin(52)).unwrap(),
4667 indoc! {r#"
4668 <|file_sep|>file_b.rs
4669 high priority content
4670 <|file_sep|>test.rs
4671 <|fim_prefix|>
4672 <|fim_middle|>current
4673 <|user_cursor|>x
4674 <|fim_suffix|>
4675 <|fim_middle|>updated
4676 "#}
4677 .to_string()
4678 );
4679 }
4680
4681 #[test]
4682 fn test_truncation_drops_high_order_excerpts_within_file() {
4683 // A single file has excerpts at order 1 and order 3. With a tight budget,
4684 // only the order-1 excerpts are included while the order-3 excerpt is
4685 // dropped — even though they belong to the same file. This also preserves
4686 // the parent invariant: parent outline items have order ≤ their best
4687 // child, so they're always included when any child is.
4688 let input = make_input(
4689 "x",
4690 0..1,
4691 0,
4692 vec![],
4693 vec![RelatedFile {
4694 path: Path::new("mod.rs").into(),
4695 max_row: 30,
4696 in_open_source_repo: false,
4697 excerpts: vec![
4698 RelatedExcerpt {
4699 row_range: 0..5,
4700 text: "mod header\n".into(),
4701 order: 1,
4702 },
4703 RelatedExcerpt {
4704 row_range: 5..15,
4705 text: "important fn\n".into(),
4706 order: 1,
4707 },
4708 RelatedExcerpt {
4709 row_range: 15..30,
4710 text: "less important fn\n".into(),
4711 order: 3,
4712 },
4713 ],
4714 }],
4715 );
4716
4717 // With large budget, all three excerpts included.
4718 assert_eq!(
4719 format_with_budget(&input, 10000).unwrap(),
4720 indoc! {r#"
4721 <|file_sep|>mod.rs
4722 mod header
4723 ...
4724 important fn
4725 ...
4726 less important fn
4727 <|file_sep|>test.rs
4728 <|fim_prefix|>
4729 <|fim_middle|>current
4730 <|user_cursor|>x
4731 <|fim_suffix|>
4732 <|fim_middle|>updated
4733 "#}
4734 .to_string()
4735 );
4736
4737 // With tight budget, only order<=1 excerpts included (header + important fn).
4738 assert_eq!(
4739 format_with_budget(&input, budget_with_margin(55)).unwrap(),
4740 indoc! {r#"
4741 <|file_sep|>mod.rs
4742 mod header
4743 ...
4744 important fn
4745 ...
4746 <|file_sep|>test.rs
4747 <|fim_prefix|>
4748 <|fim_middle|>current
4749 <|user_cursor|>x
4750 <|fim_suffix|>
4751 <|fim_middle|>updated
4752 "#}
4753 .to_string()
4754 );
4755 }
4756
4757 #[test]
4758 fn test_truncation_drops_older_events_first() {
4759 let input = make_input(
4760 "x",
4761 0..1,
4762 0,
4763 vec![make_event("old.rs", "-1\n"), make_event("new.rs", "-2\n")],
4764 vec![],
4765 );
4766
4767 assert_eq!(
4768 format_with_budget(&input, 10000).unwrap(),
4769 indoc! {r#"
4770 <|file_sep|>edit history
4771 --- a/old.rs
4772 +++ b/old.rs
4773 -1
4774 --- a/new.rs
4775 +++ b/new.rs
4776 -2
4777 <|file_sep|>test.rs
4778 <|fim_prefix|>
4779 <|fim_middle|>current
4780 <|user_cursor|>x
4781 <|fim_suffix|>
4782 <|fim_middle|>updated
4783 "#}
4784 .to_string()
4785 );
4786
4787 assert_eq!(
4788 format_with_budget(&input, 60).unwrap(),
4789 indoc! {r#"
4790 <|file_sep|>edit history
4791 --- a/new.rs
4792 +++ b/new.rs
4793 -2
4794 <|file_sep|>test.rs
4795 <|fim_prefix|>
4796 <|fim_middle|>current
4797 <|user_cursor|>x
4798 <|fim_suffix|>
4799 <|fim_middle|>updated
4800 "#}
4801 .to_string()
4802 );
4803 }
4804
4805 #[test]
4806 fn test_cursor_excerpt_always_included_with_minimal_budget() {
4807 let input = make_input(
4808 "fn main() {}",
4809 0..12,
4810 3,
4811 vec![make_event("a.rs", "-old\n+new\n")],
4812 vec![make_related_file("related.rs", "helper\n")],
4813 );
4814
4815 assert!(format_with_budget(&input, 30).is_none())
4816 }
4817
4818 #[track_caller]
4819 fn format_seed_coder(input: &ZetaPromptInput) -> String {
4820 format_prompt_with_budget_for_format(input, ZetaFormat::V0211SeedCoder, 10000)
4821 .expect("seed coder prompt formatting should succeed")
4822 }
4823
4824 #[track_caller]
4825 fn format_seed_coder_with_budget(input: &ZetaPromptInput, max_tokens: usize) -> String {
4826 format_prompt_with_budget_for_format(input, ZetaFormat::V0211SeedCoder, max_tokens)
4827 .expect("seed coder prompt formatting should succeed")
4828 }
4829
4830 #[test]
4831 fn test_seed_coder_basic_format() {
4832 let input = make_input(
4833 "prefix\neditable\nsuffix",
4834 7..15,
4835 10,
4836 vec![make_event("a.rs", "-old\n+new\n")],
4837 vec![make_related_file("related.rs", "fn helper() {}\n")],
4838 );
4839
4840 assert_eq!(
4841 format_seed_coder(&input),
4842 indoc! {r#"
4843 <[fim-suffix]>
4844 suffix
4845 <[fim-prefix]><filename>related.rs
4846 fn helper() {}
4847
4848 <filename>edit_history
4849 --- a/a.rs
4850 +++ b/a.rs
4851 -old
4852 +new
4853
4854 <filename>test.rs
4855 prefix
4856 <<<<<<< CURRENT
4857 edi<|user_cursor|>table
4858 =======
4859 <[fim-middle]>"#}
4860 );
4861 }
4862
4863 #[test]
4864 fn test_v0317_formats_prompt_with_many_related_files() {
4865 let related_files = (0..900)
4866 .map(|index| {
4867 make_related_file(
4868 &format!("related_{index}.rs"),
4869 "fn helper() {\n let value = 1;\n}\n",
4870 )
4871 })
4872 .collect();
4873
4874 let input = make_input(
4875 "code",
4876 0..4,
4877 2,
4878 vec![make_event("a.rs", "-x\n+y\n")],
4879 related_files,
4880 );
4881
4882 let prompt =
4883 format_prompt_with_budget_for_format(&input, ZetaFormat::V0317SeedMultiRegions, 4096);
4884
4885 assert!(prompt.is_some());
4886 let prompt = prompt.expect("v0317 should produce a prompt under high related-file count");
4887 assert!(prompt.contains("test.rs"));
4888 assert!(prompt.contains(CURSOR_MARKER));
4889 }
4890
4891 #[test]
4892 fn test_seed_coder_no_context() {
4893 let input = make_input("before\nmiddle\nafter", 7..13, 10, vec![], vec![]);
4894
4895 assert_eq!(
4896 format_seed_coder(&input),
4897 indoc! {r#"
4898 <[fim-suffix]>
4899 after
4900 <[fim-prefix]><filename>test.rs
4901 before
4902 <<<<<<< CURRENT
4903 mid<|user_cursor|>dle
4904 =======
4905 <[fim-middle]>"#}
4906 );
4907 }
4908
4909 #[test]
4910 fn test_seed_coder_truncation_drops_context() {
4911 let input = make_input(
4912 "code",
4913 0..4,
4914 2,
4915 vec![make_event("a.rs", "-x\n+y\n")],
4916 vec![make_related_file("r1.rs", "content\n")],
4917 );
4918
4919 // With large budget, everything is included
4920 assert_eq!(
4921 format_seed_coder(&input),
4922 indoc! {r#"
4923 <[fim-suffix]>
4924 <[fim-prefix]><filename>r1.rs
4925 content
4926
4927 <filename>edit_history
4928 --- a/a.rs
4929 +++ b/a.rs
4930 -x
4931 +y
4932
4933 <filename>test.rs
4934 <<<<<<< CURRENT
4935 co<|user_cursor|>de
4936 =======
4937 <[fim-middle]>"#}
4938 );
4939
4940 assert_eq!(
4941 format_prompt_with_budget_for_format(&input, ZetaFormat::V0211SeedCoder, 24),
4942 None
4943 );
4944
4945 assert_eq!(
4946 format_seed_coder_with_budget(&input, 40),
4947 indoc! {r#"
4948 <[fim-suffix]>
4949 <[fim-prefix]><filename>test.rs
4950 <<<<<<< CURRENT
4951 co<|user_cursor|>de
4952 =======
4953 <[fim-middle]>"#
4954 }
4955 )
4956 }
4957
4958 #[test]
4959 fn test_seed_coder_truncation_prioritizes_lower_order() {
4960 let input = make_input(
4961 "code",
4962 0..4,
4963 2,
4964 vec![],
4965 vec![
4966 RelatedFile {
4967 path: Path::new("low_prio.rs").into(),
4968 max_row: 5,
4969 in_open_source_repo: false,
4970 excerpts: vec![RelatedExcerpt {
4971 row_range: 0..5,
4972 text: "low prio\n".into(),
4973 order: 10,
4974 }],
4975 },
4976 RelatedFile {
4977 path: Path::new("high_prio.rs").into(),
4978 max_row: 5,
4979 in_open_source_repo: false,
4980 excerpts: vec![RelatedExcerpt {
4981 row_range: 0..5,
4982 text: "high prio\n".into(),
4983 order: 1,
4984 }],
4985 },
4986 ],
4987 );
4988
4989 // With large budget, both included; rendered in stable lexicographic order.
4990 assert_eq!(
4991 format_seed_coder(&input),
4992 indoc! {r#"
4993 <[fim-suffix]>
4994 <[fim-prefix]><filename>low_prio.rs
4995 low prio
4996 <filename>high_prio.rs
4997 high prio
4998
4999 <filename>test.rs
5000 <<<<<<< CURRENT
5001 co<|user_cursor|>de
5002 =======
5003 <[fim-middle]>"#}
5004 );
5005
5006 // With tight budget under the generic heuristic, context is dropped but the
5007 // minimal cursor section still fits.
5008 assert_eq!(
5009 format_prompt_with_budget_for_format(&input, ZetaFormat::V0211SeedCoder, 44),
5010 Some(
5011 indoc! {r#"
5012 <[fim-suffix]>
5013 <[fim-prefix]><filename>test.rs
5014 <<<<<<< CURRENT
5015 co<|user_cursor|>de
5016 =======
5017 <[fim-middle]>"#}
5018 .to_string()
5019 )
5020 );
5021 }
5022
5023 #[test]
5024 fn test_format_zeta1_from_input_basic() {
5025 let excerpt = "fn before() {}\nfn foo() {\n let x = 1;\n}\nfn after() {}\n";
5026 let input = ZetaPromptInput {
5027 cursor_path: Path::new("src/main.rs").into(),
5028 cursor_excerpt: excerpt.into(),
5029 cursor_offset_in_excerpt: 30,
5030 excerpt_start_row: Some(0),
5031 events: vec![Arc::new(make_event("other.rs", "-old\n+new\n"))],
5032 related_files: Some(vec![]),
5033 active_buffer_diagnostics: vec![],
5034 excerpt_ranges: ExcerptRanges {
5035 editable_150: 15..41,
5036 editable_180: 15..41,
5037 editable_350: 15..41,
5038 editable_150_context_350: 0..excerpt.len(),
5039 editable_180_context_350: 0..excerpt.len(),
5040 editable_350_context_150: 0..excerpt.len(),
5041 ..Default::default()
5042 },
5043 syntax_ranges: None,
5044 experiment: None,
5045 in_open_source_repo: false,
5046 can_collect_data: false,
5047 repo_url: None,
5048 };
5049
5050 let prompt = zeta1::format_zeta1_from_input(&input, 15..41, 0..excerpt.len());
5051
5052 assert_eq!(
5053 prompt,
5054 concat!(
5055 "### Instruction:\n",
5056 "You are a code completion assistant and your task is to analyze user edits and then rewrite an ",
5057 "excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking ",
5058 "into account the cursor location.\n",
5059 "\n",
5060 "### User Edits:\n",
5061 "\n",
5062 "User edited other.rs:\n",
5063 "```diff\n",
5064 "-old\n",
5065 "+new\n",
5066 "\n",
5067 "```\n",
5068 "\n",
5069 "### User Excerpt:\n",
5070 "\n",
5071 "```src/main.rs\n",
5072 "<|start_of_file|>\n",
5073 "fn before() {}\n",
5074 "<|editable_region_start|>\n",
5075 "fn foo() {\n",
5076 " <|user_cursor_is_here|>let x = 1;\n",
5077 "\n",
5078 "<|editable_region_end|>}\n",
5079 "fn after() {}\n",
5080 "\n",
5081 "```\n",
5082 "\n",
5083 "### Response:\n",
5084 ),
5085 );
5086 }
5087
5088 #[test]
5089 fn test_format_zeta1_from_input_no_start_of_file() {
5090 let excerpt = "fn foo() {\n let x = 1;\n}\n";
5091 let input = ZetaPromptInput {
5092 cursor_path: Path::new("src/main.rs").into(),
5093 cursor_excerpt: excerpt.into(),
5094 cursor_offset_in_excerpt: 15,
5095 excerpt_start_row: Some(10),
5096 events: vec![],
5097 related_files: Some(vec![]),
5098 active_buffer_diagnostics: vec![],
5099 excerpt_ranges: ExcerptRanges {
5100 editable_150: 0..28,
5101 editable_180: 0..28,
5102 editable_350: 0..28,
5103 editable_150_context_350: 0..28,
5104 editable_180_context_350: 0..28,
5105 editable_350_context_150: 0..28,
5106 ..Default::default()
5107 },
5108 syntax_ranges: None,
5109 experiment: None,
5110 in_open_source_repo: false,
5111 can_collect_data: false,
5112 repo_url: None,
5113 };
5114
5115 let prompt = zeta1::format_zeta1_from_input(&input, 0..28, 0..28);
5116
5117 assert_eq!(
5118 prompt,
5119 concat!(
5120 "### Instruction:\n",
5121 "You are a code completion assistant and your task is to analyze user edits and then rewrite an ",
5122 "excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking ",
5123 "into account the cursor location.\n",
5124 "\n",
5125 "### User Edits:\n",
5126 "\n",
5127 "\n",
5128 "\n",
5129 "### User Excerpt:\n",
5130 "\n",
5131 "```src/main.rs\n",
5132 "<|editable_region_start|>\n",
5133 "fn foo() {\n",
5134 " <|user_cursor_is_here|>let x = 1;\n",
5135 "}\n",
5136 "\n",
5137 "<|editable_region_end|>\n",
5138 "```\n",
5139 "\n",
5140 "### Response:\n",
5141 ),
5142 );
5143 }
5144
5145 #[test]
5146 fn test_format_zeta1_from_input_with_sub_ranges() {
5147 let excerpt = "// prefix\nfn foo() {\n let x = 1;\n}\n// suffix\n";
5148 let editable_range = 10..37;
5149 let context_range = 0..excerpt.len();
5150
5151 let input = ZetaPromptInput {
5152 cursor_path: Path::new("test.rs").into(),
5153 cursor_excerpt: excerpt.into(),
5154 cursor_offset_in_excerpt: 25,
5155 excerpt_start_row: Some(0),
5156 events: vec![],
5157 related_files: Some(vec![]),
5158 active_buffer_diagnostics: vec![],
5159 excerpt_ranges: ExcerptRanges {
5160 editable_150: editable_range.clone(),
5161 editable_180: editable_range.clone(),
5162 editable_350: editable_range.clone(),
5163 editable_150_context_350: context_range.clone(),
5164 editable_180_context_350: context_range.clone(),
5165 editable_350_context_150: context_range.clone(),
5166 ..Default::default()
5167 },
5168 syntax_ranges: None,
5169 experiment: None,
5170 in_open_source_repo: false,
5171 can_collect_data: false,
5172 repo_url: None,
5173 };
5174
5175 let prompt = zeta1::format_zeta1_from_input(&input, editable_range, context_range);
5176
5177 assert_eq!(
5178 prompt,
5179 concat!(
5180 "### Instruction:\n",
5181 "You are a code completion assistant and your task is to analyze user edits and then rewrite an ",
5182 "excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking ",
5183 "into account the cursor location.\n",
5184 "\n",
5185 "### User Edits:\n",
5186 "\n",
5187 "\n",
5188 "\n",
5189 "### User Excerpt:\n",
5190 "\n",
5191 "```test.rs\n",
5192 "<|start_of_file|>\n",
5193 "// prefix\n",
5194 "<|editable_region_start|>\n",
5195 "fn foo() {\n",
5196 " <|user_cursor_is_here|>let x = 1;\n",
5197 "}\n",
5198 "<|editable_region_end|>\n",
5199 "// suffix\n",
5200 "\n",
5201 "```\n",
5202 "\n",
5203 "### Response:\n",
5204 ),
5205 );
5206 }
5207
5208 #[test]
5209 fn test_max_event_count() {
5210 fn make_numbered_event(index: usize) -> Event {
5211 return make_event(
5212 &format!("event-{index}.rs"),
5213 &format!("-old-{index}\n+new-{index}\n"),
5214 );
5215 }
5216 let input = make_input(
5217 "x",
5218 0..1,
5219 0,
5220 (0..3).map(make_numbered_event).collect(),
5221 vec![],
5222 );
5223
5224 let edit_history_section = format_edit_history_within_budget(
5225 &input.events,
5226 "<|file_sep|>",
5227 "edit history",
5228 usize::MAX,
5229 5,
5230 );
5231
5232 assert_eq!(
5233 &edit_history_section,
5234 indoc!(
5235 "
5236 <|file_sep|>edit history
5237 --- a/event-0.rs
5238 +++ b/event-0.rs
5239 -old-0
5240 +new-0
5241 --- a/event-1.rs
5242 +++ b/event-1.rs
5243 -old-1
5244 +new-1
5245 --- a/event-2.rs
5246 +++ b/event-2.rs
5247 -old-2
5248 +new-2
5249 "
5250 )
5251 );
5252
5253 let edit_history_section = format_edit_history_within_budget(
5254 &input.events,
5255 "<|file_sep|>",
5256 "edit history",
5257 usize::MAX,
5258 2,
5259 );
5260
5261 assert_eq!(
5262 &edit_history_section,
5263 indoc!(
5264 "
5265 <|file_sep|>edit history
5266 --- a/event-1.rs
5267 +++ b/event-1.rs
5268 -old-1
5269 +new-1
5270 --- a/event-2.rs
5271 +++ b/event-2.rs
5272 -old-2
5273 +new-2
5274 "
5275 )
5276 );
5277
5278 let edit_history_section = format_edit_history_within_budget(
5279 &input.events,
5280 "<|file_sep|>",
5281 "edit history",
5282 usize::MAX,
5283 0,
5284 );
5285
5286 assert_eq!(&edit_history_section, "");
5287 }
5288
5289 #[test]
5290 fn test_clean_zeta1_model_output_basic() {
5291 let output = indoc! {"
5292 <|editable_region_start|>
5293 fn main() {
5294 println!(\"hello\");
5295 }
5296 <|editable_region_end|>
5297 "};
5298
5299 let cleaned = zeta1::clean_zeta1_model_output(output).unwrap();
5300 assert_eq!(cleaned, "fn main() {\n println!(\"hello\");\n}");
5301 }
5302
5303 #[test]
5304 fn test_clean_zeta1_model_output_with_cursor() {
5305 let output = indoc! {"
5306 <|editable_region_start|>
5307 fn main() {
5308 <|user_cursor_is_here|>println!(\"hello\");
5309 }
5310 <|editable_region_end|>
5311 "};
5312
5313 let cleaned = zeta1::clean_zeta1_model_output(output).unwrap();
5314 assert_eq!(
5315 cleaned,
5316 "fn main() {\n <|user_cursor|>println!(\"hello\");\n}"
5317 );
5318 }
5319
5320 #[test]
5321 fn test_clean_zeta1_model_output_no_markers() {
5322 let output = "fn main() {}\n";
5323 let cleaned = zeta1::clean_zeta1_model_output(output).unwrap();
5324 assert_eq!(cleaned, "fn main() {}\n");
5325 }
5326
5327 #[test]
5328 fn test_clean_zeta1_model_output_empty_region() {
5329 let output = "<|editable_region_start|>\n<|editable_region_end|>\n";
5330 let cleaned = zeta1::clean_zeta1_model_output(output).unwrap();
5331 assert_eq!(cleaned, "");
5332 }
5333
5334 fn apply_edit(excerpt: &str, parsed_output: &ParsedOutput) -> String {
5335 let mut result = excerpt.to_string();
5336 result.replace_range(
5337 parsed_output.range_in_excerpt.clone(),
5338 &parsed_output.new_editable_region,
5339 );
5340 result
5341 }
5342
5343 #[test]
5344 fn test_parse_zeta2_model_output() {
5345 let excerpt = "before ctx\nctx start\neditable old\nctx end\nafter ctx\n";
5346 let context_start = excerpt.find("ctx start").unwrap();
5347 let context_end = excerpt.find("after ctx").unwrap();
5348 let editable_start = excerpt.find("editable old").unwrap();
5349 let editable_end = editable_start + "editable old\n".len();
5350 let input = make_input_with_context_range(
5351 excerpt,
5352 editable_start..editable_end,
5353 context_start..context_end,
5354 editable_start,
5355 );
5356
5357 let output = parse_zeta2_model_output(
5358 "editable new\n>>>>>>> UPDATED\n",
5359 ZetaFormat::V0131GitMergeMarkersPrefix,
5360 &input,
5361 )
5362 .unwrap();
5363
5364 assert_eq!(
5365 apply_edit(excerpt, &output),
5366 "before ctx\nctx start\neditable new\nctx end\nafter ctx\n"
5367 );
5368 }
5369
5370 #[test]
5371 fn test_parse_zeta2_model_output_identity() {
5372 let excerpt = "aaa\nbbb\nccc\nddd\neee\n";
5373 let editable_start = excerpt.find("bbb").unwrap();
5374 let editable_end = excerpt.find("ddd").unwrap();
5375 let input = make_input_with_context_range(
5376 excerpt,
5377 editable_start..editable_end,
5378 0..excerpt.len(),
5379 editable_start,
5380 );
5381
5382 let format = ZetaFormat::V0131GitMergeMarkersPrefix;
5383 let output =
5384 parse_zeta2_model_output("bbb\nccc\n>>>>>>> UPDATED\n", format, &input).unwrap();
5385
5386 assert_eq!(apply_edit(excerpt, &output), excerpt);
5387 }
5388
5389 #[test]
5390 fn test_parse_zeta2_model_output_strips_end_marker() {
5391 let excerpt = "hello\nworld\n";
5392 let input = make_input_with_context_range(excerpt, 0..excerpt.len(), 0..excerpt.len(), 0);
5393
5394 let format = ZetaFormat::V0131GitMergeMarkersPrefix;
5395 let output1 =
5396 parse_zeta2_model_output("new content\n>>>>>>> UPDATED\n", format, &input).unwrap();
5397 let output2 = parse_zeta2_model_output("new content\n", format, &input).unwrap();
5398
5399 assert_eq!(apply_edit(excerpt, &output1), apply_edit(excerpt, &output2));
5400 assert_eq!(apply_edit(excerpt, &output1), "new content\n");
5401 }
5402
5403 #[test]
5404 fn test_special_tokens_not_triggered_by_comment_separator() {
5405 // Regression test for https://github.com/zed-industries/zed/issues/52489
5406 let excerpt = "fn main() {\n // =======\n println!(\"hello\");\n}\n";
5407 let input = make_input(excerpt, 0..excerpt.len(), 0, vec![], vec![]);
5408 assert!(
5409 !prompt_input_contains_special_tokens(&input, ZetaFormat::V0131GitMergeMarkersPrefix),
5410 "comment containing ======= should not trigger special token detection"
5411 );
5412 }
5413}