format_prompt.rs

  1use crate::{
  2    FormatPromptArgs, PredictionProvider,
  3    example::{ActualCursor, Example, ExamplePrompt},
  4    headless::EpAppState,
  5    progress::{ExampleProgress, Step},
  6    retrieve_context::run_context_retrieval,
  7};
  8use anyhow::{Context as _, Result, anyhow};
  9use edit_prediction::{cursor_excerpt::editable_and_context_ranges_for_cursor_position, udiff};
 10use gpui::{AppContext, AsyncApp};
 11use language::{Buffer, OffsetRangeExt, Point};
 12use similar::DiffableStr;
 13use std::sync::Arc;
 14use std::{fmt::Write as _, ops::Range};
 15use zeta_prompt::ZetaFormat;
 16use zeta_prompt::format_zeta_prompt;
 17
 18pub async fn run_format_prompt(
 19    example: &mut Example,
 20    args: &FormatPromptArgs,
 21    app_state: Arc<EpAppState>,
 22    example_progress: &ExampleProgress,
 23    cx: AsyncApp,
 24) -> Result<()> {
 25    run_context_retrieval(example, app_state.clone(), example_progress, cx.clone()).await?;
 26
 27    let step_progress = example_progress.start(Step::FormatPrompt);
 28
 29    let prompt_inputs = example
 30        .prompt_inputs
 31        .as_ref()
 32        .context("prompt_inputs must be set after context retrieval")?;
 33
 34    let language = app_state
 35        .languages
 36        .load_language_for_file_path(&example.spec.cursor_path)
 37        .await
 38        .ok();
 39    let snapshot_fut = cx.update(|cx| {
 40        Buffer::build_snapshot(
 41            prompt_inputs.content.as_str().into(),
 42            language,
 43            Some(app_state.languages.clone()),
 44            cx,
 45        )
 46    });
 47    let cursor_point = Point::new(prompt_inputs.cursor_row, prompt_inputs.cursor_column);
 48    let snapshot = cx.background_spawn(snapshot_fut).await;
 49
 50    match args.provider {
 51        PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_) => {
 52            step_progress.set_substatus("formatting teacher prompt");
 53
 54            let (editable_range, context_range) = editable_and_context_ranges_for_cursor_position(
 55                cursor_point,
 56                &snapshot,
 57                edit_prediction::zeta2::max_editable_tokens(ZetaFormat::default()),
 58                edit_prediction::zeta2::MAX_CONTEXT_TOKENS,
 59            );
 60            let editable_range = editable_range.to_offset(&snapshot);
 61            let context_range = context_range.to_offset(&snapshot);
 62
 63            let prompt = TeacherPrompt::format_prompt(example, editable_range, context_range);
 64            example.prompt = Some(ExamplePrompt {
 65                input: prompt,
 66                expected_output: String::new(),
 67                rejected_output: None,
 68                prefill: None,
 69                provider: args.provider,
 70            });
 71        }
 72        PredictionProvider::Zeta2(version) => {
 73            step_progress.set_substatus("formatting zeta2 prompt");
 74
 75            let (editable_range, context_range) = editable_and_context_ranges_for_cursor_position(
 76                cursor_point,
 77                &snapshot,
 78                edit_prediction::zeta2::max_editable_tokens(version),
 79                edit_prediction::zeta2::MAX_CONTEXT_TOKENS,
 80            );
 81            let editable_range = editable_range.to_offset(&snapshot);
 82            let context_range = context_range.to_offset(&snapshot);
 83
 84            let context_start = context_range.start;
 85            let cursor_offset_in_excerpt = prompt_inputs.cursor_offset - context_start;
 86            let editable_range_in_excerpt =
 87                (editable_range.start - context_start)..(editable_range.end - context_start);
 88            let input = zeta_prompt::ZetaPromptInput {
 89                cursor_path: example.spec.cursor_path.clone(),
 90                cursor_excerpt: prompt_inputs.content[context_range].to_string().into(),
 91                editable_range_in_excerpt,
 92                cursor_offset_in_excerpt,
 93                excerpt_start_row: prompt_inputs.excerpt_start_row,
 94                events: prompt_inputs.edit_history.clone(),
 95                related_files: prompt_inputs.related_files.clone().unwrap_or_default(),
 96                excerpt_ranges: None,
 97                preferred_model: None,
 98                in_open_source_repo: example
 99                    .spec
100                    .captured_prompt_input
101                    .as_ref()
102                    .map_or(false, |input| input.in_open_source_repo),
103                can_collect_data: false,
104            };
105            let prompt = format_zeta_prompt(&input, version);
106            let prefill = zeta_prompt::get_prefill(&input, version);
107            let (expected_patch, expected_cursor_offset) = example
108                .spec
109                .expected_patches_with_cursor_positions()
110                .into_iter()
111                .next()
112                .context("expected patches is empty")?;
113            let expected_output =
114                zeta2_output_for_patch(&input, &expected_patch, expected_cursor_offset, version)?;
115            let rejected_output = example
116                .spec
117                .rejected_patch
118                .as_ref()
119                .and_then(|patch| zeta2_output_for_patch(&input, patch, None, version).ok());
120
121            example.prompt = Some(ExamplePrompt {
122                input: prompt,
123                expected_output,
124                rejected_output,
125                provider: args.provider,
126                prefill: Some(prefill),
127            });
128        }
129        _ => {
130            panic!("Cannot format prompt for {:?}", args.provider);
131        }
132    };
133    Ok(())
134}
135
136pub fn zeta2_output_for_patch(
137    input: &zeta_prompt::ZetaPromptInput,
138    patch: &str,
139    cursor_offset: Option<usize>,
140    version: ZetaFormat,
141) -> Result<String> {
142    let mut old_editable_region =
143        input.cursor_excerpt[input.editable_range_in_excerpt.clone()].to_string();
144
145    if !old_editable_region.ends_with_newline() {
146        old_editable_region.push('\n');
147    }
148
149    let (mut result, first_hunk_offset) =
150        udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable_region).with_context(
151            || {
152                format!(
153                    "Patch:\n```\n{}```\n\nEditable region:\n```\n{}```",
154                    patch, old_editable_region
155                )
156            },
157        )?;
158
159    if let Some(cursor_offset) = cursor_offset {
160        // The cursor_offset is relative to the start of the hunk's new text (context + additions).
161        // We need to add where the hunk context matched in the editable region to compute
162        // the actual cursor position in the result.
163        let hunk_start = first_hunk_offset.unwrap_or(0);
164        let offset = (hunk_start + cursor_offset).min(result.len());
165        result.insert_str(offset, zeta_prompt::CURSOR_MARKER);
166    }
167
168    match version {
169        ZetaFormat::V0120GitMergeMarkers
170        | ZetaFormat::V0131GitMergeMarkersPrefix
171        | ZetaFormat::V0211SeedCoder => {
172            if !result.ends_with('\n') {
173                result.push('\n');
174            }
175            result.push_str(zeta_prompt::v0120_git_merge_markers::END_MARKER);
176        }
177        _ => (),
178    }
179
180    Ok(result)
181}
182
183pub struct TeacherPrompt;
184
185impl TeacherPrompt {
186    pub(crate) const EDITABLE_REGION_START: &str = "<|editable_region_start|>\n";
187    pub(crate) const EDITABLE_REGION_END: &str = "\n<|editable_region_end|>";
188    pub(crate) const USER_CURSOR_MARKER: &str = "<|user_cursor|>";
189    pub(crate) const NO_EDITS: &str = "NO_EDITS";
190
191    /// Truncate edit history to this number of last lines
192    const MAX_HISTORY_LINES: usize = 128;
193
194    pub fn format_prompt(
195        example: &Example,
196        editable_range: Range<usize>,
197        context_range: Range<usize>,
198    ) -> String {
199        let edit_history = Self::format_edit_history(&example.spec.edit_history);
200        let context = Self::format_context(example);
201        let cursor_excerpt = Self::format_cursor_excerpt(example, editable_range, context_range);
202
203        let prompt_template = crate::prompt_assets::get_prompt("teacher.md");
204        let prompt = prompt_template
205            .replace("{{context}}", &context)
206            .replace("{{edit_history}}", &edit_history)
207            .replace("{{cursor_excerpt}}", &cursor_excerpt);
208
209        prompt
210    }
211
212    pub fn parse(example: &Example, response: &str) -> Result<(String, Option<ActualCursor>)> {
213        // Extract updated (new) editable region from the model response.
214        // The model may include editable region markers in its output, so we need to strip them.
215        let new_editable_region = extract_last_codeblock(response);
216
217        // Check if the model indicated no edits are needed
218        if new_editable_region.trim() == Self::NO_EDITS {
219            return Ok((String::new(), None));
220        }
221
222        let new_editable_region = Self::extract_editable_region(&new_editable_region)?;
223        let cursor_offset = new_editable_region.find(Self::USER_CURSOR_MARKER);
224        let mut new_editable_region = new_editable_region.replace(Self::USER_CURSOR_MARKER, "");
225        let old_editable_region = Self::extract_editable_region(
226            &example
227                .prompt
228                .as_ref()
229                .context("example prompt missing")?
230                .input,
231        )?
232        .replace(Self::USER_CURSOR_MARKER, "");
233
234        let prompt_inputs = example
235            .prompt_inputs
236            .as_ref()
237            .context("example is missing prompt inputs")?;
238
239        // Normalize leading newlines: if old starts with newline but new doesn't,
240        // prepend newline to new to preserve whitespace structure.
241        // This handles the case where the model drops the leading blank line.
242        if old_editable_region.starts_with('\n') && !new_editable_region.starts_with('\n') {
243            new_editable_region.insert(0, '\n');
244        }
245
246        let (editable_region_offset, _) = prompt_inputs
247            .content
248            .match_indices(&old_editable_region)
249            .min_by_key(|(index, _)| index.abs_diff(prompt_inputs.cursor_offset))
250            .context("editable region not found in prompt content")?;
251        let editable_region_start_line = prompt_inputs.content[..editable_region_offset]
252            .matches('\n')
253            .count();
254
255        // Use full context so cursor offset (relative to editable region start) aligns with diff content
256        let editable_region_lines = old_editable_region.lines().count() as u32;
257        let diff = language::unified_diff_with_context(
258            &old_editable_region,
259            &new_editable_region,
260            editable_region_start_line as u32,
261            editable_region_start_line as u32,
262            editable_region_lines,
263        );
264
265        let diff = indoc::formatdoc! {"
266            --- a/{path}
267            +++ b/{path}
268            {diff}",
269            path = example.spec.cursor_path.to_string_lossy(),
270            diff = diff,
271        };
272
273        let actual_cursor = cursor_offset.map(|editable_region_cursor_offset| {
274            ActualCursor::from_editable_region(
275                &example.spec.cursor_path,
276                editable_region_cursor_offset,
277                &new_editable_region,
278                &prompt_inputs.content,
279                editable_region_offset,
280                editable_region_start_line,
281            )
282        });
283
284        Ok((diff, actual_cursor))
285    }
286
287    fn format_edit_history(edit_history: &str) -> String {
288        // Strip comments ("garbage lines") from edit history
289        let lines = edit_history
290            .lines()
291            .filter(|&s| Self::is_udiff_content_line(s))
292            .collect::<Vec<_>>();
293
294        let history_lines = if lines.len() > Self::MAX_HISTORY_LINES {
295            &lines[lines.len() - Self::MAX_HISTORY_LINES..]
296        } else {
297            &lines
298        };
299
300        if history_lines.is_empty() {
301            return "(No edit history)".to_string();
302        }
303
304        history_lines.join("\n")
305    }
306
307    pub fn format_context(example: &Example) -> String {
308        let related_files = example
309            .prompt_inputs
310            .as_ref()
311            .and_then(|pi| pi.related_files.as_ref());
312
313        let Some(related_files) = related_files else {
314            return "(No context)".to_string();
315        };
316
317        if related_files.is_empty() {
318            return "(No context)".to_string();
319        }
320
321        let mut prompt = String::new();
322        for file in related_files {
323            let path_str = file.path.to_string_lossy();
324            writeln!(&mut prompt, "`````{path_str}").ok();
325
326            let mut prev_row = 0;
327            for excerpt in &file.excerpts {
328                if excerpt.row_range.start > prev_row {
329                    prompt.push_str("\n");
330                }
331                prompt.push_str(&excerpt.text);
332                prompt.push('\n');
333                prev_row = excerpt.row_range.end;
334            }
335            if prev_row < file.max_row {
336                prompt.push_str("\n");
337            }
338            prompt.push_str("\n`````\n");
339        }
340
341        prompt
342    }
343
344    fn format_cursor_excerpt(
345        example: &Example,
346        editable_range: Range<usize>,
347        context_range: Range<usize>,
348    ) -> String {
349        let mut result = String::new();
350
351        let prompt_inputs = example.prompt_inputs.as_ref().unwrap();
352
353        let path_str = example.spec.cursor_path.to_string_lossy();
354        result.push_str(&format!("`````{path_str}\n"));
355        result.push_str(&prompt_inputs.content[context_range.start..editable_range.start]);
356        result.push_str(Self::EDITABLE_REGION_START);
357        result.push_str(&prompt_inputs.content[editable_range.start..prompt_inputs.cursor_offset]);
358        result.push_str(Self::USER_CURSOR_MARKER);
359        result.push_str(&prompt_inputs.content[prompt_inputs.cursor_offset..editable_range.end]);
360        result.push_str(Self::EDITABLE_REGION_END);
361        result.push_str(&prompt_inputs.content[editable_range.end..context_range.end]);
362        result.push_str("\n`````");
363
364        result
365    }
366
367    pub fn extract_editable_region(text: &str) -> Result<String> {
368        let start = text
369            .rfind(Self::EDITABLE_REGION_START)
370            .map_or(0, |pos| pos + Self::EDITABLE_REGION_START.len());
371        let end = text.rfind(Self::EDITABLE_REGION_END).unwrap_or(text.len());
372
373        if start >= end {
374            return Err(anyhow!("Invalid editable region markers"));
375        }
376
377        let region = &text[start..end];
378        Ok(region.strip_suffix('\n').unwrap_or(region).to_string())
379    }
380
381    fn is_udiff_content_line(s: &str) -> bool {
382        s.starts_with("-")
383            || s.starts_with("+")
384            || s.starts_with(" ")
385            || s.starts_with("---")
386            || s.starts_with("+++")
387            || s.starts_with("@@")
388    }
389}
390
391/// Extract the cursor excerpt from an example.
392/// First tries to extract from an existing prompt, then falls back to constructing from prompt_inputs.
393pub fn extract_cursor_excerpt_from_example(example: &Example) -> Option<String> {
394    // If we have the original prompt, extract the cursor excerpt from it
395    if let Some(prompt) = &example.prompt {
396        // Find "# 3. Current File" section and extract the content
397        if let Some(start) = prompt.input.find("# 3. Current File") {
398            let content_start = prompt.input[start..].find('`').map(|i| start + i)?;
399            let backtick_count = prompt.input[content_start..]
400                .chars()
401                .take_while(|&c| c == '`')
402                .count();
403            let content_start = content_start + backtick_count;
404
405            // Find the path line and skip it
406            let newline_pos = prompt.input[content_start..].find('\n')?;
407            let text_start = content_start + newline_pos + 1;
408
409            // Find the closing backticks
410            let closing_pattern = "`".repeat(backtick_count);
411            let text_end = prompt.input[text_start..].find(&closing_pattern)?;
412            let cursor_excerpt = &prompt.input[text_start..text_start + text_end];
413
414            let path_str = example.spec.cursor_path.to_string_lossy();
415            return Some(format!("`````{path_str}\n{cursor_excerpt}`````"));
416        }
417    }
418
419    // Fallback: construct from prompt_inputs if available
420    let prompt_inputs = example.prompt_inputs.as_ref()?;
421    let content = &prompt_inputs.content;
422    let cursor_offset = prompt_inputs.cursor_offset;
423
424    // Simple fallback: just show content around cursor with markers
425    let path_str = example.spec.cursor_path.to_string_lossy();
426    let mut result = format!("`````{path_str}\n");
427    result.push_str(TeacherPrompt::EDITABLE_REGION_START);
428    result.push_str(&content[..cursor_offset]);
429    result.push_str(TeacherPrompt::USER_CURSOR_MARKER);
430    result.push_str(&content[cursor_offset..]);
431    result.push_str(TeacherPrompt::EDITABLE_REGION_END);
432    result.push_str("\n`````");
433
434    Some(result)
435}
436
437pub(crate) fn extract_last_codeblock(text: &str) -> String {
438    let mut last_block = None;
439    let mut search_start = 0;
440
441    while let Some(start) = text[search_start..].find("```") {
442        let start = start + search_start;
443        let bytes = text.as_bytes();
444        let mut backtick_end = start;
445
446        while backtick_end < bytes.len() && bytes[backtick_end] == b'`' {
447            backtick_end += 1;
448        }
449
450        let backtick_count = backtick_end - start;
451        let closing_pattern = format!("\n{}", "`".repeat(backtick_count));
452
453        while backtick_end < bytes.len() && bytes[backtick_end] != b'\n' {
454            backtick_end += 1;
455        }
456
457        if let Some(end_pos) = text[backtick_end..].find(&closing_pattern) {
458            let code_block = &text[backtick_end + 1..backtick_end + end_pos + 1];
459            last_block = Some(code_block.to_string());
460            search_start = backtick_end + end_pos + closing_pattern.len();
461        } else {
462            break;
463        }
464    }
465
466    last_block.unwrap_or_else(|| text.to_string())
467}
468
469#[cfg(test)]
470mod tests {
471    use super::*;
472
473    #[test]
474    fn test_extract_last_code_block() {
475        let text = indoc::indoc! {"
476            Some thinking
477
478            ```
479            first block
480            ```
481
482            `````path='something' lines=1:2
483            last block
484            `````
485            "};
486        let last_block = extract_last_codeblock(text);
487        assert_eq!(last_block, "last block\n");
488    }
489
490    #[test]
491    fn test_extract_codeblock_with_nested_fences() {
492        let text = indoc::indoc! {"
493            `````
494            content with ``` inline
495            and ```python nested
496            more content
497            `````
498            "};
499        let last_block = extract_last_codeblock(text);
500        assert_eq!(
501            last_block,
502            "content with ``` inline\nand ```python nested\nmore content\n"
503        );
504    }
505
506    #[test]
507    fn test_extract_codeblock_ignores_inline_backticks() {
508        let text = indoc::indoc! {"
509            `````
510            here is some `code` with inline backticks
511            and here```more```stuff
512            `````
513            "};
514        let last_block = extract_last_codeblock(text);
515        assert_eq!(
516            last_block,
517            "here is some `code` with inline backticks\nand here```more```stuff\n"
518        );
519    }
520
521    #[test]
522    fn test_extract_editable_region() {
523        let text = indoc::indoc! {"
524            some lines
525            are
526            here
527            <|editable_region_start|>
528            one
529            two three
530
531            <|editable_region_end|>
532            more
533            lines here
534            "};
535        let parsed = TeacherPrompt::extract_editable_region(text).unwrap();
536        assert_eq!(
537            parsed,
538            indoc::indoc! {"
539            one
540            two three"}
541        );
542    }
543
544    #[test]
545    fn test_extract_last_codeblock_nested_bibtex() {
546        let text = indoc::indoc! {r#"
547            Looking at the edit history, I can see that a Citation section was just added.
548
549            `````
550            ## Collaborations
551            Our mission is to create a 4D generative model.
552
553            ## Citation
554
555            If you found Unique3D helpful, please cite our report:
556            ```bibtex
557            @misc{wu2024unique3d,
558                  title={Unique3D},
559            }
560            ```
561            `````
562            "#};
563        let last_block = extract_last_codeblock(text);
564        assert_eq!(
565            last_block,
566            indoc::indoc! {r#"
567            ## Collaborations
568            Our mission is to create a 4D generative model.
569
570            ## Citation
571
572            If you found Unique3D helpful, please cite our report:
573            ```bibtex
574            @misc{wu2024unique3d,
575                  title={Unique3D},
576            }
577            ```
578            "#}
579        );
580    }
581
582    #[test]
583    fn test_extract_editable_region_no_markers() {
584        let text = indoc::indoc! {"
585            one
586            two three"};
587        let parsed = TeacherPrompt::extract_editable_region(text).unwrap();
588        assert_eq!(
589            parsed,
590            indoc::indoc! {"
591            one
592            two three"}
593        );
594    }
595
596    #[test]
597    fn test_parse_no_edits_response() {
598        let response = indoc::indoc! {"
599            The code is already complete. There is no clear next edit to make.
600
601            `````
602            NO_EDITS
603            `````
604        "};
605        let codeblock = extract_last_codeblock(response);
606        assert_eq!(codeblock.trim(), TeacherPrompt::NO_EDITS);
607    }
608}