diff --git a/crates/edit_prediction_cli/src/format_prompt.rs b/crates/edit_prediction_cli/src/format_prompt.rs index f76f4899092c7add08897b887a6398a5f7a2bded..44df820d7611f6fa62aec66259bb203e09de428a 100644 --- a/crates/edit_prediction_cli/src/format_prompt.rs +++ b/crates/edit_prediction_cli/src/format_prompt.rs @@ -421,7 +421,7 @@ pub fn extract_cursor_excerpt_from_example(example: &Example) -> Option Some(result) } -fn extract_last_codeblock(text: &str) -> String { +pub(crate) fn extract_last_codeblock(text: &str) -> String { let mut last_block = None; let mut search_start = 0; diff --git a/crates/edit_prediction_cli/src/main.rs b/crates/edit_prediction_cli/src/main.rs index e22103634047fa306e0eff79f9f3146f1cda19c8..95e8332b44741ca7bdfb173282508f960d8d0303 100644 --- a/crates/edit_prediction_cli/src/main.rs +++ b/crates/edit_prediction_cli/src/main.rs @@ -1068,6 +1068,10 @@ fn main() { score::write_summary_json(&examples, summary_path)?; } } + Command::Repair(args) => { + let examples = finished_examples.lock().unwrap(); + repair::print_report(&examples, args.confidence_threshold); + } _ => (), }; diff --git a/crates/edit_prediction_cli/src/prompts/repair.md b/crates/edit_prediction_cli/src/prompts/repair.md index 3fb32cc5f5abdbf3e03a8c08f125096da498d7da..4fc0b6b66201ac9dec5147655c073b3defcf7455 100644 --- a/crates/edit_prediction_cli/src/prompts/repair.md +++ b/crates/edit_prediction_cli/src/prompts/repair.md @@ -13,11 +13,21 @@ A previous model generated a prediction that was judged to have issues. Your job ## Rules +- **NEVER undo or revert the user's recent edits.** Examine the diff in the edit history carefully: + - If a line was removed (starts with `-`), do NOT restore that content—even if the code now appears incomplete or broken without it + - If a line was added (starts with `+`), do NOT delete or significantly modify it + - If code appears broken or incomplete after the user's edit, output `NO_EDITS` rather than "fixing" it by reverting + - Only add NEW content that extends the user's work forward; never restore what they removed + - **Key test**: if your prediction would make the code more similar to what it was BEFORE the user's edit, output `NO_EDITS` instead + - **Never assume a deletion was accidental.** Even if removing content breaks the code, breaks a pattern, or leaves text looking "incomplete", respect it. The user may be mid-rewrite. Do NOT "complete" partial text by restoring what was deleted. - Do not just mechanically apply patterns - reason about what changes make sense given the context and the programmer's apparent goals. - Do not just fix syntax errors - look for the broader refactoring pattern and apply it systematically throughout the code. - Keep existing formatting unless it's absolutely necessary +- When edit history and surrounding code suggest different edits, prioritize the most recent edits in the history as they best reflect current intent. +- When uncertain, predict only the minimal, high-confidence portion of the edit. Prefer a small, correct prediction over a large, speculative one. - Don't write a lot of code if you're not sure what to do -- Do not delete or remove text that was just added in the edit history. If a recent edit introduces incomplete or incorrect code, finish or fix it in place, or simply do nothing rather than removing it. Only remove a recent edit if the history explicitly shows the user undoing it themselves. +- Do not delete or remove text that was just added in the edit history. If a recent edit introduces incomplete or incorrect code, finish or fix it in place, or simply output `NO_EDITS` rather than removing it. Only remove a recent edit if the history explicitly shows the user undoing it themselves. +- Treat partial text at or near the cursor as the beginning of something the user is actively typing. Complete the code the user appears to be creating based on context. # Input Format @@ -29,7 +39,7 @@ You will be provided with: - Within the user's current file, there is an *editable region* delimited by the `<|editable_region_start|>` and `<|editable_region_end|>` tags. You can only predict edits in this region. - The `<|user_cursor|>` tag marks the user's current cursor position, as it stands after the last edit in the history. 4. The *previous prediction* that was generated and needs improvement. -5. *Quality assessment feedback* explaining why the previous prediction was problematic. +5. *Quality feedback* explaining why the previous prediction was problematic. # Output Format @@ -37,6 +47,30 @@ You will be provided with: - Output the entire editable region, applying the edits that you predict the user will make next. - If you're unsure about some portion of the next edit, you may still predict the surrounding code (such as a function definition, `for` loop, etc) and place the `<|user_cursor|>` within it for the user to fill in. - Wrap the edited code in a codeblock with exactly five backticks. +- There are two special outputs for when you don't want to generate a new prediction. **These have different meanings — use the correct one:** + + 1. **`NO_EDITS`** — The code is already complete and correct as-is. No edits should be made at all. The editable region should remain unchanged. Use this when: + - The code needs no modifications whatsoever + - Any prediction would revert or undo the user's intentional changes + - You are unsure what edit to make and prefer to do nothing + + ````` + NO_EDITS + ````` + + 2. **`KEEP_PREVIOUS`** — The previous prediction was actually correct and should be used as-is. Use this when: + - After reviewing the quality feedback, you determine the previous prediction is good + - You cannot find a meaningful improvement over the previous prediction + - The quality feedback was too cautious and the previous prediction correctly addresses the user's intent + + ````` + KEEP_PREVIOUS + ````` + + **Important:** `NO_EDITS` and `KEEP_PREVIOUS` are NOT interchangeable. + - `NO_EDITS` means "make zero changes to the code" (empty prediction). + - `KEEP_PREVIOUS` means "the previous prediction is correct, use it" (reuse the previous prediction). + - If you believe the previous prediction was correct, you MUST use `KEEP_PREVIOUS`, not `NO_EDITS`. Using `NO_EDITS` would discard the previous prediction entirely. # 1. User Edits History @@ -60,12 +94,10 @@ The previous model generated the following edit (in word-diff format): {actual_patch_word_diff} ````` -# 5. Quality Assessment Feedback +# 5. Quality Feedback -- **Reverts user edits**: {reverts_edits} -- **Confidence score**: {confidence}/5 -- **Reasoning**: {qa_reasoning} +{quality_feedback} # Your Improved Prediction -Based on the feedback above, generate an improved prediction. Address the issues identified in the quality assessment. +Based on the feedback above, generate an improved prediction. Address the issues identified in the quality feedback. If the previous prediction was actually correct, output `KEEP_PREVIOUS`. If no edits should be made at all, output `NO_EDITS`. \ No newline at end of file diff --git a/crates/edit_prediction_cli/src/repair.rs b/crates/edit_prediction_cli/src/repair.rs index 1112d5ea567e8b3083ebfd37843bd9d534bb9a7b..b4f1f70f125be981efa44816dc63e79d0012a416 100644 --- a/crates/edit_prediction_cli/src/repair.rs +++ b/crates/edit_prediction_cli/src/repair.rs @@ -1,14 +1,16 @@ -//! Repair predictions that received poor QA scores. +//! Repair predictions that received poor quality signals. //! -//! This module takes examples with predictions and QA feedback, identifies -//! predictions that need improvement (based on reverts_edits or low confidence), -//! and uses an LLM to generate improved predictions. +//! This module takes examples with predictions, identifies predictions that need +//! improvement, and uses an LLM to generate improved predictions. It supports +//! two sources of quality signals: +//! - QA feedback (reverts_edits or low confidence) +//! - Computed scores when QA is unavailable (high reversal_ratio or wrong_editable_region) use crate::{ BatchProvider, PredictionProvider, anthropic_client::AnthropicClient, example::{Example, ExamplePrediction}, - format_prompt::{TeacherPrompt, extract_cursor_excerpt_from_example}, + format_prompt::{TeacherPrompt, extract_cursor_excerpt_from_example, extract_last_codeblock}, openai_client::OpenAiClient, parse_output::run_parse_output, paths::LLM_CACHE_DB, @@ -18,6 +20,46 @@ use crate::{ use anyhow::{Context as _, Result}; use std::sync::OnceLock; +const KEEP_PREVIOUS: &str = "KEEP_PREVIOUS"; + +/// Print a summary report of repair results across all examples. +pub fn print_report(examples: &[Example], confidence_threshold: u8) { + let total = examples.len(); + let mut no_repair_needed = 0; + let mut repaired = 0; + let mut repair_failed = 0; + + for example in examples { + if !needs_repair(example, confidence_threshold) { + no_repair_needed += 1; + continue; + } + + if has_successful_repair(example) { + repaired += 1; + } else { + repair_failed += 1; + } + } + + let needed_repair = total - no_repair_needed; + + eprintln!(); + eprintln!("Repair summary ({total} examples):"); + eprintln!( + " {no_repair_needed}/{total} didn't need repair (confidence > {confidence_threshold})" + ); + if needed_repair > 0 { + eprintln!(" {needed_repair}/{total} needed repair:"); + if repaired > 0 { + eprintln!(" {repaired} repaired successfully"); + } + if repair_failed > 0 { + eprintln!(" {repair_failed} failed to repair"); + } + } +} + /// Arguments for the repair command. #[derive(Debug, Clone, clap::Args)] pub struct RepairArgs { @@ -41,18 +83,77 @@ fn model_for_backend(backend: BatchProvider) -> &'static str { } } +/// Build the quality feedback string from QA results. +fn build_qa_feedback(example: &Example) -> Option { + let qa = example.qa.first()?.as_ref()?; + + let qa_reasoning = qa.reasoning.as_deref().unwrap_or("No reasoning provided"); + let reverts_edits = qa + .reverts_edits + .map_or("unknown", |v| if v { "yes" } else { "no" }); + let confidence = qa + .confidence + .map_or("unknown".to_string(), |v| v.to_string()); + + Some(format!( + "- **Reverts user edits**: {reverts_edits}\n\ + - **Confidence score**: {confidence}/5\n\ + - **Reasoning**: {qa_reasoning}" + )) +} + +/// Build the quality feedback string from computed scores when QA is unavailable. +fn build_score_feedback(example: &Example) -> Option { + let score = example.score.first()?; + + let mut issues = Vec::new(); + + if score.reversal_ratio > 0.9 { + issues.push(format!( + "Automated analysis detected a high reversal ratio ({:.2}), which suggests this \ + prediction may be reverting changes the user intentionally made. Double-check that \ + the prediction doesn't undo the user's recent edits. If the prediction is actually \ + fine and the edits are intentional completions rather than reversals, keep it as-is. \ + If it truly reverts the user's changes, generate an improved prediction that \ + continues the user's intent instead.", + score.reversal_ratio + )); + } + + if score.wrong_editable_region == Some(true) { + issues.push( + "Automated analysis detected that the prediction may be modifying code outside \ + the expected editable region, or producing changes misaligned with the editable \ + region boundaries. Make sure the prediction only modifies code within the editable \ + region and is properly aligned." + .to_string(), + ); + } + + if issues.is_empty() { + return None; + } + + let mut feedback = String::from( + "No human quality assessment is available, but automated scoring flagged potential issues:\n\n", + ); + for issue in &issues { + feedback.push_str(&format!("- {issue}\n")); + } + feedback.push_str( + "\nRemember: if the previous prediction was actually correct, output `KEEP_PREVIOUS`. \ + If no edits should be made at all and you are unsure how to improve it, output `NO_EDITS`.", + ); + + Some(feedback) +} + /// Build the repair prompt for an example that needs improvement. pub fn build_repair_prompt(example: &Example) -> Result { let prediction = example .predictions .first() .context("no predictions available")?; - let qa = example - .qa - .first() - .context("no QA results available")? - .as_ref() - .context("QA result is None")?; let prompt_inputs = example .prompt_inputs .as_ref() @@ -62,6 +163,10 @@ pub fn build_repair_prompt(example: &Example) -> Result { .as_ref() .context("no actual_patch available (run predict first)")?; + let quality_feedback = build_qa_feedback(example) + .or_else(|| build_score_feedback(example)) + .context("no quality feedback available (need either QA results or computed scores)")?; + let actual_patch_word_diff = unified_to_word_diff(actual_patch); let mut edit_history = String::new(); @@ -88,37 +193,39 @@ pub fn build_repair_prompt(example: &Example) -> Result { let cursor_excerpt = extract_cursor_excerpt_from_example(example).context("failed to extract cursor excerpt")?; - let qa_reasoning = qa.reasoning.as_deref().unwrap_or("No reasoning provided"); - let reverts_edits = qa - .reverts_edits - .map_or("unknown", |v| if v { "yes" } else { "no" }); - let confidence = qa - .confidence - .map_or("unknown".to_string(), |v| v.to_string()); - let prompt_template = crate::prompt_assets::get_prompt("repair.md"); Ok(prompt_template .replace("{edit_history}", &edit_history) .replace("{context}", &context) .replace("{cursor_excerpt}", &cursor_excerpt) .replace("{actual_patch_word_diff}", &actual_patch_word_diff) - .replace("{reverts_edits}", reverts_edits) - .replace("{confidence}", &confidence) - .replace("{qa_reasoning}", qa_reasoning)) + .replace("{quality_feedback}", &quality_feedback)) } -/// Check if an example needs repair based on QA feedback. +/// Check if an example needs repair based on QA feedback or computed scores. pub fn needs_repair(example: &Example, confidence_threshold: u8) -> bool { - let Some(qa) = example.qa.first().and_then(|q| q.as_ref()) else { - return false; - }; + // Check QA-based signals first. + if let Some(qa) = example.qa.first().and_then(|q| q.as_ref()) { + if qa.reverts_edits == Some(true) { + return true; + } - if qa.reverts_edits == Some(true) { - return true; + if let Some(confidence) = qa.confidence { + if confidence <= confidence_threshold { + return true; + } + } + + return false; } - if let Some(confidence) = qa.confidence { - if confidence <= confidence_threshold { + // When QA is unavailable, fall back to computed score signals. + if let Some(score) = example.score.first() { + if score.reversal_ratio > 0.9 { + return true; + } + + if score.wrong_editable_region == Some(true) { return true; } } @@ -163,10 +270,6 @@ pub async fn run_repair( anyhow::bail!("no predictions available (run predict first)"); } - if example.qa.is_empty() { - anyhow::bail!("no QA results available (run qa first)"); - } - let step_progress = example_progress.start(Step::Repair); let model = model_for_backend(args.backend); @@ -251,22 +354,37 @@ pub async fn run_repair( } }; - let parse_result = TeacherPrompt::parse(example, &response); - let err = parse_result - .as_ref() - .err() - .map(|e| format!("Failed to parse repair response: {}", e)); - - let (actual_patch, actual_cursor) = parse_result.ok().unzip(); - let actual_cursor = actual_cursor.flatten(); - - example.predictions.push(ExamplePrediction { - actual_patch, - actual_output: response, - actual_cursor, - error: err, - provider: PredictionProvider::Repair, - }); + let last_codeblock = extract_last_codeblock(&response); + if last_codeblock.trim() == KEEP_PREVIOUS { + let original = example + .predictions + .first() + .context("no original prediction to keep")?; + example.predictions.push(ExamplePrediction { + actual_patch: original.actual_patch.clone(), + actual_output: response, + actual_cursor: original.actual_cursor.clone(), + error: None, + provider: PredictionProvider::Repair, + }); + } else { + let parse_result = TeacherPrompt::parse(example, &response); + let err = parse_result + .as_ref() + .err() + .map(|e| format!("Failed to parse repair response: {}", e)); + + let (actual_patch, actual_cursor) = parse_result.ok().unzip(); + let actual_cursor = actual_cursor.flatten(); + + example.predictions.push(ExamplePrediction { + actual_patch, + actual_output: response, + actual_cursor, + error: err, + provider: PredictionProvider::Repair, + }); + } Ok(()) }