Detailed changes
@@ -48,7 +48,9 @@ pub async fn run_format_prompt(
let snapshot = cx.background_spawn(snapshot_fut).await;
match args.provider {
- PredictionProvider::Teacher(_) | PredictionProvider::TeacherNonBatching(_) => {
+ PredictionProvider::Teacher(_)
+ | PredictionProvider::TeacherNonBatching(_)
+ | PredictionProvider::RepairedTeacher(_) => {
step_progress.set_substatus("formatting teacher prompt");
let (editable_range, context_range) = editable_and_context_ranges_for_cursor_position(
@@ -0,0 +1,119 @@
+//! Shared LLM client abstraction for Anthropic and OpenAI.
+//!
+//! This module provides a unified interface for making LLM requests,
+//! supporting both synchronous and batch modes.
+
+use crate::BatchProvider;
+use crate::anthropic_client::AnthropicClient;
+use crate::openai_client::OpenAiClient;
+use crate::paths::LLM_CACHE_DB;
+use anyhow::Result;
+
+/// A unified LLM client that wraps either Anthropic or OpenAI.
+pub enum LlmClient {
+ Anthropic(AnthropicClient),
+ OpenAi(OpenAiClient),
+}
+
+impl LlmClient {
+ /// Create a new LLM client for the given backend.
+ ///
+ /// If `batched` is true, requests will be queued for batch processing.
+ /// Otherwise, requests are made synchronously.
+ pub fn new(backend: BatchProvider, batched: bool) -> Result<Self> {
+ match backend {
+ BatchProvider::Anthropic => {
+ if batched {
+ Ok(LlmClient::Anthropic(AnthropicClient::batch(&LLM_CACHE_DB)?))
+ } else {
+ Ok(LlmClient::Anthropic(AnthropicClient::plain()?))
+ }
+ }
+ BatchProvider::Openai => {
+ if batched {
+ Ok(LlmClient::OpenAi(OpenAiClient::batch(&LLM_CACHE_DB)?))
+ } else {
+ Ok(LlmClient::OpenAi(OpenAiClient::plain()?))
+ }
+ }
+ }
+ }
+
+ /// Generate a response from the LLM.
+ ///
+ /// Returns `Ok(None)` if the request was queued for batch processing
+ /// and results are not yet available.
+ pub async fn generate(
+ &self,
+ model: &str,
+ max_tokens: u64,
+ prompt: &str,
+ ) -> Result<Option<String>> {
+ match self {
+ LlmClient::Anthropic(client) => {
+ let messages = vec![anthropic::Message {
+ role: anthropic::Role::User,
+ content: vec![anthropic::RequestContent::Text {
+ text: prompt.to_string(),
+ cache_control: None,
+ }],
+ }];
+ let response = client.generate(model, max_tokens, messages, None).await?;
+ Ok(response.map(|r| {
+ r.content
+ .iter()
+ .filter_map(|c| match c {
+ anthropic::ResponseContent::Text { text } => Some(text.as_str()),
+ _ => None,
+ })
+ .collect::<Vec<_>>()
+ .join("")
+ }))
+ }
+ LlmClient::OpenAi(client) => {
+ let messages = vec![open_ai::RequestMessage::User {
+ content: open_ai::MessageContent::Plain(prompt.to_string()),
+ }];
+ let response = client.generate(model, max_tokens, messages, None).await?;
+ Ok(response.map(|r| {
+ r.choices
+ .into_iter()
+ .filter_map(|choice| match choice.message {
+ open_ai::RequestMessage::Assistant { content, .. } => {
+ content.map(|c| match c {
+ open_ai::MessageContent::Plain(text) => text,
+ open_ai::MessageContent::Multipart(parts) => parts
+ .into_iter()
+ .filter_map(|p| match p {
+ open_ai::MessagePart::Text { text } => Some(text),
+ _ => None,
+ })
+ .collect::<Vec<_>>()
+ .join(""),
+ })
+ }
+ _ => None,
+ })
+ .collect::<Vec<_>>()
+ .join("")
+ }))
+ }
+ }
+ }
+
+ /// Sync pending batches - upload queued requests and download completed results.
+ pub async fn sync_batches(&self) -> Result<()> {
+ match self {
+ LlmClient::Anthropic(client) => client.sync_batches().await,
+ LlmClient::OpenAi(client) => client.sync_batches().await,
+ }
+ }
+}
+
+/// Get the model name for a given backend.
+pub fn model_for_backend(backend: BatchProvider) -> &'static str {
+ match backend {
+ BatchProvider::Anthropic => "claude-sonnet-4-5",
+ BatchProvider::Openai => "gpt-5.2",
+ }
+}
@@ -5,6 +5,7 @@ mod filter_languages;
mod format_prompt;
mod git;
mod headless;
+mod llm_client;
mod load_project;
mod metrics;
mod openai_client;
@@ -291,6 +292,7 @@ enum PredictionProvider {
Zeta2(ZetaVersion),
Teacher(TeacherBackend),
TeacherNonBatching(TeacherBackend),
+ RepairedTeacher(TeacherBackend),
Repair,
}
@@ -311,6 +313,9 @@ impl std::fmt::Display for PredictionProvider {
PredictionProvider::TeacherNonBatching(backend) => {
write!(f, "teacher-non-batching:{backend}")
}
+ PredictionProvider::RepairedTeacher(backend) => {
+ write!(f, "repaired-teacher:{backend}")
+ }
PredictionProvider::Repair => write!(f, "repair"),
}
}
@@ -345,10 +350,17 @@ impl std::str::FromStr for PredictionProvider {
.unwrap_or(TeacherBackend::Sonnet45);
Ok(PredictionProvider::TeacherNonBatching(backend))
}
+ "repaired-teacher" | "repaired_teacher" | "repairedteacher" => {
+ let backend = arg
+ .map(|a| a.parse())
+ .transpose()?
+ .unwrap_or(TeacherBackend::Sonnet45);
+ Ok(PredictionProvider::RepairedTeacher(backend))
+ }
"repair" => Ok(PredictionProvider::Repair),
_ => {
anyhow::bail!(
- "unknown provider `{provider}`. Valid options: sweep, mercury, zeta1, zeta2, zeta2:<version>, teacher, teacher:<backend>, teacher-non-batching, repair\n\
+ "unknown provider `{provider}`. Valid options: sweep, mercury, zeta1, zeta2, zeta2:<version>, teacher, teacher:<backend>, teacher-non-batching, repaired-teacher, repair\n\
For zeta2, you can optionally specify a version like `zeta2:ordered` or `zeta2:V0113_Ordered`.\n\
For teacher, you can specify a backend like `teacher:sonnet45` or `teacher:gpt52`.\n\
Available zeta versions:\n{}",
@@ -1,13 +1,16 @@
use crate::{
- FormatPromptArgs, PredictArgs, PredictionProvider, TeacherBackend,
+ BatchProvider, FormatPromptArgs, PredictArgs, PredictionProvider, TeacherBackend,
anthropic_client::AnthropicClient,
example::{Example, ExamplePrediction, ExamplePrompt},
format_prompt::{TeacherPrompt, run_format_prompt},
headless::EpAppState,
+ llm_client::{LlmClient, model_for_backend},
load_project::run_load_project,
openai_client::OpenAiClient,
paths::{LATEST_EXAMPLE_RUN_DIR, RUN_DIR},
progress::{ExampleProgress, InfoStyle, Step},
+ qa,
+ repair::{build_repair_prompt, needs_repair, parse_repair_response},
retrieve_context::run_context_retrieval,
};
use anyhow::Context as _;
@@ -72,6 +75,21 @@ pub async fn run_prediction(
return predict_teacher(example, backend, batched, repetition_count).await;
}
+ if let PredictionProvider::RepairedTeacher(backend) = provider {
+ let _step_progress = example_progress.start(Step::Predict);
+
+ run_format_prompt(
+ example,
+ &FormatPromptArgs { provider },
+ app_state.clone(),
+ example_progress,
+ cx,
+ )
+ .await?;
+
+ return predict_repaired_teacher(example, backend, repetition_count).await;
+ }
+
run_load_project(example, app_state.clone(), example_progress, cx.clone()).await?;
let step_progress = example_progress.start(Step::Predict);
@@ -110,6 +128,7 @@ pub async fn run_prediction(
PredictionProvider::Mercury => edit_prediction::EditPredictionModel::Mercury,
PredictionProvider::Teacher(..)
| PredictionProvider::TeacherNonBatching(..)
+ | PredictionProvider::RepairedTeacher(..)
| PredictionProvider::Repair => {
unreachable!()
}
@@ -407,6 +426,83 @@ async fn predict_openai(
Ok(())
}
+/// Default confidence threshold for repair
+const DEFAULT_REPAIR_CONFIDENCE_THRESHOLD: u8 = 3;
+
+/// Predict using teacher model, then run QA evaluation, and optionally repair
+/// if QA indicates issues (reverts_edits=true or low confidence).
+///
+/// This is a non-batched flow that processes each step synchronously.
+async fn predict_repaired_teacher(
+ example: &mut Example,
+ backend: TeacherBackend,
+ repetition_count: usize,
+) -> anyhow::Result<()> {
+ // Step 1: Run teacher prediction (non-batched for immediate results)
+ predict_teacher(example, backend, false, repetition_count).await?;
+
+ // Only proceed with QA/repair for the first prediction
+ let Some(prediction) = example.predictions.first() else {
+ return Ok(());
+ };
+
+ // Skip QA if no actual patch was generated
+ if prediction.actual_patch.is_none() {
+ return Ok(());
+ }
+
+ // Step 2: Run QA evaluation
+ let batch_provider = match backend {
+ TeacherBackend::Sonnet45 => BatchProvider::Anthropic,
+ TeacherBackend::Gpt52 => BatchProvider::Openai,
+ };
+ let qa_client = LlmClient::new(batch_provider, false)?;
+ let qa_model = model_for_backend(batch_provider);
+
+ let qa_result = if let Some(qa_prompt) = qa::build_prompt(example) {
+ match qa_client.generate(qa_model, 1024, &qa_prompt).await? {
+ Some(response_text) => Some(qa::parse_response(&response_text)),
+ None => None,
+ }
+ } else {
+ None
+ };
+
+ // Store QA result
+ example.qa = vec![qa_result.clone()];
+
+ // Step 3: Check if repair is needed and run repair if so
+ if needs_repair(example, DEFAULT_REPAIR_CONFIDENCE_THRESHOLD) {
+ let repair_client = LlmClient::new(batch_provider, false)?;
+
+ if let Some(repair_prompt) = build_repair_prompt(example) {
+ if let Some(response_text) = repair_client
+ .generate(qa_model, 16384, &repair_prompt)
+ .await?
+ {
+ match parse_repair_response(example, &response_text) {
+ Ok(mut repaired_prediction) => {
+ // Mark the prediction as coming from repaired-teacher
+ repaired_prediction.provider = PredictionProvider::RepairedTeacher(backend);
+ example.predictions.push(repaired_prediction);
+ }
+ Err(e) => {
+ // Add error prediction if parsing failed
+ example.predictions.push(ExamplePrediction {
+ actual_patch: None,
+ actual_output: response_text,
+ error: Some(format!("Failed to parse repair response: {}", e)),
+ provider: PredictionProvider::RepairedTeacher(backend),
+ });
+ }
+ }
+ }
+ }
+ }
+
+ Ok(())
+}
+
pub async fn sync_batches(provider: Option<&PredictionProvider>) -> anyhow::Result<()> {
match provider {
Some(PredictionProvider::Teacher(backend)) => match backend {
@@ -18,6 +18,7 @@ A previous model generated a prediction that was judged to have issues. Your job
- Keep existing formatting unless it's absolutely necessary
- Don't write a lot of code if you're not sure what to do
- Do not delete or remove text that was just added in the edit history. If a recent edit introduces incomplete or incorrect code, finish or fix it in place, or simply do nothing rather than removing it. Only remove a recent edit if the history explicitly shows the user undoing it themselves.
+- When uncertain, predict only the minimal, high-confidence portion of the edit. Prefer a small, correct prediction over a large, speculative one
# Input Format
@@ -34,9 +35,9 @@ You will be provided with:
# Output Format
- Briefly explain what was wrong with the previous prediction and how you'll improve it.
-- Output the entire editable region, applying the edits that you predict the user will make next.
-- If you're unsure about some portion of the next edit, you may still predict the surrounding code (such as a function definition, `for` loop, etc) and place the `<|user_cursor|>` within it for the user to fill in.
+- Output a markdown codeblock containing **only** the editable region with your predicted edits applied. The codeblock must start with `<|editable_region_start|>` and end with `<|editable_region_end|>`. Do not include any content before or after these tags.
- Wrap the edited code in a codeblock with exactly five backticks.
+- If you're unsure about some portion of the next edit, you may still predict the surrounding code (such as a function definition, `for` loop, etc) and place the `<|user_cursor|>` within it for the user to fill in.
# 1. User Edits History
@@ -68,4 +69,4 @@ The previous model generated the following edit (in word-diff format):
# Your Improved Prediction
-Based on the feedback above, generate an improved prediction. Address the issues identified in the quality assessment.
+Based on the feedback above, generate an improved prediction. Address the issues identified in the quality assessment. Prefer a small, correct prediction over a large, speculative one.
@@ -4,11 +4,9 @@
//! Caching is handled by the underlying client.
use crate::BatchProvider;
-use crate::anthropic_client::AnthropicClient;
use crate::example::Example;
use crate::format_prompt::extract_cursor_excerpt_from_example;
-use crate::openai_client::OpenAiClient;
-use crate::paths::LLM_CACHE_DB;
+use crate::llm_client::{LlmClient, model_for_backend};
use crate::word_diff::unified_to_word_diff;
use anyhow::Result;
use serde::{Deserialize, Serialize};
@@ -33,13 +31,6 @@ pub struct QaArgs {
pub backend: BatchProvider,
}
-fn model_for_backend(backend: BatchProvider) -> &'static str {
- match backend {
- BatchProvider::Anthropic => "claude-sonnet-4-5",
- BatchProvider::Openai => "gpt-5.2",
- }
-}
-
/// Result of QA evaluation for a single prediction.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QaResult {
@@ -120,7 +111,7 @@ fn extract_codeblock(response: &str) -> Option<String> {
}
/// Parse the LLM response into a QaResult.
-fn parse_response(response_text: &str) -> QaResult {
+pub(crate) fn parse_response(response_text: &str) -> QaResult {
let codeblock = extract_codeblock(response_text);
// Try parsing codeblock first, then fall back to raw response
@@ -156,73 +147,6 @@ fn parse_response(response_text: &str) -> QaResult {
}
}
-enum QaClient {
- Anthropic(AnthropicClient),
- OpenAi(OpenAiClient),
-}
-
-impl QaClient {
- async fn generate(&self, model: &str, max_tokens: u64, prompt: &str) -> Result<Option<String>> {
- match self {
- QaClient::Anthropic(client) => {
- let messages = vec![anthropic::Message {
- role: anthropic::Role::User,
- content: vec![anthropic::RequestContent::Text {
- text: prompt.to_string(),
- cache_control: None,
- }],
- }];
- let response = client.generate(model, max_tokens, messages, None).await?;
- Ok(response.map(|r| {
- r.content
- .iter()
- .filter_map(|c| match c {
- anthropic::ResponseContent::Text { text } => Some(text.as_str()),
- _ => None,
- })
- .collect::<Vec<_>>()
- .join("")
- }))
- }
- QaClient::OpenAi(client) => {
- let messages = vec![open_ai::RequestMessage::User {
- content: open_ai::MessageContent::Plain(prompt.to_string()),
- }];
- let response = client.generate(model, max_tokens, messages, None).await?;
- Ok(response.map(|r| {
- r.choices
- .into_iter()
- .filter_map(|choice| match choice.message {
- open_ai::RequestMessage::Assistant { content, .. } => {
- content.map(|c| match c {
- open_ai::MessageContent::Plain(text) => text,
- open_ai::MessageContent::Multipart(parts) => parts
- .into_iter()
- .filter_map(|p| match p {
- open_ai::MessagePart::Text { text } => Some(text),
- _ => None,
- })
- .collect::<Vec<_>>()
- .join(""),
- })
- }
- _ => None,
- })
- .collect::<Vec<_>>()
- .join("")
- }))
- }
- }
- }
-
- async fn sync_batches(&self) -> Result<()> {
- match self {
- QaClient::Anthropic(client) => client.sync_batches().await,
- QaClient::OpenAi(client) => client.sync_batches().await,
- }
- }
-}
-
/// Run the QA evaluation on a set of examples.
pub async fn run_qa(
examples: &mut [Example],
@@ -230,22 +154,7 @@ pub async fn run_qa(
output_path: Option<&PathBuf>,
) -> Result<()> {
let model = model_for_backend(args.backend);
- let client = match args.backend {
- BatchProvider::Anthropic => {
- if args.no_batch {
- QaClient::Anthropic(AnthropicClient::plain()?)
- } else {
- QaClient::Anthropic(AnthropicClient::batch(&LLM_CACHE_DB)?)
- }
- }
- BatchProvider::Openai => {
- if args.no_batch {
- QaClient::OpenAi(OpenAiClient::plain()?)
- } else {
- QaClient::OpenAi(OpenAiClient::batch(&LLM_CACHE_DB)?)
- }
- }
- };
+ let client = LlmClient::new(args.backend, !args.no_batch)?;
eprintln!(
"Using model: {}, backend: {:?}, batching: {}",
@@ -6,11 +6,9 @@
use crate::BatchProvider;
use crate::PredictionProvider;
-use crate::anthropic_client::AnthropicClient;
use crate::example::{Example, ExamplePrediction};
use crate::format_prompt::{TeacherPrompt, extract_cursor_excerpt_from_example};
-use crate::openai_client::OpenAiClient;
-use crate::paths::LLM_CACHE_DB;
+use crate::llm_client::{LlmClient, model_for_backend};
use crate::word_diff::unified_to_word_diff;
use anyhow::Result;
use std::io::{BufWriter, Write};
@@ -30,7 +28,7 @@ pub struct RepairArgs {
pub wait: bool,
/// Confidence threshold: repair predictions with confidence <= this value (1-5)
- #[clap(long, default_value = "2")]
+ #[clap(long, default_value = "3")]
pub confidence_threshold: u8,
/// Which LLM provider to use (anthropic or openai)
@@ -38,13 +36,6 @@ pub struct RepairArgs {
pub backend: BatchProvider,
}
-fn model_for_backend(backend: BatchProvider) -> &'static str {
- match backend {
- BatchProvider::Anthropic => "claude-sonnet-4-5",
- BatchProvider::Openai => "gpt-5.2",
- }
-}
-
/// Build the repair prompt for an example that needs improvement.
///
/// Returns None if the example doesn't have the required data (predictions, qa, prompt_inputs).
@@ -125,7 +116,10 @@ pub fn needs_repair(example: &Example, confidence_threshold: u8) -> bool {
}
/// Parse the repair response into a prediction.
-fn parse_repair_response(example: &Example, response_text: &str) -> Result<ExamplePrediction> {
+pub(crate) fn parse_repair_response(
+ example: &Example,
+ response_text: &str,
+) -> Result<ExamplePrediction> {
let actual_patch = TeacherPrompt::parse(example, response_text)?;
Ok(ExamplePrediction {
@@ -136,73 +130,6 @@ fn parse_repair_response(example: &Example, response_text: &str) -> Result<Examp
})
}
-enum RepairClient {
- Anthropic(AnthropicClient),
- OpenAi(OpenAiClient),
-}
-
-impl RepairClient {
- async fn generate(&self, model: &str, max_tokens: u64, prompt: &str) -> Result<Option<String>> {
- match self {
- RepairClient::Anthropic(client) => {
- let messages = vec![anthropic::Message {
- role: anthropic::Role::User,
- content: vec![anthropic::RequestContent::Text {
- text: prompt.to_string(),
- cache_control: None,
- }],
- }];
- let response = client.generate(model, max_tokens, messages, None).await?;
- Ok(response.map(|r| {
- r.content
- .iter()
- .filter_map(|c| match c {
- anthropic::ResponseContent::Text { text } => Some(text.as_str()),
- _ => None,
- })
- .collect::<Vec<_>>()
- .join("")
- }))
- }
- RepairClient::OpenAi(client) => {
- let messages = vec![open_ai::RequestMessage::User {
- content: open_ai::MessageContent::Plain(prompt.to_string()),
- }];
- let response = client.generate(model, max_tokens, messages, None).await?;
- Ok(response.map(|r| {
- r.choices
- .into_iter()
- .filter_map(|choice| match choice.message {
- open_ai::RequestMessage::Assistant { content, .. } => {
- content.map(|c| match c {
- open_ai::MessageContent::Plain(text) => text,
- open_ai::MessageContent::Multipart(parts) => parts
- .into_iter()
- .filter_map(|p| match p {
- open_ai::MessagePart::Text { text } => Some(text),
- _ => None,
- })
- .collect::<Vec<_>>()
- .join(""),
- })
- }
- _ => None,
- })
- .collect::<Vec<_>>()
- .join("")
- }))
- }
- }
- }
-
- async fn sync_batches(&self) -> Result<()> {
- match self {
- RepairClient::Anthropic(client) => client.sync_batches().await,
- RepairClient::OpenAi(client) => client.sync_batches().await,
- }
- }
-}
-
/// Run the repair process on a set of examples.
pub async fn run_repair(
examples: &mut [Example],
@@ -210,22 +137,7 @@ pub async fn run_repair(
output_path: Option<&PathBuf>,
) -> Result<()> {
let model = model_for_backend(args.backend);
- let client = match args.backend {
- BatchProvider::Anthropic => {
- if args.no_batch {
- RepairClient::Anthropic(AnthropicClient::plain()?)
- } else {
- RepairClient::Anthropic(AnthropicClient::batch(&LLM_CACHE_DB)?)
- }
- }
- BatchProvider::Openai => {
- if args.no_batch {
- RepairClient::OpenAi(OpenAiClient::plain()?)
- } else {
- RepairClient::OpenAi(OpenAiClient::batch(&LLM_CACHE_DB)?)
- }
- }
- };
+ let client = LlmClient::new(args.backend, !args.no_batch)?;
eprintln!(
"Using model: {}, backend: {:?}, batching: {}, confidence_threshold: {}",