From 80b9c3a908dbc7c85812be4c84cdc153538f5e05 Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Fri, 30 Jan 2026 13:48:27 +0200 Subject: [PATCH] ep: Add --cache-only option to avoid sending requests (#48011) Release Notes: - N/A --- .../edit_prediction_cli/src/anthropic_client.rs | 8 ++++++-- crates/edit_prediction_cli/src/main.rs | 3 +++ crates/edit_prediction_cli/src/openai_client.rs | 8 ++++++-- crates/edit_prediction_cli/src/predict.rs | 15 ++++++++++----- crates/edit_prediction_cli/src/qa.rs | 8 ++++++-- crates/edit_prediction_cli/src/repair.rs | 8 ++++++-- 6 files changed, 37 insertions(+), 13 deletions(-) diff --git a/crates/edit_prediction_cli/src/anthropic_client.rs b/crates/edit_prediction_cli/src/anthropic_client.rs index 242a38dd6104c1c173d5bd978fc8b41d69f1edff..653fddd58b3567697ca5c9024ea4944e3005016f 100644 --- a/crates/edit_prediction_cli/src/anthropic_client.rs +++ b/crates/edit_prediction_cli/src/anthropic_client.rs @@ -267,13 +267,16 @@ impl BatchingLlmClient { max_tokens: u64, messages: Vec, seed: Option, + cache_only: bool, ) -> Result> { let response = self.lookup(model, max_tokens, &messages, seed)?; if let Some(response) = response { return Ok(Some(response)); } - self.mark_for_batch(model, max_tokens, &messages, seed)?; + if !cache_only { + self.mark_for_batch(model, max_tokens, &messages, seed)?; + } Ok(None) } @@ -672,6 +675,7 @@ impl AnthropicClient { max_tokens: u64, messages: Vec, seed: Option, + cache_only: bool, ) -> Result> { match self { AnthropicClient::Plain(plain_llm_client) => plain_llm_client @@ -680,7 +684,7 @@ impl AnthropicClient { .map(Some), AnthropicClient::Batch(batching_llm_client) => { batching_llm_client - .generate(model, max_tokens, messages, seed) + .generate(model, max_tokens, messages, seed, cache_only) .await } AnthropicClient::Dummy => panic!("Dummy LLM client is not expected to be used"), diff --git a/crates/edit_prediction_cli/src/main.rs b/crates/edit_prediction_cli/src/main.rs index ccb859d30f517b80cd6542b98b3557513b2b504b..2f229b3c392b5dc9114819637e0f3770b33db0ae 100644 --- a/crates/edit_prediction_cli/src/main.rs +++ b/crates/edit_prediction_cli/src/main.rs @@ -257,6 +257,9 @@ struct PredictArgs { provider: Option, #[clap(long, default_value_t = 1)] repetitions: usize, + /// Only use cached responses, don't queue new requests for batching + #[clap(long)] + cache_only: bool, } #[derive(Debug, Args, Clone)] diff --git a/crates/edit_prediction_cli/src/openai_client.rs b/crates/edit_prediction_cli/src/openai_client.rs index 15d31c997183aa7a5dc12669a7016be8eb9a6ad4..c9947e16099c7923e6c948045eda8ca08ff625cf 100644 --- a/crates/edit_prediction_cli/src/openai_client.rs +++ b/crates/edit_prediction_cli/src/openai_client.rs @@ -194,13 +194,16 @@ impl BatchingOpenAiClient { max_tokens: u64, messages: Vec, seed: Option, + cache_only: bool, ) -> Result> { let response = self.lookup(model, max_tokens, &messages, seed)?; if let Some(response) = response { return Ok(Some(response)); } - self.mark_for_batch(model, max_tokens, &messages, seed)?; + if !cache_only { + self.mark_for_batch(model, max_tokens, &messages, seed)?; + } Ok(None) } @@ -643,6 +646,7 @@ impl OpenAiClient { max_tokens: u64, messages: Vec, seed: Option, + cache_only: bool, ) -> Result> { match self { OpenAiClient::Plain(plain_client) => plain_client @@ -651,7 +655,7 @@ impl OpenAiClient { .map(Some), OpenAiClient::Batch(batching_client) => { batching_client - .generate(model, max_tokens, messages, seed) + .generate(model, max_tokens, messages, seed, cache_only) .await } OpenAiClient::Dummy => panic!("Dummy OpenAI client is not expected to be used"), diff --git a/crates/edit_prediction_cli/src/predict.rs b/crates/edit_prediction_cli/src/predict.rs index 19c2591b4fe3a1fdede82269da37af170ea4d2d7..192f8dc8f2dea95b756c7d699b91c388cced3cdf 100644 --- a/crates/edit_prediction_cli/src/predict.rs +++ b/crates/edit_prediction_cli/src/predict.rs @@ -69,7 +69,7 @@ pub async fn run_prediction( .await?; let batched = matches!(provider, PredictionProvider::Teacher(..)); - return predict_teacher(example, backend, batched, repetition_count).await; + return predict_teacher(example, backend, batched, repetition_count, args.cache_only).await; } run_load_project(example, app_state.clone(), example_progress, cx.clone()).await?; @@ -262,12 +262,15 @@ async fn predict_teacher( backend: TeacherBackend, batched: bool, repetition_count: usize, + cache_only: bool, ) -> anyhow::Result<()> { match backend { TeacherBackend::Sonnet45 => { - predict_anthropic(example, backend, batched, repetition_count).await + predict_anthropic(example, backend, batched, repetition_count, cache_only).await + } + TeacherBackend::Gpt52 => { + predict_openai(example, backend, batched, repetition_count, cache_only).await } - TeacherBackend::Gpt52 => predict_openai(example, backend, batched, repetition_count).await, } } @@ -276,6 +279,7 @@ async fn predict_anthropic( backend: TeacherBackend, batched: bool, repetition_count: usize, + cache_only: bool, ) -> anyhow::Result<()> { let llm_model_name = backend.model_name(); let max_tokens = 16384; @@ -301,7 +305,7 @@ async fn predict_anthropic( let seed = if repetition_count > 1 { Some(ix) } else { None }; let Some(response) = llm_client - .generate(llm_model_name, max_tokens, messages, seed) + .generate(llm_model_name, max_tokens, messages, seed, cache_only) .await? else { // Request stashed for batched processing @@ -341,6 +345,7 @@ async fn predict_openai( backend: TeacherBackend, batched: bool, repetition_count: usize, + cache_only: bool, ) -> anyhow::Result<()> { let llm_model_name = backend.model_name(); let max_tokens = 16384; @@ -362,7 +367,7 @@ async fn predict_openai( let seed = if repetition_count > 1 { Some(ix) } else { None }; let Some(response) = llm_client - .generate(llm_model_name, max_tokens, messages, seed) + .generate(llm_model_name, max_tokens, messages, seed, cache_only) .await? else { // Request stashed for batched processing diff --git a/crates/edit_prediction_cli/src/qa.rs b/crates/edit_prediction_cli/src/qa.rs index 28a592c2b875303d59087e3fe5e0e7d176ee74c2..e9c2e44549b67940a1dcfbb5529c123b4287f7e2 100644 --- a/crates/edit_prediction_cli/src/qa.rs +++ b/crates/edit_prediction_cli/src/qa.rs @@ -172,7 +172,9 @@ impl QaClient { cache_control: None, }], }]; - let response = client.generate(model, max_tokens, messages, None).await?; + let response = client + .generate(model, max_tokens, messages, None, false) + .await?; Ok(response.map(|r| { r.content .iter() @@ -188,7 +190,9 @@ impl QaClient { let messages = vec![open_ai::RequestMessage::User { content: open_ai::MessageContent::Plain(prompt.to_string()), }]; - let response = client.generate(model, max_tokens, messages, None).await?; + let response = client + .generate(model, max_tokens, messages, None, false) + .await?; Ok(response.map(|r| { r.choices .into_iter() diff --git a/crates/edit_prediction_cli/src/repair.rs b/crates/edit_prediction_cli/src/repair.rs index e78420d2d5f4bface31d1bd85e21165b38226f76..e3cf424244dfcc2e407464a4048a6f7500813a1c 100644 --- a/crates/edit_prediction_cli/src/repair.rs +++ b/crates/edit_prediction_cli/src/repair.rs @@ -152,7 +152,9 @@ impl RepairClient { cache_control: None, }], }]; - let response = client.generate(model, max_tokens, messages, None).await?; + let response = client + .generate(model, max_tokens, messages, None, false) + .await?; Ok(response.map(|r| { r.content .iter() @@ -168,7 +170,9 @@ impl RepairClient { let messages = vec![open_ai::RequestMessage::User { content: open_ai::MessageContent::Plain(prompt.to_string()), }]; - let response = client.generate(model, max_tokens, messages, None).await?; + let response = client + .generate(model, max_tokens, messages, None, false) + .await?; Ok(response.map(|r| { r.choices .into_iter()