From 3251f69657bbc637ad3656bde9c448cc97cc5077 Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Wed, 4 Feb 2026 18:07:23 +0200 Subject: [PATCH] ep: Lower max_output_tokens default for Ollama (#48370) Zeta 1 should not be configurable, not by this setting. Release Notes: - N/A --------- Co-authored-by: Ben Kunkle --- assets/settings/default.json | 2 +- crates/edit_prediction/src/ollama.rs | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/assets/settings/default.json b/assets/settings/default.json index da0bd07fc2166c83a401b3a7558c8372b7158993..332c16afe23ffae63f2a014956947b4c10f42738 100644 --- a/assets/settings/default.json +++ b/assets/settings/default.json @@ -1503,7 +1503,7 @@ "ollama": { "api_url": "http://localhost:11434", "model": "qwen2.5-coder:7b-base", - "max_output_tokens": 256, + "max_output_tokens": 64, }, // Whether edit predictions are enabled when editing text threads in the agent panel. // This setting has no effect if globally disabled. diff --git a/crates/edit_prediction/src/ollama.rs b/crates/edit_prediction/src/ollama.rs index 91d3b542b1dfc717ba9435ff035359a3c784dced..a79b61559cbcd7a74ae7619ee54b115eb576a637 100644 --- a/crates/edit_prediction/src/ollama.rs +++ b/crates/edit_prediction/src/ollama.rs @@ -110,7 +110,6 @@ impl Ollama { let Some(model) = settings.model.clone() else { return Task::ready(Ok(None)); }; - let max_output_tokens = settings.max_output_tokens; let api_url = settings.api_url.clone(); log::debug!("Ollama: Requesting completion (model: {})", model); @@ -127,7 +126,18 @@ impl Ollama { let is_zeta = is_zeta_model(&model); + // Zeta generates more tokens than FIM models. Ideally, we'd use MAX_REWRITE_TOKENS, + // but this might be too slow for local deployments. So we make it configurable, + // but we also have this hardcoded multiplier for now. + let max_output_tokens = if is_zeta { + settings.max_output_tokens * 4 + } else { + settings.max_output_tokens + }; + let result = cx.background_spawn(async move { + let zeta_editable_region_tokens = max_output_tokens as usize; + // For zeta models, use the dedicated zeta1 functions which handle their own // range computation with the correct token limits. let (prompt, stop_tokens, editable_range_override, inputs) = if is_zeta { @@ -136,7 +146,7 @@ impl Ollama { cursor_point, &path_str, &snapshot, - max_output_tokens as usize, + zeta_editable_region_tokens, ZETA_MAX_CONTEXT_TOKENS, ); let input_events = zeta1::prompt_for_events(&events, ZETA_MAX_EVENT_TOKENS);