From 7b082cbb6f9f24ff1c4a6d2d7b998c08c7576929 Mon Sep 17 00:00:00 2001 From: Guilherme do Amaral Alves Date: Wed, 22 Apr 2026 07:40:37 -0300 Subject: [PATCH] Add interleaved_reasoning option to openai compatible models (#54016) Release Notes: - Added interleaved_reasoning option to openai compatible models --- This PR adds the interleaved_reasoning option for OpenAI-compatible models, addressing the issue described in https://github.com/ggml-org/llama.cpp/issues/20837. In my testing, enabling interleaved_reasoning not only resolved the tool-calling issues encountered by Qwen3.5 models in llama.cpp, but also appeared to improve the model's coding capabilities. I have also verified the outgoing requests using a proxy to ensure the parameter is being sent correctly.It is also likely that this change will benefit other models and providers as well. Note: While I used AI to assist with the implementation, I have reviewed and tested the changes. As I am relatively new to Rust and the Zed codebase, I would appreciate any feedback or suggestions for improvement. I am happy to make further adjustments if needed. Thank you all for building such an amazing editor! Co-authored-by: Oleksiy Syvokon --- .../add_llm_provider_modal.rs | 2 + .../edit_prediction_cli/src/openai_client.rs | 1 + crates/edit_prediction_cli/src/repair.rs | 1 + .../language_models/src/provider/open_ai.rs | 1 + .../src/provider/open_ai_compatible.rs | 1 + .../language_models/src/provider/opencode.rs | 1 + crates/language_models/src/provider/vercel.rs | 1 + .../src/provider/vercel_ai_gateway.rs | 2 + crates/language_models/src/provider/x_ai.rs | 1 + .../src/language_models_cloud.rs | 1 + crates/open_ai/src/completion.rs | 109 ++++++++++++++++++ crates/open_ai/src/open_ai.rs | 2 + crates/settings_content/src/language_model.rs | 3 + docs/src/ai/llm-providers.md | 1 + 14 files changed, 127 insertions(+) diff --git a/crates/agent_ui/src/agent_configuration/add_llm_provider_modal.rs b/crates/agent_ui/src/agent_configuration/add_llm_provider_modal.rs index e0df79ba4dfe226652818b120b7bfcc493c73b1e..1cff19c7cf4b3e19b96f0f3f7cf47a9892dfe5cc 100644 --- a/crates/agent_ui/src/agent_configuration/add_llm_provider_modal.rs +++ b/crates/agent_ui/src/agent_configuration/add_llm_provider_modal.rs @@ -157,6 +157,7 @@ impl ModelInput { parallel_tool_calls, prompt_cache_key, chat_completions, + .. } = ModelCapabilities::default(); Self { @@ -209,6 +210,7 @@ impl ModelInput { parallel_tool_calls: self.capabilities.supports_parallel_tool_calls.selected(), prompt_cache_key: self.capabilities.supports_prompt_cache_key.selected(), chat_completions: self.capabilities.supports_chat_completions.selected(), + interleaved_reasoning: false, }, }) } diff --git a/crates/edit_prediction_cli/src/openai_client.rs b/crates/edit_prediction_cli/src/openai_client.rs index e35848aa1ccbd46d29f88a6c9a0ccfd35309114a..205b339226f34d0d260c3766f2c9b310f1d40cff 100644 --- a/crates/edit_prediction_cli/src/openai_client.rs +++ b/crates/edit_prediction_cli/src/openai_client.rs @@ -485,6 +485,7 @@ impl BatchingOpenAiClient { "assistant" => RequestMessage::Assistant { content: Some(MessageContent::Plain(msg.content)), tool_calls: Vec::new(), + reasoning_content: None, }, "system" => RequestMessage::System { content: MessageContent::Plain(msg.content), diff --git a/crates/edit_prediction_cli/src/repair.rs b/crates/edit_prediction_cli/src/repair.rs index d4d202708abc3234abc3bdb6b6ce3a506a06489d..2ae62fd70f89bae02778edf040a29edf6b68ab12 100644 --- a/crates/edit_prediction_cli/src/repair.rs +++ b/crates/edit_prediction_cli/src/repair.rs @@ -388,6 +388,7 @@ pub async fn run_repair( open_ai::RequestMessage::Assistant { content: Some(open_ai::MessageContent::Plain(teacher_response.clone())), tool_calls: vec![], + reasoning_content: None, }, // Turn 3: Repair critique and instructions open_ai::RequestMessage::User { diff --git a/crates/language_models/src/provider/open_ai.rs b/crates/language_models/src/provider/open_ai.rs index 358a0ec5a6d517064be93d973f08eceb894ab665..da341211855ca07bef526740a5a39260a4403982 100644 --- a/crates/language_models/src/provider/open_ai.rs +++ b/crates/language_models/src/provider/open_ai.rs @@ -401,6 +401,7 @@ impl LanguageModel for OpenAiLanguageModel { self.model.supports_prompt_cache_key(), self.max_output_tokens(), self.model.reasoning_effort(), + false, ); let completions = self.stream_completion(request, cx); async move { diff --git a/crates/language_models/src/provider/open_ai_compatible.rs b/crates/language_models/src/provider/open_ai_compatible.rs index 7a3126f8f33beb7851ea914cfe063b76f8b4443f..b82e3b1ae3aa80373dbad3550e7bd896b8879f2b 100644 --- a/crates/language_models/src/provider/open_ai_compatible.rs +++ b/crates/language_models/src/provider/open_ai_compatible.rs @@ -403,6 +403,7 @@ impl LanguageModel for OpenAiCompatibleLanguageModel { self.model.capabilities.prompt_cache_key, self.max_output_tokens(), self.model.reasoning_effort, + self.model.capabilities.interleaved_reasoning, ); let completions = self.stream_completion(request, cx); async move { diff --git a/crates/language_models/src/provider/opencode.rs b/crates/language_models/src/provider/opencode.rs index aae3a552544ebf2cc59255da954d84cf7b78c7da..4754741715f39104f392b6871f68a1c04e3bdfce 100644 --- a/crates/language_models/src/provider/opencode.rs +++ b/crates/language_models/src/provider/opencode.rs @@ -490,6 +490,7 @@ impl LanguageModel for OpenCodeLanguageModel { false, self.model.max_output_tokens(), None, + false, ); let stream = self.stream_openai_chat(openai_request, cx); async move { diff --git a/crates/language_models/src/provider/vercel.rs b/crates/language_models/src/provider/vercel.rs index cedbc9c3cb988375b90864ceb23a3b14fc50abdd..ce9870073ee9f399e9f02a5a093931c1a4304fdb 100644 --- a/crates/language_models/src/provider/vercel.rs +++ b/crates/language_models/src/provider/vercel.rs @@ -324,6 +324,7 @@ impl LanguageModel for VercelLanguageModel { self.model.supports_prompt_cache_key(), self.max_output_tokens(), None, + false, ); let completions = self.stream_completion(request, cx); async move { diff --git a/crates/language_models/src/provider/vercel_ai_gateway.rs b/crates/language_models/src/provider/vercel_ai_gateway.rs index 66767edd809531b4b020263654922d742a1a04be..cf379e6edc1db181127cc284834b19c61143d692 100644 --- a/crates/language_models/src/provider/vercel_ai_gateway.rs +++ b/crates/language_models/src/provider/vercel_ai_gateway.rs @@ -461,6 +461,7 @@ impl LanguageModel for VercelAiGatewayLanguageModel { self.model.capabilities.prompt_cache_key, self.max_output_tokens(), None, + false, ); let completions = self.stream_open_ai(request, cx); async move { @@ -591,6 +592,7 @@ async fn list_models( parallel_tool_calls, prompt_cache_key, chat_completions: true, + interleaved_reasoning: false, }, }); } diff --git a/crates/language_models/src/provider/x_ai.rs b/crates/language_models/src/provider/x_ai.rs index e95bc1ba72fabcf9632b2ed2efd94254fb1313cd..b14cb82bf83ac773a58104207d1594b36994a91c 100644 --- a/crates/language_models/src/provider/x_ai.rs +++ b/crates/language_models/src/provider/x_ai.rs @@ -347,6 +347,7 @@ impl LanguageModel for XAiLanguageModel { self.model.supports_prompt_cache_key(), self.max_output_tokens(), None, + false, ); let completions = self.stream_completion(request, cx); async move { diff --git a/crates/language_models_cloud/src/language_models_cloud.rs b/crates/language_models_cloud/src/language_models_cloud.rs index 1300fd42e60f0be5c135a94dde3b0503d77eb7a9..4e444def7b4df6295a5f12ebccb08802abdfca4d 100644 --- a/crates/language_models_cloud/src/language_models_cloud.rs +++ b/crates/language_models_cloud/src/language_models_cloud.rs @@ -600,6 +600,7 @@ impl LanguageModel for CloudLanguageModel, reasoning_effort: Option, + interleaved_reasoning: bool, ) -> crate::Request { let stream = !model_id.starts_with("o1-"); let mut messages = Vec::new(); + let mut current_reasoning: Option = None; for message in request.messages { for content in message.content { match content { + MessageContent::Thinking { text, .. } if interleaved_reasoning => { + current_reasoning.get_or_insert_default().push_str(&text); + } MessageContent::Text(text) | MessageContent::Thinking { text, .. } => { let should_add = if message.role == Role::User { // Including whitespace-only user messages can cause error with OpenAI compatible APIs @@ -50,6 +55,15 @@ pub fn into_open_ai( message.role, &mut messages, ); + if let Some(reasoning) = current_reasoning.take() { + if let Some(crate::RequestMessage::Assistant { + reasoning_content, + .. + }) = messages.last_mut() + { + *reasoning_content = Some(reasoning); + } + } } } MessageContent::RedactedThinking(_) => {} @@ -85,6 +99,7 @@ pub fn into_open_ai( messages.push(crate::RequestMessage::Assistant { content: None, tool_calls: vec![tool_call], + reasoning_content: current_reasoning.take(), }); } } @@ -362,6 +377,7 @@ fn add_message_content_part( Role::Assistant => crate::RequestMessage::Assistant { content: Some(crate::MessageContent::from(vec![new_part])), tool_calls: Vec::new(), + reasoning_content: None, }, Role::System => crate::RequestMessage::System { content: crate::MessageContent::from(vec![new_part]), @@ -1690,4 +1706,97 @@ mod tests { "OutputItemDone reasoning should not produce Thinking events" ); } + + #[test] + fn into_open_ai_interleaved_reasoning() { + let tool_use_id = LanguageModelToolUseId::from("call-1"); + let tool_input = json!({"query": "foo"}); + let tool_arguments = serde_json::to_string(&tool_input).unwrap(); + let tool_use = LanguageModelToolUse { + id: tool_use_id.clone(), + name: Arc::from("search"), + raw_input: tool_arguments.clone(), + input: tool_input, + is_input_complete: true, + thought_signature: None, + }; + let tool_result = LanguageModelToolResult { + tool_use_id: tool_use_id, + tool_name: Arc::from("search"), + is_error: false, + content: LanguageModelToolResultContent::Text(Arc::from("result")), + output: None, + }; + let request = LanguageModelRequest { + thread_id: None, + prompt_id: None, + intent: None, + messages: vec![ + LanguageModelRequestMessage { + role: Role::User, + content: vec![MessageContent::Text("search for something".into())], + cache: false, + reasoning_details: None, + }, + LanguageModelRequestMessage { + role: Role::Assistant, + content: vec![ + MessageContent::Thinking { + text: "I should search".into(), + signature: None, + }, + MessageContent::Text("Searching now.".into()), + MessageContent::ToolUse(tool_use), + ], + cache: false, + reasoning_details: None, + }, + LanguageModelRequestMessage { + role: Role::Assistant, + content: vec![MessageContent::ToolResult(tool_result)], + cache: false, + reasoning_details: None, + }, + ], + tools: vec![], + tool_choice: None, + stop: vec![], + temperature: None, + thinking_allowed: true, + thinking_effort: None, + speed: None, + }; + + let result = into_open_ai(request.clone(), "model", false, false, None, None, true); + assert_eq!( + serde_json::to_value(&result).unwrap()["messages"], + json!([ + {"role": "user", "content": "search for something"}, + { + "role": "assistant", + "content": "Searching now.", + "tool_calls": [{"id": "call-1", "type": "function", "function": {"name": "search", "arguments": tool_arguments}}], + "reasoning_content": "I should search" + }, + {"role": "tool", "content": "result", "tool_call_id": "call-1"} + ]) + ); + + let result = into_open_ai(request, "model", false, false, None, None, false); + assert_eq!( + serde_json::to_value(&result).unwrap()["messages"], + json!([ + {"role": "user", "content": "search for something"}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "I should search"}, + {"type": "text", "text": "Searching now."} + ], + "tool_calls": [{"id": "call-1", "type": "function", "function": {"name": "search", "arguments": tool_arguments}}] + }, + {"role": "tool", "content": "result", "tool_call_id": "call-1"} + ]) + ); + } } diff --git a/crates/open_ai/src/open_ai.rs b/crates/open_ai/src/open_ai.rs index 256b78f8a2ec921e842a846cbee75a4147745e00..0109efbe293ee0162d9c01757801e9897da8a2ba 100644 --- a/crates/open_ai/src/open_ai.rs +++ b/crates/open_ai/src/open_ai.rs @@ -366,6 +366,8 @@ pub enum RequestMessage { content: Option, #[serde(default, skip_serializing_if = "Vec::is_empty")] tool_calls: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + reasoning_content: Option, }, User { content: MessageContent, diff --git a/crates/settings_content/src/language_model.rs b/crates/settings_content/src/language_model.rs index 17beef9df25f7662caedd1380e867ad4aefbb7cc..635b58f988d6adb0baf1ca55bc49464cabff3cf0 100644 --- a/crates/settings_content/src/language_model.rs +++ b/crates/settings_content/src/language_model.rs @@ -289,6 +289,8 @@ pub struct OpenAiCompatibleModelCapabilities { pub prompt_cache_key: bool, #[serde(default = "default_true")] pub chat_completions: bool, + #[serde(default)] + pub interleaved_reasoning: bool, } impl Default for OpenAiCompatibleModelCapabilities { @@ -299,6 +301,7 @@ impl Default for OpenAiCompatibleModelCapabilities { parallel_tool_calls: false, prompt_cache_key: false, chat_completions: default_true(), + interleaved_reasoning: false, } } } diff --git a/docs/src/ai/llm-providers.md b/docs/src/ai/llm-providers.md index 8656406d23d5d36b678c8df5dcbe56f4abe23164..92c490a05d2031352516a3a8e781421f585802bd 100644 --- a/docs/src/ai/llm-providers.md +++ b/docs/src/ai/llm-providers.md @@ -607,6 +607,7 @@ By default, OpenAI-compatible models inherit the following capabilities: - `parallel_tool_calls`: false (does not support `parallel_tool_calls` parameter) - `prompt_cache_key`: false (does not support `prompt_cache_key` parameter) - `chat_completions`: true (calls the `/chat/completions` endpoint) +- `interleaved_reasoning`: false (thinking tokens are sent inline in message text; set to true to send them as a dedicated `reasoning_content` field for models that expect it) If a provider exposes models that only work with the Responses API, set `chat_completions` to `false` for those entries. Zed uses the Responses endpoint for these models.