agent: Disable thinking when using inline assistant/edit file tool (#34141)

Bennet Bo Fenner created

This introduces a new field `thinking_allowed` on `LanguageModelRequest`
which lets us control whether thinking should be enabled if the model
supports it.
We permit thinking in the Inline Assistant, Edit File tool and the Git
Commit message generator, this should make generation faster when using
a thinking model, e.g. `claude-sonnet-4-thinking`

Release Notes:

- N/A

Change summary

crates/agent/src/thread.rs                         | 2 ++
crates/agent_ui/src/active_thread.rs               | 1 +
crates/agent_ui/src/buffer_codegen.rs              | 1 +
crates/agent_ui/src/message_editor.rs              | 1 +
crates/agent_ui/src/terminal_inline_assistant.rs   | 1 +
crates/assistant_context/src/assistant_context.rs  | 1 +
crates/assistant_tools/src/edit_agent.rs           | 1 +
crates/assistant_tools/src/edit_agent/evals.rs     | 2 ++
crates/eval/src/instance.rs                        | 1 +
crates/git_ui/src/git_panel.rs                     | 1 +
crates/language_model/src/request.rs               | 1 +
crates/language_models/src/provider/anthropic.rs   | 5 ++++-
crates/language_models/src/provider/bedrock.rs     | 4 +++-
crates/language_models/src/provider/cloud.rs       | 3 ++-
crates/language_models/src/provider/google.rs      | 6 +++---
crates/language_models/src/provider/mistral.rs     | 2 ++
crates/language_models/src/provider/ollama.rs      | 5 ++++-
crates/language_models/src/provider/open_ai.rs     | 1 +
crates/language_models/src/provider/open_router.rs | 4 +++-
crates/rules_library/src/rules_library.rs          | 1 +
crates/semantic_index/src/summary_index.rs         | 1 +
21 files changed, 37 insertions(+), 8 deletions(-)

Detailed changes

crates/agent/src/thread.rs 🔗

@@ -1284,6 +1284,7 @@ impl Thread {
             tool_choice: None,
             stop: Vec::new(),
             temperature: AgentSettings::temperature_for_model(&model, cx),
+            thinking_allowed: true,
         };
 
         let available_tools = self.available_tools(cx, model.clone());
@@ -1449,6 +1450,7 @@ impl Thread {
             tool_choice: None,
             stop: Vec::new(),
             temperature: AgentSettings::temperature_for_model(model, cx),
+            thinking_allowed: false,
         };
 
         for message in &self.messages {

crates/agent_ui/src/active_thread.rs 🔗

@@ -1461,6 +1461,7 @@ impl ActiveThread {
                             &configured_model.model,
                             cx,
                         ),
+                        thinking_allowed: true,
                     };
 
                     Some(configured_model.model.count_tokens(request, cx))

crates/agent_ui/src/message_editor.rs 🔗

@@ -1454,6 +1454,7 @@ impl MessageEditor {
                         tool_choice: None,
                         stop: vec![],
                         temperature: AgentSettings::temperature_for_model(&model.model, cx),
+                        thinking_allowed: true,
                     };
 
                     Some(model.model.count_tokens(request, cx))

crates/assistant_context/src/assistant_context.rs 🔗

@@ -2293,6 +2293,7 @@ impl AssistantContext {
             tool_choice: None,
             stop: Vec::new(),
             temperature: model.and_then(|model| AgentSettings::temperature_for_model(model, cx)),
+            thinking_allowed: true,
         };
         for message in self.messages(cx) {
             if message.status != MessageStatus::Done {

crates/assistant_tools/src/edit_agent.rs 🔗

@@ -719,6 +719,7 @@ impl EditAgent {
             tools,
             stop: Vec::new(),
             temperature: None,
+            thinking_allowed: false,
         };
 
         Ok(self.model.stream_completion_text(request, cx).await?.stream)

crates/assistant_tools/src/edit_agent/evals.rs 🔗

@@ -1263,6 +1263,7 @@ impl EvalAssertion {
                     content: vec![prompt.into()],
                     cache: false,
                 }],
+                thinking_allowed: true,
                 ..Default::default()
             };
             let mut response = retry_on_rate_limit(async || {
@@ -1599,6 +1600,7 @@ impl EditAgentTest {
         let conversation = LanguageModelRequest {
             messages,
             tools,
+            thinking_allowed: true,
             ..Default::default()
         };
 

crates/eval/src/instance.rs 🔗

@@ -594,6 +594,7 @@ impl ExampleInstance {
                 tools: Vec::new(),
                 tool_choice: None,
                 stop: Vec::new(),
+                thinking_allowed: true,
             };
 
             let model = model.clone();

crates/git_ui/src/git_panel.rs 🔗

@@ -1830,6 +1830,7 @@ impl GitPanel {
                     tool_choice: None,
                     stop: Vec::new(),
                     temperature,
+                    thinking_allowed: false,
                 };
 
                 let stream = model.stream_completion_text(request, &cx);

crates/language_model/src/request.rs 🔗

@@ -391,6 +391,7 @@ pub struct LanguageModelRequest {
     pub tool_choice: Option<LanguageModelToolChoice>,
     pub stop: Vec<String>,
     pub temperature: Option<f32>,
+    pub thinking_allowed: bool,
 }
 
 #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]

crates/language_models/src/provider/anthropic.rs 🔗

@@ -663,7 +663,9 @@ pub fn into_anthropic(
         } else {
             Some(anthropic::StringOrContents::String(system_message))
         },
-        thinking: if let AnthropicModelMode::Thinking { budget_tokens } = mode {
+        thinking: if request.thinking_allowed
+            && let AnthropicModelMode::Thinking { budget_tokens } = mode
+        {
             Some(anthropic::Thinking::Enabled { budget_tokens })
         } else {
             None
@@ -1108,6 +1110,7 @@ mod tests {
             temperature: None,
             tools: vec![],
             tool_choice: None,
+            thinking_allowed: true,
         };
 
         let anthropic_request = into_anthropic(

crates/language_models/src/provider/bedrock.rs 🔗

@@ -799,7 +799,9 @@ pub fn into_bedrock(
         max_tokens: max_output_tokens,
         system: Some(system_message),
         tools: Some(tool_config),
-        thinking: if let BedrockModelMode::Thinking { budget_tokens } = mode {
+        thinking: if request.thinking_allowed
+            && let BedrockModelMode::Thinking { budget_tokens } = mode
+        {
             Some(bedrock::Thinking::Enabled { budget_tokens })
         } else {
             None

crates/language_models/src/provider/cloud.rs 🔗

@@ -849,6 +849,7 @@ impl LanguageModel for CloudLanguageModel {
         let use_cloud = cx
             .update(|cx| cx.has_flag::<ZedCloudFeatureFlag>())
             .unwrap_or(false);
+        let thinking_allowed = request.thinking_allowed;
         match self.model.provider {
             zed_llm_client::LanguageModelProvider::Anthropic => {
                 let request = into_anthropic(
@@ -856,7 +857,7 @@ impl LanguageModel for CloudLanguageModel {
                     self.model.id.to_string(),
                     1.0,
                     self.model.max_output_tokens as u64,
-                    if self.model.id.0.ends_with("-thinking") {
+                    if thinking_allowed && self.model.id.0.ends_with("-thinking") {
                         AnthropicModelMode::Thinking {
                             budget_tokens: Some(4_096),
                         }

crates/language_models/src/provider/google.rs 🔗

@@ -559,11 +559,11 @@ pub fn into_google(
             stop_sequences: Some(request.stop),
             max_output_tokens: None,
             temperature: request.temperature.map(|t| t as f64).or(Some(1.0)),
-            thinking_config: match mode {
-                GoogleModelMode::Thinking { budget_tokens } => {
+            thinking_config: match (request.thinking_allowed, mode) {
+                (true, GoogleModelMode::Thinking { budget_tokens }) => {
                     budget_tokens.map(|thinking_budget| ThinkingConfig { thinking_budget })
                 }
-                GoogleModelMode::Default => None,
+                _ => None,
             },
             top_p: None,
             top_k: None,

crates/language_models/src/provider/mistral.rs 🔗

@@ -911,6 +911,7 @@ mod tests {
             intent: None,
             mode: None,
             stop: vec![],
+            thinking_allowed: true,
         };
 
         let mistral_request = into_mistral(request, "mistral-small-latest".into(), None);
@@ -943,6 +944,7 @@ mod tests {
             intent: None,
             mode: None,
             stop: vec![],
+            thinking_allowed: true,
         };
 
         let mistral_request = into_mistral(request, "pixtral-12b-latest".into(), None);

crates/language_models/src/provider/ollama.rs 🔗

@@ -334,7 +334,10 @@ impl OllamaLanguageModel {
                 temperature: request.temperature.or(Some(1.0)),
                 ..Default::default()
             }),
-            think: self.model.supports_thinking,
+            think: self
+                .model
+                .supports_thinking
+                .map(|supports_thinking| supports_thinking && request.thinking_allowed),
             tools: request.tools.into_iter().map(tool_into_ollama).collect(),
         }
     }

crates/language_models/src/provider/open_router.rs 🔗

@@ -523,7 +523,9 @@ pub fn into_open_router(
             None
         },
         usage: open_router::RequestUsage { include: true },
-        reasoning: if let OpenRouterModelMode::Thinking { budget_tokens } = model.mode {
+        reasoning: if request.thinking_allowed
+            && let OpenRouterModelMode::Thinking { budget_tokens } = model.mode
+        {
             Some(open_router::Reasoning {
                 effort: None,
                 max_tokens: budget_tokens,