From bc4bd2e168863f58cdeb72d29f4c26fec4aa0f5a Mon Sep 17 00:00:00 2001
From: Richard Feldman <oss@rtfeldman.com>
Date: Mon, 4 Nov 2024 15:40:10 -0500
Subject: [PATCH] Don't conservatively include Suggest Edits token in token
 count (#20180)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before: (note the 1.3k in the upper right corner instead of 3 in the
second screenshot)

<img width="459" alt="Screenshot 2024-11-04 at 11 37 58 AM"
src="https://github.com/user-attachments/assets/64c06aff-f7d2-42a4-a767-0d7a4ba0f486">

Now:
<img width="631" alt="Screenshot 2024-11-04 at 11 38 11 AM"
src="https://github.com/user-attachments/assets/22af974d-915a-41e1-9ee0-f0622901e242">

This was intended to be a conservative estimate in case you pressed
Suggest Edits (and therefore might have an unpleasant surprise if you
were under the context limit but Suggest Edits failed with a "too much
context" error message anyway, after the Suggest Edits context got added
for you behind the scenes).

However, in retrospect this design created more [confusion in the common
case](https://github.com/zed-industries/zed/pull/19900#issuecomment-2453456569)
because it made it look like more context had been actually consumed
than what was actually consumed.

This does raise a potential design question for the future: the Suggest
Edits button adds more context at the last minute without ever
communicating that it's going to do that.

In the meantime it seems best to go back to the less-confusing way of
reporting the token counts, especially since only users of the
experimental flag could possibly press Suggest Edits anyway.

Release Notes:

- Fixed issue where initial token count was over-reported as 1.3k
instead of 3 (for the context string "You").
---
 crates/assistant/src/context.rs | 6 ++++--
 crates/assistant/src/prompts.rs | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/crates/assistant/src/context.rs b/crates/assistant/src/context.rs
index 5b4cff01b6a48c364a0ea342342c0e9304afb8ea..9290e59d85e2a819f8dea5fe5922c45b22519183 100644
--- a/crates/assistant/src/context.rs
+++ b/crates/assistant/src/context.rs
@@ -1052,7 +1052,9 @@ impl Context {
     }
 
     pub(crate) fn count_remaining_tokens(&mut self, cx: &mut ModelContext<Self>) {
-        let request = self.to_completion_request(RequestType::SuggestEdits, cx); // Conservatively assume SuggestEdits, since it takes more tokens.
+        // Assume it will be a Chat request, even though that takes fewer tokens (and risks going over the limit),
+        // because otherwise you see in the UI that your empty message has a bunch of tokens already used.
+        let request = self.to_completion_request(RequestType::Chat, cx);
         let Some(model) = LanguageModelRegistry::read_global(cx).active_model() else {
             return;
         };
@@ -2202,7 +2204,7 @@ impl Context {
         }
 
         if let RequestType::SuggestEdits = request_type {
-            if let Ok(preamble) = self.prompt_builder.generate_workflow_prompt() {
+            if let Ok(preamble) = self.prompt_builder.generate_suggest_edits_prompt() {
                 let last_elem_index = completion_request.messages.len();
 
                 completion_request
diff --git a/crates/assistant/src/prompts.rs b/crates/assistant/src/prompts.rs
index 50fee242eab42d006cea87dbc937e5f6af81984a..4d6d7ea3df9556d28b6cfb2367454119ae815209 100644
--- a/crates/assistant/src/prompts.rs
+++ b/crates/assistant/src/prompts.rs
@@ -310,7 +310,7 @@ impl PromptBuilder {
             .render("terminal_assistant_prompt", &context)
     }
 
-    pub fn generate_workflow_prompt(&self) -> Result<String, RenderError> {
+    pub fn generate_suggest_edits_prompt(&self) -> Result<String, RenderError> {
         self.handlebars.lock().render("suggest_edits", &())
     }