loop: show error when LLM response hits max tokens limit

Philip Zeyliger created

Prompt: Fix https://github.com/boldsoftware/exe.dev/issues/84 . See how boldsoftware/sketch dealt with a similar problem.
(follow-up: don't auto-continue, just error out and tell the LLM to use smaller outputs)
(follow-up: fix patch tool spinning on truncated/malformed input)
(follow-up: fix bash description to talk about input not output, remove unused anthropic-beta header)

When the LLM returns a response with StopReasonMaxTokens, the loop now
records an error message informing the user about the truncation and
instructing the LLM to use smaller incremental changes.

Changes:
- loop.go: handleMaxTokensTruncation() records error message instead of
  auto-continuing
- ant.go: Remove the 128k retry logic and unused TokenEfficientToolUse field
- bash.go: Add note about 60k token limit for command input
- patch.go: Add note about 60k token limit and suggest incremental patches
- patch.go: Fix patchParse to give clear error when patches field is
  missing/empty (e.g., from truncated LLM response) instead of spinning
- Tests updated accordingly

Fixes https://github.com/boldsoftware/exe.dev/issues/84

Change summary

claudetool/bash.go       |  3 +
claudetool/patch.go      | 10 +++++
claudetool/patch_test.go | 17 +++++++++
llm/ant/ant.go           | 28 ---------------
loop/loop.go             | 37 ++++++++++++++++++++
loop/loop_test.go        | 75 ++++++++++++++++++++++++++++++++++++++++++
loop/predictable.go      | 27 +++++++++++++++
7 files changed, 169 insertions(+), 28 deletions(-)

Detailed changes

claudetool/bash.go 🔗

@@ -107,6 +107,9 @@ installs, tests, or any other substantive operation.
 
 To change the working directory persistently, use the change_dir tool.
 
+IMPORTANT: Keep commands concise. The command input must be less than 60k tokens.
+For complex scripts, write them to a file first and then execute the file.
+
 <pwd>%s</pwd>
 `
 	// If you modify this, update the termui template for prettier rendering.

claudetool/patch.go 🔗

@@ -103,6 +103,10 @@ Recipes:
 Usage notes:
 - All inputs are interpreted literally (no automatic newline or whitespace handling)
 - For replace operations, oldText must appear EXACTLY ONCE in the file
+
+IMPORTANT: Each patch call must be less than 60k tokens total. For large file
+changes, break them into multiple smaller patch operations rather than one
+large overwrite. Prefer incremental replace operations over full file overwrites.
 `
 
 	// If you modify this, update the termui template for prettier rendering.
@@ -317,7 +321,7 @@ func (p *PatchTool) patchParse(m json.RawMessage) (PatchInput, error) {
 		originalErr = err
 	}
 	var inputOneString PatchInputOneString
-	if err := json.Unmarshal(m, &inputOneString); err == nil {
+	if err := json.Unmarshal(m, &inputOneString); err == nil && inputOneString.Patches != "" {
 		var onePatch PatchRequest
 		if err := json.Unmarshal([]byte(inputOneString.Patches), &onePatch); err == nil && onePatch.Operation != "" {
 			return PatchInput{Path: inputOneString.Path, Patches: []PatchRequest{onePatch}}, nil
@@ -331,6 +335,10 @@ func (p *PatchTool) patchParse(m json.RawMessage) (PatchInput, error) {
 			originalErr = err
 		}
 	}
+	// If JSON parsed but patches field was missing/empty, provide a clear error
+	if originalErr == nil {
+		return PatchInput{}, fmt.Errorf("patches field is missing or empty (this may indicate a truncated LLM response)\nJSON: %s", string(m))
+	}
 	return PatchInput{}, fmt.Errorf("failed to unmarshal patch input: %w\nJSON: %s", originalErr, string(m))
 }
 

claudetool/patch_test.go 🔗

@@ -308,6 +308,23 @@ func TestPatchTool_ErrorCases(t *testing.T) {
 	if result.Error == nil || !strings.Contains(result.Error.Error(), "clipboard") {
 		t.Error("expected clipboard error")
 	}
+
+	// Test missing patches field (simulates truncated LLM response)
+	msg = json.RawMessage(`{"path":"server/dashboard.go"}`)
+	result = patch.Run(ctx, msg)
+	if result.Error == nil {
+		t.Error("expected error for missing patches field")
+	}
+	if !strings.Contains(result.Error.Error(), "missing or empty") {
+		t.Errorf("expected 'missing or empty' in error, got: %v", result.Error)
+	}
+
+	// Test empty patches array
+	msg = json.RawMessage(`{"path":"server/dashboard.go","patches":[]}`)
+	result = patch.Run(ctx, msg)
+	if result.Error == nil {
+		t.Error("expected error for empty patches array")
+	}
 }
 
 func TestPatchTool_FlexibleInputParsing(t *testing.T) {

llm/ant/ant.go 🔗

@@ -236,7 +236,7 @@ type request struct {
 	TopP          float64         `json:"top_p,omitempty"`
 	StopSequences []string        `json:"stop_sequences,omitempty"`
 
-	TokenEfficientToolUse bool `json:"-"` // DO NOT USE, broken on Anthropic's side as of 2025-02-28
+
 }
 
 func mapped[Slice ~[]E, E, T any](s Slice, f func(E) T) []T {
@@ -481,8 +481,6 @@ func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error
 	}
 
 	backoff := []time.Duration{15 * time.Second, 30 * time.Second, time.Minute}
-	largerMaxTokens := false
-	var partialUsage usage
 
 	url := cmp.Or(s.URL, DefaultURL)
 	httpc := cmp.Or(s.HTTPC, http.DefaultClient)
@@ -522,18 +520,6 @@ func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error
 		req.Header.Set("X-API-Key", s.APIKey)
 		req.Header.Set("Anthropic-Version", "2023-06-01")
 
-		var features []string
-		if request.TokenEfficientToolUse {
-			features = append(features, "token-efficient-tool-use-2025-02-19")
-		}
-		if largerMaxTokens {
-			features = append(features, "output-128k-2025-02-19")
-			request.MaxTokens = 128 * 1024
-		}
-		if len(features) > 0 {
-			req.Header.Set("anthropic-beta", strings.Join(features, ","))
-		}
-
 		resp, err := httpc.Do(req)
 		if err != nil {
 			// Don't retry httprr cache misses
@@ -566,19 +552,7 @@ func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error
 			if err != nil {
 				return nil, errors.Join(errs, err)
 			}
-			if response.StopReason == "max_tokens" && !largerMaxTokens {
-				slog.InfoContext(ctx, "anthropic_retrying_with_larger_tokens", "message", "Retrying Anthropic API call with larger max tokens size")
-				// Retry with more output tokens.
-				largerMaxTokens = true
-				response.Usage.CostUSD = llm.CostUSDFromResponse(resp.Header)
-				partialUsage = response.Usage
-				continue
-			}
-
 			// Calculate and set the cost_usd field
-			if largerMaxTokens {
-				response.Usage.Add(partialUsage)
-			}
 			response.Usage.CostUSD = llm.CostUSDFromResponse(resp.Header)
 
 			endTime := time.Now()

loop/loop.go 🔗

@@ -288,6 +288,12 @@ func (l *Loop) processLLMRequest(ctx context.Context) error {
 		return l.handleToolCalls(ctx, resp.Content)
 	}
 
+	// Handle max tokens truncation - record error message for the user
+	if resp.StopReason == llm.StopReasonMaxTokens {
+		l.logger.Warn("LLM response truncated due to max tokens")
+		return l.handleMaxTokensTruncation(ctx)
+	}
+
 	// End of turn - check for git state changes
 	l.checkGitStateChange(ctx)
 
@@ -331,6 +337,37 @@ func (l *Loop) checkGitStateChange(ctx context.Context) {
 	}
 }
 
+// handleMaxTokensTruncation handles the case where the LLM response was truncated
+// due to hitting the maximum output token limit. It records an error message
+// informing the user and instructing the LLM to use smaller outputs.
+func (l *Loop) handleMaxTokensTruncation(ctx context.Context) error {
+	errorMessage := llm.Message{
+		Role: llm.MessageRoleUser,
+		Content: []llm.Content{
+			{
+				Type: llm.ContentTypeText,
+				Text: "[SYSTEM ERROR: Your previous response was truncated because it exceeded the maximum output token limit. " +
+					"Any tool calls in that response were lost. Please retry with smaller, incremental changes. " +
+					"For file operations, break large changes into multiple smaller patches. " +
+					"The user can ask you to continue if needed.]",
+			},
+		},
+	}
+
+	l.mu.Lock()
+	l.history = append(l.history, errorMessage)
+	l.mu.Unlock()
+
+	// Record the error message
+	if err := l.recordMessage(ctx, errorMessage, llm.Usage{}); err != nil {
+		l.logger.Error("failed to record truncation error message", "error", err)
+	}
+
+	// End the turn - don't automatically continue
+	l.checkGitStateChange(ctx)
+	return nil
+}
+
 // handleToolCalls processes tool calls from the LLM response
 func (l *Loop) handleToolCalls(ctx context.Context, content []llm.Content) error {
 	var toolResults []llm.Content

loop/loop_test.go 🔗

@@ -7,6 +7,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
 	"sync"
 	"testing"
 	"time"
@@ -1164,3 +1165,77 @@ func runGit(t *testing.T, dir string, args ...string) {
 		t.Fatalf("git %v failed: %v\n%s", args, err, output)
 	}
 }
+
+func TestMaxTokensTruncation(t *testing.T) {
+	var recordedMessages []llm.Message
+	var mu sync.Mutex
+
+	recordFunc := func(ctx context.Context, message llm.Message, usage llm.Usage) error {
+		mu.Lock()
+		defer mu.Unlock()
+		recordedMessages = append(recordedMessages, message)
+		return nil
+	}
+
+	service := NewPredictableService()
+	loop := NewLoop(Config{
+		LLM:           service,
+		History:       []llm.Message{},
+		Tools:         []*llm.Tool{},
+		RecordMessage: recordFunc,
+	})
+
+	// Queue a user message that triggers max_tokens response
+	userMessage := llm.Message{
+		Role:    llm.MessageRoleUser,
+		Content: []llm.Content{{Type: llm.ContentTypeText, Text: "maxTokens"}},
+	}
+	loop.QueueUserMessage(userMessage)
+
+	// Process the turn - should end with error message about truncation
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	err := loop.ProcessOneTurn(ctx)
+	if err != nil {
+		t.Fatalf("ProcessOneTurn failed: %v", err)
+	}
+
+	// Check that messages were recorded:
+	// 1. First assistant message (truncated)
+	// 2. User error message about truncation
+	mu.Lock()
+	numMessages := len(recordedMessages)
+	mu.Unlock()
+
+	if numMessages != 2 {
+		mu.Lock()
+		for i, msg := range recordedMessages {
+			t.Logf("Message %d: role=%v, content=%v", i, msg.Role, msg.Content)
+		}
+		mu.Unlock()
+		t.Fatalf("expected 2 recorded messages (truncated response, error message), got %d", numMessages)
+	}
+
+	// Verify the first message was the truncated assistant response
+	mu.Lock()
+	firstMsg := recordedMessages[0]
+	mu.Unlock()
+	if firstMsg.Role != llm.MessageRoleAssistant {
+		t.Errorf("expected first message to be assistant, got %v", firstMsg.Role)
+	}
+
+	// Verify the second message is the error/system message about truncation
+	mu.Lock()
+	secondMsg := recordedMessages[1]
+	mu.Unlock()
+	if secondMsg.Role != llm.MessageRoleUser {
+		t.Errorf("expected second message to be user (system error), got %v", secondMsg.Role)
+	}
+	if !strings.Contains(secondMsg.Content[0].Text, "truncated") {
+		t.Errorf("expected error message to mention truncation, got %q", secondMsg.Content[0].Text)
+	}
+	if !strings.Contains(secondMsg.Content[0].Text, "smaller") {
+		t.Errorf("expected error message to suggest smaller changes, got %q", secondMsg.Content[0].Text)
+	}
+}

loop/predictable.go 🔗

@@ -124,6 +124,10 @@ func (s *PredictableService) Do(ctx context.Context, req *llm.Request) (*llm.Res
 		// Trigger a patch with malformed JSON (simulates Anthropic sending invalid JSON)
 		return s.makeMalformedPatchToolResponse(inputTokens), nil
 
+	case "maxTokens":
+		// Simulate a max_tokens truncation
+		return s.makeMaxTokensResponse("This is a truncated response that was cut off mid-sentence because the output token limit was", inputTokens), nil
+
 	default:
 		// Handle pattern-based inputs
 		if strings.HasPrefix(inputText, "echo: ") {
@@ -175,6 +179,29 @@ func (s *PredictableService) Do(ctx context.Context, req *llm.Request) (*llm.Res
 	}
 }
 
+// makeMaxTokensResponse creates a response that simulates hitting max_tokens limit
+func (s *PredictableService) makeMaxTokensResponse(text string, inputTokens uint64) *llm.Response {
+	outputTokens := uint64(len(text) / 4)
+	if outputTokens == 0 {
+		outputTokens = 1
+	}
+	return &llm.Response{
+		ID:    fmt.Sprintf("pred-%d", time.Now().UnixNano()),
+		Type:  "message",
+		Role:  llm.MessageRoleAssistant,
+		Model: "predictable-v1",
+		Content: []llm.Content{
+			{Type: llm.ContentTypeText, Text: text},
+		},
+		StopReason: llm.StopReasonMaxTokens,
+		Usage: llm.Usage{
+			InputTokens:  inputTokens,
+			OutputTokens: outputTokens,
+			CostUSD:      0.001,
+		},
+	}
+}
+
 // makeResponse creates a simple text response
 func (s *PredictableService) makeResponse(text string, inputTokens uint64) *llm.Response {
 	outputTokens := uint64(len(text) / 4) // ~4 chars per token