From b938c9f45d4ad0f6bd3b88f13824de8430fcd6aa Mon Sep 17 00:00:00 2001 From: Philip Zeyliger Date: Tue, 6 Jan 2026 00:26:17 +0000 Subject: [PATCH] loop: show error when LLM response hits max tokens limit Prompt: Fix https://github.com/boldsoftware/exe.dev/issues/84 . See how boldsoftware/sketch dealt with a similar problem. (follow-up: don't auto-continue, just error out and tell the LLM to use smaller outputs) (follow-up: fix patch tool spinning on truncated/malformed input) (follow-up: fix bash description to talk about input not output, remove unused anthropic-beta header) When the LLM returns a response with StopReasonMaxTokens, the loop now records an error message informing the user about the truncation and instructing the LLM to use smaller incremental changes. Changes: - loop.go: handleMaxTokensTruncation() records error message instead of auto-continuing - ant.go: Remove the 128k retry logic and unused TokenEfficientToolUse field - bash.go: Add note about 60k token limit for command input - patch.go: Add note about 60k token limit and suggest incremental patches - patch.go: Fix patchParse to give clear error when patches field is missing/empty (e.g., from truncated LLM response) instead of spinning - Tests updated accordingly Fixes https://github.com/boldsoftware/exe.dev/issues/84 --- claudetool/bash.go | 3 ++ claudetool/patch.go | 10 +++++- claudetool/patch_test.go | 17 +++++++++ llm/ant/ant.go | 28 +-------------- loop/loop.go | 37 ++++++++++++++++++++ loop/loop_test.go | 75 ++++++++++++++++++++++++++++++++++++++++ loop/predictable.go | 27 +++++++++++++++ 7 files changed, 169 insertions(+), 28 deletions(-) diff --git a/claudetool/bash.go b/claudetool/bash.go index c1c63d6cd98323018a97eb0a91c12f5a656cd0de..82f5c708a578ff0c3d2fcfed248943ec10abe8e8 100644 --- a/claudetool/bash.go +++ b/claudetool/bash.go @@ -107,6 +107,9 @@ installs, tests, or any other substantive operation. To change the working directory persistently, use the change_dir tool. +IMPORTANT: Keep commands concise. The command input must be less than 60k tokens. +For complex scripts, write them to a file first and then execute the file. + %s ` // If you modify this, update the termui template for prettier rendering. diff --git a/claudetool/patch.go b/claudetool/patch.go index 614670711efab97a9286cc148662e7af3dc5c73f..0695320340c126fc9dbb035da21bdeba14bb47e9 100644 --- a/claudetool/patch.go +++ b/claudetool/patch.go @@ -103,6 +103,10 @@ Recipes: Usage notes: - All inputs are interpreted literally (no automatic newline or whitespace handling) - For replace operations, oldText must appear EXACTLY ONCE in the file + +IMPORTANT: Each patch call must be less than 60k tokens total. For large file +changes, break them into multiple smaller patch operations rather than one +large overwrite. Prefer incremental replace operations over full file overwrites. ` // If you modify this, update the termui template for prettier rendering. @@ -317,7 +321,7 @@ func (p *PatchTool) patchParse(m json.RawMessage) (PatchInput, error) { originalErr = err } var inputOneString PatchInputOneString - if err := json.Unmarshal(m, &inputOneString); err == nil { + if err := json.Unmarshal(m, &inputOneString); err == nil && inputOneString.Patches != "" { var onePatch PatchRequest if err := json.Unmarshal([]byte(inputOneString.Patches), &onePatch); err == nil && onePatch.Operation != "" { return PatchInput{Path: inputOneString.Path, Patches: []PatchRequest{onePatch}}, nil @@ -331,6 +335,10 @@ func (p *PatchTool) patchParse(m json.RawMessage) (PatchInput, error) { originalErr = err } } + // If JSON parsed but patches field was missing/empty, provide a clear error + if originalErr == nil { + return PatchInput{}, fmt.Errorf("patches field is missing or empty (this may indicate a truncated LLM response)\nJSON: %s", string(m)) + } return PatchInput{}, fmt.Errorf("failed to unmarshal patch input: %w\nJSON: %s", originalErr, string(m)) } diff --git a/claudetool/patch_test.go b/claudetool/patch_test.go index f495c196f76ff92822128d3102744f4d3bfb62ae..6a81ca0c0b26df6c425a8edb1f38898c184a71fd 100644 --- a/claudetool/patch_test.go +++ b/claudetool/patch_test.go @@ -308,6 +308,23 @@ func TestPatchTool_ErrorCases(t *testing.T) { if result.Error == nil || !strings.Contains(result.Error.Error(), "clipboard") { t.Error("expected clipboard error") } + + // Test missing patches field (simulates truncated LLM response) + msg = json.RawMessage(`{"path":"server/dashboard.go"}`) + result = patch.Run(ctx, msg) + if result.Error == nil { + t.Error("expected error for missing patches field") + } + if !strings.Contains(result.Error.Error(), "missing or empty") { + t.Errorf("expected 'missing or empty' in error, got: %v", result.Error) + } + + // Test empty patches array + msg = json.RawMessage(`{"path":"server/dashboard.go","patches":[]}`) + result = patch.Run(ctx, msg) + if result.Error == nil { + t.Error("expected error for empty patches array") + } } func TestPatchTool_FlexibleInputParsing(t *testing.T) { diff --git a/llm/ant/ant.go b/llm/ant/ant.go index a1b9acf10013a79c3a8359c181f3992d61d7e340..7cfb75b63af80e95220976518a6fd308a5afb43e 100644 --- a/llm/ant/ant.go +++ b/llm/ant/ant.go @@ -236,7 +236,7 @@ type request struct { TopP float64 `json:"top_p,omitempty"` StopSequences []string `json:"stop_sequences,omitempty"` - TokenEfficientToolUse bool `json:"-"` // DO NOT USE, broken on Anthropic's side as of 2025-02-28 + } func mapped[Slice ~[]E, E, T any](s Slice, f func(E) T) []T { @@ -481,8 +481,6 @@ func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error } backoff := []time.Duration{15 * time.Second, 30 * time.Second, time.Minute} - largerMaxTokens := false - var partialUsage usage url := cmp.Or(s.URL, DefaultURL) httpc := cmp.Or(s.HTTPC, http.DefaultClient) @@ -522,18 +520,6 @@ func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error req.Header.Set("X-API-Key", s.APIKey) req.Header.Set("Anthropic-Version", "2023-06-01") - var features []string - if request.TokenEfficientToolUse { - features = append(features, "token-efficient-tool-use-2025-02-19") - } - if largerMaxTokens { - features = append(features, "output-128k-2025-02-19") - request.MaxTokens = 128 * 1024 - } - if len(features) > 0 { - req.Header.Set("anthropic-beta", strings.Join(features, ",")) - } - resp, err := httpc.Do(req) if err != nil { // Don't retry httprr cache misses @@ -566,19 +552,7 @@ func (s *Service) Do(ctx context.Context, ir *llm.Request) (*llm.Response, error if err != nil { return nil, errors.Join(errs, err) } - if response.StopReason == "max_tokens" && !largerMaxTokens { - slog.InfoContext(ctx, "anthropic_retrying_with_larger_tokens", "message", "Retrying Anthropic API call with larger max tokens size") - // Retry with more output tokens. - largerMaxTokens = true - response.Usage.CostUSD = llm.CostUSDFromResponse(resp.Header) - partialUsage = response.Usage - continue - } - // Calculate and set the cost_usd field - if largerMaxTokens { - response.Usage.Add(partialUsage) - } response.Usage.CostUSD = llm.CostUSDFromResponse(resp.Header) endTime := time.Now() diff --git a/loop/loop.go b/loop/loop.go index 06e0eecb33e86111477affc1128f09ecccdf71af..246a96ba7bb4744b8e849a3ac4e41d208bee69b1 100644 --- a/loop/loop.go +++ b/loop/loop.go @@ -288,6 +288,12 @@ func (l *Loop) processLLMRequest(ctx context.Context) error { return l.handleToolCalls(ctx, resp.Content) } + // Handle max tokens truncation - record error message for the user + if resp.StopReason == llm.StopReasonMaxTokens { + l.logger.Warn("LLM response truncated due to max tokens") + return l.handleMaxTokensTruncation(ctx) + } + // End of turn - check for git state changes l.checkGitStateChange(ctx) @@ -331,6 +337,37 @@ func (l *Loop) checkGitStateChange(ctx context.Context) { } } +// handleMaxTokensTruncation handles the case where the LLM response was truncated +// due to hitting the maximum output token limit. It records an error message +// informing the user and instructing the LLM to use smaller outputs. +func (l *Loop) handleMaxTokensTruncation(ctx context.Context) error { + errorMessage := llm.Message{ + Role: llm.MessageRoleUser, + Content: []llm.Content{ + { + Type: llm.ContentTypeText, + Text: "[SYSTEM ERROR: Your previous response was truncated because it exceeded the maximum output token limit. " + + "Any tool calls in that response were lost. Please retry with smaller, incremental changes. " + + "For file operations, break large changes into multiple smaller patches. " + + "The user can ask you to continue if needed.]", + }, + }, + } + + l.mu.Lock() + l.history = append(l.history, errorMessage) + l.mu.Unlock() + + // Record the error message + if err := l.recordMessage(ctx, errorMessage, llm.Usage{}); err != nil { + l.logger.Error("failed to record truncation error message", "error", err) + } + + // End the turn - don't automatically continue + l.checkGitStateChange(ctx) + return nil +} + // handleToolCalls processes tool calls from the LLM response func (l *Loop) handleToolCalls(ctx context.Context, content []llm.Content) error { var toolResults []llm.Content diff --git a/loop/loop_test.go b/loop/loop_test.go index 6518af77c3dc5c1f2019078f00d77db231ceb8d6..9a9e8d8c69b1098fd72e436cd3eff66b0e3c2650 100644 --- a/loop/loop_test.go +++ b/loop/loop_test.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "sync" "testing" "time" @@ -1164,3 +1165,77 @@ func runGit(t *testing.T, dir string, args ...string) { t.Fatalf("git %v failed: %v\n%s", args, err, output) } } + +func TestMaxTokensTruncation(t *testing.T) { + var recordedMessages []llm.Message + var mu sync.Mutex + + recordFunc := func(ctx context.Context, message llm.Message, usage llm.Usage) error { + mu.Lock() + defer mu.Unlock() + recordedMessages = append(recordedMessages, message) + return nil + } + + service := NewPredictableService() + loop := NewLoop(Config{ + LLM: service, + History: []llm.Message{}, + Tools: []*llm.Tool{}, + RecordMessage: recordFunc, + }) + + // Queue a user message that triggers max_tokens response + userMessage := llm.Message{ + Role: llm.MessageRoleUser, + Content: []llm.Content{{Type: llm.ContentTypeText, Text: "maxTokens"}}, + } + loop.QueueUserMessage(userMessage) + + // Process the turn - should end with error message about truncation + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := loop.ProcessOneTurn(ctx) + if err != nil { + t.Fatalf("ProcessOneTurn failed: %v", err) + } + + // Check that messages were recorded: + // 1. First assistant message (truncated) + // 2. User error message about truncation + mu.Lock() + numMessages := len(recordedMessages) + mu.Unlock() + + if numMessages != 2 { + mu.Lock() + for i, msg := range recordedMessages { + t.Logf("Message %d: role=%v, content=%v", i, msg.Role, msg.Content) + } + mu.Unlock() + t.Fatalf("expected 2 recorded messages (truncated response, error message), got %d", numMessages) + } + + // Verify the first message was the truncated assistant response + mu.Lock() + firstMsg := recordedMessages[0] + mu.Unlock() + if firstMsg.Role != llm.MessageRoleAssistant { + t.Errorf("expected first message to be assistant, got %v", firstMsg.Role) + } + + // Verify the second message is the error/system message about truncation + mu.Lock() + secondMsg := recordedMessages[1] + mu.Unlock() + if secondMsg.Role != llm.MessageRoleUser { + t.Errorf("expected second message to be user (system error), got %v", secondMsg.Role) + } + if !strings.Contains(secondMsg.Content[0].Text, "truncated") { + t.Errorf("expected error message to mention truncation, got %q", secondMsg.Content[0].Text) + } + if !strings.Contains(secondMsg.Content[0].Text, "smaller") { + t.Errorf("expected error message to suggest smaller changes, got %q", secondMsg.Content[0].Text) + } +} diff --git a/loop/predictable.go b/loop/predictable.go index 2f51331deb73fd3834f674a52cfcf34efd880a49..b93c2cf23496f3390cceee73e2d6cd5baf7c362b 100644 --- a/loop/predictable.go +++ b/loop/predictable.go @@ -124,6 +124,10 @@ func (s *PredictableService) Do(ctx context.Context, req *llm.Request) (*llm.Res // Trigger a patch with malformed JSON (simulates Anthropic sending invalid JSON) return s.makeMalformedPatchToolResponse(inputTokens), nil + case "maxTokens": + // Simulate a max_tokens truncation + return s.makeMaxTokensResponse("This is a truncated response that was cut off mid-sentence because the output token limit was", inputTokens), nil + default: // Handle pattern-based inputs if strings.HasPrefix(inputText, "echo: ") { @@ -175,6 +179,29 @@ func (s *PredictableService) Do(ctx context.Context, req *llm.Request) (*llm.Res } } +// makeMaxTokensResponse creates a response that simulates hitting max_tokens limit +func (s *PredictableService) makeMaxTokensResponse(text string, inputTokens uint64) *llm.Response { + outputTokens := uint64(len(text) / 4) + if outputTokens == 0 { + outputTokens = 1 + } + return &llm.Response{ + ID: fmt.Sprintf("pred-%d", time.Now().UnixNano()), + Type: "message", + Role: llm.MessageRoleAssistant, + Model: "predictable-v1", + Content: []llm.Content{ + {Type: llm.ContentTypeText, Text: text}, + }, + StopReason: llm.StopReasonMaxTokens, + Usage: llm.Usage{ + InputTokens: inputTokens, + OutputTokens: outputTokens, + CostUSD: 0.001, + }, + } +} + // makeResponse creates a simple text response func (s *PredictableService) makeResponse(text string, inputTokens uint64) *llm.Response { outputTokens := uint64(len(text) / 4) // ~4 chars per token