fix: support local models with unknown max_tokens and context window (#2554)

Kartik33 created

Two fixes for local/custom model compatibility (LM Studio, Ollama, llama.cpp):

1. Don't send MaxOutputTokens when it's 0. Custom models not in the
   catwalk providers list have DefaultMaxTokens=0, which gets sent as
   max_tokens:0 in the API request. LM Studio rejects this with
   "maxPredictedTokens does not satisfy the schema". Fix: only send
   the field when the value is positive.

2. Skip auto-summarize when ContextWindow is 0. Custom models have
   ContextWindow=0, making remaining tokens negative, which immediately
   triggers summarize after the first response. The session resets with
   "previous session was interrupted because it got too long" even for
   short conversations. Fix: skip the check when context window is
   unknown.

Fixes #1218 (regression), relates to #1583, #1591

Change summary

internal/agent/agent.go | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)

Detailed changes

internal/agent/agent.go 🔗

@@ -249,12 +249,17 @@ func (a *sessionAgent) Run(ctx context.Context, call SessionAgentCall) (*fantasy
 
 	var currentAssistant *message.Message
 	var shouldSummarize bool
+	// Don't send MaxOutputTokens if 0 — some providers (e.g. LM Studio) reject it
+	var maxOutputTokens *int64
+	if call.MaxOutputTokens > 0 {
+		maxOutputTokens = &call.MaxOutputTokens
+	}
 	result, err := agent.Stream(genCtx, fantasy.AgentStreamCall{
 		Prompt:           message.PromptWithTextAttachments(call.Prompt, call.Attachments),
 		Files:            files,
 		Messages:         history,
 		ProviderOptions:  call.ProviderOptions,
-		MaxOutputTokens:  &call.MaxOutputTokens,
+		MaxOutputTokens:  maxOutputTokens,
 		TopP:             call.TopP,
 		Temperature:      call.Temperature,
 		PresencePenalty:  call.PresencePenalty,
@@ -425,6 +430,11 @@ func (a *sessionAgent) Run(ctx context.Context, call SessionAgentCall) (*fantasy
 		StopWhen: []fantasy.StopCondition{
 			func(_ []fantasy.StepResult) bool {
 				cw := int64(largeModel.CatwalkCfg.ContextWindow)
+				// If context window is unknown (0), skip auto-summarize
+				// to avoid immediately truncating custom/local models.
+				if cw == 0 {
+					return false
+				}
 				tokens := currentSession.CompletionTokens + currentSession.PromptTokens
 				remaining := cw - tokens
 				var threshold int64