From 0cab8bfed4df7148dcbb0ed01c02208b401abea5 Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Tue, 24 Mar 2026 21:29:02 +0100 Subject: [PATCH] feat: anthropic computer use (#185) --- agent.go | 133 ++- agent_test.go | 463 ++++++++- content.go | 62 +- examples/computer-use/main.go | 103 ++ providers/anthropic/anthropic.go | 139 ++- providers/anthropic/anthropic_test.go | 924 ++++++++++++++++++ providers/anthropic/computer_use.go | 427 ++++++++ providers/anthropic/computer_use_test.go | 303 ++++++ providertests/anthropic_test.go | 185 ++++ .../claude-opus-4-6/computer_use.yaml | 69 ++ .../computer_use_streaming.yaml | 132 +++ .../claude-sonnet-4/computer_use.yaml | 69 ++ .../computer_use_streaming.yaml | 144 +++ 13 files changed, 3100 insertions(+), 53 deletions(-) create mode 100644 examples/computer-use/main.go create mode 100644 providers/anthropic/computer_use.go create mode 100644 providers/anthropic/computer_use_test.go create mode 100644 providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use.yaml create mode 100644 providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use_streaming.yaml create mode 100644 providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use.yaml create mode 100644 providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use_streaming.yaml diff --git a/agent.go b/agent.go index 08d488770354ef3c1af6685bba1bd04d4e453533..34d4f8a1fb6efa1c6d0028bba905b1d52896ed1e 100644 --- a/agent.go +++ b/agent.go @@ -3,6 +3,7 @@ package fantasy import ( "cmp" "context" + "encoding/base64" "encoding/json" "errors" "fmt" @@ -142,9 +143,10 @@ type agentSettings struct { userAgent string providerOptions ProviderOptions - providerDefinedTools []ProviderDefinedTool - tools []AgentTool - maxRetries *int + providerDefinedTools []ProviderDefinedTool + executableProviderTools []ExecutableProviderTool + tools []AgentTool + maxRetries *int model LanguageModel @@ -432,13 +434,17 @@ func (a *agent) Generate(ctx context.Context, opts AgentCall) (*AgentResult, err preparedTools := a.prepareTools(stepTools, a.settings.providerDefinedTools, stepActiveTools, disableAllTools) + // Filter executable provider tools by activeTools at the + // step level, consistent with how stepTools (AgentTools) + // are scoped before being passed to inner functions. + stepExecProviderTools := a.filterExecProviderTools(stepActiveTools) + retryOptions := DefaultRetryOptions() if opts.MaxRetries != nil { retryOptions.MaxRetries = *opts.MaxRetries } retryOptions.OnRetry = opts.OnRetry retry := RetryWithExponentialBackoffRespectingRetryHeaders[*Response](retryOptions) - result, err := retry(ctx, func() (*Response, error) { return stepModel.Generate(ctx, Call{ Prompt: stepInputMessages, @@ -472,15 +478,14 @@ func (a *agent) Generate(ctx context.Context, opts AgentCall) (*AgentResult, err continue } // Validate and potentially repair the tool call - validatedToolCall := a.validateAndRepairToolCall(ctx, toolCall, stepTools, stepSystemPrompt, stepInputMessages, a.settings.repairToolCall) + validatedToolCall := a.validateAndRepairToolCall(ctx, toolCall, stepTools, stepExecProviderTools, stepSystemPrompt, stepInputMessages, a.settings.repairToolCall) stepToolCalls = append(stepToolCalls, validatedToolCall) } } - toolResults, err := a.executeTools(ctx, stepTools, stepToolCalls, nil) + toolResults, err := a.executeTools(ctx, stepTools, stepExecProviderTools, stepToolCalls, nil) - // Build step content with validated tool calls and tool results. - // Provider-executed tool calls are kept as-is. + // Build step content with validated tool calls and tool results. // Provider-executed tool calls are kept as-is. stepContent := []Content{} toolCallIndex := 0 for _, content := range result.Content { @@ -644,7 +649,7 @@ func toResponseMessages(content []Content) []Message { return messages } -func (a *agent) executeTools(ctx context.Context, allTools []AgentTool, toolCalls []ToolCallContent, toolResultCallback func(result ToolResultContent) error) ([]ToolResultContent, error) { +func (a *agent) executeTools(ctx context.Context, allTools []AgentTool, execProviderTools []ExecutableProviderTool, toolCalls []ToolCallContent, toolResultCallback func(result ToolResultContent) error) ([]ToolResultContent, error) { if len(toolCalls) == 0 { return nil, nil } @@ -655,11 +660,16 @@ func (a *agent) executeTools(ctx context.Context, allTools []AgentTool, toolCall toolMap[tool.Info().Name] = tool } + execProviderToolMap := make(map[string]ExecutableProviderTool, len(execProviderTools)) + for _, ept := range execProviderTools { + execProviderToolMap[ept.GetName()] = ept + } + // Execute all tool calls sequentially in order results := make([]ToolResultContent, 0, len(toolCalls)) for _, toolCall := range toolCalls { - result, isCriticalError := a.executeSingleTool(ctx, toolMap, toolCall, toolResultCallback) + result, isCriticalError := a.executeSingleTool(ctx, toolMap, execProviderToolMap, toolCall, toolResultCallback) results = append(results, result) if isCriticalError { if errorResult, ok := result.Result.(ToolResultOutputContentError); ok && errorResult.Error != nil { @@ -672,7 +682,7 @@ func (a *agent) executeTools(ctx context.Context, allTools []AgentTool, toolCall } // executeSingleTool executes a single tool and returns its result and a critical error flag. -func (a *agent) executeSingleTool(ctx context.Context, toolMap map[string]AgentTool, toolCall ToolCallContent, toolResultCallback func(result ToolResultContent) error) (ToolResultContent, bool) { +func (a *agent) executeSingleTool(ctx context.Context, toolMap map[string]AgentTool, execProviderToolMap map[string]ExecutableProviderTool, toolCall ToolCallContent, toolResultCallback func(result ToolResultContent) error) (ToolResultContent, bool) { result := ToolResultContent{ ToolCallID: toolCall.ToolCallID, ToolName: toolCall.ToolName, @@ -690,10 +700,17 @@ func (a *agent) executeSingleTool(ctx context.Context, toolMap map[string]AgentT return result, false } - tool, exists := toolMap[toolCall.ToolName] - if !exists { + // Find the run function — either from a regular AgentTool or an + // executable provider tool. + var runTool func(ctx context.Context, call ToolCall) (ToolResponse, error) + if tool, exists := toolMap[toolCall.ToolName]; exists { + runTool = tool.Run + } else if ept, ok := execProviderToolMap[toolCall.ToolName]; ok { + runTool = ept.Run + } + if runTool == nil { result.Result = ToolResultOutputContentError{ - Error: errors.New("Error: Tool not found: " + toolCall.ToolName), + Error: errors.New("tool not found: " + toolCall.ToolName), } if toolResultCallback != nil { _ = toolResultCallback(result) @@ -702,7 +719,7 @@ func (a *agent) executeSingleTool(ctx context.Context, toolMap map[string]AgentT } // Execute the tool - toolResult, err := tool.Run(ctx, ToolCall{ + toolResult, err := runTool(ctx, ToolCall{ ID: toolCall.ToolCallID, Name: toolCall.ToolName, Input: toolCall.Input, @@ -725,7 +742,7 @@ func (a *agent) executeSingleTool(ctx context.Context, toolMap map[string]AgentT } } else if toolResult.Type == "image" || toolResult.Type == "media" { result.Result = ToolResultOutputContentMedia{ - Data: string(toolResult.Data), + Data: base64.StdEncoding.EncodeToString(toolResult.Data), MediaType: toolResult.MediaType, Text: toolResult.Content, } @@ -834,11 +851,15 @@ func (a *agent) Stream(ctx context.Context, opts AgentStreamCall) (*AgentResult, preparedTools := a.prepareTools(stepTools, a.settings.providerDefinedTools, stepActiveTools, disableAllTools) + // Filter executable provider tools by activeTools at the + // step level, consistent with how stepTools (AgentTools) + // are scoped before being passed to inner functions. + stepExecProviderTools := a.filterExecProviderTools(stepActiveTools) + // Start step stream if opts.OnStepStart != nil { _ = opts.OnStepStart(stepNumber) } - // Create streaming call streamCall := Call{ Prompt: stepInputMessages, @@ -870,11 +891,10 @@ func (a *agent) Stream(ctx context.Context, opts AgentStreamCall) (*AgentResult, } // Process the stream - result, err := a.processStepStream(ctx, stream, opts, steps, stepTools) + result, err := a.processStepStream(ctx, stream, opts, steps, stepTools, stepExecProviderTools) if err != nil { return stepExecutionResult{}, err } - return result, nil }) if err != nil { @@ -921,6 +941,22 @@ func (a *agent) Stream(ctx context.Context, opts AgentStreamCall) (*AgentResult, return agentResult, nil } +// filterExecProviderTools returns the subset of executable provider +// tools permitted by activeTools. When activeTools is empty every +// tool is included (no filtering). +func (a *agent) filterExecProviderTools(activeTools []string) []ExecutableProviderTool { + if len(activeTools) == 0 { + return a.settings.executableProviderTools + } + filtered := make([]ExecutableProviderTool, 0, len(a.settings.executableProviderTools)) + for _, ept := range a.settings.executableProviderTools { + if slices.Contains(activeTools, ept.GetName()) { + filtered = append(filtered, ept) + } + } + return filtered +} + func (a *agent) prepareTools(tools []AgentTool, providerDefinedTools []ProviderDefinedTool, activeTools []string, disableAllTools bool) []Tool { preparedTools := make([]Tool, 0, len(tools)+len(providerDefinedTools)) @@ -961,8 +997,8 @@ func (a *agent) prepareTools(tools []AgentTool, providerDefinedTools []ProviderD } // validateAndRepairToolCall validates a tool call and attempts repair if validation fails. -func (a *agent) validateAndRepairToolCall(ctx context.Context, toolCall ToolCallContent, availableTools []AgentTool, systemPrompt string, messages []Message, repairFunc RepairToolCallFunction) ToolCallContent { - if err := a.validateToolCall(toolCall, availableTools); err == nil { +func (a *agent) validateAndRepairToolCall(ctx context.Context, toolCall ToolCallContent, availableTools []AgentTool, execProviderTools []ExecutableProviderTool, systemPrompt string, messages []Message, repairFunc RepairToolCallFunction) ToolCallContent { + if err := a.validateToolCall(toolCall, availableTools, execProviderTools); err == nil { return toolCall } else { //nolint: revive if repairFunc != nil { @@ -975,7 +1011,7 @@ func (a *agent) validateAndRepairToolCall(ctx context.Context, toolCall ToolCall } if repairedToolCall, repairErr := repairFunc(ctx, repairOptions); repairErr == nil && repairedToolCall != nil { - if validateErr := a.validateToolCall(*repairedToolCall, availableTools); validateErr == nil { + if validateErr := a.validateToolCall(*repairedToolCall, availableTools, execProviderTools); validateErr == nil { return *repairedToolCall } } @@ -989,7 +1025,10 @@ func (a *agent) validateAndRepairToolCall(ctx context.Context, toolCall ToolCall } // validateToolCall validates a tool call against available tools and their schemas. -func (a *agent) validateToolCall(toolCall ToolCallContent, availableTools []AgentTool) error { +// Both availableTools and execProviderTools must already be filtered by the +// caller (e.g. via activeTools); this function trusts that the slices +// represent exactly the tools permitted for the current step. +func (a *agent) validateToolCall(toolCall ToolCallContent, availableTools []AgentTool, execProviderTools []ExecutableProviderTool) error { var tool AgentTool for _, t := range availableTools { if t.Info().Name == toolCall.ToolName { @@ -999,6 +1038,18 @@ func (a *agent) validateToolCall(toolCall ToolCallContent, availableTools []Agen } if tool == nil { + // Check if this is an executable provider tool. Provider- + // defined tools have their schema enforced server-side, so + // we only validate that the input is parseable JSON. + for _, ept := range execProviderTools { + if ept.GetName() == toolCall.ToolName { + var input map[string]any + if err := json.Unmarshal([]byte(toolCall.Input), &input); err != nil { + return fmt.Errorf("invalid JSON input: %w", err) + } + return nil + } + } return fmt.Errorf("tool not found: %s", toolCall.ToolName) } @@ -1117,12 +1168,25 @@ func WithTools(tools ...AgentTool) AgentOption { } } -// WithProviderDefinedTools sets the provider-defined tools for the agent. -// These tools are executed by the provider (e.g. web search) rather -// than by the client. -func WithProviderDefinedTools(tools ...ProviderDefinedTool) AgentOption { +// WithProviderDefinedTools registers provider-defined tools with the +// agent. Provider-executed tools (e.g. web search) are passed through +// to the API. Client-executed tools (ExecutableProviderTool) are also +// registered for local execution. +func WithProviderDefinedTools(tools ...ProviderTool) AgentOption { return func(s *agentSettings) { - s.providerDefinedTools = append(s.providerDefinedTools, tools...) + for _, t := range tools { + // Every provider tool goes into providerDefinedTools + // for wire formatting. + s.providerDefinedTools = append( + s.providerDefinedTools, t.providerDefinedTool(), + ) + // Executable ones also register for local execution. + if exec, ok := t.(ExecutableProviderTool); ok { + s.executableProviderTools = append( + s.executableProviderTools, exec, + ) + } + } } } @@ -1162,7 +1226,7 @@ func WithOnRetry(callback OnRetryCallback) AgentOption { } // processStepStream processes a single step's stream and returns the step result. -func (a *agent) processStepStream(ctx context.Context, stream StreamResponse, opts AgentStreamCall, _ []StepResult, stepTools []AgentTool) (stepExecutionResult, error) { +func (a *agent) processStepStream(ctx context.Context, stream StreamResponse, opts AgentStreamCall, _ []StepResult, stepTools []AgentTool, execProviderTools []ExecutableProviderTool) (stepExecutionResult, error) { var stepContent []Content var stepToolCalls []ToolCallContent var stepUsage Usage @@ -1195,6 +1259,11 @@ func (a *agent) processStepStream(ctx context.Context, stream StreamResponse, op toolMap[tool.Info().Name] = tool } + execProviderToolMap := make(map[string]ExecutableProviderTool, len(execProviderTools)) + for _, ept := range execProviderTools { + execProviderToolMap[ept.GetName()] = ept + } + // Semaphores for controlling parallelism parallelSem := make(chan struct{}, 5) var sequentialMu sync.Mutex @@ -1206,7 +1275,7 @@ func (a *agent) processStepStream(ctx context.Context, stream StreamResponse, op parallelSem <- struct{}{} toolExecutionWg.Go(func() { defer func() { <-parallelSem }() - result, isCriticalError := a.executeSingleTool(ctx, toolMap, req.toolCall, opts.OnToolResult) + result, isCriticalError := a.executeSingleTool(ctx, toolMap, execProviderToolMap, req.toolCall, opts.OnToolResult) toolStateMu.Lock() toolResults = append(toolResults, result) if isCriticalError && toolExecutionErr == nil { @@ -1218,7 +1287,7 @@ func (a *agent) processStepStream(ctx context.Context, stream StreamResponse, op }) } else { sequentialMu.Lock() - result, isCriticalError := a.executeSingleTool(ctx, toolMap, req.toolCall, opts.OnToolResult) + result, isCriticalError := a.executeSingleTool(ctx, toolMap, execProviderToolMap, req.toolCall, opts.OnToolResult) toolStateMu.Lock() toolResults = append(toolResults, result) if isCriticalError && toolExecutionErr == nil { @@ -1389,7 +1458,7 @@ func (a *agent) processStepStream(ctx context.Context, stream StreamResponse, op delete(activeToolCalls, part.ID) } else { // Validate and potentially repair the tool call - validatedToolCall := a.validateAndRepairToolCall(ctx, toolCall, stepTools, a.settings.systemPrompt, nil, opts.RepairToolCall) + validatedToolCall := a.validateAndRepairToolCall(ctx, toolCall, stepTools, execProviderTools, a.settings.systemPrompt, nil, opts.RepairToolCall) stepToolCalls = append(stepToolCalls, validatedToolCall) stepContent = append(stepContent, validatedToolCall) diff --git a/agent_test.go b/agent_test.go index 929094b21899bb14f8a519e8d52a29e29040a355..8d45429d284e88dac4fa7e16f9793ff7a375314c 100644 --- a/agent_test.go +++ b/agent_test.go @@ -2,6 +2,7 @@ package fantasy import ( "context" + "encoding/base64" "encoding/json" "errors" "fmt" @@ -1717,7 +1718,7 @@ func TestAgent_MediaToolResponses(t *testing.T) { mediaResult, ok := toolResults[0].Result.(ToolResultOutputContentMedia) require.True(t, ok, "Expected media result") - require.Equal(t, string(imageData), mediaResult.Data) + require.Equal(t, base64.StdEncoding.EncodeToString(imageData), mediaResult.Data) require.Equal(t, "image/png", mediaResult.MediaType) }) @@ -1769,7 +1770,7 @@ func TestAgent_MediaToolResponses(t *testing.T) { mediaResult, ok := toolResults[0].Result.(ToolResultOutputContentMedia) require.True(t, ok, "Expected media result") - require.Equal(t, string(audioData), mediaResult.Data) + require.Equal(t, base64.StdEncoding.EncodeToString(audioData), mediaResult.Data) require.Equal(t, "audio/wav", mediaResult.MediaType) }) @@ -1823,7 +1824,7 @@ func TestAgent_MediaToolResponses(t *testing.T) { mediaResult, ok := toolResults[0].Result.(ToolResultOutputContentMedia) require.True(t, ok, "Expected media result") - require.Equal(t, string(imageData), mediaResult.Data) + require.Equal(t, base64.StdEncoding.EncodeToString(imageData), mediaResult.Data) require.Equal(t, "image/png", mediaResult.MediaType) require.Equal(t, "Screenshot captured successfully", mediaResult.Text) }) @@ -1971,3 +1972,459 @@ func TestToResponseMessages_ProviderExecutedRouting(t *testing.T) { require.Equal(t, "toolu_02", tr2.ToolCallID) require.False(t, tr2.ProviderExecuted) } + +// TestAgent_Generate_ExecutableProviderTool verifies that an +// ExecutableProviderTool registered via WithProviderDefinedTools is +// executed by the agent when the model returns a matching tool call. +func TestAgent_Generate_ExecutableProviderTool(t *testing.T) { + t.Parallel() + + runCalled := false + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + runCalled = true + return NewTextResponse("screenshot taken"), nil + }, + ) + + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + return &Response{ + Content: []Content{ + ToolCallContent{ + ToolCallID: "call-1", + ToolName: "computer", + Input: `{"action":"screenshot"}`, + }, + }, + Usage: Usage{TotalTokens: 10}, + FinishReason: FinishReasonStop, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool)) + result, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "take a screenshot", + }) + + require.NoError(t, err) + require.NotNil(t, result) + require.True(t, runCalled, "expected Run func to be called") + require.Len(t, result.Steps, 1) + + // Verify tool result is in the response. + var toolResults []ToolResultContent + for _, c := range result.Response.Content { + if tr, ok := AsContentType[ToolResultContent](c); ok { + toolResults = append(toolResults, tr) + } + } + require.Len(t, toolResults, 1) + require.Equal(t, "call-1", toolResults[0].ToolCallID) + require.Equal(t, "computer", toolResults[0].ToolName) + + textResult, ok := toolResults[0].Result.(ToolResultOutputContentText) + require.True(t, ok) + require.Equal(t, "screenshot taken", textResult.Text) +} + +// TestAgent_Generate_ExecutableProviderTool_ActiveTools verifies that +// active tool filtering works for ExecutableProviderTool. +func TestAgent_Generate_ExecutableProviderTool_ActiveTools(t *testing.T) { + t.Parallel() + + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + return NewTextResponse("ok"), nil + }, + ) + + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + // With ActiveTools=["other"], computer should be filtered out. + require.Empty(t, call.Tools) + + return &Response{ + Content: []Content{TextContent{Text: "no tools"}}, + Usage: Usage{TotalTokens: 5}, + FinishReason: FinishReasonStop, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool)) + result, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "test", + ActiveTools: []string{"other"}, + }) + + require.NoError(t, err) + require.NotNil(t, result) +} + +// TestAgent_Generate_ExecutableProviderTool_ActiveTools_Rejected +// verifies that a hallucinated tool call for an EPT excluded by +// activeTools is rejected at validation and execution time. +func TestAgent_Generate_ExecutableProviderTool_ActiveTools_Rejected(t *testing.T) { + t.Parallel() + + runCalled := false + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + runCalled = true + return NewTextResponse("ok"), nil + }, + ) + + callCount := 0 + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + callCount++ + if callCount == 1 { + // Model hallucinates a call to the excluded tool. + return &Response{ + Content: []Content{ToolCallContent{ + ToolCallID: "call-1", + ToolName: "computer", + Input: `{"action":"screenshot"}`, + }}, + Usage: Usage{TotalTokens: 5}, + FinishReason: FinishReasonToolCalls, + }, nil + } + // Second call: model stops. + return &Response{ + Content: []Content{TextContent{Text: "done"}}, + Usage: Usage{TotalTokens: 3}, + FinishReason: FinishReasonStop, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool)) + result, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "test", + ActiveTools: []string{"other"}, + }) + + require.NoError(t, err) + require.NotNil(t, result) + require.False(t, runCalled, "excluded EPT should not have been executed") + + // The tool call should have been marked invalid. + var foundInvalidToolResult bool + for _, step := range result.Steps { + for _, content := range step.Content { + if tr, ok := AsContentType[ToolResultContent](content); ok { + if errResult, ok := tr.Result.(ToolResultOutputContentError); ok { + require.Contains(t, errResult.Error.Error(), "tool not found") + foundInvalidToolResult = true + } + } + } + } + require.True(t, foundInvalidToolResult, "expected an error result for the excluded tool call") +} + +// TestAgent_Stream_ExecutableProviderTool verifies that an +// ExecutableProviderTool works through the Stream path. +func TestAgent_Stream_ExecutableProviderTool(t *testing.T) { + t.Parallel() + + runCalled := false + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + runCalled = true + return NewTextResponse("screenshot taken"), nil + }, + ) + + model := &mockLanguageModel{ + streamFunc: func(ctx context.Context, call Call) (StreamResponse, error) { + return func(yield func(StreamPart) bool) { + if !yield(StreamPart{ + Type: StreamPartTypeToolCall, + ID: "call-1", + ToolCallName: "computer", + ToolCallInput: `{"action":"screenshot"}`, + }) { + return + } + yield(StreamPart{ + Type: StreamPartTypeFinish, + FinishReason: FinishReasonStop, + Usage: Usage{TotalTokens: 10}, + }) + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool)) + result, err := agent.Stream(context.Background(), AgentStreamCall{ + Prompt: "take a screenshot", + }) + + require.NoError(t, err) + require.NotNil(t, result) + require.True(t, runCalled, "expected Run func to be called") + require.Len(t, result.Steps, 1) + + // Verify tool result is in the step content. + var toolResults []ToolResultContent + for _, c := range result.Steps[0].Content { + if tr, ok := AsContentType[ToolResultContent](c); ok { + toolResults = append(toolResults, tr) + } + } + require.Len(t, toolResults, 1) + require.Equal(t, "call-1", toolResults[0].ToolCallID) +} + +// TestAgent_PrepareTools_ExecutableProviderTool verifies that +// prepareTools emits a ProviderDefinedTool (not a FunctionTool) when +// an ExecutableProviderTool is registered via WithProviderDefinedTools. +func TestAgent_PrepareTools_ExecutableProviderTool(t *testing.T) { + t.Parallel() + + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + return NewTextResponse("ok"), nil + }, + ) + + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + // Verify the tool is emitted as a ProviderDefinedTool. + require.Len(t, call.Tools, 1) + pdt, ok := call.Tools[0].(ProviderDefinedTool) + require.True(t, ok, "expected ProviderDefinedTool, got %T", call.Tools[0]) + require.Equal(t, "computer", pdt.Name) + require.Equal(t, "test.computer", pdt.ID) + + return &Response{ + Content: []Content{TextContent{Text: "done"}}, + Usage: Usage{TotalTokens: 5}, + FinishReason: FinishReasonStop, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool)) + _, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "test", + }) + require.NoError(t, err) +} + +// TestAgent_ValidateToolCall_ExecutableProviderTool verifies that +// schema validation is skipped for executable provider tools, but +// JSON parsing is still checked. +func TestAgent_ValidateToolCall_ExecutableProviderTool(t *testing.T) { + t.Parallel() + + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + return NewTextResponse("ok"), nil + }, + ) + + a := &agent{ + settings: agentSettings{ + executableProviderTools: []ExecutableProviderTool{execTool}, + }, + } + + // Valid JSON should pass even without required fields. + err := a.validateToolCall(ToolCallContent{ + ToolName: "computer", + Input: `{"action":"screenshot"}`, + }, []AgentTool{}, []ExecutableProviderTool{execTool}) + require.NoError(t, err) + + // Invalid JSON should still fail. + err = a.validateToolCall(ToolCallContent{ + ToolName: "computer", + Input: `not-json`, + }, []AgentTool{}, []ExecutableProviderTool{execTool}) + require.Error(t, err) + require.Contains(t, err.Error(), "invalid JSON") +} + +// TestAgent_WithProviderDefinedTools_BackwardCompat verifies that +// passing a plain ProviderDefinedTool to WithProviderDefinedTools +// still works (web search path). +func TestAgent_WithProviderDefinedTools_BackwardCompat(t *testing.T) { + t.Parallel() + + webSearch := ProviderDefinedTool{ + ID: "anthropic.web_search", + Name: "web_search", + Args: map[string]any{"max_results": 5}, + } + + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + require.Len(t, call.Tools, 1) + pdt, ok := call.Tools[0].(ProviderDefinedTool) + require.True(t, ok, "expected ProviderDefinedTool, got %T", call.Tools[0]) + require.Equal(t, "web_search", pdt.Name) + require.Equal(t, "anthropic.web_search", pdt.ID) + + return &Response{ + Content: []Content{TextContent{Text: "search results"}}, + Usage: Usage{TotalTokens: 5}, + FinishReason: FinishReasonStop, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(webSearch)) + result, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "search for something", + }) + + require.NoError(t, err) + require.NotNil(t, result) + require.Equal(t, "search results", result.Response.Content.Text()) +} + +// TestAgent_Generate_ExecutableProviderTool_ImageBase64 verifies that +// image data returned by an ExecutableProviderTool's run function is +// base64-encoded when stored in ToolResultOutputContentMedia.Data. +func TestAgent_Generate_ExecutableProviderTool_ImageBase64(t *testing.T) { + t.Parallel() + + rawPNG := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A} + + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + return NewImageResponse(rawPNG, "image/png"), nil + }, + ) + + callCount := 0 + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + callCount++ + if callCount == 1 { + return &Response{ + Content: []Content{ + ToolCallContent{ + ToolCallID: "call-1", + ToolName: "computer", + Input: `{"action":"screenshot"}`, + }, + }, + Usage: Usage{TotalTokens: 10}, + FinishReason: FinishReasonToolCalls, + }, nil + } + return &Response{ + Content: []Content{TextContent{Text: "done"}}, + Usage: Usage{TotalTokens: 5}, + FinishReason: FinishReasonStop, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool)) + result, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "take a screenshot", + }) + + require.NoError(t, err) + require.NotNil(t, result) + require.Len(t, result.Steps, 2) + + // The tool result in the first step must have base64-encoded data. + toolResults := result.Steps[0].Content.ToolResults() + require.Len(t, toolResults, 1) + + mediaResult, ok := toolResults[0].Result.(ToolResultOutputContentMedia) + require.True(t, ok, "expected media result") + require.Equal(t, base64.StdEncoding.EncodeToString(rawPNG), mediaResult.Data) + require.Equal(t, "image/png", mediaResult.MediaType) +} + +// TestAgent_Generate_ExecutableProviderTool_CriticalError verifies +// that a Go error returned from an ExecutableProviderTool's run +// function is treated as a critical error, stopping the agent loop. +func TestAgent_Generate_ExecutableProviderTool_CriticalError(t *testing.T) { + t.Parallel() + + execTool := NewExecutableProviderTool( + ProviderDefinedTool{ + ID: "test.computer", + Name: "computer", + Args: map[string]any{"display_width_px": 1920}, + }, + func(ctx context.Context, call ToolCall) (ToolResponse, error) { + return ToolResponse{}, fmt.Errorf("vnc connection lost") + }, + ) + + callCount := 0 + model := &mockLanguageModel{ + generateFunc: func(ctx context.Context, call Call) (*Response, error) { + callCount++ + return &Response{ + Content: []Content{ + ToolCallContent{ + ToolCallID: "call-1", + ToolName: "computer", + Input: `{"action":"screenshot"}`, + }, + }, + Usage: Usage{TotalTokens: 10}, + FinishReason: FinishReasonToolCalls, + }, nil + }, + } + + agent := NewAgent(model, WithProviderDefinedTools(execTool), WithStopConditions(StepCountIs(5))) + result, err := agent.Generate(context.Background(), AgentCall{ + Prompt: "take a screenshot", + }) + + require.NoError(t, err) + require.NotNil(t, result) + // The model should only be called once — the critical error stops + // the loop before a second model call. + require.Equal(t, 1, callCount) + require.Len(t, result.Steps, 1) +} diff --git a/content.go b/content.go index 9cbe67ab4f31555c6f0ccbb5667f78d9fd73ff2b..8787f7cd0bd9b44a35903c674ea0e3b09dda9e78 100644 --- a/content.go +++ b/content.go @@ -1,6 +1,9 @@ package fantasy -import "encoding/json" +import ( + "context" + "encoding/json" +) // ProviderOptionsData is an interface for provider-specific options data. // All implementations MUST also implement encoding/json.Marshaler and @@ -512,6 +515,16 @@ func (f FunctionTool) GetName() string { return f.Name } +// ProviderTool is a tool whose schema and wire format are defined by +// the model provider. Both pure provider-executed tools +// (ProviderDefinedTool) and client-executed provider tools +// (ExecutableProviderTool) implement this interface. The unexported +// method seals this interface to the types in this package. +// External packages should use NewExecutableProviderTool instead. +type ProviderTool interface { + providerDefinedTool() ProviderDefinedTool +} + // ProviderDefinedTool represents the configuration of a tool that is defined by the provider. type ProviderDefinedTool struct { // ID of the tool. Should follow the format `.`. @@ -532,6 +545,53 @@ func (p ProviderDefinedTool) GetName() string { return p.Name } +func (p ProviderDefinedTool) providerDefinedTool() ProviderDefinedTool { + return p +} + +// ExecutableProviderTool pairs a ProviderDefinedTool with a +// client-side execution function. Use this for provider-defined tools +// that require local execution (e.g. Anthropic computer use). Register +// it via WithProviderDefinedTools. +type ExecutableProviderTool struct { + pdt ProviderDefinedTool + run func(ctx context.Context, call ToolCall) (ToolResponse, error) +} + +func (e ExecutableProviderTool) providerDefinedTool() ProviderDefinedTool { + return e.pdt +} + +// GetType returns the type of the underlying ProviderDefinedTool. +func (e ExecutableProviderTool) GetType() ToolType { + return e.pdt.GetType() +} + +// GetName returns the name of the underlying ProviderDefinedTool. +func (e ExecutableProviderTool) GetName() string { + return e.pdt.GetName() +} + +// Definition returns the underlying ProviderDefinedTool. +func (e ExecutableProviderTool) Definition() ProviderDefinedTool { + return e.pdt +} + +// Run executes the tool's client-side function. +func (e ExecutableProviderTool) Run(ctx context.Context, call ToolCall) (ToolResponse, error) { + return e.run(ctx, call) +} + +// NewExecutableProviderTool creates a provider-defined tool with +// client-side execution. The tool is sent to the API using the +// provider's native wire format, but executed locally by run. +func NewExecutableProviderTool( + pdt ProviderDefinedTool, + run func(ctx context.Context, call ToolCall) (ToolResponse, error), +) ExecutableProviderTool { + return ExecutableProviderTool{pdt: pdt, run: run} +} + // NewUserMessage creates a new user message with the given prompt and optional files. func NewUserMessage(prompt string, files ...FilePart) Message { content := make([]MessagePart, 0, len(files)+1) diff --git a/examples/computer-use/main.go b/examples/computer-use/main.go new file mode 100644 index 0000000000000000000000000000000000000000..d36a7bdcdcb21e5c66faa468ab5b32dd4c91a08b --- /dev/null +++ b/examples/computer-use/main.go @@ -0,0 +1,103 @@ +package main + +// This example demonstrates Anthropic computer use with the agent +// helper. It shows how to: +// +// 1. Wire up the provider, model, and computer use tool. +// 2. Register the tool via WithProviderDefinedTools so the agent +// handles the tool-call loop automatically. +// 3. Parse incoming tool calls with ParseComputerUseInput inside +// the Run function. +// 4. Return results (screenshots, errors) back to the agent. + +import ( + "bytes" + "context" + "fmt" + "image" + "image/color" + "image/png" + "os" + + "charm.land/fantasy" + "charm.land/fantasy/providers/anthropic" +) + +// takeScreenshot is a stub that simulates capturing a screenshot. +// In a real implementation this would capture the virtual display +// and return raw PNG bytes. +func takeScreenshot() ([]byte, error) { + // Generate a valid 1x1 black PNG as a placeholder. + img := image.NewRGBA(image.Rect(0, 0, 1, 1)) + img.Set(0, 0, color.Black) + var buf bytes.Buffer + if err := png.Encode(&buf, img); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +func main() { + // Set up the Anthropic provider. + provider, err := anthropic.New(anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY"))) + if err != nil { + fmt.Fprintln(os.Stderr, "could not create provider:", err) + os.Exit(1) + } + + ctx := context.Background() + + // Pick the model. + model, err := provider.LanguageModel(ctx, "claude-opus-4-6") + if err != nil { + fmt.Fprintln(os.Stderr, "could not get language model:", err) + os.Exit(1) + } + + // Create a computer use tool with a Run function that executes + // actions and returns screenshots. + computerTool := anthropic.NewComputerUseTool(anthropic.ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: anthropic.ComputerUse20251124, + }, func(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error) { + action, err := anthropic.ParseComputerUseInput(call.Input) + if err != nil { + return fantasy.ToolResponse{}, fmt.Errorf("parse computer use input: %w", err) + } + + fmt.Printf("Action: %s\n", action.Action) + + // In production you would execute the action (click, + // type, scroll, etc.) against the virtual display and + // then capture a screenshot. + png, err := takeScreenshot() + if err != nil { + return fantasy.ToolResponse{}, fmt.Errorf("take screenshot: %w", err) + } + return fantasy.NewImageResponse(png, "image/png"), nil + }) + + // Build an agent with the computer use tool. The agent handles + // the tool-call loop: it sends the prompt, executes any tool + // calls the model returns, feeds the results back, and repeats + // until the model stops requesting tools. + agent := fantasy.NewAgent(model, + fantasy.WithProviderDefinedTools(computerTool), + fantasy.WithStopConditions(fantasy.StepCountIs(10)), + ) + + result, err := agent.Generate(ctx, fantasy.AgentCall{ + Prompt: "Take a screenshot of the desktop", + }) + if err != nil { + fmt.Fprintln(os.Stderr, "agent error:", err) + os.Exit(1) + } + + fmt.Println("Agent finished.") + fmt.Printf("Steps: %d\n", len(result.Steps)) + if text := result.Response.Content.Text(); text != "" { + fmt.Println("Claude said:", text) + } +} diff --git a/providers/anthropic/anthropic.go b/providers/anthropic/anthropic.go index 330c1ac5fbd67daa694899f048f0d88ef77d63de..2f05b2f1289a12b3577357ed7c5a584ef17ca932 100644 --- a/providers/anthropic/anthropic.go +++ b/providers/anthropic/anthropic.go @@ -25,6 +25,37 @@ import ( "golang.org/x/oauth2/google" ) +// betaRequestOptions converts beta flag strings into request +// options that enable the corresponding Anthropic beta APIs. +func betaRequestOptions(flags []string) []option.RequestOption { + if len(flags) == 0 { + return nil + } + opts := []option.RequestOption{option.WithQuery("beta", "true")} + for _, flag := range flags { + opts = append(opts, option.WithHeaderAdd("anthropic-beta", flag)) + } + return opts +} + +// buildRequestOptions constructs the common request options shared +// by Generate and Stream: user-agent, raw tool injection, and any +// beta API flags. +func buildRequestOptions(call fantasy.Call, rawTools []json.RawMessage, betaFlags []string) []option.RequestOption { + reqOpts := callUARequestOptions(call) + if len(rawTools) > 0 { + // Tools are injected as raw JSON rather than via params.Tools + // because the SDK doesn't model beta tool types (e.g. computer + // use). If the SDK adds validation that reads params.Tools, + // this will need updating. + reqOpts = append(reqOpts, option.WithJSONSet("tools", rawTools)) + } + if len(betaFlags) > 0 { + reqOpts = append(reqOpts, betaRequestOptions(betaFlags)...) + } + return reqOpts +} + const ( // Name is the name of the Anthropic provider. Name = "anthropic" @@ -236,13 +267,19 @@ func (a languageModel) Provider() string { return a.provider } -func (a languageModel) prepareParams(call fantasy.Call) (*anthropic.MessageNewParams, []fantasy.CallWarning, error) { - params := &anthropic.MessageNewParams{} +func (a languageModel) prepareParams(call fantasy.Call) ( + params *anthropic.MessageNewParams, + rawTools []json.RawMessage, + warnings []fantasy.CallWarning, + betaFlags []string, + err error, +) { + params = &anthropic.MessageNewParams{} providerOptions := &ProviderOptions{} if v, ok := call.ProviderOptions[Name]; ok { providerOptions, ok = v.(*ProviderOptions) if !ok { - return nil, nil, &fantasy.Error{Title: "invalid argument", Message: "anthropic provider options should be *anthropic.ProviderOptions"} + return nil, nil, nil, nil, &fantasy.Error{Title: "invalid argument", Message: "anthropic provider options should be *anthropic.ProviderOptions"} } } sendReasoning := true @@ -293,7 +330,7 @@ func (a languageModel) prepareParams(call fantasy.Call) (*anthropic.MessageNewPa params.Thinking.OfAdaptive = &adaptive case providerOptions.Thinking != nil: if providerOptions.Thinking.BudgetTokens == 0 { - return nil, nil, &fantasy.Error{Title: "no budget", Message: "thinking requires budget"} + return nil, nil, nil, nil, &fantasy.Error{Title: "no budget", Message: "thinking requires budget"} } params.Thinking = anthropic.ThinkingConfigParamOfEnabled(providerOptions.Thinking.BudgetTokens) if call.Temperature != nil { @@ -327,15 +364,16 @@ func (a languageModel) prepareParams(call fantasy.Call) (*anthropic.MessageNewPa if providerOptions.DisableParallelToolUse != nil { disableParallelToolUse = *providerOptions.DisableParallelToolUse } - tools, toolChoice, toolWarnings := a.toTools(call.Tools, call.ToolChoice, disableParallelToolUse) - params.Tools = tools + var toolChoice *anthropic.ToolChoiceUnionParam + var toolWarnings []fantasy.CallWarning + rawTools, toolChoice, toolWarnings, betaFlags = a.toTools(call.Tools, call.ToolChoice, disableParallelToolUse) if toolChoice != nil { params.ToolChoice = *toolChoice } warnings = append(warnings, toolWarnings...) } - return params, warnings, nil + return params, rawTools, warnings, betaFlags, nil } func (a *provider) Name() string { @@ -447,6 +485,19 @@ func anyToStringSlice(v any) []string { const maxExactIntFloat64 = float64(1<<53 - 1) +// asProviderDefinedTool extracts the ProviderDefinedTool from a +// Tool, handling both ProviderDefinedTool and +// ExecutableProviderTool. +func asProviderDefinedTool(tool fantasy.Tool) (fantasy.ProviderDefinedTool, bool) { + if pdt, ok := tool.(fantasy.ProviderDefinedTool); ok { + return pdt, true + } + if ept, ok := tool.(fantasy.ExecutableProviderTool); ok { + return ept.Definition(), true + } + return fantasy.ProviderDefinedTool{}, false +} + func anyToInt64(v any) (int64, bool) { switch typed := v.(type) { case int: @@ -528,7 +579,7 @@ func anyToUserLocation(v any) *UserLocation { } } -func (a languageModel) toTools(tools []fantasy.Tool, toolChoice *fantasy.ToolChoice, disableParallelToolCalls bool) (anthropicTools []anthropic.ToolUnionParam, anthropicToolChoice *anthropic.ToolChoiceUnionParam, warnings []fantasy.CallWarning) { +func (a languageModel) toTools(tools []fantasy.Tool, toolChoice *fantasy.ToolChoice, disableParallelToolCalls bool) (rawTools []json.RawMessage, anthropicToolChoice *anthropic.ToolChoiceUnionParam, warnings []fantasy.CallWarning, betaFlags []string) { for _, tool := range tools { if tool.GetType() == fantasy.ToolTypeFunction { ft, ok := tool.(fantasy.FunctionTool) @@ -558,11 +609,20 @@ func (a languageModel) toTools(tools []fantasy.Tool, toolChoice *fantasy.ToolCho if cacheControl != nil { anthropicTool.CacheControl = anthropic.NewCacheControlEphemeralParam() } - anthropicTools = append(anthropicTools, anthropic.ToolUnionParam{OfTool: &anthropicTool}) + raw, err := json.Marshal(anthropic.ToolUnionParam{OfTool: &anthropicTool}) + if err != nil { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Tool: tool, + Message: fmt.Sprintf("failed to marshal function tool: %v", err), + }) + continue + } + rawTools = append(rawTools, raw) continue } if tool.GetType() == fantasy.ToolTypeProviderDefined { - pt, ok := tool.(fantasy.ProviderDefinedTool) + pt, ok := asProviderDefinedTool(tool) if !ok { continue } @@ -596,11 +656,52 @@ func (a languageModel) toTools(tools []fantasy.Tool, toolChoice *fantasy.ToolCho webSearchTool.UserLocation = ulp } } - anthropicTools = append(anthropicTools, anthropic.ToolUnionParam{ + raw, err := json.Marshal(anthropic.ToolUnionParam{ OfWebSearchTool20250305: &webSearchTool, }) + if err != nil { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Tool: tool, + Message: fmt.Sprintf("failed to marshal web search tool: %v", err), + }) + continue + } + rawTools = append(rawTools, raw) + continue + } + if IsComputerUseTool(tool) { + raw, err := computerUseToolJSON(pt) + if err != nil { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Tool: tool, + Message: fmt.Sprintf("failed to build computer use tool: %v", err), + }) + continue + } + version, ok := getComputerUseVersion(pt) + if ok { + flag, err := computerUseBetaFlag(version) + if err != nil { + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeOther, + Tool: tool, + Message: fmt.Sprintf("unsupported computer use version: %v", err), + }) + continue + } + betaFlags = append(betaFlags, flag) + } + rawTools = append(rawTools, raw) continue } + warnings = append(warnings, fantasy.CallWarning{ + Type: fantasy.CallWarningTypeUnsupportedTool, + Tool: tool, + Message: "tool is not supported", + }) + continue } warnings = append(warnings, fantasy.CallWarning{ Type: fantasy.CallWarningTypeUnsupportedTool, @@ -624,7 +725,7 @@ func (a languageModel) toTools(tools []fantasy.Tool, toolChoice *fantasy.ToolCho }, } } - return anthropicTools, anthropicToolChoice, warnings + return rawTools, anthropicToolChoice, warnings, betaFlags } switch *toolChoice { @@ -656,7 +757,7 @@ func (a languageModel) toTools(tools []fantasy.Tool, toolChoice *fantasy.ToolCho }, } } - return anthropicTools, anthropicToolChoice, warnings + return rawTools, anthropicToolChoice, warnings, betaFlags } func toPrompt(prompt fantasy.Prompt, sendReasoningData bool) ([]anthropic.TextBlockParam, []anthropic.MessageParam, []fantasy.CallWarning) { @@ -1005,11 +1106,13 @@ func mapFinishReason(finishReason string) fantasy.FinishReason { // Generate implements fantasy.LanguageModel. func (a languageModel) Generate(ctx context.Context, call fantasy.Call) (*fantasy.Response, error) { - params, warnings, err := a.prepareParams(call) + params, rawTools, warnings, betaFlags, err := a.prepareParams(call) if err != nil { return nil, err } - response, err := a.client.Messages.New(ctx, *params, callUARequestOptions(call)...) + reqOpts := buildRequestOptions(call, rawTools, betaFlags) + + response, err := a.client.Messages.New(ctx, *params, reqOpts...) if err != nil { return nil, toProviderErr(err) } @@ -1132,12 +1235,14 @@ func (a languageModel) Generate(ctx context.Context, call fantasy.Call) (*fantas // Stream implements fantasy.LanguageModel. func (a languageModel) Stream(ctx context.Context, call fantasy.Call) (fantasy.StreamResponse, error) { - params, warnings, err := a.prepareParams(call) + params, rawTools, warnings, betaFlags, err := a.prepareParams(call) if err != nil { return nil, err } - stream := a.client.Messages.NewStreaming(ctx, *params, callUARequestOptions(call)...) + reqOpts := buildRequestOptions(call, rawTools, betaFlags) + + stream := a.client.Messages.NewStreaming(ctx, *params, reqOpts...) acc := anthropic.Message{} return func(yield func(fantasy.StreamPart) bool) { if len(warnings) > 0 { diff --git a/providers/anthropic/anthropic_test.go b/providers/anthropic/anthropic_test.go index 5f137a78be7a0b2f6fcfbad8403474ac5ece376a..4387a34faf7fe8900383e26859de13d306505664 100644 --- a/providers/anthropic/anthropic_test.go +++ b/providers/anthropic/anthropic_test.go @@ -8,6 +8,7 @@ import ( "math" "net/http" "net/http/httptest" + "strings" "testing" "time" @@ -16,6 +17,12 @@ import ( "github.com/stretchr/testify/require" ) +// noopComputerRun is a no-op run function for tests that only need +// to inspect the tool definition, not execute it. +var noopComputerRun = func(_ context.Context, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { + return fantasy.ToolResponse{}, nil +} + func TestToPrompt_DropsEmptyMessages(t *testing.T) { t.Parallel() @@ -1364,3 +1371,920 @@ func TestGenerate_ToolChoiceNone(t *testing.T) { require.True(t, ok, "request body should have tool_choice") require.Equal(t, "none", toolChoice["type"], "tool_choice should be 'none'") } + +// --- Computer Use Tests --- + +// jsonRoundTripTool simulates a JSON round-trip on a +// ProviderDefinedTool so that its Args map contains float64 +// values (as json.Unmarshal produces) rather than the int64 +// values that NewComputerUseTool stores directly. The +// production toBetaTools code asserts float64. +func jsonRoundTripTool(t *testing.T, tool fantasy.ExecutableProviderTool) fantasy.ProviderDefinedTool { + t.Helper() + pdt := tool.Definition() + data, err := json.Marshal(pdt.Args) + require.NoError(t, err) + var args map[string]any + require.NoError(t, json.Unmarshal(data, &args)) + pdt.Args = args + return pdt +} + +func TestNewComputerUseTool(t *testing.T) { + t.Parallel() + + t.Run("creates tool with correct ID and name", func(t *testing.T) { + t.Parallel() + tool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun).Definition() + require.Equal(t, "anthropic.computer", tool.ID) + require.Equal(t, "computer", tool.Name) + require.Equal(t, int64(1920), tool.Args["display_width_px"]) + require.Equal(t, int64(1080), tool.Args["display_height_px"]) + require.Equal(t, string(ComputerUse20250124), tool.Args["tool_version"]) + }) + + t.Run("includes optional fields when set", func(t *testing.T) { + t.Parallel() + displayNum := int64(1) + enableZoom := true + tool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1024, + DisplayHeightPx: 768, + DisplayNumber: &displayNum, + EnableZoom: &enableZoom, + ToolVersion: ComputerUse20251124, + CacheControl: &CacheControl{Type: "ephemeral"}, + }, noopComputerRun).Definition() + require.Equal(t, int64(1), tool.Args["display_number"]) + require.Equal(t, true, tool.Args["enable_zoom"]) + require.NotNil(t, tool.Args["cache_control"]) + }) + + t.Run("omits optional fields when nil", func(t *testing.T) { + t.Parallel() + tool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun).Definition() + _, hasDisplayNum := tool.Args["display_number"] + _, hasEnableZoom := tool.Args["enable_zoom"] + _, hasCacheControl := tool.Args["cache_control"] + require.False(t, hasDisplayNum) + require.False(t, hasEnableZoom) + require.False(t, hasCacheControl) + }) +} + +func TestIsComputerUseTool(t *testing.T) { + t.Parallel() + + t.Run("returns true for computer use tool", func(t *testing.T) { + t.Parallel() + tool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun) + require.True(t, IsComputerUseTool(tool.Definition())) + }) + + t.Run("returns false for function tool", func(t *testing.T) { + t.Parallel() + tool := fantasy.FunctionTool{ + Name: "test", + Description: "test tool", + } + require.False(t, IsComputerUseTool(tool)) + }) + + t.Run("returns false for other provider defined tool", func(t *testing.T) { + t.Parallel() + tool := fantasy.ProviderDefinedTool{ + ID: "other.tool", + Name: "other", + } + require.False(t, IsComputerUseTool(tool)) + }) +} + +func TestNeedsBetaAPI(t *testing.T) { + t.Parallel() + + lm := languageModel{options: options{}} + + t.Run("returns false for empty tools", func(t *testing.T) { + t.Parallel() + _, _, _, betaFlags := lm.toTools(nil, nil, false) + require.Empty(t, betaFlags) + _, _, _, betaFlags = lm.toTools([]fantasy.Tool{}, nil, false) + require.Empty(t, betaFlags) + }) + + t.Run("returns false for only function tools", func(t *testing.T) { + t.Parallel() + tools := []fantasy.Tool{ + fantasy.FunctionTool{Name: "test"}, + } + _, _, _, betaFlags := lm.toTools(tools, nil, false) + require.Empty(t, betaFlags) + }) + + t.Run("returns beta flags when computer use tool present", func(t *testing.T) { + t.Parallel() + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun)) + tools := []fantasy.Tool{ + fantasy.FunctionTool{Name: "test"}, + cuTool, + } + _, _, _, betaFlags := lm.toTools(tools, nil, false) + require.NotEmpty(t, betaFlags) + }) +} + +func TestComputerUseToolJSON(t *testing.T) { + t.Parallel() + + t.Run("builds JSON for version 20250124", func(t *testing.T) { + t.Parallel() + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun)) + data, err := computerUseToolJSON(cuTool) + require.NoError(t, err) + var m map[string]any + require.NoError(t, json.Unmarshal(data, &m)) + require.Equal(t, "computer_20250124", m["type"]) + require.Equal(t, "computer", m["name"]) + require.InDelta(t, 1920, m["display_width_px"], 0) + require.InDelta(t, 1080, m["display_height_px"], 0) + }) + + t.Run("builds JSON for version 20251124 with enable_zoom", func(t *testing.T) { + t.Parallel() + enableZoom := true + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1024, + DisplayHeightPx: 768, + EnableZoom: &enableZoom, + ToolVersion: ComputerUse20251124, + }, noopComputerRun)) + data, err := computerUseToolJSON(cuTool) + require.NoError(t, err) + var m map[string]any + require.NoError(t, json.Unmarshal(data, &m)) + require.Equal(t, "computer_20251124", m["type"]) + require.Equal(t, true, m["enable_zoom"]) + }) + + t.Run("handles int64 args without JSON round-trip", func(t *testing.T) { + t.Parallel() + // Direct construction stores int64 values. + cuTool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun) + data, err := computerUseToolJSON(cuTool.Definition()) + require.NoError(t, err) + var m map[string]any + require.NoError(t, json.Unmarshal(data, &m)) + require.InDelta(t, 1920, m["display_width_px"], 0) + }) + + t.Run("returns error when version is missing", func(t *testing.T) { + t.Parallel() + pdt := fantasy.ProviderDefinedTool{ + ID: "anthropic.computer", + Name: "computer", + Args: map[string]any{ + "display_width_px": float64(1920), + "display_height_px": float64(1080), + }, + } + _, err := computerUseToolJSON(pdt) + require.Error(t, err) + require.Contains(t, err.Error(), "tool_version arg is missing") }) + + t.Run("returns error for unsupported version", func(t *testing.T) { + t.Parallel() + pdt := fantasy.ProviderDefinedTool{ + ID: "anthropic.computer", + Name: "computer", + Args: map[string]any{ + "display_width_px": float64(1920), + "display_height_px": float64(1080), + "tool_version": "computer_99991231", + }, + } + _, err := computerUseToolJSON(pdt) + require.Error(t, err) + require.Contains(t, err.Error(), "unsupported") + }) +} + +func TestParseComputerUseInput_CoordinateValidation(t *testing.T) { + t.Parallel() + + t.Run("rejects coordinate with 1 element", func(t *testing.T) { + t.Parallel() + _, err := ParseComputerUseInput(`{"action":"left_click","coordinate":[100]}`) + require.Error(t, err) + require.Contains(t, err.Error(), "coordinate") + }) + + t.Run("rejects coordinate with 3 elements", func(t *testing.T) { + t.Parallel() + _, err := ParseComputerUseInput(`{"action":"left_click","coordinate":[100,200,300]}`) + require.Error(t, err) + require.Contains(t, err.Error(), "coordinate") + }) + + t.Run("rejects start_coordinate with 1 element", func(t *testing.T) { + t.Parallel() + _, err := ParseComputerUseInput(`{"action":"left_click_drag","coordinate":[100,200],"start_coordinate":[50]}`) + require.Error(t, err) + require.Contains(t, err.Error(), "start_coordinate") + }) + + t.Run("rejects region with 3 elements", func(t *testing.T) { + t.Parallel() + _, err := ParseComputerUseInput(`{"action":"zoom","region":[10,20,30]}`) + require.Error(t, err) + require.Contains(t, err.Error(), "region") + }) + + t.Run("accepts valid coordinate", func(t *testing.T) { + t.Parallel() + result, err := ParseComputerUseInput(`{"action":"left_click","coordinate":[100,200]}`) + require.NoError(t, err) + require.Equal(t, [2]int64{100, 200}, result.Coordinate) + }) + + t.Run("accepts absent optional arrays", func(t *testing.T) { + t.Parallel() + result, err := ParseComputerUseInput(`{"action":"screenshot"}`) + require.NoError(t, err) + require.Equal(t, ActionScreenshot, result.Action) + }) +} + +func TestToTools_RawJSON(t *testing.T) { + t.Parallel() + + lm := languageModel{options: options{}} + + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun)) + + tools := []fantasy.Tool{ + fantasy.FunctionTool{ + Name: "weather", + Description: "Get weather", + InputSchema: map[string]any{ + "properties": map[string]any{ + "location": map[string]any{"type": "string"}, + }, + "required": []string{"location"}, + }, + }, + WebSearchTool(nil), + cuTool, + } + + rawTools, toolChoice, warnings, betaFlags := lm.toTools(tools, nil, false) + + require.Len(t, rawTools, 3) + require.Nil(t, toolChoice) + require.Empty(t, warnings) + require.NotEmpty(t, betaFlags) + + // Verify each raw tool is valid JSON. + for i, raw := range rawTools { + var m map[string]any + require.NoError(t, json.Unmarshal(raw, &m), "tool %d should be valid JSON", i) + } + + // Check function tool. + var funcTool map[string]any + require.NoError(t, json.Unmarshal(rawTools[0], &funcTool)) + require.Equal(t, "weather", funcTool["name"]) + + // Check web search tool. + var webTool map[string]any + require.NoError(t, json.Unmarshal(rawTools[1], &webTool)) + require.Equal(t, "web_search_20250305", webTool["type"]) + + // Check computer use tool. + var cuToolJSON map[string]any + require.NoError(t, json.Unmarshal(rawTools[2], &cuToolJSON)) + require.Equal(t, "computer_20250124", cuToolJSON["type"]) + require.Equal(t, "computer", cuToolJSON["name"]) +} + +func TestGenerate_BetaAPI(t *testing.T) { + t.Parallel() + + t.Run("sends beta header for computer use", func(t *testing.T) { + t.Parallel() + + var capturedHeaders http.Header + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedHeaders = r.Header.Clone() + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(mockAnthropicGenerateResponse()) + })) + defer server.Close() + + provider, err := New( + WithAPIKey("test-api-key"), + WithBaseURL(server.URL), + ) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun)) + + _, err = model.Generate(context.Background(), fantasy.Call{ + Prompt: testPrompt(), + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + require.Contains(t, capturedHeaders.Get("Anthropic-Beta"), "computer-use-2025-01-24") + }) + + t.Run("sends beta header for computer use 20251124", func(t *testing.T) { + t.Parallel() + + var capturedHeaders http.Header + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedHeaders = r.Header.Clone() + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(mockAnthropicGenerateResponse()) + })) + defer server.Close() + + provider, err := New( + WithAPIKey("test-api-key"), + WithBaseURL(server.URL), + ) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20251124, + }, noopComputerRun)) + + _, err = model.Generate(context.Background(), fantasy.Call{ + Prompt: testPrompt(), + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + require.Contains(t, capturedHeaders.Get("Anthropic-Beta"), "computer-use-2025-11-24") + }) + + t.Run("returns tool use from beta response", func(t *testing.T) { + t.Parallel() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "id": "msg_01Test", + "type": "message", + "role": "assistant", + "model": "claude-sonnet-4-20250514", + "content": []any{ + map[string]any{ + "type": "tool_use", + "id": "toolu_01", + "name": "computer", + "input": map[string]any{"action": "screenshot"}, + }, + }, + "stop_reason": "tool_use", + "usage": map[string]any{ + "input_tokens": 10, + "output_tokens": 5, + "cache_creation": map[string]any{ + "ephemeral_1h_input_tokens": 0, + "ephemeral_5m_input_tokens": 0, + }, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 0, + "server_tool_use": map[string]any{ + "web_search_requests": 0, + }, + "service_tier": "standard", + }, + }) + })) + defer server.Close() + + provider, err := New( + WithAPIKey("test-api-key"), + WithBaseURL(server.URL), + ) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun)) + + resp, err := model.Generate(context.Background(), fantasy.Call{ + Prompt: testPrompt(), + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + + toolCalls := resp.Content.ToolCalls() + require.Len(t, toolCalls, 1) + require.Equal(t, "computer", toolCalls[0].ToolName) + require.Equal(t, "toolu_01", toolCalls[0].ToolCallID) + require.Contains(t, toolCalls[0].Input, "screenshot") + require.Equal(t, fantasy.FinishReasonToolCalls, resp.FinishReason) + + // Verify typed parsing works on the tool call input. + parsed, err := ParseComputerUseInput(toolCalls[0].Input) + require.NoError(t, err) + require.Equal(t, ActionScreenshot, parsed.Action) + }) +} + +func TestStream_BetaAPI(t *testing.T) { + t.Parallel() + + t.Run("streams via beta API for computer use", func(t *testing.T) { + t.Parallel() + + var capturedHeaders http.Header + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedHeaders = r.Header.Clone() + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.WriteHeader(http.StatusOK) + chunks := []string{ + "event: message_start\n", + "data: {\"type\":\"message_start\",\"message\":{}}\n\n", + "event: message_stop\n", + "data: {\"type\":\"message_stop\"}\n\n", + } + for _, chunk := range chunks { + _, _ = fmt.Fprint(w, chunk) + if flusher, ok := w.(http.Flusher); ok { + flusher.Flush() + } + } + })) + defer server.Close() + + provider, err := New( + WithAPIKey("test-api-key"), + WithBaseURL(server.URL), + ) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun)) + + stream, err := model.Stream(context.Background(), fantasy.Call{ + Prompt: testPrompt(), + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + + stream(func(fantasy.StreamPart) bool { return true }) + + require.Contains(t, capturedHeaders.Get("Anthropic-Beta"), "computer-use-2025-01-24") + }) + + t.Run("streams via beta API for computer use 20251124", func(t *testing.T) { + t.Parallel() + + var capturedHeaders http.Header + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedHeaders = r.Header.Clone() + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.WriteHeader(http.StatusOK) + chunks := []string{ + "event: message_start\n", + "data: {\"type\":\"message_start\",\"message\":{}}\n\n", + "event: message_stop\n", + "data: {\"type\":\"message_stop\"}\n\n", + } + for _, chunk := range chunks { + _, _ = fmt.Fprint(w, chunk) + if flusher, ok := w.(http.Flusher); ok { + flusher.Flush() + } + } + })) + defer server.Close() + + provider, err := New( + WithAPIKey("test-api-key"), + WithBaseURL(server.URL), + ) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20251124, + }, noopComputerRun)) + + stream, err := model.Stream(context.Background(), fantasy.Call{ + Prompt: testPrompt(), + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + + stream(func(fantasy.StreamPart) bool { return true }) + + require.Contains(t, capturedHeaders.Get("Anthropic-Beta"), "computer-use-2025-11-24") + }) +} + +// TestGenerate_ComputerUseTool runs a multi-turn computer use session +// via model.Generate, passing the ExecutableProviderTool directly into +// Call.Tools (no .Definition(), no jsonRoundTripTool). The mock server +// walks through a scripted sequence of actions — screenshot, click, +// type, key, scroll — then finishes with a text reply. Each turn the +// test parses the tool call, builds a screenshot result, and appends +// both to the prompt for the next request. +func TestGenerate_ComputerUseTool(t *testing.T) { + t.Parallel() + + type actionStep struct { + input map[string]any + want ComputerUseInput + } + steps := []actionStep{ + { + input: map[string]any{"action": "screenshot"}, + want: ComputerUseInput{Action: ActionScreenshot}, + }, + { + input: map[string]any{"action": "left_click", "coordinate": []any{100, 200}}, + want: ComputerUseInput{Action: ActionLeftClick, Coordinate: [2]int64{100, 200}}, + }, + { + input: map[string]any{"action": "type", "text": "hello world"}, + want: ComputerUseInput{Action: ActionType, Text: "hello world"}, + }, + { + input: map[string]any{"action": "key", "text": "Return"}, + want: ComputerUseInput{Action: ActionKey, Text: "Return"}, + }, + { + input: map[string]any{ + "action": "scroll", + "coordinate": []any{500, 300}, + "scroll_direction": "down", + "scroll_amount": 3, + }, + want: ComputerUseInput{ + Action: ActionScroll, + Coordinate: [2]int64{500, 300}, + ScrollDirection: "down", + ScrollAmount: 3, + }, + }, + { + input: map[string]any{"action": "screenshot"}, + want: ComputerUseInput{Action: ActionScreenshot}, + }, + } + + var ( + requestIdx int + betaHeaders []string + ) + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + betaHeaders = append(betaHeaders, r.Header.Get("Anthropic-Beta")) + idx := requestIdx + requestIdx++ + + w.Header().Set("Content-Type", "application/json") + if idx < len(steps) { + _ = json.NewEncoder(w).Encode(map[string]any{ + "id": fmt.Sprintf("msg_%02d", idx), + "type": "message", + "role": "assistant", + "model": "claude-sonnet-4-20250514", + "content": []any{map[string]any{ + "type": "tool_use", + "id": fmt.Sprintf("toolu_%02d", idx), + "name": "computer", + "input": steps[idx].input, + }}, + "stop_reason": "tool_use", + "usage": map[string]any{"input_tokens": 10, "output_tokens": 5}, + }) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "id": "msg_final", + "type": "message", + "role": "assistant", + "model": "claude-sonnet-4-20250514", + "content": []any{map[string]any{ + "type": "text", + "text": "Done! I have completed all the requested actions.", + }}, + "stop_reason": "end_turn", + "usage": map[string]any{"input_tokens": 10, "output_tokens": 15}, + }) + })) + defer server.Close() + + provider, err := New(WithAPIKey("test-api-key"), WithBaseURL(server.URL)) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + // Pass the ExecutableProviderTool directly — the whole point is + // to verify that the Tool interface works without unwrapping. + cuTool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun) + + var got []ComputerUseInput + prompt := testPrompt() + fakePNG := []byte("fake-screenshot-png") + + for turn := 0; turn <= len(steps); turn++ { + resp, err := model.Generate(context.Background(), fantasy.Call{ + Prompt: prompt, + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err, "turn %d", turn) + + if resp.FinishReason != fantasy.FinishReasonToolCalls { + require.Equal(t, fantasy.FinishReasonStop, resp.FinishReason) + require.Contains(t, resp.Content.Text(), "Done") + break + } + + toolCalls := resp.Content.ToolCalls() + require.Len(t, toolCalls, 1, "turn %d", turn) + require.Equal(t, "computer", toolCalls[0].ToolName, "turn %d", turn) + + parsed, err := ParseComputerUseInput(toolCalls[0].Input) + require.NoError(t, err, "turn %d", turn) + got = append(got, parsed) + + // Build the next prompt: append the assistant tool-call turn + // and the user screenshot-result turn. + prompt = append(prompt, + fantasy.Message{ + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ + ToolCallID: toolCalls[0].ToolCallID, + ToolName: toolCalls[0].ToolName, + Input: toolCalls[0].Input, + }, + }, + }, + fantasy.Message{ + // Use MessageRoleTool for tool results — this matches + // what the agent loop produces. + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + NewComputerUseScreenshotResult(toolCalls[0].ToolCallID, fakePNG), + }, + }, + ) + } + + // Every scripted action was received and parsed correctly. + require.Len(t, got, len(steps)) + for i, step := range steps { + require.Equal(t, step.want.Action, got[i].Action, "step %d", i) + require.Equal(t, step.want.Coordinate, got[i].Coordinate, "step %d", i) + require.Equal(t, step.want.Text, got[i].Text, "step %d", i) + require.Equal(t, step.want.ScrollDirection, got[i].ScrollDirection, "step %d", i) + require.Equal(t, step.want.ScrollAmount, got[i].ScrollAmount, "step %d", i) + } + + // Beta header was sent on every request. + require.Len(t, betaHeaders, len(steps)+1) + for i, h := range betaHeaders { + require.Contains(t, h, "computer-use-2025-01-24", "request %d", i) + } +} + +// TestStream_ComputerUseTool runs a multi-turn computer use session +// via model.Stream, verifying that the ExecutableProviderTool works +// through the streaming path end-to-end. +func TestStream_ComputerUseTool(t *testing.T) { + t.Parallel() + + type streamStep struct { + input map[string]any + wantAction ComputerAction + } + steps := []streamStep{ + {input: map[string]any{"action": "screenshot"}, wantAction: ActionScreenshot}, + {input: map[string]any{"action": "left_click", "coordinate": []any{150, 250}}, wantAction: ActionLeftClick}, + {input: map[string]any{"action": "type", "text": "search query"}, wantAction: ActionType}, + } + + var ( + requestIdx int + betaHeaders []string + ) + + // streamToolUseChunks returns SSE chunks for a single + // computer-use tool_use content block. + streamToolUseChunks := func(id string, input map[string]any) []string { + inputJSON, _ := json.Marshal(input) + escaped := strings.ReplaceAll(string(inputJSON), `"`, `\"`) + return []string{ + "event: message_start\n", + `data: {"type":"message_start","message":{"id":"` + id + `","type":"message","role":"assistant","content":[],"model":"claude-sonnet-4-20250514","stop_reason":null,"usage":{"input_tokens":10,"output_tokens":0}}}` + "\n\n", + "event: content_block_start\n", + `data: {"type":"content_block_start","index":0,"content_block":{"type":"tool_use","id":"` + id + `","name":"computer","input":{}}}` + "\n\n", + "event: content_block_delta\n", + `data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":"` + escaped + `"}}` + "\n\n", + "event: content_block_stop\n", + `data: {"type":"content_block_stop","index":0}` + "\n\n", + "event: message_delta\n", + `data: {"type":"message_delta","delta":{"stop_reason":"tool_use"},"usage":{"output_tokens":5}}` + "\n\n", + "event: message_stop\n", + `data: {"type":"message_stop"}` + "\n\n", + } + } + + streamTextChunks := func() []string { + return []string{ + "event: message_start\n", + `data: {"type":"message_start","message":{"id":"msg_final","type":"message","role":"assistant","content":[],"model":"claude-sonnet-4-20250514","stop_reason":null,"usage":{"input_tokens":10,"output_tokens":0}}}` + "\n\n", + "event: content_block_start\n", + `data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}` + "\n\n", + "event: content_block_delta\n", + `data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"All done."}}` + "\n\n", + "event: content_block_stop\n", + `data: {"type":"content_block_stop","index":0}` + "\n\n", + "event: message_delta\n", + `data: {"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":10}}` + "\n\n", + "event: message_stop\n", + `data: {"type":"message_stop"}` + "\n\n", + } + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + betaHeaders = append(betaHeaders, r.Header.Get("Anthropic-Beta")) + idx := requestIdx + requestIdx++ + + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.WriteHeader(http.StatusOK) + + var chunks []string + if idx < len(steps) { + chunks = streamToolUseChunks( + fmt.Sprintf("toolu_%02d", idx), + steps[idx].input, + ) + } else { + chunks = streamTextChunks() + } + for _, chunk := range chunks { + _, _ = fmt.Fprint(w, chunk) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + } + })) + defer server.Close() + + provider, err := New(WithAPIKey("test-api-key"), WithBaseURL(server.URL)) + require.NoError(t, err) + + model, err := provider.LanguageModel(context.Background(), "claude-sonnet-4-20250514") + require.NoError(t, err) + + cuTool := NewComputerUseTool(ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: ComputerUse20250124, + }, noopComputerRun) + + var gotActions []ComputerAction + prompt := testPrompt() + fakePNG := []byte("fake-screenshot-png") + + for turn := 0; turn <= len(steps); turn++ { + stream, err := model.Stream(context.Background(), fantasy.Call{ + Prompt: prompt, + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err, "turn %d", turn) + + var ( + toolCallName string + toolCallID string + toolCallInput string + finishReason fantasy.FinishReason + gotText string + ) + stream(func(part fantasy.StreamPart) bool { + switch part.Type { + case fantasy.StreamPartTypeToolCall: + toolCallName = part.ToolCallName + toolCallID = part.ID + toolCallInput = part.ToolCallInput + case fantasy.StreamPartTypeFinish: + finishReason = part.FinishReason + case fantasy.StreamPartTypeTextDelta: + gotText += part.Delta + } + return true + }) + + if finishReason != fantasy.FinishReasonToolCalls { + require.Contains(t, gotText, "All done") + break + } + + require.Equal(t, "computer", toolCallName, "turn %d", turn) + + parsed, err := ParseComputerUseInput(toolCallInput) + require.NoError(t, err, "turn %d", turn) + gotActions = append(gotActions, parsed.Action) + + prompt = append(prompt, + fantasy.Message{ + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ + ToolCallID: toolCallID, + ToolName: toolCallName, + Input: toolCallInput, + }, + }, + }, + fantasy.Message{ + // Use MessageRoleTool for tool results — this matches + // what the agent loop produces. + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + NewComputerUseScreenshotResult(toolCallID, fakePNG), + }, + }, + ) + } + + require.Len(t, gotActions, len(steps)) + for i, step := range steps { + require.Equal(t, step.wantAction, gotActions[i], "step %d", i) + } + + require.Len(t, betaHeaders, len(steps)+1) + for i, h := range betaHeaders { + require.Contains(t, h, "computer-use-2025-01-24", "request %d", i) + } +} diff --git a/providers/anthropic/computer_use.go b/providers/anthropic/computer_use.go new file mode 100644 index 0000000000000000000000000000000000000000..f4526e9aeed4f2d07f651caab192aef36bd0a591 --- /dev/null +++ b/providers/anthropic/computer_use.go @@ -0,0 +1,427 @@ +package anthropic + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + + "charm.land/fantasy" + anthropicsdk "github.com/charmbracelet/anthropic-sdk-go" + "github.com/charmbracelet/anthropic-sdk-go/packages/param" +) + +// computerUseToolID is the canonical identifier for +// Anthropic computer use tools. It follows the +// . convention used by ProviderDefinedTool.ID. +const computerUseToolID = "anthropic.computer" + +// computerUseAPIName is the tool name Anthropic's API expects +// on the wire. +const computerUseAPIName = "computer" + +// ComputerUseToolVersion identifies which version of the Anthropic +// computer use tool to use. +type ComputerUseToolVersion string + +const ( + // ComputerUse20251124 selects the November 2025 version of the + // computer use tool. + ComputerUse20251124 ComputerUseToolVersion = "computer_20251124" + // ComputerUse20250124 selects the January 2025 version of the + // computer use tool. + ComputerUse20250124 ComputerUseToolVersion = "computer_20250124" +) + +// ComputerUseToolOptions holds the configuration for creating a +// computer use tool instance. +type ComputerUseToolOptions struct { + // DisplayWidthPx is the width of the display in pixels. + DisplayWidthPx int64 + // DisplayHeightPx is the height of the display in pixels. + DisplayHeightPx int64 + // DisplayNumber is an optional X11 display number. + DisplayNumber *int64 + // EnableZoom enables zoom support. Only used with the + // ComputerUse20251124 version. + EnableZoom *bool + // ToolVersion selects which computer use tool version to use. + ToolVersion ComputerUseToolVersion + // CacheControl sets optional cache control for the tool. + CacheControl *CacheControl +} + +// NewComputerUseTool creates a new provider-defined tool configured +// for Anthropic computer use. The returned tool can be passed +// directly into a fantasy tool set via WithProviderDefinedTools. +func NewComputerUseTool( + opts ComputerUseToolOptions, + run func(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error), +) fantasy.ExecutableProviderTool { + args := map[string]any{ + "display_width_px": opts.DisplayWidthPx, + "display_height_px": opts.DisplayHeightPx, + "tool_version": string(opts.ToolVersion), + } + if opts.DisplayNumber != nil { + args["display_number"] = *opts.DisplayNumber + } + if opts.EnableZoom != nil { + args["enable_zoom"] = *opts.EnableZoom + } + if opts.CacheControl != nil { + args["cache_control"] = *opts.CacheControl + } + pdt := fantasy.ProviderDefinedTool{ + ID: computerUseToolID, + Name: computerUseAPIName, + Args: args, + } + return fantasy.NewExecutableProviderTool(pdt, run) +} + +// IsComputerUseTool reports whether tool is an Anthropic computer +// use tool. It checks for a ProviderDefinedTool whose ID matches +// the computer use tool identifier exactly. +func IsComputerUseTool(tool fantasy.Tool) bool { + pdt, ok := asProviderDefinedTool(tool) + if !ok { + return false + } + return pdt.ID == computerUseToolID +} + +// getComputerUseVersion extracts the ComputerUseToolVersion from a +// provider-defined tool's Args map. It returns the version and true +// if present, or the zero value and false otherwise. +func getComputerUseVersion(tool fantasy.ProviderDefinedTool) (ComputerUseToolVersion, bool) { + v, ok := tool.Args["tool_version"] + if !ok { + return "", false + } + s, ok := v.(string) + if !ok { + return "", false + } + return ComputerUseToolVersion(s), true +} + +// computerUseBetaFlag returns the Anthropic beta header value for +// the given computer use tool version. +func computerUseBetaFlag(version ComputerUseToolVersion) (string, error) { + switch version { + case ComputerUse20251124: + // TODO: Replace with SDK constant when available. + return "computer-use-2025-11-24", nil + case ComputerUse20250124: + return anthropicsdk.AnthropicBetaComputerUse2025_01_24, nil + default: + return "", fmt.Errorf( + "unsupported computer use tool version: %q", version, + ) + } +} + +// computerUseToolJSON builds the JSON representation of a computer +// use tool from a ProviderDefinedTool's Args, using the beta SDK +// types for serialization. +func computerUseToolJSON(pdt fantasy.ProviderDefinedTool) (json.RawMessage, error) { + version, ok := getComputerUseVersion(pdt) + if !ok { + return nil, fmt.Errorf("computerUseToolJSON: tool_version arg is missing") + } + + h, hOK := anyToInt64(pdt.Args["display_height_px"]) + w, wOK := anyToInt64(pdt.Args["display_width_px"]) + if !hOK || !wOK { + return nil, fmt.Errorf( + "display_height_px and display_width_px must be numeric"+ + " (height ok=%t, width ok=%t)", hOK, wOK, + ) + } + + switch version { + case ComputerUse20250124: + tool := anthropicsdk.BetaToolUnionParamOfComputerUseTool20250124(h, w) + if v, ok := pdt.Args["display_number"]; ok { + dn, ok := anyToInt64(v) + if !ok { + return nil, fmt.Errorf("computer use tool has invalid display_number") + } + tool.OfComputerUseTool20250124.DisplayNumber = param.NewOpt(dn) + } + if _, ok := pdt.Args["cache_control"]; ok { + tool.OfComputerUseTool20250124.CacheControl = anthropicsdk.NewBetaCacheControlEphemeralParam() + } + return json.Marshal(tool) + case ComputerUse20251124: + tool := anthropicsdk.BetaToolUnionParamOfComputerUseTool20251124(h, w) + if v, ok := pdt.Args["display_number"]; ok { + dn, ok := anyToInt64(v) + if !ok { + return nil, fmt.Errorf("computer use tool has invalid display_number") + } + tool.OfComputerUseTool20251124.DisplayNumber = param.NewOpt(dn) + } + if v, ok := pdt.Args["enable_zoom"]; ok { + if b, ok := v.(bool); ok { + tool.OfComputerUseTool20251124.EnableZoom = param.NewOpt(b) + } + } + if _, ok := pdt.Args["cache_control"]; ok { + tool.OfComputerUseTool20251124.CacheControl = anthropicsdk.NewBetaCacheControlEphemeralParam() + } + return json.Marshal(tool) + default: + return nil, fmt.Errorf( + "unsupported computer use tool version: %q", version, + ) + } +} + +// ComputerAction identifies the action Claude wants to perform. +// +// Unless noted otherwise on a specific action, respond by returning a +// screenshot using NewComputerUseScreenshotResult. +type ComputerAction string + +const ( + // ActionScreenshot captures the current screen. + // + // No additional fields are populated. + ActionScreenshot ComputerAction = "screenshot" + // ActionLeftClick performs a left click. + // + // - Coordinate: [x, y] target. + // - Text: optional modifier key (e.g. "shift", "ctrl"). + ActionLeftClick ComputerAction = "left_click" + // ActionRightClick performs a right click (v20250124+). + // + // - Coordinate: [x, y] target. + // - Text: optional modifier key (e.g. "shift", "ctrl"). + ActionRightClick ComputerAction = "right_click" + // ActionDoubleClick performs a double click (v20250124+). + // + // - Coordinate: [x, y] target. + // - Text: optional modifier key (e.g. "shift", "ctrl"). + ActionDoubleClick ComputerAction = "double_click" + // ActionTripleClick performs a triple click (v20250124+). + // + // - Coordinate: [x, y] target. + // - Text: optional modifier key (e.g. "shift", "ctrl"). + ActionTripleClick ComputerAction = "triple_click" + // ActionMiddleClick performs a middle click (v20250124+). + // + // - Coordinate: [x, y] target. + // - Text: optional modifier key (e.g. "shift", "ctrl"). + ActionMiddleClick ComputerAction = "middle_click" + // ActionMouseMove moves the cursor. + // + // - Coordinate: [x, y] destination. + ActionMouseMove ComputerAction = "mouse_move" + // ActionLeftClickDrag drags from one point to another + // (v20250124+). + // + // - StartCoordinate: [x, y] drag origin. + // - Coordinate: [x, y] drag destination. + ActionLeftClickDrag ComputerAction = "left_click_drag" + // ActionType types text. + // + // - Text: the string to type. + ActionType ComputerAction = "type" + // ActionKey presses a key combination. + // + // - Text: key combo string (e.g. "ctrl+c", "Return"). + ActionKey ComputerAction = "key" + // ActionScroll scrolls the screen (v20250124+). + // + // - Coordinate: [x, y] scroll origin. + // - ScrollDirection: "up", "down", "left", or "right". + // - ScrollAmount: scroll distance. + // - Text: optional modifier key. + ActionScroll ComputerAction = "scroll" + // ActionLeftMouseDown presses and holds the left mouse button + // (v20250124+). + // + // - Coordinate: [x, y] target. + ActionLeftMouseDown ComputerAction = "left_mouse_down" + // ActionLeftMouseUp releases the left mouse button + // (v20250124+). + // + // - Coordinate: [x, y] target. + ActionLeftMouseUp ComputerAction = "left_mouse_up" + // ActionHoldKey holds down a key for a specified duration + // (v20250124+). + // + // - Text: the key to hold. + // - Duration: hold time in seconds. + ActionHoldKey ComputerAction = "hold_key" + // ActionWait pauses between actions (v20250124+). + // + // No additional fields are populated. + ActionWait ComputerAction = "wait" + // ActionZoom views a specific screen region at full + // resolution (v20251124 only). Requires enable_zoom in the + // tool definition. + // + // - Region: [x1, y1, x2, y2] top-left and bottom-right. + // + // Response: return a screenshot of the zoomed region at + // full resolution. + ActionZoom ComputerAction = "zoom" +) + +// ComputerUseInput is the parsed, typed representation of a computer +// use tool call's Input JSON. Not all fields are populated for every +// action — check Action first, then read the relevant fields. +type ComputerUseInput struct { + Action ComputerAction `json:"action"` + // Coordinate is [x, y] for click, move, scroll, and + // drag-end actions. + Coordinate [2]int64 `json:"coordinate,omitempty"` + // StartCoordinate is [x, y] for left_click_drag start point. + StartCoordinate [2]int64 `json:"start_coordinate,omitempty"` + // Text is the string to type (ActionType), key combo + // (ActionKey), modifier key for click/scroll actions, or key + // to hold (ActionHoldKey). + Text string `json:"text,omitempty"` + // ScrollDirection is the scroll direction: "up", "down", + // "left", or "right". + ScrollDirection string `json:"scroll_direction,omitempty"` + // ScrollAmount is the number of scroll clicks. + ScrollAmount int64 `json:"scroll_amount,omitempty"` + // Duration is how long to hold the key in seconds + // (ActionHoldKey). + Duration int64 `json:"duration,omitempty"` + // Region is [x1, y1, x2, y2] defining the zoom area + // (ActionZoom, v20251124 only). + Region [4]int64 `json:"region,omitempty"` +} + +// ParseComputerUseInput parses a ToolCallContent's Input string into +// a typed ComputerUseInput. Returns an error if the JSON is invalid +// or if coordinate arrays have the wrong number of elements. +func ParseComputerUseInput(input string) (ComputerUseInput, error) { + var result ComputerUseInput + if err := json.Unmarshal([]byte(input), &result); err != nil { + return result, err + } + + // Validate array field lengths. json.Unmarshal silently pads + // or truncates arrays that don't match the Go fixed-size type, + // which would produce wrong coordinates. + var raw map[string]json.RawMessage + if err := json.Unmarshal([]byte(input), &raw); err != nil { + return result, err + } + if err := validateArrayLen(raw, "coordinate", 2); err != nil { + return ComputerUseInput{}, err + } + if err := validateArrayLen(raw, "start_coordinate", 2); err != nil { + return ComputerUseInput{}, err + } + if err := validateArrayLen(raw, "region", 4); err != nil { + return ComputerUseInput{}, err + } + + return result, nil +} + +// validateArrayLen checks that the JSON array at key has exactly +// wantLen elements. If the key is absent from raw it returns nil. +func validateArrayLen(raw map[string]json.RawMessage, key string, wantLen int) error { + v, ok := raw[key] + if !ok { + return nil + } + var elems []json.RawMessage + if err := json.Unmarshal(v, &elems); err != nil { + return fmt.Errorf("%s: expected array: %w", key, err) + } + if len(elems) != wantLen { + return fmt.Errorf( + "%s: expected %d elements, got %d", + key, wantLen, len(elems), + ) + } + return nil +} + +// NewComputerUseScreenshotResult constructs a ToolResultPart +// containing a screenshot image. This is the standard response for +// almost every computer use action — Claude expects to see what +// happened after executing the action. +// +// Parameters: +// - toolCallID: the ToolCallID from the ToolCallContent that +// requested this action. +// - screenshotPNG: the raw PNG bytes of the screenshot. The +// caller is responsible for capturing and (optionally) resizing +// the screenshot before passing it here. +// +// The function base64-encodes the image data and sets the media +// type to "image/png". +func NewComputerUseScreenshotResult( + toolCallID string, + screenshotPNG []byte, +) fantasy.ToolResultPart { + return fantasy.ToolResultPart{ + ToolCallID: toolCallID, + Output: fantasy.ToolResultOutputContentMedia{ + Data: base64.StdEncoding.EncodeToString(screenshotPNG), + MediaType: "image/png", + }, + } +} + +// NewComputerUseScreenshotResultWithMediaType is like +// NewComputerUseScreenshotResult but allows specifying a custom +// media type (e.g. "image/jpeg") and pre-encoded base64 data. +func NewComputerUseScreenshotResultWithMediaType( + toolCallID string, + base64Data string, + mediaType string, +) fantasy.ToolResultPart { + return fantasy.ToolResultPart{ + ToolCallID: toolCallID, + Output: fantasy.ToolResultOutputContentMedia{ + Data: base64Data, + MediaType: mediaType, + }, + } +} + +// NewComputerUseErrorResult constructs a ToolResultPart indicating +// that the requested action failed. Claude will see this as an +// error and may retry or adjust its approach. +// +// Use this when screenshot capture fails, coordinates are out of +// bounds, the application is unresponsive, or any other execution +// error occurs. +func NewComputerUseErrorResult( + toolCallID string, + err error, +) fantasy.ToolResultPart { + return fantasy.ToolResultPart{ + ToolCallID: toolCallID, + Output: fantasy.ToolResultOutputContentError{ + Error: err, + }, + } +} + +// NewComputerUseTextResult constructs a ToolResultPart containing a +// plain text response. This is rarely needed for computer use — +// most actions should return a screenshot — but can be useful for +// returning metadata alongside the action or for testing. +func NewComputerUseTextResult( + toolCallID string, + text string, +) fantasy.ToolResultPart { + return fantasy.ToolResultPart{ + ToolCallID: toolCallID, + Output: fantasy.ToolResultOutputContentText{ + Text: text, + }, + } +} diff --git a/providers/anthropic/computer_use_test.go b/providers/anthropic/computer_use_test.go new file mode 100644 index 0000000000000000000000000000000000000000..6278327a195aeec1244bc1fa17ae19e7e89a23cd --- /dev/null +++ b/providers/anthropic/computer_use_test.go @@ -0,0 +1,303 @@ +package anthropic + +import ( + "encoding/base64" + "errors" + "testing" + + "charm.land/fantasy" + "github.com/stretchr/testify/require" +) + +func TestParseComputerUseInput(t *testing.T) { + t.Parallel() + + t.Run("screenshot", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"screenshot"}`) + require.NoError(t, err) + require.Equal(t, ActionScreenshot, input.Action) + require.Equal(t, [2]int64{0, 0}, input.Coordinate) + require.Equal(t, "", input.Text) + }) + + t.Run("left_click with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"left_click","coordinate":[100,200]}`) + require.NoError(t, err) + require.Equal(t, ActionLeftClick, input.Action) + require.Equal(t, [2]int64{100, 200}, input.Coordinate) + }) + + t.Run("right_click with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"right_click","coordinate":[50,75]}`) + require.NoError(t, err) + require.Equal(t, ActionRightClick, input.Action) + require.Equal(t, [2]int64{50, 75}, input.Coordinate) + }) + + t.Run("double_click with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"double_click","coordinate":[300,400]}`) + require.NoError(t, err) + require.Equal(t, ActionDoubleClick, input.Action) + require.Equal(t, [2]int64{300, 400}, input.Coordinate) + }) + + t.Run("middle_click with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"middle_click","coordinate":[10,20]}`) + require.NoError(t, err) + require.Equal(t, ActionMiddleClick, input.Action) + require.Equal(t, [2]int64{10, 20}, input.Coordinate) + }) + + t.Run("mouse_move with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"mouse_move","coordinate":[500,600]}`) + require.NoError(t, err) + require.Equal(t, ActionMouseMove, input.Action) + require.Equal(t, [2]int64{500, 600}, input.Coordinate) + }) + + t.Run("left_click_drag with start_coordinate and coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"left_click_drag","start_coordinate":[10,20],"coordinate":[300,400]}`) + require.NoError(t, err) + require.Equal(t, ActionLeftClickDrag, input.Action) + require.Equal(t, [2]int64{10, 20}, input.StartCoordinate) + require.Equal(t, [2]int64{300, 400}, input.Coordinate) + }) + + t.Run("type with text", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"type","text":"hello world"}`) + require.NoError(t, err) + require.Equal(t, ActionType, input.Action) + require.Equal(t, "hello world", input.Text) + }) + + t.Run("key with text", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"key","text":"ctrl+c"}`) + require.NoError(t, err) + require.Equal(t, ActionKey, input.Action) + require.Equal(t, "ctrl+c", input.Text) + }) + + t.Run("scroll with coordinate direction and amount", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"scroll","coordinate":[960,540],"scroll_direction":"down","scroll_amount":3}`) + require.NoError(t, err) + require.Equal(t, ActionScroll, input.Action) + require.Equal(t, [2]int64{960, 540}, input.Coordinate) + require.Equal(t, "down", input.ScrollDirection) + require.Equal(t, int64(3), input.ScrollAmount) + }) + + t.Run("invalid JSON returns error", func(t *testing.T) { + t.Parallel() + _, err := ParseComputerUseInput(`{not valid json}`) + require.Error(t, err) + }) + + t.Run("triple_click with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"triple_click","coordinate":[120,240]}`) + require.NoError(t, err) + require.Equal(t, ActionTripleClick, input.Action) + require.Equal(t, [2]int64{120, 240}, input.Coordinate) + }) + + t.Run("left_mouse_down with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"left_mouse_down","coordinate":[80,90]}`) + require.NoError(t, err) + require.Equal(t, ActionLeftMouseDown, input.Action) + require.Equal(t, [2]int64{80, 90}, input.Coordinate) + }) + + t.Run("left_mouse_up with coordinate", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"left_mouse_up","coordinate":[80,90]}`) + require.NoError(t, err) + require.Equal(t, ActionLeftMouseUp, input.Action) + require.Equal(t, [2]int64{80, 90}, input.Coordinate) + }) + + t.Run("wait", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"wait"}`) + require.NoError(t, err) + require.Equal(t, ActionWait, input.Action) + require.Equal(t, [2]int64{0, 0}, input.Coordinate) + require.Equal(t, "", input.Text) + }) + + t.Run("zoom with region", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"zoom","region":[100,200,500,600]}`) + require.NoError(t, err) + require.Equal(t, ActionZoom, input.Action) + require.Equal(t, [4]int64{100, 200, 500, 600}, input.Region) + }) + + t.Run("left_click with modifier key", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"left_click","coordinate":[100,200],"text":"shift"}`) + require.NoError(t, err) + require.Equal(t, ActionLeftClick, input.Action) + require.Equal(t, [2]int64{100, 200}, input.Coordinate) + require.Equal(t, "shift", input.Text) + }) + + t.Run("unknown action parses without error", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"future_action","coordinate":[1,2]}`) + require.NoError(t, err) + require.Equal(t, ComputerAction("future_action"), input.Action) + require.Equal(t, [2]int64{1, 2}, input.Coordinate) + }) + + t.Run("hold_key with duration", func(t *testing.T) { + t.Parallel() + input, err := ParseComputerUseInput(`{"action":"hold_key","text":"shift","duration":2}`) + require.NoError(t, err) + require.Equal(t, ActionHoldKey, input.Action) + require.Equal(t, "shift", input.Text) + require.Equal(t, int64(2), input.Duration) + }) +} + +func TestNewComputerUseScreenshotResult(t *testing.T) { + t.Parallel() + + t.Run("base64 encodes PNG bytes", func(t *testing.T) { + t.Parallel() + pngData := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A} + result := NewComputerUseScreenshotResult("call-123", pngData) + + require.Equal(t, "call-123", result.ToolCallID) + + media, ok := result.Output.(fantasy.ToolResultOutputContentMedia) + require.True(t, ok, "output should be ToolResultOutputContentMedia") + require.Equal(t, "image/png", media.MediaType) + require.Equal(t, base64.StdEncoding.EncodeToString(pngData), media.Data) + }) + + t.Run("preserves tool call ID", func(t *testing.T) { + t.Parallel() + result := NewComputerUseScreenshotResult("tc_abc", []byte{0x01}) + require.Equal(t, "tc_abc", result.ToolCallID) + }) + + t.Run("empty screenshot bytes", func(t *testing.T) { + t.Parallel() + result := NewComputerUseScreenshotResult("call-empty", []byte{}) + + media, ok := result.Output.(fantasy.ToolResultOutputContentMedia) + require.True(t, ok) + require.Equal(t, "image/png", media.MediaType) + require.Equal(t, "", media.Data) + }) + + t.Run("output content type is media", func(t *testing.T) { + t.Parallel() + result := NewComputerUseScreenshotResult("call-type", []byte{0xFF}) + require.Equal(t, fantasy.ToolResultContentTypeMedia, result.Output.GetType()) + }) +} + +func TestNewComputerUseScreenshotResultWithMediaType(t *testing.T) { + t.Parallel() + + t.Run("custom media type and base64 data", func(t *testing.T) { + t.Parallel() + b64 := base64.StdEncoding.EncodeToString([]byte("jpeg-data")) + result := NewComputerUseScreenshotResultWithMediaType("call-456", b64, "image/jpeg") + + require.Equal(t, "call-456", result.ToolCallID) + + media, ok := result.Output.(fantasy.ToolResultOutputContentMedia) + require.True(t, ok, "output should be ToolResultOutputContentMedia") + require.Equal(t, "image/jpeg", media.MediaType) + require.Equal(t, b64, media.Data) + }) + + t.Run("preserves tool call ID", func(t *testing.T) { + t.Parallel() + result := NewComputerUseScreenshotResultWithMediaType("tc_xyz", "data", "image/webp") + require.Equal(t, "tc_xyz", result.ToolCallID) + }) + + t.Run("output content type is media", func(t *testing.T) { + t.Parallel() + result := NewComputerUseScreenshotResultWithMediaType("call-type", "data", "image/png") + require.Equal(t, fantasy.ToolResultContentTypeMedia, result.Output.GetType()) + }) +} + +func TestNewComputerUseErrorResult(t *testing.T) { + t.Parallel() + + t.Run("error message propagates", func(t *testing.T) { + t.Parallel() + err := errors.New("screenshot capture failed") + result := NewComputerUseErrorResult("call-err", err) + + require.Equal(t, "call-err", result.ToolCallID) + + errOutput, ok := result.Output.(fantasy.ToolResultOutputContentError) + require.True(t, ok, "output should be ToolResultOutputContentError") + require.Equal(t, "screenshot capture failed", errOutput.Error.Error()) + }) + + t.Run("preserves tool call ID", func(t *testing.T) { + t.Parallel() + result := NewComputerUseErrorResult("tc_err", errors.New("fail")) + require.Equal(t, "tc_err", result.ToolCallID) + }) + + t.Run("output content type is error", func(t *testing.T) { + t.Parallel() + result := NewComputerUseErrorResult("call-type", errors.New("oops")) + require.Equal(t, fantasy.ToolResultContentTypeError, result.Output.GetType()) + }) +} + +func TestNewComputerUseTextResult(t *testing.T) { + t.Parallel() + + t.Run("text content is set", func(t *testing.T) { + t.Parallel() + result := NewComputerUseTextResult("call-txt", "action completed successfully") + + require.Equal(t, "call-txt", result.ToolCallID) + + textOutput, ok := result.Output.(fantasy.ToolResultOutputContentText) + require.True(t, ok, "output should be ToolResultOutputContentText") + require.Equal(t, "action completed successfully", textOutput.Text) + }) + + t.Run("preserves tool call ID", func(t *testing.T) { + t.Parallel() + result := NewComputerUseTextResult("tc_text", "hello") + require.Equal(t, "tc_text", result.ToolCallID) + }) + + t.Run("empty text", func(t *testing.T) { + t.Parallel() + result := NewComputerUseTextResult("call-empty", "") + + textOutput, ok := result.Output.(fantasy.ToolResultOutputContentText) + require.True(t, ok) + require.Equal(t, "", textOutput.Text) + }) + + t.Run("output content type is text", func(t *testing.T) { + t.Parallel() + result := NewComputerUseTextResult("call-type", "test") + require.Equal(t, fantasy.ToolResultContentTypeText, result.Output.GetType()) + }) +} diff --git a/providertests/anthropic_test.go b/providertests/anthropic_test.go index 32fb87f70920d462f724f529687e40dad70ae895..940ec477d7c6bcc0ef487914a7ac6c89bdd3302c 100644 --- a/providertests/anthropic_test.go +++ b/providertests/anthropic_test.go @@ -2,6 +2,7 @@ package providertests import ( "context" + "encoding/json" "net/http" "os" "testing" @@ -274,3 +275,187 @@ func TestAnthropicWebSearch(t *testing.T) { require.Contains(t, got2, "Osaka", "turn 2 response should mention Osaka") }) } + +// screenshotBase64 is a tiny valid 1x1 PNG encoded as base64, +// used as a stub screenshot result in computer use tests. +const screenshotBase64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==" + +func TestAnthropicComputerUse(t *testing.T) { + type computerUseModel struct { + name string + model string + toolVersion anthropic.ComputerUseToolVersion + } + computerUseModels := []computerUseModel{ + {"claude-sonnet-4", "claude-sonnet-4-20250514", anthropic.ComputerUse20250124}, + {"claude-opus-4-6", "claude-opus-4-6", anthropic.ComputerUse20251124}, + } + for _, m := range computerUseModels { + t.Run(m.name, func(t *testing.T) { + t.Run("computer use", func(t *testing.T) { + r := vcr.NewRecorder(t) + + model, err := anthropicBuilder(m.model)(t, r) + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, anthropic.NewComputerUseTool(anthropic.ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: m.toolVersion, + }, noopComputerRun)) + + // First call: expect a screenshot tool call. + resp, err := model.Generate(t.Context(), fantasy.Call{ + Prompt: fantasy.Prompt{ + {Role: fantasy.MessageRoleSystem, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "You are a helpful assistant"}}}, + {Role: fantasy.MessageRoleUser, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "Take a screenshot of the desktop"}}}, + }, + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + require.Equal(t, fantasy.FinishReasonToolCalls, resp.FinishReason) + + toolCalls := resp.Content.ToolCalls() + require.Len(t, toolCalls, 1) + require.Equal(t, "computer", toolCalls[0].ToolName) + require.Contains(t, toolCalls[0].Input, "screenshot") + + // Second call: send the tool result back, expect text. + resp2, err := model.Generate(t.Context(), fantasy.Call{ + Prompt: fantasy.Prompt{ + {Role: fantasy.MessageRoleSystem, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "You are a helpful assistant"}}}, + {Role: fantasy.MessageRoleUser, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "Take a screenshot of the desktop"}}}, + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ + ToolCallID: toolCalls[0].ToolCallID, + ToolName: toolCalls[0].ToolName, + Input: toolCalls[0].Input, + }, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: toolCalls[0].ToolCallID, + Output: fantasy.ToolResultOutputContentMedia{ + Data: screenshotBase64, + MediaType: "image/png", + }, + }, + }, + }, + }, + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + require.NotEmpty(t, resp2.Content.Text()) + require.Contains(t, resp2.Content.Text(), "desktop") + }) + + t.Run("computer use streaming", func(t *testing.T) { + r := vcr.NewRecorder(t) + + model, err := anthropicBuilder(m.model)(t, r) + require.NoError(t, err) + + cuTool := jsonRoundTripTool(t, anthropic.NewComputerUseTool(anthropic.ComputerUseToolOptions{ + DisplayWidthPx: 1920, + DisplayHeightPx: 1080, + ToolVersion: m.toolVersion, + }, noopComputerRun)) + + // First call: stream, collect tool call. + stream, err := model.Stream(t.Context(), fantasy.Call{ + Prompt: fantasy.Prompt{ + {Role: fantasy.MessageRoleSystem, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "You are a helpful assistant"}}}, + {Role: fantasy.MessageRoleUser, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "Take a screenshot of the desktop"}}}, + }, + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + + var toolCallID, toolCallName, toolCallInput string + var finishReason fantasy.FinishReason + stream(func(part fantasy.StreamPart) bool { + switch part.Type { + case fantasy.StreamPartTypeToolCall: + toolCallID = part.ID + toolCallName = part.ToolCallName + toolCallInput = part.ToolCallInput + case fantasy.StreamPartTypeFinish: + finishReason = part.FinishReason + } + return true + }) + + require.Equal(t, fantasy.FinishReasonToolCalls, finishReason) + require.Equal(t, "computer", toolCallName) + require.Contains(t, toolCallInput, "screenshot") + + // Second call: send tool result, stream text back. + stream2, err := model.Stream(t.Context(), fantasy.Call{ + Prompt: fantasy.Prompt{ + {Role: fantasy.MessageRoleSystem, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "You are a helpful assistant"}}}, + {Role: fantasy.MessageRoleUser, Content: []fantasy.MessagePart{fantasy.TextPart{Text: "Take a screenshot of the desktop"}}}, + { + Role: fantasy.MessageRoleAssistant, + Content: []fantasy.MessagePart{ + fantasy.ToolCallPart{ + ToolCallID: toolCallID, + ToolName: toolCallName, + Input: toolCallInput, + }, + }, + }, + { + Role: fantasy.MessageRoleTool, + Content: []fantasy.MessagePart{ + fantasy.ToolResultPart{ + ToolCallID: toolCallID, + Output: fantasy.ToolResultOutputContentMedia{ + Data: screenshotBase64, + MediaType: "image/png", + }, + }, + }, + }, + }, + Tools: []fantasy.Tool{cuTool}, + }) + require.NoError(t, err) + + var text string + stream2(func(part fantasy.StreamPart) bool { + if part.Type == fantasy.StreamPartTypeTextDelta { + text += part.Delta + } + return true + }) + require.NotEmpty(t, text) + require.Contains(t, text, "desktop") + }) + }) + } +} + +// noopComputerRun is a no-op run function for tests that only need +// to inspect the tool definition, not execute it. +var noopComputerRun = func(_ context.Context, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { + return fantasy.ToolResponse{}, nil +} + +// jsonRoundTripTool simulates a JSON round-trip on a ProviderDefinedTool +// so numeric values become float64 as they would in real usage. +func jsonRoundTripTool(t *testing.T, tool fantasy.ExecutableProviderTool) fantasy.ProviderDefinedTool { + t.Helper() + pdt := tool.Definition() + data, err := json.Marshal(pdt.Args) + require.NoError(t, err) + var args map[string]any + require.NoError(t, json.Unmarshal(data, &args)) + pdt.Args = args + return pdt +} diff --git a/providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use.yaml b/providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ef9521a90f8fbfe480a607c39ddce914787d662 --- /dev/null +++ b/providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use.yaml @@ -0,0 +1,69 @@ +--- +version: 2 +interactions: +- id: 0 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 314 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"}],"model":"claude-opus-4-6","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20251124"}]}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: '{"model":"claude-opus-4-6","id":"msg_01KbvEPdNineiBzobWvpf838","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_017ZcKCSBfPgLF2MstqzLwS6","name":"computer","input":{"action":"screenshot"},"caller":{"type":"direct"}}],"stop_reason":"tool_use","stop_sequence":null,"usage":{"input_tokens":1842,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":52,"service_tier":"standard","inference_geo":"global"}}' + headers: + Content-Type: + - application/json + status: 200 OK + code: 200 + duration: 2.904585799s +- id: 1 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 740 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"},{"content":[{"id":"toolu_017ZcKCSBfPgLF2MstqzLwS6","input":{"action":"screenshot"},"name":"computer","type":"tool_use"}],"role":"assistant"},{"content":[{"tool_use_id":"toolu_017ZcKCSBfPgLF2MstqzLwS6","content":[{"source":{"data":"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==","media_type":"image/png","type":"base64"},"type":"image"}],"type":"tool_result"}],"role":"user"}],"model":"claude-opus-4-6","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20251124"}]}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: '{"model":"claude-opus-4-6","id":"msg_01LNnzU2u6f7Gb7QsSWZFMqV","type":"message","role":"assistant","content":[{"type":"text","text":"The screenshot shows what appears to be a mostly blank/yellow desktop. The screen appears to be largely empty with a light yellow or pale background. There don''t seem to be any visible icons, taskbar, or other desktop elements in this view.\n\nWould you like me to try anything else, such as moving the mouse or clicking to see if there are hidden elements?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1921,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":78,"service_tier":"standard","inference_geo":"global"}}' + headers: + Content-Type: + - application/json + status: 200 OK + code: 200 + duration: 4.691481122s diff --git a/providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use_streaming.yaml b/providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use_streaming.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0d84388504c0fa87c527c86fef69081bdb19ceb --- /dev/null +++ b/providertests/testdata/TestAnthropicComputerUse/claude-opus-4-6/computer_use_streaming.yaml @@ -0,0 +1,132 @@ +--- +version: 2 +interactions: +- id: 0 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 328 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"}],"model":"claude-opus-4-6","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20251124"}],"stream":true}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: |+ + event: message_start + data: {"type":"message_start","message":{"model":"claude-opus-4-6","id":"msg_01H5UdVXFCVHyBS6MXSCVkKa","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1842,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":52,"service_tier":"standard","inference_geo":"global"}} } + + event: content_block_start + data: {"type":"content_block_start","index":0,"content_block":{"type":"tool_use","id":"toolu_01DkSrnHZMNWNEqf7wznra4W","name":"computer","input":{},"caller":{"type":"direct"}} } + + event: ping + data: {"type": "ping"} + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":""} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":"{\"action\""} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":": \"screens"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":"hot\"}"} } + + event: content_block_stop + data: {"type":"content_block_stop","index":0 } + + event: message_delta + data: {"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":1842,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":52} } + + event: message_stop + data: {"type":"message_stop" } + + headers: + Content-Type: + - text/event-stream; charset=utf-8 + status: 200 OK + code: 200 + duration: 2.80446445s +- id: 1 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 754 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"},{"content":[{"id":"toolu_01DkSrnHZMNWNEqf7wznra4W","input":{"action":"screenshot"},"name":"computer","type":"tool_use"}],"role":"assistant"},{"content":[{"tool_use_id":"toolu_01DkSrnHZMNWNEqf7wznra4W","content":[{"source":{"data":"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==","media_type":"image/png","type":"base64"},"type":"image"}],"type":"tool_result"}],"role":"user"}],"model":"claude-opus-4-6","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20251124"}],"stream":true}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: |+ + event: message_start + data: {"type":"message_start","message":{"model":"claude-opus-4-6","id":"msg_01BRaGURK3eQELbeoU9DM6Bb","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1921,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard","inference_geo":"global"}} } + + event: content_block_start + data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } + + event: ping + data: {"type": "ping"} + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"The"}} + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" screenshot shows the desktop. It appears to be a mostly blank/empty desktop with a"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" yellow or light-colored background. There don't appear to be any visible icons, taskb"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"ars, or open windows at the moment. \n\nIs there anything specific you'd like me to do on the"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" desktop?"} } + + event: content_block_stop + data: {"type":"content_block_stop","index":0 } + + event: message_delta + data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":1921,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":65} } + + event: message_stop + data: {"type":"message_stop" } + + headers: + Content-Type: + - text/event-stream; charset=utf-8 + status: 200 OK + code: 200 + duration: 3.163032433s diff --git a/providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use.yaml b/providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03a61289296385ebd80aab0a0789c1cd8111c6c4 --- /dev/null +++ b/providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use.yaml @@ -0,0 +1,69 @@ +--- +version: 2 +interactions: +- id: 0 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 323 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"}],"model":"claude-sonnet-4-20250514","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20250124"}]}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: '{"model":"claude-sonnet-4-20250514","id":"msg_01KHX677vLGXfeouAejAbGDD","type":"message","role":"assistant","content":[{"type":"text","text":"I''ll take a screenshot of the desktop for you."},{"type":"tool_use","id":"toolu_01MpnnLwjqpqYWLUQ5F5SixE","name":"computer","input":{"action":"screenshot"},"caller":{"type":"direct"}}],"stop_reason":"tool_use","stop_sequence":null,"usage":{"input_tokens":1652,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":63,"service_tier":"standard","inference_geo":"not_available"}}' + headers: + Content-Type: + - application/json + status: 200 OK + code: 200 + duration: 1.45399283s +- id: 1 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 749 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"},{"content":[{"id":"toolu_01MpnnLwjqpqYWLUQ5F5SixE","input":{"action":"screenshot"},"name":"computer","type":"tool_use"}],"role":"assistant"},{"content":[{"tool_use_id":"toolu_01MpnnLwjqpqYWLUQ5F5SixE","content":[{"source":{"data":"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==","media_type":"image/png","type":"base64"},"type":"image"}],"type":"tool_result"}],"role":"user"}],"model":"claude-sonnet-4-20250514","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20250124"}]}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: '{"model":"claude-sonnet-4-20250514","id":"msg_01Ute4pN6TgBst3zEx6h8nF4","type":"message","role":"assistant","content":[{"type":"text","text":"I''ve taken a screenshot of the desktop. The image shows a plain black desktop background without any visible icons, taskbar, or other desktop elements. This appears to be a minimal desktop environment. Is there anything specific you''d like me to do with the desktop or any applications you''d like me to open?"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":1731,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":65,"service_tier":"standard","inference_geo":"not_available"}}' + headers: + Content-Type: + - application/json + status: 200 OK + code: 200 + duration: 2.613302152s diff --git a/providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use_streaming.yaml b/providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use_streaming.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a22b534220e931f525d92de35354e32f37d35d75 --- /dev/null +++ b/providertests/testdata/TestAnthropicComputerUse/claude-sonnet-4/computer_use_streaming.yaml @@ -0,0 +1,144 @@ +--- +version: 2 +interactions: +- id: 0 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 337 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"}],"model":"claude-sonnet-4-20250514","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20250124"}],"stream":true}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: |+ + event: message_start + data: {"type":"message_start","message":{"model":"claude-sonnet-4-20250514","id":"msg_019NrbUUum1q7ooQRs42Rzrz","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1652,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":8,"service_tier":"standard","inference_geo":"not_available"}} } + + event: content_block_start + data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } + + event: ping + data: {"type": "ping"} + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"I'll take a screenshot of the desktop"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" for you."} } + + event: content_block_stop + data: {"type":"content_block_stop","index":0 } + + event: content_block_start + data: {"type":"content_block_start","index":1,"content_block":{"type":"tool_use","id":"toolu_01Y1QGhTQnPURqMEtY7RnsP1","name":"computer","input":{},"caller":{"type":"direct"}} } + + event: content_block_delta + data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":""} } + + event: content_block_delta + data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"{\"actio"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"n\": \"scree"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"nshot\"}"} } + + event: content_block_stop + data: {"type":"content_block_stop","index":1 } + + event: message_delta + data: {"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":1652,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":63} } + + event: message_stop + data: {"type":"message_stop" } + + headers: + Content-Type: + - text/event-stream; charset=utf-8 + status: 200 OK + code: 200 + duration: 1.328724866s +- id: 1 + request: + proto: HTTP/1.1 + proto_major: 1 + proto_minor: 1 + content_length: 763 + host: "" + body: '{"max_tokens":4096,"messages":[{"content":[{"text":"Take a screenshot of the desktop","type":"text"}],"role":"user"},{"content":[{"id":"toolu_01Y1QGhTQnPURqMEtY7RnsP1","input":{"action":"screenshot"},"name":"computer","type":"tool_use"}],"role":"assistant"},{"content":[{"tool_use_id":"toolu_01Y1QGhTQnPURqMEtY7RnsP1","content":[{"source":{"data":"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==","media_type":"image/png","type":"base64"},"type":"image"}],"type":"tool_result"}],"role":"user"}],"model":"claude-sonnet-4-20250514","system":[{"text":"You are a helpful assistant","type":"text"}],"tools":[{"display_height_px":1080,"display_width_px":1920,"name":"computer","type":"computer_20250124"}],"stream":true}' + form: + beta: + - "true" + headers: + Accept: + - application/json + Content-Type: + - application/json + User-Agent: + - Charm-Fantasy/0.16.0 (https://charm.land/fantasy) + url: https://api.anthropic.com/v1/messages?beta=true + method: POST + response: + proto: HTTP/2.0 + proto_major: 2 + proto_minor: 0 + content_length: -1 + uncompressed: true + body: |+ + event: message_start + data: {"type":"message_start","message":{"model":"claude-sonnet-4-20250514","id":"msg_01R839ZFF3M6X1k8BFQiH9Go","type":"message","role":"assistant","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1731,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"}} } + + event: content_block_start + data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } + + event: ping + data: {"type": "ping"} + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"I"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"'ve taken a screenshot of the desktop. The desktop appears to be mostly black/dark, which could indicate"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" either a dark desktop background or that the screen is currently displaying"} } + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" a blank/black background. This is the current state"}} + + event: content_block_delta + data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" of the desktop as captured."} } + + event: content_block_stop + data: {"type":"content_block_stop","index":0 } + + event: message_delta + data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":1731,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":54} } + + event: message_stop + data: {"type":"message_stop" } + + headers: + Content-Type: + - text/event-stream; charset=utf-8 + status: 200 OK + code: 200 + duration: 1.529686838s