1package anthropic
2
3import (
4 "context"
5 "encoding/base64"
6 "encoding/json"
7 "fmt"
8
9 "charm.land/fantasy"
10 anthropicsdk "github.com/charmbracelet/anthropic-sdk-go"
11 "github.com/charmbracelet/anthropic-sdk-go/packages/param"
12)
13
14// computerUseToolID is the canonical identifier for
15// Anthropic computer use tools. It follows the
16// <provider>.<tool> convention used by ProviderDefinedTool.ID.
17const computerUseToolID = "anthropic.computer"
18
19// computerUseAPIName is the tool name Anthropic's API expects
20// on the wire.
21const computerUseAPIName = "computer"
22
23// ComputerUseToolVersion identifies which version of the Anthropic
24// computer use tool to use.
25type ComputerUseToolVersion string
26
27const (
28 // ComputerUse20251124 selects the November 2025 version of the
29 // computer use tool.
30 ComputerUse20251124 ComputerUseToolVersion = "computer_20251124"
31 // ComputerUse20250124 selects the January 2025 version of the
32 // computer use tool.
33 ComputerUse20250124 ComputerUseToolVersion = "computer_20250124"
34)
35
36// ComputerUseToolOptions holds the configuration for creating a
37// computer use tool instance.
38type ComputerUseToolOptions struct {
39 // DisplayWidthPx is the width of the display in pixels.
40 DisplayWidthPx int64
41 // DisplayHeightPx is the height of the display in pixels.
42 DisplayHeightPx int64
43 // DisplayNumber is an optional X11 display number.
44 DisplayNumber *int64
45 // EnableZoom enables zoom support. Only used with the
46 // ComputerUse20251124 version.
47 EnableZoom *bool
48 // ToolVersion selects which computer use tool version to use.
49 ToolVersion ComputerUseToolVersion
50 // CacheControl sets optional cache control for the tool.
51 CacheControl *CacheControl
52}
53
54// NewComputerUseTool creates a new provider-defined tool configured
55// for Anthropic computer use. The returned tool can be passed
56// directly into a fantasy tool set via WithProviderDefinedTools.
57func NewComputerUseTool(
58 opts ComputerUseToolOptions,
59 run func(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error),
60) fantasy.ExecutableProviderTool {
61 args := map[string]any{
62 "display_width_px": opts.DisplayWidthPx,
63 "display_height_px": opts.DisplayHeightPx,
64 "tool_version": string(opts.ToolVersion),
65 }
66 if opts.DisplayNumber != nil {
67 args["display_number"] = *opts.DisplayNumber
68 }
69 if opts.EnableZoom != nil {
70 args["enable_zoom"] = *opts.EnableZoom
71 }
72 if opts.CacheControl != nil {
73 args["cache_control"] = *opts.CacheControl
74 }
75 pdt := fantasy.ProviderDefinedTool{
76 ID: computerUseToolID,
77 Name: computerUseAPIName,
78 Args: args,
79 }
80 return fantasy.NewExecutableProviderTool(pdt, run)
81}
82
83// IsComputerUseTool reports whether tool is an Anthropic computer
84// use tool. It checks for a ProviderDefinedTool whose ID matches
85// the computer use tool identifier exactly.
86func IsComputerUseTool(tool fantasy.Tool) bool {
87 pdt, ok := asProviderDefinedTool(tool)
88 if !ok {
89 return false
90 }
91 return pdt.ID == computerUseToolID
92}
93
94// getComputerUseVersion extracts the ComputerUseToolVersion from a
95// provider-defined tool's Args map. It returns the version and true
96// if present, or the zero value and false otherwise.
97func getComputerUseVersion(tool fantasy.ProviderDefinedTool) (ComputerUseToolVersion, bool) {
98 v, ok := tool.Args["tool_version"]
99 if !ok {
100 return "", false
101 }
102 s, ok := v.(string)
103 if !ok {
104 return "", false
105 }
106 return ComputerUseToolVersion(s), true
107}
108
109// computerUseBetaFlag returns the Anthropic beta header value for
110// the given computer use tool version.
111func computerUseBetaFlag(version ComputerUseToolVersion) (string, error) {
112 switch version {
113 case ComputerUse20251124:
114 // TODO: Replace with SDK constant when available.
115 return "computer-use-2025-11-24", nil
116 case ComputerUse20250124:
117 return anthropicsdk.AnthropicBetaComputerUse2025_01_24, nil
118 default:
119 return "", fmt.Errorf(
120 "unsupported computer use tool version: %q", version,
121 )
122 }
123}
124
125// computerUseToolJSON builds the JSON representation of a computer
126// use tool from a ProviderDefinedTool's Args, using the beta SDK
127// types for serialization.
128func computerUseToolJSON(pdt fantasy.ProviderDefinedTool) (json.RawMessage, error) {
129 version, ok := getComputerUseVersion(pdt)
130 if !ok {
131 return nil, fmt.Errorf("computerUseToolJSON: tool_version arg is missing")
132 }
133
134 h, hOK := anyToInt64(pdt.Args["display_height_px"])
135 w, wOK := anyToInt64(pdt.Args["display_width_px"])
136 if !hOK || !wOK {
137 return nil, fmt.Errorf(
138 "display_height_px and display_width_px must be numeric"+
139 " (height ok=%t, width ok=%t)", hOK, wOK,
140 )
141 }
142
143 switch version {
144 case ComputerUse20250124:
145 tool := anthropicsdk.BetaToolUnionParamOfComputerUseTool20250124(h, w)
146 if v, ok := pdt.Args["display_number"]; ok {
147 dn, ok := anyToInt64(v)
148 if !ok {
149 return nil, fmt.Errorf("computer use tool has invalid display_number")
150 }
151 tool.OfComputerUseTool20250124.DisplayNumber = param.NewOpt(dn)
152 }
153 if _, ok := pdt.Args["cache_control"]; ok {
154 tool.OfComputerUseTool20250124.CacheControl = anthropicsdk.NewBetaCacheControlEphemeralParam()
155 }
156 return json.Marshal(tool)
157 case ComputerUse20251124:
158 tool := anthropicsdk.BetaToolUnionParamOfComputerUseTool20251124(h, w)
159 if v, ok := pdt.Args["display_number"]; ok {
160 dn, ok := anyToInt64(v)
161 if !ok {
162 return nil, fmt.Errorf("computer use tool has invalid display_number")
163 }
164 tool.OfComputerUseTool20251124.DisplayNumber = param.NewOpt(dn)
165 }
166 if v, ok := pdt.Args["enable_zoom"]; ok {
167 if b, ok := v.(bool); ok {
168 tool.OfComputerUseTool20251124.EnableZoom = param.NewOpt(b)
169 }
170 }
171 if _, ok := pdt.Args["cache_control"]; ok {
172 tool.OfComputerUseTool20251124.CacheControl = anthropicsdk.NewBetaCacheControlEphemeralParam()
173 }
174 return json.Marshal(tool)
175 default:
176 return nil, fmt.Errorf(
177 "unsupported computer use tool version: %q", version,
178 )
179 }
180}
181
182// ComputerAction identifies the action Claude wants to perform.
183//
184// Unless noted otherwise on a specific action, respond by returning a
185// screenshot using NewComputerUseScreenshotResult.
186type ComputerAction string
187
188const (
189 // ActionScreenshot captures the current screen.
190 //
191 // No additional fields are populated.
192 ActionScreenshot ComputerAction = "screenshot"
193 // ActionLeftClick performs a left click.
194 //
195 // - Coordinate: [x, y] target.
196 // - Text: optional modifier key (e.g. "shift", "ctrl").
197 ActionLeftClick ComputerAction = "left_click"
198 // ActionRightClick performs a right click (v20250124+).
199 //
200 // - Coordinate: [x, y] target.
201 // - Text: optional modifier key (e.g. "shift", "ctrl").
202 ActionRightClick ComputerAction = "right_click"
203 // ActionDoubleClick performs a double click (v20250124+).
204 //
205 // - Coordinate: [x, y] target.
206 // - Text: optional modifier key (e.g. "shift", "ctrl").
207 ActionDoubleClick ComputerAction = "double_click"
208 // ActionTripleClick performs a triple click (v20250124+).
209 //
210 // - Coordinate: [x, y] target.
211 // - Text: optional modifier key (e.g. "shift", "ctrl").
212 ActionTripleClick ComputerAction = "triple_click"
213 // ActionMiddleClick performs a middle click (v20250124+).
214 //
215 // - Coordinate: [x, y] target.
216 // - Text: optional modifier key (e.g. "shift", "ctrl").
217 ActionMiddleClick ComputerAction = "middle_click"
218 // ActionMouseMove moves the cursor.
219 //
220 // - Coordinate: [x, y] destination.
221 ActionMouseMove ComputerAction = "mouse_move"
222 // ActionLeftClickDrag drags from one point to another
223 // (v20250124+).
224 //
225 // - StartCoordinate: [x, y] drag origin.
226 // - Coordinate: [x, y] drag destination.
227 ActionLeftClickDrag ComputerAction = "left_click_drag"
228 // ActionType types text.
229 //
230 // - Text: the string to type.
231 ActionType ComputerAction = "type"
232 // ActionKey presses a key combination.
233 //
234 // - Text: key combo string (e.g. "ctrl+c", "Return").
235 ActionKey ComputerAction = "key"
236 // ActionScroll scrolls the screen (v20250124+).
237 //
238 // - Coordinate: [x, y] scroll origin.
239 // - ScrollDirection: "up", "down", "left", or "right".
240 // - ScrollAmount: scroll distance.
241 // - Text: optional modifier key.
242 ActionScroll ComputerAction = "scroll"
243 // ActionLeftMouseDown presses and holds the left mouse button
244 // (v20250124+).
245 //
246 // - Coordinate: [x, y] target.
247 ActionLeftMouseDown ComputerAction = "left_mouse_down"
248 // ActionLeftMouseUp releases the left mouse button
249 // (v20250124+).
250 //
251 // - Coordinate: [x, y] target.
252 ActionLeftMouseUp ComputerAction = "left_mouse_up"
253 // ActionHoldKey holds down a key for a specified duration
254 // (v20250124+).
255 //
256 // - Text: the key to hold.
257 // - Duration: hold time in seconds.
258 ActionHoldKey ComputerAction = "hold_key"
259 // ActionWait pauses between actions (v20250124+).
260 //
261 // No additional fields are populated.
262 ActionWait ComputerAction = "wait"
263 // ActionZoom views a specific screen region at full
264 // resolution (v20251124 only). Requires enable_zoom in the
265 // tool definition.
266 //
267 // - Region: [x1, y1, x2, y2] top-left and bottom-right.
268 //
269 // Response: return a screenshot of the zoomed region at
270 // full resolution.
271 ActionZoom ComputerAction = "zoom"
272)
273
274// ComputerUseInput is the parsed, typed representation of a computer
275// use tool call's Input JSON. Not all fields are populated for every
276// action — check Action first, then read the relevant fields.
277type ComputerUseInput struct {
278 Action ComputerAction `json:"action"`
279 // Coordinate is [x, y] for click, move, scroll, and
280 // drag-end actions.
281 Coordinate [2]int64 `json:"coordinate,omitempty"`
282 // StartCoordinate is [x, y] for left_click_drag start point.
283 StartCoordinate [2]int64 `json:"start_coordinate,omitempty"`
284 // Text is the string to type (ActionType), key combo
285 // (ActionKey), modifier key for click/scroll actions, or key
286 // to hold (ActionHoldKey).
287 Text string `json:"text,omitempty"`
288 // ScrollDirection is the scroll direction: "up", "down",
289 // "left", or "right".
290 ScrollDirection string `json:"scroll_direction,omitempty"`
291 // ScrollAmount is the number of scroll clicks.
292 ScrollAmount int64 `json:"scroll_amount,omitempty"`
293 // Duration is how long to hold the key in seconds
294 // (ActionHoldKey).
295 Duration int64 `json:"duration,omitempty"`
296 // Region is [x1, y1, x2, y2] defining the zoom area
297 // (ActionZoom, v20251124 only).
298 Region [4]int64 `json:"region,omitempty"`
299}
300
301// ParseComputerUseInput parses a ToolCallContent's Input string into
302// a typed ComputerUseInput. Returns an error if the JSON is invalid
303// or if coordinate arrays have the wrong number of elements.
304func ParseComputerUseInput(input string) (ComputerUseInput, error) {
305 var result ComputerUseInput
306 if err := json.Unmarshal([]byte(input), &result); err != nil {
307 return result, err
308 }
309
310 // Validate array field lengths. json.Unmarshal silently pads
311 // or truncates arrays that don't match the Go fixed-size type,
312 // which would produce wrong coordinates.
313 var raw map[string]json.RawMessage
314 if err := json.Unmarshal([]byte(input), &raw); err != nil {
315 return result, err
316 }
317 if err := validateArrayLen(raw, "coordinate", 2); err != nil {
318 return ComputerUseInput{}, err
319 }
320 if err := validateArrayLen(raw, "start_coordinate", 2); err != nil {
321 return ComputerUseInput{}, err
322 }
323 if err := validateArrayLen(raw, "region", 4); err != nil {
324 return ComputerUseInput{}, err
325 }
326
327 return result, nil
328}
329
330// validateArrayLen checks that the JSON array at key has exactly
331// wantLen elements. If the key is absent from raw it returns nil.
332func validateArrayLen(raw map[string]json.RawMessage, key string, wantLen int) error {
333 v, ok := raw[key]
334 if !ok {
335 return nil
336 }
337 var elems []json.RawMessage
338 if err := json.Unmarshal(v, &elems); err != nil {
339 return fmt.Errorf("%s: expected array: %w", key, err)
340 }
341 if len(elems) != wantLen {
342 return fmt.Errorf(
343 "%s: expected %d elements, got %d",
344 key, wantLen, len(elems),
345 )
346 }
347 return nil
348}
349
350// NewComputerUseScreenshotResult constructs a ToolResultPart
351// containing a screenshot image. This is the standard response for
352// almost every computer use action — Claude expects to see what
353// happened after executing the action.
354//
355// Parameters:
356// - toolCallID: the ToolCallID from the ToolCallContent that
357// requested this action.
358// - screenshotPNG: the raw PNG bytes of the screenshot. The
359// caller is responsible for capturing and (optionally) resizing
360// the screenshot before passing it here.
361//
362// The function base64-encodes the image data and sets the media
363// type to "image/png".
364func NewComputerUseScreenshotResult(
365 toolCallID string,
366 screenshotPNG []byte,
367) fantasy.ToolResultPart {
368 return fantasy.ToolResultPart{
369 ToolCallID: toolCallID,
370 Output: fantasy.ToolResultOutputContentMedia{
371 Data: base64.StdEncoding.EncodeToString(screenshotPNG),
372 MediaType: "image/png",
373 },
374 }
375}
376
377// NewComputerUseScreenshotResultWithMediaType is like
378// NewComputerUseScreenshotResult but allows specifying a custom
379// media type (e.g. "image/jpeg") and pre-encoded base64 data.
380func NewComputerUseScreenshotResultWithMediaType(
381 toolCallID string,
382 base64Data string,
383 mediaType string,
384) fantasy.ToolResultPart {
385 return fantasy.ToolResultPart{
386 ToolCallID: toolCallID,
387 Output: fantasy.ToolResultOutputContentMedia{
388 Data: base64Data,
389 MediaType: mediaType,
390 },
391 }
392}
393
394// NewComputerUseErrorResult constructs a ToolResultPart indicating
395// that the requested action failed. Claude will see this as an
396// error and may retry or adjust its approach.
397//
398// Use this when screenshot capture fails, coordinates are out of
399// bounds, the application is unresponsive, or any other execution
400// error occurs.
401func NewComputerUseErrorResult(
402 toolCallID string,
403 err error,
404) fantasy.ToolResultPart {
405 return fantasy.ToolResultPart{
406 ToolCallID: toolCallID,
407 Output: fantasy.ToolResultOutputContentError{
408 Error: err,
409 },
410 }
411}
412
413// NewComputerUseTextResult constructs a ToolResultPart containing a
414// plain text response. This is rarely needed for computer use —
415// most actions should return a screenshot — but can be useful for
416// returning metadata alongside the action or for testing.
417func NewComputerUseTextResult(
418 toolCallID string,
419 text string,
420) fantasy.ToolResultPart {
421 return fantasy.ToolResultPart{
422 ToolCallID: toolCallID,
423 Output: fantasy.ToolResultOutputContentText{
424 Text: text,
425 },
426 }
427}