computer_use.go

  1package anthropic
  2
  3import (
  4	"context"
  5	"encoding/base64"
  6	"encoding/json"
  7	"fmt"
  8
  9	"charm.land/fantasy"
 10	anthropicsdk "github.com/charmbracelet/anthropic-sdk-go"
 11	"github.com/charmbracelet/anthropic-sdk-go/packages/param"
 12)
 13
 14// computerUseToolID is the canonical identifier for
 15// Anthropic computer use tools. It follows the
 16// <provider>.<tool> convention used by ProviderDefinedTool.ID.
 17const computerUseToolID = "anthropic.computer"
 18
 19// computerUseAPIName is the tool name Anthropic's API expects
 20// on the wire.
 21const computerUseAPIName = "computer"
 22
 23// ComputerUseToolVersion identifies which version of the Anthropic
 24// computer use tool to use.
 25type ComputerUseToolVersion string
 26
 27const (
 28	// ComputerUse20251124 selects the November 2025 version of the
 29	// computer use tool.
 30	ComputerUse20251124 ComputerUseToolVersion = "computer_20251124"
 31	// ComputerUse20250124 selects the January 2025 version of the
 32	// computer use tool.
 33	ComputerUse20250124 ComputerUseToolVersion = "computer_20250124"
 34)
 35
 36// ComputerUseToolOptions holds the configuration for creating a
 37// computer use tool instance.
 38type ComputerUseToolOptions struct {
 39	// DisplayWidthPx is the width of the display in pixels.
 40	DisplayWidthPx int64
 41	// DisplayHeightPx is the height of the display in pixels.
 42	DisplayHeightPx int64
 43	// DisplayNumber is an optional X11 display number.
 44	DisplayNumber *int64
 45	// EnableZoom enables zoom support. Only used with the
 46	// ComputerUse20251124 version.
 47	EnableZoom *bool
 48	// ToolVersion selects which computer use tool version to use.
 49	ToolVersion ComputerUseToolVersion
 50	// CacheControl sets optional cache control for the tool.
 51	CacheControl *CacheControl
 52}
 53
 54// NewComputerUseTool creates a new provider-defined tool configured
 55// for Anthropic computer use. The returned tool can be passed
 56// directly into a fantasy tool set via WithProviderDefinedTools.
 57func NewComputerUseTool(
 58	opts ComputerUseToolOptions,
 59	run func(ctx context.Context, call fantasy.ToolCall) (fantasy.ToolResponse, error),
 60) fantasy.ExecutableProviderTool {
 61	args := map[string]any{
 62		"display_width_px":  opts.DisplayWidthPx,
 63		"display_height_px": opts.DisplayHeightPx,
 64		"tool_version":      string(opts.ToolVersion),
 65	}
 66	if opts.DisplayNumber != nil {
 67		args["display_number"] = *opts.DisplayNumber
 68	}
 69	if opts.EnableZoom != nil {
 70		args["enable_zoom"] = *opts.EnableZoom
 71	}
 72	if opts.CacheControl != nil {
 73		args["cache_control"] = *opts.CacheControl
 74	}
 75	pdt := fantasy.ProviderDefinedTool{
 76		ID:   computerUseToolID,
 77		Name: computerUseAPIName,
 78		Args: args,
 79	}
 80	return fantasy.NewExecutableProviderTool(pdt, run)
 81}
 82
 83// IsComputerUseTool reports whether tool is an Anthropic computer
 84// use tool. It checks for a ProviderDefinedTool whose ID matches
 85// the computer use tool identifier exactly.
 86func IsComputerUseTool(tool fantasy.Tool) bool {
 87	pdt, ok := asProviderDefinedTool(tool)
 88	if !ok {
 89		return false
 90	}
 91	return pdt.ID == computerUseToolID
 92}
 93
 94// getComputerUseVersion extracts the ComputerUseToolVersion from a
 95// provider-defined tool's Args map. It returns the version and true
 96// if present, or the zero value and false otherwise.
 97func getComputerUseVersion(tool fantasy.ProviderDefinedTool) (ComputerUseToolVersion, bool) {
 98	v, ok := tool.Args["tool_version"]
 99	if !ok {
100		return "", false
101	}
102	s, ok := v.(string)
103	if !ok {
104		return "", false
105	}
106	return ComputerUseToolVersion(s), true
107}
108
109// computerUseBetaFlag returns the Anthropic beta header value for
110// the given computer use tool version.
111func computerUseBetaFlag(version ComputerUseToolVersion) (string, error) {
112	switch version {
113	case ComputerUse20251124:
114		// TODO: Replace with SDK constant when available.
115		return "computer-use-2025-11-24", nil
116	case ComputerUse20250124:
117		return anthropicsdk.AnthropicBetaComputerUse2025_01_24, nil
118	default:
119		return "", fmt.Errorf(
120			"unsupported computer use tool version: %q", version,
121		)
122	}
123}
124
125// computerUseToolJSON builds the JSON representation of a computer
126// use tool from a ProviderDefinedTool's Args, using the beta SDK
127// types for serialization.
128func computerUseToolJSON(pdt fantasy.ProviderDefinedTool) (json.RawMessage, error) {
129	version, ok := getComputerUseVersion(pdt)
130	if !ok {
131		return nil, fmt.Errorf("computerUseToolJSON: tool_version arg is missing")
132	}
133
134	h, hOK := anyToInt64(pdt.Args["display_height_px"])
135	w, wOK := anyToInt64(pdt.Args["display_width_px"])
136	if !hOK || !wOK {
137		return nil, fmt.Errorf(
138			"display_height_px and display_width_px must be numeric"+
139				" (height ok=%t, width ok=%t)", hOK, wOK,
140		)
141	}
142
143	switch version {
144	case ComputerUse20250124:
145		tool := anthropicsdk.BetaToolUnionParamOfComputerUseTool20250124(h, w)
146		if v, ok := pdt.Args["display_number"]; ok {
147			dn, ok := anyToInt64(v)
148			if !ok {
149				return nil, fmt.Errorf("computer use tool has invalid display_number")
150			}
151			tool.OfComputerUseTool20250124.DisplayNumber = param.NewOpt(dn)
152		}
153		if _, ok := pdt.Args["cache_control"]; ok {
154			tool.OfComputerUseTool20250124.CacheControl = anthropicsdk.NewBetaCacheControlEphemeralParam()
155		}
156		return json.Marshal(tool)
157	case ComputerUse20251124:
158		tool := anthropicsdk.BetaToolUnionParamOfComputerUseTool20251124(h, w)
159		if v, ok := pdt.Args["display_number"]; ok {
160			dn, ok := anyToInt64(v)
161			if !ok {
162				return nil, fmt.Errorf("computer use tool has invalid display_number")
163			}
164			tool.OfComputerUseTool20251124.DisplayNumber = param.NewOpt(dn)
165		}
166		if v, ok := pdt.Args["enable_zoom"]; ok {
167			if b, ok := v.(bool); ok {
168				tool.OfComputerUseTool20251124.EnableZoom = param.NewOpt(b)
169			}
170		}
171		if _, ok := pdt.Args["cache_control"]; ok {
172			tool.OfComputerUseTool20251124.CacheControl = anthropicsdk.NewBetaCacheControlEphemeralParam()
173		}
174		return json.Marshal(tool)
175	default:
176		return nil, fmt.Errorf(
177			"unsupported computer use tool version: %q", version,
178		)
179	}
180}
181
182// ComputerAction identifies the action Claude wants to perform.
183//
184// Unless noted otherwise on a specific action, respond by returning a
185// screenshot using NewComputerUseScreenshotResult.
186type ComputerAction string
187
188const (
189	// ActionScreenshot captures the current screen.
190	//
191	// No additional fields are populated.
192	ActionScreenshot ComputerAction = "screenshot"
193	// ActionLeftClick performs a left click.
194	//
195	//   - Coordinate: [x, y] target.
196	//   - Text: optional modifier key (e.g. "shift", "ctrl").
197	ActionLeftClick ComputerAction = "left_click"
198	// ActionRightClick performs a right click (v20250124+).
199	//
200	//   - Coordinate: [x, y] target.
201	//   - Text: optional modifier key (e.g. "shift", "ctrl").
202	ActionRightClick ComputerAction = "right_click"
203	// ActionDoubleClick performs a double click (v20250124+).
204	//
205	//   - Coordinate: [x, y] target.
206	//   - Text: optional modifier key (e.g. "shift", "ctrl").
207	ActionDoubleClick ComputerAction = "double_click"
208	// ActionTripleClick performs a triple click (v20250124+).
209	//
210	//   - Coordinate: [x, y] target.
211	//   - Text: optional modifier key (e.g. "shift", "ctrl").
212	ActionTripleClick ComputerAction = "triple_click"
213	// ActionMiddleClick performs a middle click (v20250124+).
214	//
215	//   - Coordinate: [x, y] target.
216	//   - Text: optional modifier key (e.g. "shift", "ctrl").
217	ActionMiddleClick ComputerAction = "middle_click"
218	// ActionMouseMove moves the cursor.
219	//
220	//   - Coordinate: [x, y] destination.
221	ActionMouseMove ComputerAction = "mouse_move"
222	// ActionLeftClickDrag drags from one point to another
223	// (v20250124+).
224	//
225	//   - StartCoordinate: [x, y] drag origin.
226	//   - Coordinate: [x, y] drag destination.
227	ActionLeftClickDrag ComputerAction = "left_click_drag"
228	// ActionType types text.
229	//
230	//   - Text: the string to type.
231	ActionType ComputerAction = "type"
232	// ActionKey presses a key combination.
233	//
234	//   - Text: key combo string (e.g. "ctrl+c", "Return").
235	ActionKey ComputerAction = "key"
236	// ActionScroll scrolls the screen (v20250124+).
237	//
238	//   - Coordinate: [x, y] scroll origin.
239	//   - ScrollDirection: "up", "down", "left", or "right".
240	//   - ScrollAmount: scroll distance.
241	//   - Text: optional modifier key.
242	ActionScroll ComputerAction = "scroll"
243	// ActionLeftMouseDown presses and holds the left mouse button
244	// (v20250124+).
245	//
246	//   - Coordinate: [x, y] target.
247	ActionLeftMouseDown ComputerAction = "left_mouse_down"
248	// ActionLeftMouseUp releases the left mouse button
249	// (v20250124+).
250	//
251	//   - Coordinate: [x, y] target.
252	ActionLeftMouseUp ComputerAction = "left_mouse_up"
253	// ActionHoldKey holds down a key for a specified duration
254	// (v20250124+).
255	//
256	//   - Text: the key to hold.
257	//   - Duration: hold time in seconds.
258	ActionHoldKey ComputerAction = "hold_key"
259	// ActionWait pauses between actions (v20250124+).
260	//
261	// No additional fields are populated.
262	ActionWait ComputerAction = "wait"
263	// ActionZoom views a specific screen region at full
264	// resolution (v20251124 only). Requires enable_zoom in the
265	// tool definition.
266	//
267	//   - Region: [x1, y1, x2, y2] top-left and bottom-right.
268	//
269	// Response: return a screenshot of the zoomed region at
270	// full resolution.
271	ActionZoom ComputerAction = "zoom"
272)
273
274// ComputerUseInput is the parsed, typed representation of a computer
275// use tool call's Input JSON. Not all fields are populated for every
276// action — check Action first, then read the relevant fields.
277type ComputerUseInput struct {
278	Action ComputerAction `json:"action"`
279	// Coordinate is [x, y] for click, move, scroll, and
280	// drag-end actions.
281	Coordinate [2]int64 `json:"coordinate,omitempty"`
282	// StartCoordinate is [x, y] for left_click_drag start point.
283	StartCoordinate [2]int64 `json:"start_coordinate,omitempty"`
284	// Text is the string to type (ActionType), key combo
285	// (ActionKey), modifier key for click/scroll actions, or key
286	// to hold (ActionHoldKey).
287	Text string `json:"text,omitempty"`
288	// ScrollDirection is the scroll direction: "up", "down",
289	// "left", or "right".
290	ScrollDirection string `json:"scroll_direction,omitempty"`
291	// ScrollAmount is the number of scroll clicks.
292	ScrollAmount int64 `json:"scroll_amount,omitempty"`
293	// Duration is how long to hold the key in seconds
294	// (ActionHoldKey).
295	Duration int64 `json:"duration,omitempty"`
296	// Region is [x1, y1, x2, y2] defining the zoom area
297	// (ActionZoom, v20251124 only).
298	Region [4]int64 `json:"region,omitempty"`
299}
300
301// ParseComputerUseInput parses a ToolCallContent's Input string into
302// a typed ComputerUseInput. Returns an error if the JSON is invalid
303// or if coordinate arrays have the wrong number of elements.
304func ParseComputerUseInput(input string) (ComputerUseInput, error) {
305	var result ComputerUseInput
306	if err := json.Unmarshal([]byte(input), &result); err != nil {
307		return result, err
308	}
309
310	// Validate array field lengths. json.Unmarshal silently pads
311	// or truncates arrays that don't match the Go fixed-size type,
312	// which would produce wrong coordinates.
313	var raw map[string]json.RawMessage
314	if err := json.Unmarshal([]byte(input), &raw); err != nil {
315		return result, err
316	}
317	if err := validateArrayLen(raw, "coordinate", 2); err != nil {
318		return ComputerUseInput{}, err
319	}
320	if err := validateArrayLen(raw, "start_coordinate", 2); err != nil {
321		return ComputerUseInput{}, err
322	}
323	if err := validateArrayLen(raw, "region", 4); err != nil {
324		return ComputerUseInput{}, err
325	}
326
327	return result, nil
328}
329
330// validateArrayLen checks that the JSON array at key has exactly
331// wantLen elements. If the key is absent from raw it returns nil.
332func validateArrayLen(raw map[string]json.RawMessage, key string, wantLen int) error {
333	v, ok := raw[key]
334	if !ok {
335		return nil
336	}
337	var elems []json.RawMessage
338	if err := json.Unmarshal(v, &elems); err != nil {
339		return fmt.Errorf("%s: expected array: %w", key, err)
340	}
341	if len(elems) != wantLen {
342		return fmt.Errorf(
343			"%s: expected %d elements, got %d",
344			key, wantLen, len(elems),
345		)
346	}
347	return nil
348}
349
350// NewComputerUseScreenshotResult constructs a ToolResultPart
351// containing a screenshot image. This is the standard response for
352// almost every computer use action — Claude expects to see what
353// happened after executing the action.
354//
355// Parameters:
356//   - toolCallID: the ToolCallID from the ToolCallContent that
357//     requested this action.
358//   - screenshotPNG: the raw PNG bytes of the screenshot. The
359//     caller is responsible for capturing and (optionally) resizing
360//     the screenshot before passing it here.
361//
362// The function base64-encodes the image data and sets the media
363// type to "image/png".
364func NewComputerUseScreenshotResult(
365	toolCallID string,
366	screenshotPNG []byte,
367) fantasy.ToolResultPart {
368	return fantasy.ToolResultPart{
369		ToolCallID: toolCallID,
370		Output: fantasy.ToolResultOutputContentMedia{
371			Data:      base64.StdEncoding.EncodeToString(screenshotPNG),
372			MediaType: "image/png",
373		},
374	}
375}
376
377// NewComputerUseScreenshotResultWithMediaType is like
378// NewComputerUseScreenshotResult but allows specifying a custom
379// media type (e.g. "image/jpeg") and pre-encoded base64 data.
380func NewComputerUseScreenshotResultWithMediaType(
381	toolCallID string,
382	base64Data string,
383	mediaType string,
384) fantasy.ToolResultPart {
385	return fantasy.ToolResultPart{
386		ToolCallID: toolCallID,
387		Output: fantasy.ToolResultOutputContentMedia{
388			Data:      base64Data,
389			MediaType: mediaType,
390		},
391	}
392}
393
394// NewComputerUseErrorResult constructs a ToolResultPart indicating
395// that the requested action failed. Claude will see this as an
396// error and may retry or adjust its approach.
397//
398// Use this when screenshot capture fails, coordinates are out of
399// bounds, the application is unresponsive, or any other execution
400// error occurs.
401func NewComputerUseErrorResult(
402	toolCallID string,
403	err error,
404) fantasy.ToolResultPart {
405	return fantasy.ToolResultPart{
406		ToolCallID: toolCallID,
407		Output: fantasy.ToolResultOutputContentError{
408			Error: err,
409		},
410	}
411}
412
413// NewComputerUseTextResult constructs a ToolResultPart containing a
414// plain text response. This is rarely needed for computer use —
415// most actions should return a screenshot — but can be useful for
416// returning metadata alongside the action or for testing.
417func NewComputerUseTextResult(
418	toolCallID string,
419	text string,
420) fantasy.ToolResultPart {
421	return fantasy.ToolResultPart{
422		ToolCallID: toolCallID,
423		Output: fantasy.ToolResultOutputContentText{
424			Text: text,
425		},
426	}
427}