browse.go

  1// Package browse provides browser automation tools for the agent
  2package browse
  3
  4import (
  5	"context"
  6	"encoding/base64"
  7	"encoding/json"
  8	"fmt"
  9	"log"
 10	"net/http"
 11	"net/url"
 12	"os"
 13	"path/filepath"
 14	"strings"
 15	"sync"
 16	"time"
 17
 18	"github.com/chromedp/cdproto/browser"
 19	"github.com/chromedp/cdproto/runtime"
 20	"github.com/chromedp/chromedp"
 21	"github.com/google/uuid"
 22	"shelley.exe.dev/llm"
 23	"shelley.exe.dev/llm/imageutil"
 24)
 25
 26// ScreenshotDir is the directory where screenshots are stored
 27const ScreenshotDir = "/tmp/shelley-screenshots"
 28
 29// DownloadDir is the directory where downloads are stored
 30const DownloadDir = "/tmp/shelley-downloads"
 31
 32// ConsoleLogsDir is the directory where large console logs are stored
 33const ConsoleLogsDir = "/tmp/shelley-console-logs"
 34
 35// ConsoleLogSizeThreshold is the size in bytes above which console logs are written to a file
 36const ConsoleLogSizeThreshold = 1024
 37
 38// DefaultIdleTimeout is how long to wait before shutting down an idle browser
 39const DefaultIdleTimeout = 30 * time.Minute
 40
 41// DownloadInfo tracks information about a completed download
 42type DownloadInfo struct {
 43	GUID              string
 44	URL               string
 45	SuggestedFilename string
 46	FinalPath         string
 47	Completed         bool
 48	Error             string
 49}
 50
 51// BrowseTools contains all browser tools and manages a shared browser instance
 52type BrowseTools struct {
 53	ctx              context.Context
 54	allocCtx         context.Context
 55	allocCancel      context.CancelFunc
 56	browserCtx       context.Context
 57	browserCtxCancel context.CancelFunc
 58	mux              sync.Mutex
 59	// Map to track screenshots by ID and their creation time
 60	screenshots      map[string]time.Time
 61	screenshotsMutex sync.Mutex
 62	// Console logs storage
 63	consoleLogs      []*runtime.EventConsoleAPICalled
 64	consoleLogsMutex sync.Mutex
 65	maxConsoleLogs   int
 66	// Idle timeout management
 67	idleTimeout time.Duration
 68	idleTimer   *time.Timer
 69	// Max image dimension for resizing (0 means use default)
 70	maxImageDimension int
 71	// Download tracking
 72	downloads      map[string]*DownloadInfo // keyed by GUID
 73	downloadsMutex sync.Mutex
 74	downloadCond   *sync.Cond
 75}
 76
 77// NewBrowseTools creates a new set of browser automation tools.
 78// idleTimeout is how long to wait before shutting down an idle browser (0 uses default).
 79// maxImageDimension is the max pixel dimension for images (0 means unlimited).
 80func NewBrowseTools(ctx context.Context, idleTimeout time.Duration, maxImageDimension int) *BrowseTools {
 81	if idleTimeout <= 0 {
 82		idleTimeout = DefaultIdleTimeout
 83	}
 84	for _, dir := range []string{ScreenshotDir, DownloadDir, ConsoleLogsDir} {
 85		if err := os.MkdirAll(dir, 0o755); err != nil {
 86			log.Printf("Failed to create directory %s: %v", dir, err)
 87		}
 88	}
 89
 90	bt := &BrowseTools{
 91		ctx:               ctx,
 92		screenshots:       make(map[string]time.Time),
 93		consoleLogs:       make([]*runtime.EventConsoleAPICalled, 0),
 94		maxConsoleLogs:    100,
 95		maxImageDimension: maxImageDimension,
 96		idleTimeout:       idleTimeout,
 97		downloads:         make(map[string]*DownloadInfo),
 98	}
 99	bt.downloadCond = sync.NewCond(&bt.downloadsMutex)
100	return bt
101}
102
103// GetBrowserContext returns the browser context, initializing if needed and resetting the idle timer.
104func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
105	b.mux.Lock()
106	defer b.mux.Unlock()
107
108	// If browser exists, reset idle timer and return
109	if b.browserCtx != nil {
110		b.resetIdleTimerLocked()
111		return b.browserCtx, nil
112	}
113
114	// Initialize a new browser
115	opts := chromedp.DefaultExecAllocatorOptions[:]
116	opts = append(opts, chromedp.NoSandbox)
117	opts = append(opts, chromedp.Flag("--disable-dbus", true))
118	opts = append(opts, chromedp.WSURLReadTimeout(60*time.Second))
119
120	allocCtx, allocCancel := chromedp.NewExecAllocator(b.ctx, opts...)
121	browserCtx, browserCancel := chromedp.NewContext(
122		allocCtx,
123		chromedp.WithLogf(log.Printf),
124		chromedp.WithErrorf(log.Printf),
125		chromedp.WithBrowserOption(chromedp.WithDialTimeout(60*time.Second)),
126	)
127
128	// Set up event listeners for console logs and downloads
129	chromedp.ListenTarget(browserCtx, func(ev any) {
130		switch e := ev.(type) {
131		case *runtime.EventConsoleAPICalled:
132			b.captureConsoleLog(e)
133		case *browser.EventDownloadWillBegin:
134			b.handleDownloadWillBegin(e)
135		case *browser.EventDownloadProgress:
136			b.handleDownloadProgress(e)
137		}
138	})
139
140	// Start the browser
141	if err := chromedp.Run(browserCtx); err != nil {
142		allocCancel()
143		return nil, fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
144	}
145
146	// Set default viewport size to 1280x720 (16:9 widescreen)
147	if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
148		browserCancel()
149		allocCancel()
150		return nil, fmt.Errorf("failed to set default viewport: %w", err)
151	}
152
153	// Configure download behavior to allow downloads and emit events
154	if err := chromedp.Run(browserCtx,
155		browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).
156			WithDownloadPath(DownloadDir).
157			WithEventsEnabled(true),
158	); err != nil {
159		browserCancel()
160		allocCancel()
161		return nil, fmt.Errorf("failed to configure download behavior: %w", err)
162	}
163
164	b.allocCtx = allocCtx
165	b.allocCancel = allocCancel
166	b.browserCtx = browserCtx
167	b.browserCtxCancel = browserCancel
168
169	b.resetIdleTimerLocked()
170
171	return b.browserCtx, nil
172}
173
174// resetIdleTimerLocked resets or starts the idle timer. Caller must hold b.mux.
175func (b *BrowseTools) resetIdleTimerLocked() {
176	if b.idleTimer != nil {
177		b.idleTimer.Stop()
178	}
179	b.idleTimer = time.AfterFunc(b.idleTimeout, b.idleShutdown)
180}
181
182// idleShutdown is called when the idle timer fires
183func (b *BrowseTools) idleShutdown() {
184	b.mux.Lock()
185	defer b.mux.Unlock()
186
187	if b.browserCtx == nil {
188		return
189	}
190
191	log.Printf("Browser idle for %v, shutting down", b.idleTimeout)
192	b.closeBrowserLocked()
193}
194
195// closeBrowserLocked shuts down the browser. Caller must hold b.mux.
196func (b *BrowseTools) closeBrowserLocked() {
197	if b.idleTimer != nil {
198		b.idleTimer.Stop()
199		b.idleTimer = nil
200	}
201
202	if b.browserCtxCancel != nil {
203		b.browserCtxCancel()
204		b.browserCtxCancel = nil
205	}
206
207	if b.allocCancel != nil {
208		b.allocCancel()
209		b.allocCancel = nil
210	}
211
212	b.browserCtx = nil
213	b.allocCtx = nil
214}
215
216// Close shuts down the browser
217func (b *BrowseTools) Close() {
218	b.mux.Lock()
219	defer b.mux.Unlock()
220	b.closeBrowserLocked()
221}
222
223// NavigateTool definition
224type navigateInput struct {
225	URL     string `json:"url"`
226	Timeout string `json:"timeout,omitempty"`
227}
228
229// isPort80 reports whether urlStr definitely uses port 80.
230func isPort80(urlStr string) bool {
231	parsedURL, err := url.Parse(urlStr)
232	if err != nil {
233		return false
234	}
235	port := parsedURL.Port()
236	return port == "80" || (port == "" && parsedURL.Scheme == "http")
237}
238
239// NewNavigateTool creates a tool for navigating to URLs
240func (b *BrowseTools) NewNavigateTool() *llm.Tool {
241	return &llm.Tool{
242		Name:        "browser_navigate",
243		Description: "Navigate the browser to a specific URL and wait for page to load",
244		InputSchema: json.RawMessage(`{
245			"type": "object",
246			"properties": {
247				"url": {
248					"type": "string",
249					"description": "The URL to navigate to"
250				},
251				"timeout": {
252					"type": "string",
253					"description": "Timeout as a Go duration string (default: 15s)"
254				}
255			},
256			"required": ["url"]
257		}`),
258		Run: b.navigateRun,
259	}
260}
261
262func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
263	var input navigateInput
264	if err := json.Unmarshal(m, &input); err != nil {
265		return llm.ErrorfToolOut("invalid input: %w", err)
266	}
267
268	if isPort80(input.URL) {
269		return llm.ErrorToolOut(fmt.Errorf("port 80 is not the port you're looking for--port 80 is the main sketch server"))
270	}
271
272	browserCtx, err := b.GetBrowserContext()
273	if err != nil {
274		return llm.ErrorToolOut(err)
275	}
276
277	// Create a timeout context for this operation
278	timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
279	defer cancel()
280
281	err = chromedp.Run(timeoutCtx,
282		chromedp.Navigate(input.URL),
283		chromedp.WaitReady("body"),
284	)
285	if err != nil {
286		// Navigation to download URLs fails with ERR_ABORTED, but the download may have succeeded.
287		// Wait briefly for download events to be processed, then check if we got any downloads.
288		if strings.Contains(err.Error(), "net::ERR_ABORTED") {
289			time.Sleep(500 * time.Millisecond)
290			downloads := b.GetRecentDownloads()
291			if len(downloads) > 0 {
292				// Download succeeded - report it instead of error
293				var sb strings.Builder
294				sb.WriteString("Navigation triggered download(s):")
295				for _, d := range downloads {
296					if d.Error != "" {
297						sb.WriteString(fmt.Sprintf("\n  - %s (from %s): ERROR: %s", d.SuggestedFilename, d.URL, d.Error))
298					} else {
299						sb.WriteString(fmt.Sprintf("\n  - %s (from %s) saved to: %s", d.SuggestedFilename, d.URL, d.FinalPath))
300					}
301				}
302				return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
303			}
304		}
305		return llm.ErrorToolOut(err)
306	}
307
308	return b.toolOutWithDownloads("done")
309}
310
311// ResizeTool definition
312type resizeInput struct {
313	Width   int    `json:"width"`
314	Height  int    `json:"height"`
315	Timeout string `json:"timeout,omitempty"`
316}
317
318// NewResizeTool creates a tool for resizing the browser viewport
319func (b *BrowseTools) NewResizeTool() *llm.Tool {
320	return &llm.Tool{
321		Name:        "browser_resize",
322		Description: "Resize the browser viewport to a specific width and height",
323		InputSchema: json.RawMessage(`{
324			"type": "object",
325			"properties": {
326				"width": {
327					"type": "integer",
328					"description": "Viewport width in pixels"
329				},
330				"height": {
331					"type": "integer",
332					"description": "Viewport height in pixels"
333				},
334				"timeout": {
335					"type": "string",
336					"description": "Timeout as a Go duration string (default: 15s)"
337				}
338			},
339			"required": ["width", "height"]
340		}`),
341		Run: b.resizeRun,
342	}
343}
344
345func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
346	var input resizeInput
347	if err := json.Unmarshal(m, &input); err != nil {
348		return llm.ErrorfToolOut("invalid input: %w", err)
349	}
350
351	if input.Width <= 0 || input.Height <= 0 {
352		return llm.ErrorToolOut(fmt.Errorf("invalid dimensions: width and height must be positive"))
353	}
354
355	browserCtx, err := b.GetBrowserContext()
356	if err != nil {
357		return llm.ErrorToolOut(err)
358	}
359
360	timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
361	defer cancel()
362
363	err = chromedp.Run(timeoutCtx,
364		chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
365	)
366	if err != nil {
367		return llm.ErrorToolOut(err)
368	}
369
370	return llm.ToolOut{LLMContent: llm.TextContent("done")}
371}
372
373// EvalTool definition
374type evalInput struct {
375	Expression string `json:"expression"`
376	Timeout    string `json:"timeout,omitempty"`
377	Await      *bool  `json:"await,omitempty"`
378}
379
380// NewEvalTool creates a tool for evaluating JavaScript
381func (b *BrowseTools) NewEvalTool() *llm.Tool {
382	return &llm.Tool{
383		Name: "browser_eval",
384		Description: `Evaluate JavaScript in the browser context.
385Your go-to tool for interacting with content: clicking buttons, typing, getting content, scrolling, resizing, waiting for content/selector to be ready, etc.`,
386		InputSchema: json.RawMessage(`{
387			"type": "object",
388			"properties": {
389				"expression": {
390					"type": "string",
391					"description": "JavaScript expression to evaluate"
392				},
393				"timeout": {
394					"type": "string",
395					"description": "Timeout as a Go duration string (default: 15s)"
396				},
397				"await": {
398					"type": "boolean",
399					"description": "If true, wait for promises to resolve and return their resolved value (default: true)"
400				}
401			},
402			"required": ["expression"]
403		}`),
404		Run: b.evalRun,
405	}
406}
407
408func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
409	var input evalInput
410	if err := json.Unmarshal(m, &input); err != nil {
411		return llm.ErrorfToolOut("invalid input: %w", err)
412	}
413
414	browserCtx, err := b.GetBrowserContext()
415	if err != nil {
416		return llm.ErrorToolOut(err)
417	}
418
419	// Create a timeout context for this operation
420	timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
421	defer cancel()
422
423	var result any
424	var evalOps []chromedp.EvaluateOption
425
426	await := true
427	if input.Await != nil {
428		await = *input.Await
429	}
430	if await {
431		evalOps = append(evalOps, func(p *runtime.EvaluateParams) *runtime.EvaluateParams {
432			return p.WithAwaitPromise(true)
433		})
434	}
435
436	evalAction := chromedp.Evaluate(input.Expression, &result, evalOps...)
437
438	err = chromedp.Run(timeoutCtx, evalAction)
439	if err != nil {
440		return llm.ErrorToolOut(err)
441	}
442
443	// Return the result as JSON
444	response, err := json.Marshal(result)
445	if err != nil {
446		return llm.ErrorfToolOut("failed to marshal response: %w", err)
447	}
448
449	// If output exceeds threshold, write to file
450	if len(response) > ConsoleLogSizeThreshold {
451		filename := fmt.Sprintf("js_result_%s.json", uuid.New().String()[:8])
452		filePath := filepath.Join(ConsoleLogsDir, filename)
453		if err := os.WriteFile(filePath, response, 0o644); err != nil {
454			return llm.ErrorfToolOut("failed to write JS result to file: %w", err)
455		}
456		return b.toolOutWithDownloads(fmt.Sprintf(
457			"JavaScript result (%d bytes) written to: %s\nUse `cat %s` to view the full content.",
458			len(response), filePath, filePath))
459	}
460
461	return b.toolOutWithDownloads("<javascript_result>" + string(response) + "</javascript_result>")
462}
463
464// ScreenshotTool definition
465type screenshotInput struct {
466	Selector string `json:"selector,omitempty"`
467	Timeout  string `json:"timeout,omitempty"`
468}
469
470// NewScreenshotTool creates a tool for taking screenshots
471func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
472	return &llm.Tool{
473		Name:        "browser_take_screenshot",
474		Description: "Take a screenshot of the page or a specific element",
475		InputSchema: json.RawMessage(`{
476			"type": "object",
477			"properties": {
478				"selector": {
479					"type": "string",
480					"description": "CSS selector for the element to screenshot (optional)"
481				},
482				"timeout": {
483					"type": "string",
484					"description": "Timeout as a Go duration string (default: 15s)"
485				}
486			}
487		}`),
488		Run: b.screenshotRun,
489	}
490}
491
492func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
493	var input screenshotInput
494	if err := json.Unmarshal(m, &input); err != nil {
495		return llm.ErrorfToolOut("invalid input: %w", err)
496	}
497
498	// Try to get a browser context; if unavailable, return an error
499	browserCtx, err := b.GetBrowserContext()
500	if err != nil {
501		return llm.ErrorToolOut(err)
502	}
503
504	// Create a timeout context for this operation
505	timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
506	defer cancel()
507
508	var buf []byte
509	var actions []chromedp.Action
510
511	if input.Selector != "" {
512		// Take screenshot of specific element
513		actions = append(actions,
514			chromedp.WaitReady(input.Selector),
515			chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
516		)
517	} else {
518		// Take full page screenshot
519		actions = append(actions, chromedp.CaptureScreenshot(&buf))
520	}
521
522	err = chromedp.Run(timeoutCtx, actions...)
523	if err != nil {
524		return llm.ErrorToolOut(err)
525	}
526
527	// Save the screenshot and get its ID for potential future reference
528	id := b.SaveScreenshot(buf)
529	if id == "" {
530		return llm.ErrorToolOut(fmt.Errorf("failed to save screenshot"))
531	}
532
533	// Get the full path to the screenshot
534	screenshotPath := GetScreenshotPath(id)
535
536	// Resize image if needed to fit within model's image dimension limits
537	imageData := buf
538	format := "png"
539	resized := false
540	if b.maxImageDimension > 0 {
541		var err error
542		imageData, format, resized, err = imageutil.ResizeImage(buf, b.maxImageDimension)
543		if err != nil {
544			return llm.ErrorToolOut(fmt.Errorf("failed to resize screenshot: %w", err))
545		}
546	}
547
548	base64Data := base64.StdEncoding.EncodeToString(imageData)
549	mediaType := "image/" + format
550
551	display := map[string]any{
552		"type":     "screenshot",
553		"id":       id,
554		"url":      "/api/read?path=" + url.QueryEscape(screenshotPath),
555		"path":     screenshotPath,
556		"selector": input.Selector,
557	}
558
559	description := fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath)
560	if resized {
561		description += " [resized]"
562	}
563
564	return llm.ToolOut{LLMContent: []llm.Content{
565		{
566			Type: llm.ContentTypeText,
567			Text: description,
568		},
569		{
570			Type:      llm.ContentTypeText,
571			MediaType: mediaType,
572			Data:      base64Data,
573		},
574	}, Display: display}
575}
576
577// GetTools returns browser tools, optionally filtering out screenshot-related tools
578func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
579	tools := []*llm.Tool{
580		b.NewNavigateTool(),
581		b.NewEvalTool(),
582		b.NewResizeTool(),
583		b.NewRecentConsoleLogsTool(),
584		b.NewClearConsoleLogsTool(),
585	}
586
587	// Add screenshot-related tools if supported
588	if includeScreenshotTools {
589		tools = append(tools, b.NewScreenshotTool())
590		tools = append(tools, b.NewReadImageTool())
591	}
592
593	return tools
594}
595
596// SaveScreenshot saves a screenshot to disk and returns its ID
597func (b *BrowseTools) SaveScreenshot(data []byte) string {
598	// Generate a unique ID
599	id := uuid.New().String()
600
601	// Save the file
602	filePath := filepath.Join(ScreenshotDir, id+".png")
603	if err := os.WriteFile(filePath, data, 0o644); err != nil {
604		log.Printf("Failed to save screenshot: %v", err)
605		return ""
606	}
607
608	// Track this screenshot
609	b.screenshotsMutex.Lock()
610	b.screenshots[id] = time.Now()
611	b.screenshotsMutex.Unlock()
612
613	return id
614}
615
616// GetScreenshotPath returns the full path to a screenshot by ID
617func GetScreenshotPath(id string) string {
618	return filepath.Join(ScreenshotDir, id+".png")
619}
620
621// ReadImageTool definition
622type readImageInput struct {
623	Path    string `json:"path"`
624	Timeout string `json:"timeout,omitempty"`
625}
626
627// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
628func (b *BrowseTools) NewReadImageTool() *llm.Tool {
629	return &llm.Tool{
630		Name:        "read_image",
631		Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
632		InputSchema: json.RawMessage(`{
633			"type": "object",
634			"properties": {
635				"path": {
636					"type": "string",
637					"description": "Path to the image file to read"
638				},
639				"timeout": {
640					"type": "string",
641					"description": "Timeout as a Go duration string (default: 15s)"
642				}
643			},
644			"required": ["path"]
645		}`),
646		Run: b.readImageRun,
647	}
648}
649
650func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
651	var input readImageInput
652	if err := json.Unmarshal(m, &input); err != nil {
653		return llm.ErrorfToolOut("invalid input: %w", err)
654	}
655
656	// Check if the path exists
657	if _, err := os.Stat(input.Path); os.IsNotExist(err) {
658		return llm.ErrorfToolOut("image file not found: %s", input.Path)
659	}
660
661	// Read the file
662	imageData, err := os.ReadFile(input.Path)
663	if err != nil {
664		return llm.ErrorfToolOut("failed to read image file: %w", err)
665	}
666
667	// Convert HEIC to PNG if needed (Go's image library doesn't support HEIC)
668	converted := false
669	if imageutil.IsHEIC(imageData) {
670		imageData, err = imageutil.ConvertHEICToPNG(imageData)
671		if err != nil {
672			return llm.ErrorfToolOut("failed to convert HEIC image: %w", err)
673		}
674		converted = true
675	}
676
677	detectedType := http.DetectContentType(imageData)
678	if !strings.HasPrefix(detectedType, "image/") {
679		return llm.ErrorfToolOut("file is not an image: %s", detectedType)
680	}
681
682	// Resize image if needed to fit within model's image dimension limits
683	resized := false
684	format := strings.TrimPrefix(detectedType, "image/")
685	if b.maxImageDimension > 0 {
686		var err error
687		imageData, format, resized, err = imageutil.ResizeImage(imageData, b.maxImageDimension)
688		if err != nil {
689			return llm.ErrorToolOut(fmt.Errorf("failed to resize image: %w", err))
690		}
691	}
692
693	base64Data := base64.StdEncoding.EncodeToString(imageData)
694	mediaType := "image/" + format
695
696	description := fmt.Sprintf("Image from %s (type: %s)", input.Path, mediaType)
697	if converted {
698		description += " [converted from HEIC]"
699	}
700	if resized {
701		description += " [resized]"
702	}
703
704	return llm.ToolOut{LLMContent: []llm.Content{
705		{
706			Type: llm.ContentTypeText,
707			Text: description,
708		},
709		{
710			Type:      llm.ContentTypeText,
711			MediaType: mediaType,
712			Data:      base64Data,
713		},
714	}}
715}
716
717// parseTimeout parses a timeout string and returns a time.Duration
718// It returns a default of 5 seconds if the timeout is empty or invalid
719func parseTimeout(timeout string) time.Duration {
720	dur, err := time.ParseDuration(timeout)
721	if err != nil {
722		return 15 * time.Second
723	}
724	return dur
725}
726
727// captureConsoleLog captures a console log event and stores it
728func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
729	// Add to logs with mutex protection
730	b.consoleLogsMutex.Lock()
731	defer b.consoleLogsMutex.Unlock()
732
733	// Add the log and maintain max size
734	b.consoleLogs = append(b.consoleLogs, e)
735	if len(b.consoleLogs) > b.maxConsoleLogs {
736		b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
737	}
738}
739
740// handleDownloadWillBegin handles the browser download start event
741func (b *BrowseTools) handleDownloadWillBegin(e *browser.EventDownloadWillBegin) {
742	b.downloadsMutex.Lock()
743	defer b.downloadsMutex.Unlock()
744
745	b.downloads[e.GUID] = &DownloadInfo{
746		GUID:              e.GUID,
747		URL:               e.URL,
748		SuggestedFilename: e.SuggestedFilename,
749	}
750}
751
752// handleDownloadProgress handles the browser download progress event
753func (b *BrowseTools) handleDownloadProgress(e *browser.EventDownloadProgress) {
754	b.downloadsMutex.Lock()
755	defer b.downloadsMutex.Unlock()
756
757	info, ok := b.downloads[e.GUID]
758	if !ok {
759		// Download started before we started tracking, create entry
760		info = &DownloadInfo{GUID: e.GUID}
761		b.downloads[e.GUID] = info
762	}
763
764	switch e.State {
765	case browser.DownloadProgressStateCompleted:
766		info.Completed = true
767		// The file is downloaded with GUID as filename, rename to suggested filename with random suffix
768		guidPath := filepath.Join(DownloadDir, e.GUID)
769		finalName := b.generateDownloadFilename(info.SuggestedFilename)
770		finalPath := filepath.Join(DownloadDir, finalName)
771		// Retry rename a few times as file might still be being written
772		var renamed bool
773		for i := 0; i < 10; i++ {
774			if err := os.Rename(guidPath, finalPath); err == nil {
775				info.FinalPath = finalPath
776				renamed = true
777				break
778			}
779			time.Sleep(50 * time.Millisecond)
780		}
781		if !renamed {
782			// File might have different path or couldn't be renamed
783			if e.FilePath != "" {
784				info.FinalPath = e.FilePath
785			} else {
786				info.FinalPath = guidPath
787			}
788		}
789		b.downloadCond.Broadcast()
790	case browser.DownloadProgressStateCanceled:
791		info.Completed = true
792		info.Error = "download canceled"
793		b.downloadCond.Broadcast()
794	}
795}
796
797// generateDownloadFilename creates a filename with randomness
798func (b *BrowseTools) generateDownloadFilename(suggested string) string {
799	if suggested == "" {
800		suggested = "download"
801	}
802	// Extract extension if present
803	ext := filepath.Ext(suggested)
804	base := strings.TrimSuffix(suggested, ext)
805	// Add random suffix
806	randomSuffix := uuid.New().String()[:8]
807	return fmt.Sprintf("%s_%s%s", base, randomSuffix, ext)
808}
809
810// GetRecentDownloads returns download info for recently completed downloads and clears the list
811func (b *BrowseTools) GetRecentDownloads() []*DownloadInfo {
812	b.downloadsMutex.Lock()
813	defer b.downloadsMutex.Unlock()
814
815	var completed []*DownloadInfo
816	for guid, info := range b.downloads {
817		if info.Completed {
818			completed = append(completed, info)
819			delete(b.downloads, guid)
820		}
821	}
822	return completed
823}
824
825// toolOutWithDownloads creates a tool output that includes any completed downloads
826func (b *BrowseTools) toolOutWithDownloads(message string) llm.ToolOut {
827	downloads := b.GetRecentDownloads()
828	if len(downloads) == 0 {
829		return llm.ToolOut{LLMContent: llm.TextContent(message)}
830	}
831
832	var sb strings.Builder
833	sb.WriteString(message)
834	sb.WriteString("\n\nDownloads completed:")
835	for _, d := range downloads {
836		if d.Error != "" {
837			sb.WriteString(fmt.Sprintf("\n  - %s (from %s): ERROR: %s", d.SuggestedFilename, d.URL, d.Error))
838		} else {
839			sb.WriteString(fmt.Sprintf("\n  - %s (from %s) saved to: %s", d.SuggestedFilename, d.URL, d.FinalPath))
840		}
841	}
842	return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
843}
844
845// RecentConsoleLogsTool definition
846type recentConsoleLogsInput struct {
847	Limit int `json:"limit,omitempty"`
848}
849
850// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
851func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
852	return &llm.Tool{
853		Name:        "browser_recent_console_logs",
854		Description: "Get recent browser console logs",
855		InputSchema: json.RawMessage(`{
856			"type": "object",
857			"properties": {
858				"limit": {
859					"type": "integer",
860					"description": "Maximum number of log entries to return (default: 100)"
861				}
862			}
863		}`),
864		Run: b.recentConsoleLogsRun,
865	}
866}
867
868func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
869	var input recentConsoleLogsInput
870	if err := json.Unmarshal(m, &input); err != nil {
871		return llm.ErrorfToolOut("invalid input: %w", err)
872	}
873
874	// Ensure browser is initialized
875	_, err := b.GetBrowserContext()
876	if err != nil {
877		return llm.ErrorToolOut(err)
878	}
879
880	// Apply limit (default to 100 if not specified)
881	limit := 100
882	if input.Limit > 0 {
883		limit = input.Limit
884	}
885
886	// Get console logs with mutex protection
887	b.consoleLogsMutex.Lock()
888	logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
889	start := 0
890	if len(b.consoleLogs) > limit {
891		start = len(b.consoleLogs) - limit
892	}
893	logs = append(logs, b.consoleLogs[start:]...)
894	b.consoleLogsMutex.Unlock()
895
896	// Format the logs as JSON
897	logData, err := json.MarshalIndent(logs, "", "  ")
898	if err != nil {
899		return llm.ErrorfToolOut("failed to serialize logs: %w", err)
900	}
901
902	// If output exceeds threshold, write to file
903	if len(logData) > ConsoleLogSizeThreshold {
904		filename := fmt.Sprintf("console_logs_%s.json", uuid.New().String()[:8])
905		filePath := filepath.Join(ConsoleLogsDir, filename)
906		if err := os.WriteFile(filePath, logData, 0o644); err != nil {
907			return llm.ErrorfToolOut("failed to write console logs to file: %w", err)
908		}
909		return llm.ToolOut{LLMContent: llm.TextContent(fmt.Sprintf(
910			"Retrieved %d console log entries (%d bytes).\nOutput written to: %s\nUse `cat %s` to view the full content.",
911			len(logs), len(logData), filePath, filePath))}
912	}
913
914	// Format the logs
915	var sb strings.Builder
916	sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
917
918	if len(logs) == 0 {
919		sb.WriteString("No console logs captured.")
920	} else {
921		// Add the JSON data for full details
922		sb.WriteString(string(logData))
923	}
924
925	return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
926}
927
928// ClearConsoleLogsTool definition
929type clearConsoleLogsInput struct{}
930
931// NewClearConsoleLogsTool creates a tool for clearing console logs
932func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
933	return &llm.Tool{
934		Name:        "browser_clear_console_logs",
935		Description: "Clear all captured browser console logs",
936		InputSchema: llm.EmptySchema(),
937		Run:         b.clearConsoleLogsRun,
938	}
939}
940
941func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
942	var input clearConsoleLogsInput
943	if err := json.Unmarshal(m, &input); err != nil {
944		return llm.ErrorfToolOut("invalid input: %w", err)
945	}
946
947	// Ensure browser is initialized
948	_, err := b.GetBrowserContext()
949	if err != nil {
950		return llm.ErrorToolOut(err)
951	}
952
953	// Clear console logs with mutex protection
954	b.consoleLogsMutex.Lock()
955	logCount := len(b.consoleLogs)
956	b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
957	b.consoleLogsMutex.Unlock()
958
959	return llm.ToolOut{LLMContent: llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount))}
960}