1// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
6 "encoding/base64"
7 "encoding/json"
8 "fmt"
9 "log"
10 "net/http"
11 "net/url"
12 "os"
13 "path/filepath"
14 "strings"
15 "sync"
16 "time"
17
18 "github.com/chromedp/cdproto/browser"
19 "github.com/chromedp/cdproto/runtime"
20 "github.com/chromedp/chromedp"
21 "github.com/google/uuid"
22 "shelley.exe.dev/llm"
23 "shelley.exe.dev/llm/imageutil"
24)
25
26// ScreenshotDir is the directory where screenshots are stored
27const ScreenshotDir = "/tmp/shelley-screenshots"
28
29// DownloadDir is the directory where downloads are stored
30const DownloadDir = "/tmp/shelley-downloads"
31
32// ConsoleLogsDir is the directory where large console logs are stored
33const ConsoleLogsDir = "/tmp/shelley-console-logs"
34
35// ConsoleLogSizeThreshold is the size in bytes above which console logs are written to a file
36const ConsoleLogSizeThreshold = 1024
37
38// DefaultIdleTimeout is how long to wait before shutting down an idle browser
39const DefaultIdleTimeout = 30 * time.Minute
40
41// DownloadInfo tracks information about a completed download
42type DownloadInfo struct {
43 GUID string
44 URL string
45 SuggestedFilename string
46 FinalPath string
47 Completed bool
48 Error string
49}
50
51// BrowseTools contains all browser tools and manages a shared browser instance
52type BrowseTools struct {
53 ctx context.Context
54 allocCtx context.Context
55 allocCancel context.CancelFunc
56 browserCtx context.Context
57 browserCtxCancel context.CancelFunc
58 mux sync.Mutex
59 // Map to track screenshots by ID and their creation time
60 screenshots map[string]time.Time
61 screenshotsMutex sync.Mutex
62 // Console logs storage
63 consoleLogs []*runtime.EventConsoleAPICalled
64 consoleLogsMutex sync.Mutex
65 maxConsoleLogs int
66 // Idle timeout management
67 idleTimeout time.Duration
68 idleTimer *time.Timer
69 // Max image dimension for resizing (0 means use default)
70 maxImageDimension int
71 // Download tracking
72 downloads map[string]*DownloadInfo // keyed by GUID
73 downloadsMutex sync.Mutex
74 downloadCond *sync.Cond
75}
76
77// NewBrowseTools creates a new set of browser automation tools.
78// idleTimeout is how long to wait before shutting down an idle browser (0 uses default).
79// maxImageDimension is the max pixel dimension for images (0 means unlimited).
80func NewBrowseTools(ctx context.Context, idleTimeout time.Duration, maxImageDimension int) *BrowseTools {
81 if idleTimeout <= 0 {
82 idleTimeout = DefaultIdleTimeout
83 }
84 for _, dir := range []string{ScreenshotDir, DownloadDir, ConsoleLogsDir} {
85 if err := os.MkdirAll(dir, 0o755); err != nil {
86 log.Printf("Failed to create directory %s: %v", dir, err)
87 }
88 }
89
90 bt := &BrowseTools{
91 ctx: ctx,
92 screenshots: make(map[string]time.Time),
93 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
94 maxConsoleLogs: 100,
95 maxImageDimension: maxImageDimension,
96 idleTimeout: idleTimeout,
97 downloads: make(map[string]*DownloadInfo),
98 }
99 bt.downloadCond = sync.NewCond(&bt.downloadsMutex)
100 return bt
101}
102
103// GetBrowserContext returns the browser context, initializing if needed and resetting the idle timer.
104func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
105 b.mux.Lock()
106 defer b.mux.Unlock()
107
108 // If browser exists, reset idle timer and return
109 if b.browserCtx != nil {
110 b.resetIdleTimerLocked()
111 return b.browserCtx, nil
112 }
113
114 // Initialize a new browser
115 opts := chromedp.DefaultExecAllocatorOptions[:]
116 opts = append(opts, chromedp.NoSandbox)
117 opts = append(opts, chromedp.Flag("--disable-dbus", true))
118 opts = append(opts, chromedp.WSURLReadTimeout(60*time.Second))
119
120 allocCtx, allocCancel := chromedp.NewExecAllocator(b.ctx, opts...)
121 browserCtx, browserCancel := chromedp.NewContext(
122 allocCtx,
123 chromedp.WithLogf(log.Printf),
124 chromedp.WithErrorf(log.Printf),
125 chromedp.WithBrowserOption(chromedp.WithDialTimeout(60*time.Second)),
126 )
127
128 // Set up event listeners for console logs and downloads
129 chromedp.ListenTarget(browserCtx, func(ev any) {
130 switch e := ev.(type) {
131 case *runtime.EventConsoleAPICalled:
132 b.captureConsoleLog(e)
133 case *browser.EventDownloadWillBegin:
134 b.handleDownloadWillBegin(e)
135 case *browser.EventDownloadProgress:
136 b.handleDownloadProgress(e)
137 }
138 })
139
140 // Start the browser
141 if err := chromedp.Run(browserCtx); err != nil {
142 allocCancel()
143 return nil, fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
144 }
145
146 // Set default viewport size to 1280x720 (16:9 widescreen)
147 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
148 browserCancel()
149 allocCancel()
150 return nil, fmt.Errorf("failed to set default viewport: %w", err)
151 }
152
153 // Configure download behavior to allow downloads and emit events
154 if err := chromedp.Run(browserCtx,
155 browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).
156 WithDownloadPath(DownloadDir).
157 WithEventsEnabled(true),
158 ); err != nil {
159 browserCancel()
160 allocCancel()
161 return nil, fmt.Errorf("failed to configure download behavior: %w", err)
162 }
163
164 b.allocCtx = allocCtx
165 b.allocCancel = allocCancel
166 b.browserCtx = browserCtx
167 b.browserCtxCancel = browserCancel
168
169 b.resetIdleTimerLocked()
170
171 return b.browserCtx, nil
172}
173
174// resetIdleTimerLocked resets or starts the idle timer. Caller must hold b.mux.
175func (b *BrowseTools) resetIdleTimerLocked() {
176 if b.idleTimer != nil {
177 b.idleTimer.Stop()
178 }
179 b.idleTimer = time.AfterFunc(b.idleTimeout, b.idleShutdown)
180}
181
182// idleShutdown is called when the idle timer fires
183func (b *BrowseTools) idleShutdown() {
184 b.mux.Lock()
185 defer b.mux.Unlock()
186
187 if b.browserCtx == nil {
188 return
189 }
190
191 log.Printf("Browser idle for %v, shutting down", b.idleTimeout)
192 b.closeBrowserLocked()
193}
194
195// closeBrowserLocked shuts down the browser. Caller must hold b.mux.
196func (b *BrowseTools) closeBrowserLocked() {
197 if b.idleTimer != nil {
198 b.idleTimer.Stop()
199 b.idleTimer = nil
200 }
201
202 if b.browserCtxCancel != nil {
203 b.browserCtxCancel()
204 b.browserCtxCancel = nil
205 }
206
207 if b.allocCancel != nil {
208 b.allocCancel()
209 b.allocCancel = nil
210 }
211
212 b.browserCtx = nil
213 b.allocCtx = nil
214}
215
216// Close shuts down the browser
217func (b *BrowseTools) Close() {
218 b.mux.Lock()
219 defer b.mux.Unlock()
220 b.closeBrowserLocked()
221}
222
223// NavigateTool definition
224type navigateInput struct {
225 URL string `json:"url"`
226 Timeout string `json:"timeout,omitempty"`
227}
228
229// isPort80 reports whether urlStr definitely uses port 80.
230func isPort80(urlStr string) bool {
231 parsedURL, err := url.Parse(urlStr)
232 if err != nil {
233 return false
234 }
235 port := parsedURL.Port()
236 return port == "80" || (port == "" && parsedURL.Scheme == "http")
237}
238
239// NewNavigateTool creates a tool for navigating to URLs
240func (b *BrowseTools) NewNavigateTool() *llm.Tool {
241 return &llm.Tool{
242 Name: "browser_navigate",
243 Description: "Navigate the browser to a specific URL and wait for page to load",
244 InputSchema: json.RawMessage(`{
245 "type": "object",
246 "properties": {
247 "url": {
248 "type": "string",
249 "description": "The URL to navigate to"
250 },
251 "timeout": {
252 "type": "string",
253 "description": "Timeout as a Go duration string (default: 15s)"
254 }
255 },
256 "required": ["url"]
257 }`),
258 Run: b.navigateRun,
259 }
260}
261
262func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
263 var input navigateInput
264 if err := json.Unmarshal(m, &input); err != nil {
265 return llm.ErrorfToolOut("invalid input: %w", err)
266 }
267
268 if isPort80(input.URL) {
269 return llm.ErrorToolOut(fmt.Errorf("port 80 is not the port you're looking for--port 80 is the main sketch server"))
270 }
271
272 browserCtx, err := b.GetBrowserContext()
273 if err != nil {
274 return llm.ErrorToolOut(err)
275 }
276
277 // Create a timeout context for this operation
278 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
279 defer cancel()
280
281 err = chromedp.Run(timeoutCtx,
282 chromedp.Navigate(input.URL),
283 chromedp.WaitReady("body"),
284 )
285 if err != nil {
286 // Navigation to download URLs fails with ERR_ABORTED, but the download may have succeeded.
287 // Wait briefly for download events to be processed, then check if we got any downloads.
288 if strings.Contains(err.Error(), "net::ERR_ABORTED") {
289 time.Sleep(500 * time.Millisecond)
290 downloads := b.GetRecentDownloads()
291 if len(downloads) > 0 {
292 // Download succeeded - report it instead of error
293 var sb strings.Builder
294 sb.WriteString("Navigation triggered download(s):")
295 for _, d := range downloads {
296 if d.Error != "" {
297 sb.WriteString(fmt.Sprintf("\n - %s (from %s): ERROR: %s", d.SuggestedFilename, d.URL, d.Error))
298 } else {
299 sb.WriteString(fmt.Sprintf("\n - %s (from %s) saved to: %s", d.SuggestedFilename, d.URL, d.FinalPath))
300 }
301 }
302 return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
303 }
304 }
305 return llm.ErrorToolOut(err)
306 }
307
308 return b.toolOutWithDownloads("done")
309}
310
311// ResizeTool definition
312type resizeInput struct {
313 Width int `json:"width"`
314 Height int `json:"height"`
315 Timeout string `json:"timeout,omitempty"`
316}
317
318// NewResizeTool creates a tool for resizing the browser viewport
319func (b *BrowseTools) NewResizeTool() *llm.Tool {
320 return &llm.Tool{
321 Name: "browser_resize",
322 Description: "Resize the browser viewport to a specific width and height",
323 InputSchema: json.RawMessage(`{
324 "type": "object",
325 "properties": {
326 "width": {
327 "type": "integer",
328 "description": "Viewport width in pixels"
329 },
330 "height": {
331 "type": "integer",
332 "description": "Viewport height in pixels"
333 },
334 "timeout": {
335 "type": "string",
336 "description": "Timeout as a Go duration string (default: 15s)"
337 }
338 },
339 "required": ["width", "height"]
340 }`),
341 Run: b.resizeRun,
342 }
343}
344
345func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
346 var input resizeInput
347 if err := json.Unmarshal(m, &input); err != nil {
348 return llm.ErrorfToolOut("invalid input: %w", err)
349 }
350
351 if input.Width <= 0 || input.Height <= 0 {
352 return llm.ErrorToolOut(fmt.Errorf("invalid dimensions: width and height must be positive"))
353 }
354
355 browserCtx, err := b.GetBrowserContext()
356 if err != nil {
357 return llm.ErrorToolOut(err)
358 }
359
360 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
361 defer cancel()
362
363 err = chromedp.Run(timeoutCtx,
364 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
365 )
366 if err != nil {
367 return llm.ErrorToolOut(err)
368 }
369
370 return llm.ToolOut{LLMContent: llm.TextContent("done")}
371}
372
373// EvalTool definition
374type evalInput struct {
375 Expression string `json:"expression"`
376 Timeout string `json:"timeout,omitempty"`
377 Await *bool `json:"await,omitempty"`
378}
379
380// NewEvalTool creates a tool for evaluating JavaScript
381func (b *BrowseTools) NewEvalTool() *llm.Tool {
382 return &llm.Tool{
383 Name: "browser_eval",
384 Description: `Evaluate JavaScript in the browser context.
385Your go-to tool for interacting with content: clicking buttons, typing, getting content, scrolling, resizing, waiting for content/selector to be ready, etc.`,
386 InputSchema: json.RawMessage(`{
387 "type": "object",
388 "properties": {
389 "expression": {
390 "type": "string",
391 "description": "JavaScript expression to evaluate"
392 },
393 "timeout": {
394 "type": "string",
395 "description": "Timeout as a Go duration string (default: 15s)"
396 },
397 "await": {
398 "type": "boolean",
399 "description": "If true, wait for promises to resolve and return their resolved value (default: true)"
400 }
401 },
402 "required": ["expression"]
403 }`),
404 Run: b.evalRun,
405 }
406}
407
408func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
409 var input evalInput
410 if err := json.Unmarshal(m, &input); err != nil {
411 return llm.ErrorfToolOut("invalid input: %w", err)
412 }
413
414 browserCtx, err := b.GetBrowserContext()
415 if err != nil {
416 return llm.ErrorToolOut(err)
417 }
418
419 // Create a timeout context for this operation
420 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
421 defer cancel()
422
423 var result any
424 var evalOps []chromedp.EvaluateOption
425
426 await := true
427 if input.Await != nil {
428 await = *input.Await
429 }
430 if await {
431 evalOps = append(evalOps, func(p *runtime.EvaluateParams) *runtime.EvaluateParams {
432 return p.WithAwaitPromise(true)
433 })
434 }
435
436 evalAction := chromedp.Evaluate(input.Expression, &result, evalOps...)
437
438 err = chromedp.Run(timeoutCtx, evalAction)
439 if err != nil {
440 return llm.ErrorToolOut(err)
441 }
442
443 // Return the result as JSON
444 response, err := json.Marshal(result)
445 if err != nil {
446 return llm.ErrorfToolOut("failed to marshal response: %w", err)
447 }
448
449 // If output exceeds threshold, write to file
450 if len(response) > ConsoleLogSizeThreshold {
451 filename := fmt.Sprintf("js_result_%s.json", uuid.New().String()[:8])
452 filePath := filepath.Join(ConsoleLogsDir, filename)
453 if err := os.WriteFile(filePath, response, 0o644); err != nil {
454 return llm.ErrorfToolOut("failed to write JS result to file: %w", err)
455 }
456 return b.toolOutWithDownloads(fmt.Sprintf(
457 "JavaScript result (%d bytes) written to: %s\nUse `cat %s` to view the full content.",
458 len(response), filePath, filePath))
459 }
460
461 return b.toolOutWithDownloads("<javascript_result>" + string(response) + "</javascript_result>")
462}
463
464// ScreenshotTool definition
465type screenshotInput struct {
466 Selector string `json:"selector,omitempty"`
467 Timeout string `json:"timeout,omitempty"`
468}
469
470// NewScreenshotTool creates a tool for taking screenshots
471func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
472 return &llm.Tool{
473 Name: "browser_take_screenshot",
474 Description: "Take a screenshot of the page or a specific element",
475 InputSchema: json.RawMessage(`{
476 "type": "object",
477 "properties": {
478 "selector": {
479 "type": "string",
480 "description": "CSS selector for the element to screenshot (optional)"
481 },
482 "timeout": {
483 "type": "string",
484 "description": "Timeout as a Go duration string (default: 15s)"
485 }
486 }
487 }`),
488 Run: b.screenshotRun,
489 }
490}
491
492func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
493 var input screenshotInput
494 if err := json.Unmarshal(m, &input); err != nil {
495 return llm.ErrorfToolOut("invalid input: %w", err)
496 }
497
498 // Try to get a browser context; if unavailable, return an error
499 browserCtx, err := b.GetBrowserContext()
500 if err != nil {
501 return llm.ErrorToolOut(err)
502 }
503
504 // Create a timeout context for this operation
505 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
506 defer cancel()
507
508 var buf []byte
509 var actions []chromedp.Action
510
511 if input.Selector != "" {
512 // Take screenshot of specific element
513 actions = append(actions,
514 chromedp.WaitReady(input.Selector),
515 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
516 )
517 } else {
518 // Take full page screenshot
519 actions = append(actions, chromedp.CaptureScreenshot(&buf))
520 }
521
522 err = chromedp.Run(timeoutCtx, actions...)
523 if err != nil {
524 return llm.ErrorToolOut(err)
525 }
526
527 // Save the screenshot and get its ID for potential future reference
528 id := b.SaveScreenshot(buf)
529 if id == "" {
530 return llm.ErrorToolOut(fmt.Errorf("failed to save screenshot"))
531 }
532
533 // Get the full path to the screenshot
534 screenshotPath := GetScreenshotPath(id)
535
536 // Resize image if needed to fit within model's image dimension limits
537 imageData := buf
538 format := "png"
539 resized := false
540 if b.maxImageDimension > 0 {
541 var err error
542 imageData, format, resized, err = imageutil.ResizeImage(buf, b.maxImageDimension)
543 if err != nil {
544 return llm.ErrorToolOut(fmt.Errorf("failed to resize screenshot: %w", err))
545 }
546 }
547
548 base64Data := base64.StdEncoding.EncodeToString(imageData)
549 mediaType := "image/" + format
550
551 display := map[string]any{
552 "type": "screenshot",
553 "id": id,
554 "url": "/api/read?path=" + url.QueryEscape(screenshotPath),
555 "path": screenshotPath,
556 "selector": input.Selector,
557 }
558
559 description := fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath)
560 if resized {
561 description += " [resized]"
562 }
563
564 return llm.ToolOut{LLMContent: []llm.Content{
565 {
566 Type: llm.ContentTypeText,
567 Text: description,
568 },
569 {
570 Type: llm.ContentTypeText,
571 MediaType: mediaType,
572 Data: base64Data,
573 },
574 }, Display: display}
575}
576
577// GetTools returns browser tools, optionally filtering out screenshot-related tools
578func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
579 tools := []*llm.Tool{
580 b.NewNavigateTool(),
581 b.NewEvalTool(),
582 b.NewResizeTool(),
583 b.NewRecentConsoleLogsTool(),
584 b.NewClearConsoleLogsTool(),
585 }
586
587 // Add screenshot-related tools if supported
588 if includeScreenshotTools {
589 tools = append(tools, b.NewScreenshotTool())
590 tools = append(tools, b.NewReadImageTool())
591 }
592
593 return tools
594}
595
596// SaveScreenshot saves a screenshot to disk and returns its ID
597func (b *BrowseTools) SaveScreenshot(data []byte) string {
598 // Generate a unique ID
599 id := uuid.New().String()
600
601 // Save the file
602 filePath := filepath.Join(ScreenshotDir, id+".png")
603 if err := os.WriteFile(filePath, data, 0o644); err != nil {
604 log.Printf("Failed to save screenshot: %v", err)
605 return ""
606 }
607
608 // Track this screenshot
609 b.screenshotsMutex.Lock()
610 b.screenshots[id] = time.Now()
611 b.screenshotsMutex.Unlock()
612
613 return id
614}
615
616// GetScreenshotPath returns the full path to a screenshot by ID
617func GetScreenshotPath(id string) string {
618 return filepath.Join(ScreenshotDir, id+".png")
619}
620
621// ReadImageTool definition
622type readImageInput struct {
623 Path string `json:"path"`
624 Timeout string `json:"timeout,omitempty"`
625}
626
627// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
628func (b *BrowseTools) NewReadImageTool() *llm.Tool {
629 return &llm.Tool{
630 Name: "read_image",
631 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
632 InputSchema: json.RawMessage(`{
633 "type": "object",
634 "properties": {
635 "path": {
636 "type": "string",
637 "description": "Path to the image file to read"
638 },
639 "timeout": {
640 "type": "string",
641 "description": "Timeout as a Go duration string (default: 15s)"
642 }
643 },
644 "required": ["path"]
645 }`),
646 Run: b.readImageRun,
647 }
648}
649
650func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
651 var input readImageInput
652 if err := json.Unmarshal(m, &input); err != nil {
653 return llm.ErrorfToolOut("invalid input: %w", err)
654 }
655
656 // Check if the path exists
657 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
658 return llm.ErrorfToolOut("image file not found: %s", input.Path)
659 }
660
661 // Read the file
662 imageData, err := os.ReadFile(input.Path)
663 if err != nil {
664 return llm.ErrorfToolOut("failed to read image file: %w", err)
665 }
666
667 // Convert HEIC to PNG if needed (Go's image library doesn't support HEIC)
668 converted := false
669 if imageutil.IsHEIC(imageData) {
670 imageData, err = imageutil.ConvertHEICToPNG(imageData)
671 if err != nil {
672 return llm.ErrorfToolOut("failed to convert HEIC image: %w", err)
673 }
674 converted = true
675 }
676
677 detectedType := http.DetectContentType(imageData)
678 if !strings.HasPrefix(detectedType, "image/") {
679 return llm.ErrorfToolOut("file is not an image: %s", detectedType)
680 }
681
682 // Resize image if needed to fit within model's image dimension limits
683 resized := false
684 format := strings.TrimPrefix(detectedType, "image/")
685 if b.maxImageDimension > 0 {
686 var err error
687 imageData, format, resized, err = imageutil.ResizeImage(imageData, b.maxImageDimension)
688 if err != nil {
689 return llm.ErrorToolOut(fmt.Errorf("failed to resize image: %w", err))
690 }
691 }
692
693 base64Data := base64.StdEncoding.EncodeToString(imageData)
694 mediaType := "image/" + format
695
696 description := fmt.Sprintf("Image from %s (type: %s)", input.Path, mediaType)
697 if converted {
698 description += " [converted from HEIC]"
699 }
700 if resized {
701 description += " [resized]"
702 }
703
704 return llm.ToolOut{LLMContent: []llm.Content{
705 {
706 Type: llm.ContentTypeText,
707 Text: description,
708 },
709 {
710 Type: llm.ContentTypeText,
711 MediaType: mediaType,
712 Data: base64Data,
713 },
714 }}
715}
716
717// parseTimeout parses a timeout string and returns a time.Duration
718// It returns a default of 5 seconds if the timeout is empty or invalid
719func parseTimeout(timeout string) time.Duration {
720 dur, err := time.ParseDuration(timeout)
721 if err != nil {
722 return 15 * time.Second
723 }
724 return dur
725}
726
727// captureConsoleLog captures a console log event and stores it
728func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
729 // Add to logs with mutex protection
730 b.consoleLogsMutex.Lock()
731 defer b.consoleLogsMutex.Unlock()
732
733 // Add the log and maintain max size
734 b.consoleLogs = append(b.consoleLogs, e)
735 if len(b.consoleLogs) > b.maxConsoleLogs {
736 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
737 }
738}
739
740// handleDownloadWillBegin handles the browser download start event
741func (b *BrowseTools) handleDownloadWillBegin(e *browser.EventDownloadWillBegin) {
742 b.downloadsMutex.Lock()
743 defer b.downloadsMutex.Unlock()
744
745 b.downloads[e.GUID] = &DownloadInfo{
746 GUID: e.GUID,
747 URL: e.URL,
748 SuggestedFilename: e.SuggestedFilename,
749 }
750}
751
752// handleDownloadProgress handles the browser download progress event
753func (b *BrowseTools) handleDownloadProgress(e *browser.EventDownloadProgress) {
754 b.downloadsMutex.Lock()
755 defer b.downloadsMutex.Unlock()
756
757 info, ok := b.downloads[e.GUID]
758 if !ok {
759 // Download started before we started tracking, create entry
760 info = &DownloadInfo{GUID: e.GUID}
761 b.downloads[e.GUID] = info
762 }
763
764 switch e.State {
765 case browser.DownloadProgressStateCompleted:
766 info.Completed = true
767 // The file is downloaded with GUID as filename, rename to suggested filename with random suffix
768 guidPath := filepath.Join(DownloadDir, e.GUID)
769 finalName := b.generateDownloadFilename(info.SuggestedFilename)
770 finalPath := filepath.Join(DownloadDir, finalName)
771 // Retry rename a few times as file might still be being written
772 var renamed bool
773 for i := 0; i < 10; i++ {
774 if err := os.Rename(guidPath, finalPath); err == nil {
775 info.FinalPath = finalPath
776 renamed = true
777 break
778 }
779 time.Sleep(50 * time.Millisecond)
780 }
781 if !renamed {
782 // File might have different path or couldn't be renamed
783 if e.FilePath != "" {
784 info.FinalPath = e.FilePath
785 } else {
786 info.FinalPath = guidPath
787 }
788 }
789 b.downloadCond.Broadcast()
790 case browser.DownloadProgressStateCanceled:
791 info.Completed = true
792 info.Error = "download canceled"
793 b.downloadCond.Broadcast()
794 }
795}
796
797// generateDownloadFilename creates a filename with randomness
798func (b *BrowseTools) generateDownloadFilename(suggested string) string {
799 if suggested == "" {
800 suggested = "download"
801 }
802 // Extract extension if present
803 ext := filepath.Ext(suggested)
804 base := strings.TrimSuffix(suggested, ext)
805 // Add random suffix
806 randomSuffix := uuid.New().String()[:8]
807 return fmt.Sprintf("%s_%s%s", base, randomSuffix, ext)
808}
809
810// GetRecentDownloads returns download info for recently completed downloads and clears the list
811func (b *BrowseTools) GetRecentDownloads() []*DownloadInfo {
812 b.downloadsMutex.Lock()
813 defer b.downloadsMutex.Unlock()
814
815 var completed []*DownloadInfo
816 for guid, info := range b.downloads {
817 if info.Completed {
818 completed = append(completed, info)
819 delete(b.downloads, guid)
820 }
821 }
822 return completed
823}
824
825// toolOutWithDownloads creates a tool output that includes any completed downloads
826func (b *BrowseTools) toolOutWithDownloads(message string) llm.ToolOut {
827 downloads := b.GetRecentDownloads()
828 if len(downloads) == 0 {
829 return llm.ToolOut{LLMContent: llm.TextContent(message)}
830 }
831
832 var sb strings.Builder
833 sb.WriteString(message)
834 sb.WriteString("\n\nDownloads completed:")
835 for _, d := range downloads {
836 if d.Error != "" {
837 sb.WriteString(fmt.Sprintf("\n - %s (from %s): ERROR: %s", d.SuggestedFilename, d.URL, d.Error))
838 } else {
839 sb.WriteString(fmt.Sprintf("\n - %s (from %s) saved to: %s", d.SuggestedFilename, d.URL, d.FinalPath))
840 }
841 }
842 return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
843}
844
845// RecentConsoleLogsTool definition
846type recentConsoleLogsInput struct {
847 Limit int `json:"limit,omitempty"`
848}
849
850// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
851func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
852 return &llm.Tool{
853 Name: "browser_recent_console_logs",
854 Description: "Get recent browser console logs",
855 InputSchema: json.RawMessage(`{
856 "type": "object",
857 "properties": {
858 "limit": {
859 "type": "integer",
860 "description": "Maximum number of log entries to return (default: 100)"
861 }
862 }
863 }`),
864 Run: b.recentConsoleLogsRun,
865 }
866}
867
868func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
869 var input recentConsoleLogsInput
870 if err := json.Unmarshal(m, &input); err != nil {
871 return llm.ErrorfToolOut("invalid input: %w", err)
872 }
873
874 // Ensure browser is initialized
875 _, err := b.GetBrowserContext()
876 if err != nil {
877 return llm.ErrorToolOut(err)
878 }
879
880 // Apply limit (default to 100 if not specified)
881 limit := 100
882 if input.Limit > 0 {
883 limit = input.Limit
884 }
885
886 // Get console logs with mutex protection
887 b.consoleLogsMutex.Lock()
888 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
889 start := 0
890 if len(b.consoleLogs) > limit {
891 start = len(b.consoleLogs) - limit
892 }
893 logs = append(logs, b.consoleLogs[start:]...)
894 b.consoleLogsMutex.Unlock()
895
896 // Format the logs as JSON
897 logData, err := json.MarshalIndent(logs, "", " ")
898 if err != nil {
899 return llm.ErrorfToolOut("failed to serialize logs: %w", err)
900 }
901
902 // If output exceeds threshold, write to file
903 if len(logData) > ConsoleLogSizeThreshold {
904 filename := fmt.Sprintf("console_logs_%s.json", uuid.New().String()[:8])
905 filePath := filepath.Join(ConsoleLogsDir, filename)
906 if err := os.WriteFile(filePath, logData, 0o644); err != nil {
907 return llm.ErrorfToolOut("failed to write console logs to file: %w", err)
908 }
909 return llm.ToolOut{LLMContent: llm.TextContent(fmt.Sprintf(
910 "Retrieved %d console log entries (%d bytes).\nOutput written to: %s\nUse `cat %s` to view the full content.",
911 len(logs), len(logData), filePath, filePath))}
912 }
913
914 // Format the logs
915 var sb strings.Builder
916 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
917
918 if len(logs) == 0 {
919 sb.WriteString("No console logs captured.")
920 } else {
921 // Add the JSON data for full details
922 sb.WriteString(string(logData))
923 }
924
925 return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
926}
927
928// ClearConsoleLogsTool definition
929type clearConsoleLogsInput struct{}
930
931// NewClearConsoleLogsTool creates a tool for clearing console logs
932func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
933 return &llm.Tool{
934 Name: "browser_clear_console_logs",
935 Description: "Clear all captured browser console logs",
936 InputSchema: llm.EmptySchema(),
937 Run: b.clearConsoleLogsRun,
938 }
939}
940
941func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
942 var input clearConsoleLogsInput
943 if err := json.Unmarshal(m, &input); err != nil {
944 return llm.ErrorfToolOut("invalid input: %w", err)
945 }
946
947 // Ensure browser is initialized
948 _, err := b.GetBrowserContext()
949 if err != nil {
950 return llm.ErrorToolOut(err)
951 }
952
953 // Clear console logs with mutex protection
954 b.consoleLogsMutex.Lock()
955 logCount := len(b.consoleLogs)
956 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
957 b.consoleLogsMutex.Unlock()
958
959 return llm.ToolOut{LLMContent: llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount))}
960}