From ab6d97113665941296269a48c4ea47f886829e41 Mon Sep 17 00:00:00 2001 From: Kujtim Hoxha Date: Thu, 15 Jan 2026 21:13:01 +0100 Subject: [PATCH] fix: try to make the search tool more reliable (#1779) --- internal/agent/tools/search.go | 172 +++++++++++++++++------------ internal/agent/tools/web_search.go | 3 + 2 files changed, 104 insertions(+), 71 deletions(-) diff --git a/internal/agent/tools/search.go b/internal/agent/tools/search.go index 64c3219f169b1c8ce8284b86203e84bfb19d0e59..9df7be8764ab952a23f25d624f72748696a86aac 100644 --- a/internal/agent/tools/search.go +++ b/internal/agent/tools/search.go @@ -4,10 +4,13 @@ import ( "context" "fmt" "io" + "math/rand/v2" "net/http" "net/url" "slices" "strings" + "sync" + "time" "golang.org/x/net/html" ) @@ -20,28 +23,41 @@ type SearchResult struct { Position int } -// searchDuckDuckGo performs a web search using DuckDuckGo's HTML endpoint. +var userAgents = []string{ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", +} + +var acceptLanguages = []string{ + "en-US,en;q=0.9", + "en-US,en;q=0.9,es;q=0.8", + "en-GB,en;q=0.9,en-US;q=0.8", + "en-US,en;q=0.5", + "en-CA,en;q=0.9,en-US;q=0.8", +} + func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, maxResults int) ([]SearchResult, error) { if maxResults <= 0 { maxResults = 10 } - formData := url.Values{} - formData.Set("q", query) - formData.Set("b", "") - formData.Set("kl", "") + searchURL := "https://lite.duckduckgo.com/lite/?q=" + url.QueryEscape(query) - req, err := http.NewRequestWithContext(ctx, "POST", "https://html.duckduckgo.com/html", strings.NewReader(formData.Encode())) + req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } - req.Header.Set("Content-Type", "application/x-www-form-urlencoded") - req.Header.Set("User-Agent", BrowserUserAgent) - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - req.Header.Set("Accept-Language", "en-US,en;q=0.5") - req.Header.Set("Accept-Encoding", "gzip, deflate") - req.Header.Set("Referer", "https://duckduckgo.com/") + setRandomizedHeaders(req) resp, err := client.Do(req) if err != nil { @@ -49,10 +65,8 @@ func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, ma } defer resp.Body.Close() - // Accept both 200 (OK) and 202 (Accepted). - // DuckDuckGo may still return 202 for rate limiting or bot detection. if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted { - return nil, fmt.Errorf("search failed with status code: %d (DuckDuckGo may be rate limiting requests)", resp.StatusCode) + return nil, fmt.Errorf("search failed with status code: %d", resp.StatusCode) } body, err := io.ReadAll(resp.Body) @@ -60,85 +74,90 @@ func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, ma return nil, fmt.Errorf("failed to read response: %w", err) } - return parseSearchResults(string(body), maxResults) + return parseLiteSearchResults(string(body), maxResults) } -// parseSearchResults extracts search results from DuckDuckGo HTML response. -func parseSearchResults(htmlContent string, maxResults int) ([]SearchResult, error) { +func setRandomizedHeaders(req *http.Request) { + req.Header.Set("User-Agent", userAgents[rand.IntN(len(userAgents))]) + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + req.Header.Set("Accept-Language", acceptLanguages[rand.IntN(len(acceptLanguages))]) + req.Header.Set("Accept-Encoding", "identity") + req.Header.Set("Connection", "keep-alive") + req.Header.Set("Upgrade-Insecure-Requests", "1") + req.Header.Set("Sec-Fetch-Dest", "document") + req.Header.Set("Sec-Fetch-Mode", "navigate") + req.Header.Set("Sec-Fetch-Site", "none") + req.Header.Set("Sec-Fetch-User", "?1") + req.Header.Set("Cache-Control", "max-age=0") + if rand.IntN(2) == 0 { + req.Header.Set("DNT", "1") + } +} + +func parseLiteSearchResults(htmlContent string, maxResults int) ([]SearchResult, error) { doc, err := html.Parse(strings.NewReader(htmlContent)) if err != nil { return nil, fmt.Errorf("failed to parse HTML: %w", err) } var results []SearchResult - var traverse func(*html.Node) + var currentResult *SearchResult + var traverse func(*html.Node) traverse = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "result") { - result := extractResult(n) - if result != nil && result.Link != "" && !strings.Contains(result.Link, "y.js") { - result.Position = len(results) + 1 - results = append(results, *result) - if len(results) >= maxResults { - return + if n.Type == html.ElementNode { + if n.Data == "a" && hasClass(n, "result-link") { + if currentResult != nil && currentResult.Link != "" { + currentResult.Position = len(results) + 1 + results = append(results, *currentResult) + if len(results) >= maxResults { + return + } + } + currentResult = &SearchResult{Title: getTextContent(n)} + for _, attr := range n.Attr { + if attr.Key == "href" { + currentResult.Link = cleanDuckDuckGoURL(attr.Val) + break + } } } + if n.Data == "td" && hasClass(n, "result-snippet") && currentResult != nil { + currentResult.Snippet = getTextContent(n) + } } - for c := n.FirstChild; c != nil && len(results) < maxResults; c = c.NextSibling { + for c := n.FirstChild; c != nil; c = c.NextSibling { + if len(results) >= maxResults { + return + } traverse(c) } } traverse(doc) + + if currentResult != nil && currentResult.Link != "" && len(results) < maxResults { + currentResult.Position = len(results) + 1 + results = append(results, *currentResult) + } + return results, nil } -// hasClass checks if an HTML node has a specific class. func hasClass(n *html.Node, class string) bool { for _, attr := range n.Attr { if attr.Key == "class" { - return slices.Contains(strings.Fields(attr.Val), class) - } - } - return false -} - -// extractResult extracts a search result from a result div node. -func extractResult(n *html.Node) *SearchResult { - result := &SearchResult{} - - var traverse func(*html.Node) - traverse = func(node *html.Node) { - if node.Type == html.ElementNode { - // Look for title link. - if node.Data == "a" && hasClass(node, "result__a") { - result.Title = getTextContent(node) - for _, attr := range node.Attr { - if attr.Key == "href" { - result.Link = cleanDuckDuckGoURL(attr.Val) - break - } - } - } - // Look for snippet. - if node.Data == "a" && hasClass(node, "result__snippet") { - result.Snippet = getTextContent(node) + if slices.Contains(strings.Fields(attr.Val), class) { + return true } } - for c := node.FirstChild; c != nil; c = c.NextSibling { - traverse(c) - } } - - traverse(n) - return result + return false } -// getTextContent extracts all text content from a node and its children. func getTextContent(n *html.Node) string { var text strings.Builder var traverse func(*html.Node) - traverse = func(node *html.Node) { if node.Type == html.TextNode { text.WriteString(node.Data) @@ -147,22 +166,18 @@ func getTextContent(n *html.Node) string { traverse(c) } } - traverse(n) return strings.TrimSpace(text.String()) } -// cleanDuckDuckGoURL extracts the actual URL from DuckDuckGo's redirect URL. func cleanDuckDuckGoURL(rawURL string) string { if strings.HasPrefix(rawURL, "//duckduckgo.com/l/?uddg=") { - // Extract the actual URL from the redirect. if idx := strings.Index(rawURL, "uddg="); idx != -1 { encoded := rawURL[idx+5:] if ampIdx := strings.Index(encoded, "&"); ampIdx != -1 { encoded = encoded[:ampIdx] } - decoded, err := url.QueryUnescape(encoded) - if err == nil { + if decoded, err := url.QueryUnescape(encoded); err == nil { return decoded } } @@ -170,20 +185,35 @@ func cleanDuckDuckGoURL(rawURL string) string { return rawURL } -// formatSearchResults formats search results for LLM consumption. func formatSearchResults(results []SearchResult) string { if len(results) == 0 { - return "No results were found for your search query. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes." + return "No results found. Try rephrasing your search." } var sb strings.Builder sb.WriteString(fmt.Sprintf("Found %d search results:\n\n", len(results))) - for _, result := range results { sb.WriteString(fmt.Sprintf("%d. %s\n", result.Position, result.Title)) sb.WriteString(fmt.Sprintf(" URL: %s\n", result.Link)) sb.WriteString(fmt.Sprintf(" Summary: %s\n\n", result.Snippet)) } - return sb.String() } + +var ( + lastSearchMu sync.Mutex + lastSearchTime time.Time +) + +// maybeDelaySearch adds a random delay if the last search was recent. +func maybeDelaySearch() { + lastSearchMu.Lock() + defer lastSearchMu.Unlock() + + minGap := time.Duration(500+rand.IntN(1500)) * time.Millisecond + elapsed := time.Since(lastSearchTime) + if elapsed < minGap { + time.Sleep(minGap - elapsed) + } + lastSearchTime = time.Now() +} diff --git a/internal/agent/tools/web_search.go b/internal/agent/tools/web_search.go index b604c9051b4f5b0039431c01bea0b150a318740e..5ce9280c013cdd100f6d7734c969723b21e7e3bf 100644 --- a/internal/agent/tools/web_search.go +++ b/internal/agent/tools/web_search.go @@ -3,6 +3,7 @@ package tools import ( "context" _ "embed" + "log/slog" "net/http" "time" @@ -41,7 +42,9 @@ func NewWebSearchTool(client *http.Client) fantasy.AgentTool { maxResults = 20 } + maybeDelaySearch() results, err := searchDuckDuckGo(ctx, client, params.Query, maxResults) + slog.Debug("Web search completed", "query", params.Query, "results", len(results), "err", err) if err != nil { return fantasy.NewTextErrorResponse("Failed to search: " + err.Error()), nil }