fix: try to make the search tool more reliable (#1779)

Kujtim Hoxha created

Change summary

internal/agent/tools/search.go     | 172 ++++++++++++++++++-------------
internal/agent/tools/web_search.go |   3 
2 files changed, 104 insertions(+), 71 deletions(-)

Detailed changes

internal/agent/tools/search.go 🔗

@@ -4,10 +4,13 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"math/rand/v2"
 	"net/http"
 	"net/url"
 	"slices"
 	"strings"
+	"sync"
+	"time"
 
 	"golang.org/x/net/html"
 )
@@ -20,28 +23,41 @@ type SearchResult struct {
 	Position int
 }
 
-// searchDuckDuckGo performs a web search using DuckDuckGo's HTML endpoint.
+var userAgents = []string{
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15",
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
+}
+
+var acceptLanguages = []string{
+	"en-US,en;q=0.9",
+	"en-US,en;q=0.9,es;q=0.8",
+	"en-GB,en;q=0.9,en-US;q=0.8",
+	"en-US,en;q=0.5",
+	"en-CA,en;q=0.9,en-US;q=0.8",
+}
+
 func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, maxResults int) ([]SearchResult, error) {
 	if maxResults <= 0 {
 		maxResults = 10
 	}
 
-	formData := url.Values{}
-	formData.Set("q", query)
-	formData.Set("b", "")
-	formData.Set("kl", "")
+	searchURL := "https://lite.duckduckgo.com/lite/?q=" + url.QueryEscape(query)
 
-	req, err := http.NewRequestWithContext(ctx, "POST", "https://html.duckduckgo.com/html", strings.NewReader(formData.Encode()))
+	req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create request: %w", err)
 	}
 
-	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
-	req.Header.Set("User-Agent", BrowserUserAgent)
-	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
-	req.Header.Set("Accept-Language", "en-US,en;q=0.5")
-	req.Header.Set("Accept-Encoding", "gzip, deflate")
-	req.Header.Set("Referer", "https://duckduckgo.com/")
+	setRandomizedHeaders(req)
 
 	resp, err := client.Do(req)
 	if err != nil {
@@ -49,10 +65,8 @@ func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, ma
 	}
 	defer resp.Body.Close()
 
-	// Accept both 200 (OK) and 202 (Accepted).
-	// DuckDuckGo may still return 202 for rate limiting or bot detection.
 	if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
-		return nil, fmt.Errorf("search failed with status code: %d (DuckDuckGo may be rate limiting requests)", resp.StatusCode)
+		return nil, fmt.Errorf("search failed with status code: %d", resp.StatusCode)
 	}
 
 	body, err := io.ReadAll(resp.Body)
@@ -60,85 +74,90 @@ func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, ma
 		return nil, fmt.Errorf("failed to read response: %w", err)
 	}
 
-	return parseSearchResults(string(body), maxResults)
+	return parseLiteSearchResults(string(body), maxResults)
 }
 
-// parseSearchResults extracts search results from DuckDuckGo HTML response.
-func parseSearchResults(htmlContent string, maxResults int) ([]SearchResult, error) {
+func setRandomizedHeaders(req *http.Request) {
+	req.Header.Set("User-Agent", userAgents[rand.IntN(len(userAgents))])
+	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+	req.Header.Set("Accept-Language", acceptLanguages[rand.IntN(len(acceptLanguages))])
+	req.Header.Set("Accept-Encoding", "identity")
+	req.Header.Set("Connection", "keep-alive")
+	req.Header.Set("Upgrade-Insecure-Requests", "1")
+	req.Header.Set("Sec-Fetch-Dest", "document")
+	req.Header.Set("Sec-Fetch-Mode", "navigate")
+	req.Header.Set("Sec-Fetch-Site", "none")
+	req.Header.Set("Sec-Fetch-User", "?1")
+	req.Header.Set("Cache-Control", "max-age=0")
+	if rand.IntN(2) == 0 {
+		req.Header.Set("DNT", "1")
+	}
+}
+
+func parseLiteSearchResults(htmlContent string, maxResults int) ([]SearchResult, error) {
 	doc, err := html.Parse(strings.NewReader(htmlContent))
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse HTML: %w", err)
 	}
 
 	var results []SearchResult
-	var traverse func(*html.Node)
+	var currentResult *SearchResult
 
+	var traverse func(*html.Node)
 	traverse = func(n *html.Node) {
-		if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "result") {
-			result := extractResult(n)
-			if result != nil && result.Link != "" && !strings.Contains(result.Link, "y.js") {
-				result.Position = len(results) + 1
-				results = append(results, *result)
-				if len(results) >= maxResults {
-					return
+		if n.Type == html.ElementNode {
+			if n.Data == "a" && hasClass(n, "result-link") {
+				if currentResult != nil && currentResult.Link != "" {
+					currentResult.Position = len(results) + 1
+					results = append(results, *currentResult)
+					if len(results) >= maxResults {
+						return
+					}
+				}
+				currentResult = &SearchResult{Title: getTextContent(n)}
+				for _, attr := range n.Attr {
+					if attr.Key == "href" {
+						currentResult.Link = cleanDuckDuckGoURL(attr.Val)
+						break
+					}
 				}
 			}
+			if n.Data == "td" && hasClass(n, "result-snippet") && currentResult != nil {
+				currentResult.Snippet = getTextContent(n)
+			}
 		}
-		for c := n.FirstChild; c != nil && len(results) < maxResults; c = c.NextSibling {
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			if len(results) >= maxResults {
+				return
+			}
 			traverse(c)
 		}
 	}
 
 	traverse(doc)
+
+	if currentResult != nil && currentResult.Link != "" && len(results) < maxResults {
+		currentResult.Position = len(results) + 1
+		results = append(results, *currentResult)
+	}
+
 	return results, nil
 }
 
-// hasClass checks if an HTML node has a specific class.
 func hasClass(n *html.Node, class string) bool {
 	for _, attr := range n.Attr {
 		if attr.Key == "class" {
-			return slices.Contains(strings.Fields(attr.Val), class)
-		}
-	}
-	return false
-}
-
-// extractResult extracts a search result from a result div node.
-func extractResult(n *html.Node) *SearchResult {
-	result := &SearchResult{}
-
-	var traverse func(*html.Node)
-	traverse = func(node *html.Node) {
-		if node.Type == html.ElementNode {
-			// Look for title link.
-			if node.Data == "a" && hasClass(node, "result__a") {
-				result.Title = getTextContent(node)
-				for _, attr := range node.Attr {
-					if attr.Key == "href" {
-						result.Link = cleanDuckDuckGoURL(attr.Val)
-						break
-					}
-				}
-			}
-			// Look for snippet.
-			if node.Data == "a" && hasClass(node, "result__snippet") {
-				result.Snippet = getTextContent(node)
+			if slices.Contains(strings.Fields(attr.Val), class) {
+				return true
 			}
 		}
-		for c := node.FirstChild; c != nil; c = c.NextSibling {
-			traverse(c)
-		}
 	}
-
-	traverse(n)
-	return result
+	return false
 }
 
-// getTextContent extracts all text content from a node and its children.
 func getTextContent(n *html.Node) string {
 	var text strings.Builder
 	var traverse func(*html.Node)
-
 	traverse = func(node *html.Node) {
 		if node.Type == html.TextNode {
 			text.WriteString(node.Data)
@@ -147,22 +166,18 @@ func getTextContent(n *html.Node) string {
 			traverse(c)
 		}
 	}
-
 	traverse(n)
 	return strings.TrimSpace(text.String())
 }
 
-// cleanDuckDuckGoURL extracts the actual URL from DuckDuckGo's redirect URL.
 func cleanDuckDuckGoURL(rawURL string) string {
 	if strings.HasPrefix(rawURL, "//duckduckgo.com/l/?uddg=") {
-		// Extract the actual URL from the redirect.
 		if idx := strings.Index(rawURL, "uddg="); idx != -1 {
 			encoded := rawURL[idx+5:]
 			if ampIdx := strings.Index(encoded, "&"); ampIdx != -1 {
 				encoded = encoded[:ampIdx]
 			}
-			decoded, err := url.QueryUnescape(encoded)
-			if err == nil {
+			if decoded, err := url.QueryUnescape(encoded); err == nil {
 				return decoded
 			}
 		}
@@ -170,20 +185,35 @@ func cleanDuckDuckGoURL(rawURL string) string {
 	return rawURL
 }
 
-// formatSearchResults formats search results for LLM consumption.
 func formatSearchResults(results []SearchResult) string {
 	if len(results) == 0 {
-		return "No results were found for your search query. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes."
+		return "No results found. Try rephrasing your search."
 	}
 
 	var sb strings.Builder
 	sb.WriteString(fmt.Sprintf("Found %d search results:\n\n", len(results)))
-
 	for _, result := range results {
 		sb.WriteString(fmt.Sprintf("%d. %s\n", result.Position, result.Title))
 		sb.WriteString(fmt.Sprintf("   URL: %s\n", result.Link))
 		sb.WriteString(fmt.Sprintf("   Summary: %s\n\n", result.Snippet))
 	}
-
 	return sb.String()
 }
+
+var (
+	lastSearchMu   sync.Mutex
+	lastSearchTime time.Time
+)
+
+// maybeDelaySearch adds a random delay if the last search was recent.
+func maybeDelaySearch() {
+	lastSearchMu.Lock()
+	defer lastSearchMu.Unlock()
+
+	minGap := time.Duration(500+rand.IntN(1500)) * time.Millisecond
+	elapsed := time.Since(lastSearchTime)
+	if elapsed < minGap {
+		time.Sleep(minGap - elapsed)
+	}
+	lastSearchTime = time.Now()
+}

internal/agent/tools/web_search.go 🔗

@@ -3,6 +3,7 @@ package tools
 import (
 	"context"
 	_ "embed"
+	"log/slog"
 	"net/http"
 	"time"
 
@@ -41,7 +42,9 @@ func NewWebSearchTool(client *http.Client) fantasy.AgentTool {
 				maxResults = 20
 			}
 
+			maybeDelaySearch()
 			results, err := searchDuckDuckGo(ctx, client, params.Query, maxResults)
+			slog.Debug("Web search completed", "query", params.Query, "results", len(results), "err", err)
 			if err != nil {
 				return fantasy.NewTextErrorResponse("Failed to search: " + err.Error()), nil
 			}