search.go

  1package tools
  2
  3import (
  4	"context"
  5	"fmt"
  6	"io"
  7	"net/http"
  8	"net/url"
  9	"slices"
 10	"strings"
 11
 12	"golang.org/x/net/html"
 13)
 14
 15// SearchResult represents a single search result from DuckDuckGo.
 16type SearchResult struct {
 17	Title    string
 18	Link     string
 19	Snippet  string
 20	Position int
 21}
 22
 23// searchDuckDuckGo performs a web search using DuckDuckGo's HTML endpoint.
 24func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, maxResults int) ([]SearchResult, error) {
 25	if maxResults <= 0 {
 26		maxResults = 10
 27	}
 28
 29	formData := url.Values{}
 30	formData.Set("q", query)
 31	formData.Set("b", "")
 32	formData.Set("kl", "")
 33
 34	req, err := http.NewRequestWithContext(ctx, "POST", "https://html.duckduckgo.com/html", strings.NewReader(formData.Encode()))
 35	if err != nil {
 36		return nil, fmt.Errorf("failed to create request: %w", err)
 37	}
 38
 39	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 40	req.Header.Set("User-Agent", BrowserUserAgent)
 41	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
 42	req.Header.Set("Accept-Language", "en-US,en;q=0.5")
 43	req.Header.Set("Accept-Encoding", "gzip, deflate")
 44	req.Header.Set("Referer", "https://duckduckgo.com/")
 45
 46	resp, err := client.Do(req)
 47	if err != nil {
 48		return nil, fmt.Errorf("failed to execute search: %w", err)
 49	}
 50	defer resp.Body.Close()
 51
 52	// Accept both 200 (OK) and 202 (Accepted).
 53	// DuckDuckGo may still return 202 for rate limiting or bot detection.
 54	if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
 55		return nil, fmt.Errorf("search failed with status code: %d (DuckDuckGo may be rate limiting requests)", resp.StatusCode)
 56	}
 57
 58	body, err := io.ReadAll(resp.Body)
 59	if err != nil {
 60		return nil, fmt.Errorf("failed to read response: %w", err)
 61	}
 62
 63	return parseSearchResults(string(body), maxResults)
 64}
 65
 66// parseSearchResults extracts search results from DuckDuckGo HTML response.
 67func parseSearchResults(htmlContent string, maxResults int) ([]SearchResult, error) {
 68	doc, err := html.Parse(strings.NewReader(htmlContent))
 69	if err != nil {
 70		return nil, fmt.Errorf("failed to parse HTML: %w", err)
 71	}
 72
 73	var results []SearchResult
 74	var traverse func(*html.Node)
 75
 76	traverse = func(n *html.Node) {
 77		if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "result") {
 78			result := extractResult(n)
 79			if result != nil && result.Link != "" && !strings.Contains(result.Link, "y.js") {
 80				result.Position = len(results) + 1
 81				results = append(results, *result)
 82				if len(results) >= maxResults {
 83					return
 84				}
 85			}
 86		}
 87		for c := n.FirstChild; c != nil && len(results) < maxResults; c = c.NextSibling {
 88			traverse(c)
 89		}
 90	}
 91
 92	traverse(doc)
 93	return results, nil
 94}
 95
 96// hasClass checks if an HTML node has a specific class.
 97func hasClass(n *html.Node, class string) bool {
 98	for _, attr := range n.Attr {
 99		if attr.Key == "class" {
100			return slices.Contains(strings.Fields(attr.Val), class)
101		}
102	}
103	return false
104}
105
106// extractResult extracts a search result from a result div node.
107func extractResult(n *html.Node) *SearchResult {
108	result := &SearchResult{}
109
110	var traverse func(*html.Node)
111	traverse = func(node *html.Node) {
112		if node.Type == html.ElementNode {
113			// Look for title link.
114			if node.Data == "a" && hasClass(node, "result__a") {
115				result.Title = getTextContent(node)
116				for _, attr := range node.Attr {
117					if attr.Key == "href" {
118						result.Link = cleanDuckDuckGoURL(attr.Val)
119						break
120					}
121				}
122			}
123			// Look for snippet.
124			if node.Data == "a" && hasClass(node, "result__snippet") {
125				result.Snippet = getTextContent(node)
126			}
127		}
128		for c := node.FirstChild; c != nil; c = c.NextSibling {
129			traverse(c)
130		}
131	}
132
133	traverse(n)
134	return result
135}
136
137// getTextContent extracts all text content from a node and its children.
138func getTextContent(n *html.Node) string {
139	var text strings.Builder
140	var traverse func(*html.Node)
141
142	traverse = func(node *html.Node) {
143		if node.Type == html.TextNode {
144			text.WriteString(node.Data)
145		}
146		for c := node.FirstChild; c != nil; c = c.NextSibling {
147			traverse(c)
148		}
149	}
150
151	traverse(n)
152	return strings.TrimSpace(text.String())
153}
154
155// cleanDuckDuckGoURL extracts the actual URL from DuckDuckGo's redirect URL.
156func cleanDuckDuckGoURL(rawURL string) string {
157	if strings.HasPrefix(rawURL, "//duckduckgo.com/l/?uddg=") {
158		// Extract the actual URL from the redirect.
159		if idx := strings.Index(rawURL, "uddg="); idx != -1 {
160			encoded := rawURL[idx+5:]
161			if ampIdx := strings.Index(encoded, "&"); ampIdx != -1 {
162				encoded = encoded[:ampIdx]
163			}
164			decoded, err := url.QueryUnescape(encoded)
165			if err == nil {
166				return decoded
167			}
168		}
169	}
170	return rawURL
171}
172
173// formatSearchResults formats search results for LLM consumption.
174func formatSearchResults(results []SearchResult) string {
175	if len(results) == 0 {
176		return "No results were found for your search query. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes."
177	}
178
179	var sb strings.Builder
180	sb.WriteString(fmt.Sprintf("Found %d search results:\n\n", len(results)))
181
182	for _, result := range results {
183		sb.WriteString(fmt.Sprintf("%d. %s\n", result.Position, result.Title))
184		sb.WriteString(fmt.Sprintf("   URL: %s\n", result.Link))
185		sb.WriteString(fmt.Sprintf("   Summary: %s\n\n", result.Snippet))
186	}
187
188	return sb.String()
189}