search.go

  1package tools
  2
  3import (
  4	"context"
  5	"fmt"
  6	"io"
  7	"net/http"
  8	"net/url"
  9	"slices"
 10	"strings"
 11
 12	"golang.org/x/net/html"
 13)
 14
 15// SearchResult represents a single search result from DuckDuckGo.
 16type SearchResult struct {
 17	Title    string
 18	Link     string
 19	Snippet  string
 20	Position int
 21}
 22
 23// searchDuckDuckGo performs a web search using DuckDuckGo's HTML endpoint.
 24func searchDuckDuckGo(ctx context.Context, client *http.Client, query string, maxResults int) ([]SearchResult, error) {
 25	if maxResults <= 0 {
 26		maxResults = 10
 27	}
 28
 29	formData := url.Values{}
 30	formData.Set("q", query)
 31	formData.Set("b", "")
 32	formData.Set("kl", "")
 33
 34	req, err := http.NewRequestWithContext(ctx, "POST", "https://html.duckduckgo.com/html", strings.NewReader(formData.Encode()))
 35	if err != nil {
 36		return nil, fmt.Errorf("failed to create request: %w", err)
 37	}
 38
 39	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 40	req.Header.Set("User-Agent", BrowserUserAgent)
 41
 42	resp, err := client.Do(req)
 43	if err != nil {
 44		return nil, fmt.Errorf("failed to execute search: %w", err)
 45	}
 46	defer resp.Body.Close()
 47
 48	if resp.StatusCode != http.StatusOK {
 49		return nil, fmt.Errorf("search failed with status code: %d", resp.StatusCode)
 50	}
 51
 52	body, err := io.ReadAll(resp.Body)
 53	if err != nil {
 54		return nil, fmt.Errorf("failed to read response: %w", err)
 55	}
 56
 57	return parseSearchResults(string(body), maxResults)
 58}
 59
 60// parseSearchResults extracts search results from DuckDuckGo HTML response.
 61func parseSearchResults(htmlContent string, maxResults int) ([]SearchResult, error) {
 62	doc, err := html.Parse(strings.NewReader(htmlContent))
 63	if err != nil {
 64		return nil, fmt.Errorf("failed to parse HTML: %w", err)
 65	}
 66
 67	var results []SearchResult
 68	var traverse func(*html.Node)
 69
 70	traverse = func(n *html.Node) {
 71		if n.Type == html.ElementNode && n.Data == "div" && hasClass(n, "result") {
 72			result := extractResult(n)
 73			if result != nil && result.Link != "" && !strings.Contains(result.Link, "y.js") {
 74				result.Position = len(results) + 1
 75				results = append(results, *result)
 76				if len(results) >= maxResults {
 77					return
 78				}
 79			}
 80		}
 81		for c := n.FirstChild; c != nil && len(results) < maxResults; c = c.NextSibling {
 82			traverse(c)
 83		}
 84	}
 85
 86	traverse(doc)
 87	return results, nil
 88}
 89
 90// hasClass checks if an HTML node has a specific class.
 91func hasClass(n *html.Node, class string) bool {
 92	for _, attr := range n.Attr {
 93		if attr.Key == "class" {
 94			return slices.Contains(strings.Fields(attr.Val), class)
 95		}
 96	}
 97	return false
 98}
 99
100// extractResult extracts a search result from a result div node.
101func extractResult(n *html.Node) *SearchResult {
102	result := &SearchResult{}
103
104	var traverse func(*html.Node)
105	traverse = func(node *html.Node) {
106		if node.Type == html.ElementNode {
107			// Look for title link.
108			if node.Data == "a" && hasClass(node, "result__a") {
109				result.Title = getTextContent(node)
110				for _, attr := range node.Attr {
111					if attr.Key == "href" {
112						result.Link = cleanDuckDuckGoURL(attr.Val)
113						break
114					}
115				}
116			}
117			// Look for snippet.
118			if node.Data == "a" && hasClass(node, "result__snippet") {
119				result.Snippet = getTextContent(node)
120			}
121		}
122		for c := node.FirstChild; c != nil; c = c.NextSibling {
123			traverse(c)
124		}
125	}
126
127	traverse(n)
128	return result
129}
130
131// getTextContent extracts all text content from a node and its children.
132func getTextContent(n *html.Node) string {
133	var text strings.Builder
134	var traverse func(*html.Node)
135
136	traverse = func(node *html.Node) {
137		if node.Type == html.TextNode {
138			text.WriteString(node.Data)
139		}
140		for c := node.FirstChild; c != nil; c = c.NextSibling {
141			traverse(c)
142		}
143	}
144
145	traverse(n)
146	return strings.TrimSpace(text.String())
147}
148
149// cleanDuckDuckGoURL extracts the actual URL from DuckDuckGo's redirect URL.
150func cleanDuckDuckGoURL(rawURL string) string {
151	if strings.HasPrefix(rawURL, "//duckduckgo.com/l/?uddg=") {
152		// Extract the actual URL from the redirect.
153		if idx := strings.Index(rawURL, "uddg="); idx != -1 {
154			encoded := rawURL[idx+5:]
155			if ampIdx := strings.Index(encoded, "&"); ampIdx != -1 {
156				encoded = encoded[:ampIdx]
157			}
158			decoded, err := url.QueryUnescape(encoded)
159			if err == nil {
160				return decoded
161			}
162		}
163	}
164	return rawURL
165}
166
167// formatSearchResults formats search results for LLM consumption.
168func formatSearchResults(results []SearchResult) string {
169	if len(results) == 0 {
170		return "No results were found for your search query. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes."
171	}
172
173	var sb strings.Builder
174	sb.WriteString(fmt.Sprintf("Found %d search results:\n\n", len(results)))
175
176	for _, result := range results {
177		sb.WriteString(fmt.Sprintf("%d. %s\n", result.Position, result.Title))
178		sb.WriteString(fmt.Sprintf("   URL: %s\n", result.Link))
179		sb.WriteString(fmt.Sprintf("   Summary: %s\n\n", result.Snippet))
180	}
181
182	return sb.String()
183}