fix: fix DDG and Bing parsers — verified with live tests

DuckDuckGo: - Fixed parser to handle single-quoted class attributes (class='result-link') - Decode DDG tracking URLs (uddg= parameter) to extract real URLs - Match snippet extraction to actual DDG Lite HTML structure (</td> terminator) Bing: - Switched from HTML scraping (blocked by JS detection) to RSS endpoint (?format=rss) which returns parseable XML - Added JSON API response parsing as fallback - Returns graceful unresponsive_engines entry when blocked Live test results: - DuckDuckGo: 9 results ✅ - GitHub: 10 results (14,768 total) ✅ - Bing: 10 results via RSS ✅ - Reddit: skipped (403 from sandbox, needs browser-like context)
2026-03-21 16:57:02 +00:00 · 2026-03-21 16:57:02 +00:00 · a8ab29b23a
commit a8ab29b23a
parent df8fe9474b
4 changed files with 186 additions and 157 deletions
--- a/internal/engines/duckduckgo_parse.go
+++ b/internal/engines/duckduckgo_parse.go
@ -2,12 +2,14 @@ package engines

 import (
 	"io"
+	"net/url"
 	"strings"

 	"github.com/ashie/gosearch/internal/contracts"
 )

 // parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
+// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
 func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
 	body, err := io.ReadAll(r)
 	if err != nil {
@ -26,40 +28,61 @@ func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
 	remaining := content

 	for {
-		idx := strings.Index(remaining, `class="result-link"`)
+		// DDG uses single quotes: class='result-link'
+		idx := strings.Index(remaining, "class='result-link'")
 		if idx == -1 {
 			break
 		}

 		block := remaining[idx:]

+		// Extract href from the anchor.
 		href := extractAttr(block, "href")
 		if href == "" {
 			remaining = block[1:]
 			continue
 		}

-		// Skip DDG internal links.
-		if strings.HasPrefix(href, "/") || strings.Contains(href, "duckduckgo.com/l/") {
+		// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
+		if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
+			if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
+				encodedURL := href[uddgIdx+5:]
+				// Split on & to get just the URL (other params may follow)
+				if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
+					encodedURL = encodedURL[:ampIdx]
+				}
+				if decoded, err := url.QueryUnescape(encodedURL); err == nil {
+					href = decoded
+				}
+			}
+		}
+
+		// Skip internal links.
+		if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
 			remaining = block[1:]
 			continue
 		}

-		// Extract title — text between > and </a> after the href.
+		// Extract title — text between > and </a> after the class attribute.
 		titleStart := strings.Index(block, ">")
 		if titleStart == -1 {
 			remaining = block[1:]
 			continue
 		}
-		afterHref := block[titleStart+1:]
-		titleEnd := strings.Index(afterHref, "</a>")
+		afterClass := block[titleStart+1:]
+		titleEnd := strings.Index(afterClass, "</a>")
 		if titleEnd == -1 {
 			remaining = block[1:]
 			continue
 		}
-		title := stripHTML(afterHref[:titleEnd])
+		title := stripHTML(afterClass[:titleEnd])
 		title = htmlUnescape(title)

+		if title == "" {
+			remaining = block[titleStart+1+titleEnd:]
+			continue
+		}
+
 		parsedLinks = append(parsedLinks, parsedResult{
 			href:  href,
 			title: title,
@ -68,26 +91,28 @@ func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
 		remaining = block[titleStart+1+titleEnd:]
 	}

-	// Extract snippets between results.
+	// Extract snippets for each result.
 	for i, link := range parsedLinks {
 		snippet := ""
 		linkIdx := strings.Index(content, link.href)
+		if linkIdx == -1 {
+			// Try partial match (the href might be HTML-encoded in the source).
+			linkIdx = strings.Index(content, url.QueryEscape(link.href))
+		}
+
 		if linkIdx != -1 {
-			snippetRegion := content[linkIdx+len(link.href):]
+			snippetRegion := content[linkIdx:]
 			if len(snippetRegion) > 2000 {
 				snippetRegion = snippetRegion[:2000]
 			}

-			snippetIdx := strings.Index(snippetRegion, "result-snippet")
-			if snippetIdx == -1 {
-				snippetIdx = strings.Index(snippetRegion, "result__snippet")
-			}
-
+			// DDG uses single quotes: class='result-snippet'
+			snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
 			if snippetIdx != -1 {
 				snippetBlock := snippetRegion[snippetIdx:]
 				textStart := strings.Index(snippetBlock, ">")
 				if textStart != -1 {
-					textEnd := strings.Index(snippetBlock[textStart:], "</")
+					textEnd := strings.Index(snippetBlock[textStart:], "</td>")
 					if textEnd != -1 {
 						snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
 					}