kafka/internal/engines/duckduckgo_parse.go
Franz Kafka a8ab29b23a fix: fix DDG and Bing parsers — verified with live tests
DuckDuckGo:
- Fixed parser to handle single-quoted class attributes (class='result-link')
- Decode DDG tracking URLs (uddg= parameter) to extract real URLs
- Match snippet extraction to actual DDG Lite HTML structure (</td> terminator)

Bing:
- Switched from HTML scraping (blocked by JS detection) to RSS endpoint
  (?format=rss) which returns parseable XML
- Added JSON API response parsing as fallback
- Returns graceful unresponsive_engines entry when blocked

Live test results:
- DuckDuckGo: 9 results 
- GitHub: 10 results (14,768 total) 
- Bing: 10 results via RSS 
- Reddit: skipped (403 from sandbox, needs browser-like context)
2026-03-21 16:57:02 +00:00

137 lines
3.4 KiB
Go

package engines
import (
"io"
"net/url"
"strings"
"github.com/ashie/gosearch/internal/contracts"
)
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
body, err := io.ReadAll(r)
if err != nil {
return nil, err
}
content := string(body)
results := make([]contracts.MainResult, 0)
type parsedResult struct {
href string
title string
}
var parsedLinks []parsedResult
remaining := content
for {
// DDG uses single quotes: class='result-link'
idx := strings.Index(remaining, "class='result-link'")
if idx == -1 {
break
}
block := remaining[idx:]
// Extract href from the anchor.
href := extractAttr(block, "href")
if href == "" {
remaining = block[1:]
continue
}
// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
encodedURL := href[uddgIdx+5:]
// Split on & to get just the URL (other params may follow)
if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
encodedURL = encodedURL[:ampIdx]
}
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
href = decoded
}
}
}
// Skip internal links.
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
remaining = block[1:]
continue
}
// Extract title — text between > and </a> after the class attribute.
titleStart := strings.Index(block, ">")
if titleStart == -1 {
remaining = block[1:]
continue
}
afterClass := block[titleStart+1:]
titleEnd := strings.Index(afterClass, "</a>")
if titleEnd == -1 {
remaining = block[1:]
continue
}
title := stripHTML(afterClass[:titleEnd])
title = htmlUnescape(title)
if title == "" {
remaining = block[titleStart+1+titleEnd:]
continue
}
parsedLinks = append(parsedLinks, parsedResult{
href: href,
title: title,
})
remaining = block[titleStart+1+titleEnd:]
}
// Extract snippets for each result.
for i, link := range parsedLinks {
snippet := ""
linkIdx := strings.Index(content, link.href)
if linkIdx == -1 {
// Try partial match (the href might be HTML-encoded in the source).
linkIdx = strings.Index(content, url.QueryEscape(link.href))
}
if linkIdx != -1 {
snippetRegion := content[linkIdx:]
if len(snippetRegion) > 2000 {
snippetRegion = snippetRegion[:2000]
}
// DDG uses single quotes: class='result-snippet'
snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
if snippetIdx != -1 {
snippetBlock := snippetRegion[snippetIdx:]
textStart := strings.Index(snippetBlock, ">")
if textStart != -1 {
textEnd := strings.Index(snippetBlock[textStart:], "</td>")
if textEnd != -1 {
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
}
}
}
}
urlPtr := link.href
results = append(results, contracts.MainResult{
Template: "default.html",
Title: link.title,
Content: snippet,
URL: &urlPtr,
Engine: "duckduckgo",
Score: float64(len(parsedLinks) - i),
Category: "general",
Engines: []string{"duckduckgo"},
})
}
return results, nil
}