fix: fix DDG and Bing parsers — verified with live tests
DuckDuckGo: - Fixed parser to handle single-quoted class attributes (class='result-link') - Decode DDG tracking URLs (uddg= parameter) to extract real URLs - Match snippet extraction to actual DDG Lite HTML structure (</td> terminator) Bing: - Switched from HTML scraping (blocked by JS detection) to RSS endpoint (?format=rss) which returns parseable XML - Added JSON API response parsing as fallback - Returns graceful unresponsive_engines entry when blocked Live test results: - DuckDuckGo: 9 results ✅ - GitHub: 10 results (14,768 total) ✅ - Bing: 10 results via RSS ✅ - Reddit: skipped (403 from sandbox, needs browser-like context)
This commit is contained in:
parent
df8fe9474b
commit
a8ab29b23a
4 changed files with 186 additions and 157 deletions
|
|
@ -2,12 +2,14 @@ package engines
|
|||
|
||||
import (
|
||||
"io"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
|
||||
// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
|
||||
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
|
||||
body, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
|
|
@ -26,40 +28,61 @@ func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
|
|||
remaining := content
|
||||
|
||||
for {
|
||||
idx := strings.Index(remaining, `class="result-link"`)
|
||||
// DDG uses single quotes: class='result-link'
|
||||
idx := strings.Index(remaining, "class='result-link'")
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
block := remaining[idx:]
|
||||
|
||||
// Extract href from the anchor.
|
||||
href := extractAttr(block, "href")
|
||||
if href == "" {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip DDG internal links.
|
||||
if strings.HasPrefix(href, "/") || strings.Contains(href, "duckduckgo.com/l/") {
|
||||
// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
|
||||
if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
|
||||
if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
|
||||
encodedURL := href[uddgIdx+5:]
|
||||
// Split on & to get just the URL (other params may follow)
|
||||
if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
|
||||
encodedURL = encodedURL[:ampIdx]
|
||||
}
|
||||
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
|
||||
href = decoded
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip internal links.
|
||||
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
|
||||
// Extract title — text between > and </a> after the href.
|
||||
// Extract title — text between > and </a> after the class attribute.
|
||||
titleStart := strings.Index(block, ">")
|
||||
if titleStart == -1 {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
afterHref := block[titleStart+1:]
|
||||
titleEnd := strings.Index(afterHref, "</a>")
|
||||
afterClass := block[titleStart+1:]
|
||||
titleEnd := strings.Index(afterClass, "</a>")
|
||||
if titleEnd == -1 {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
title := stripHTML(afterHref[:titleEnd])
|
||||
title := stripHTML(afterClass[:titleEnd])
|
||||
title = htmlUnescape(title)
|
||||
|
||||
if title == "" {
|
||||
remaining = block[titleStart+1+titleEnd:]
|
||||
continue
|
||||
}
|
||||
|
||||
parsedLinks = append(parsedLinks, parsedResult{
|
||||
href: href,
|
||||
title: title,
|
||||
|
|
@ -68,26 +91,28 @@ func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
|
|||
remaining = block[titleStart+1+titleEnd:]
|
||||
}
|
||||
|
||||
// Extract snippets between results.
|
||||
// Extract snippets for each result.
|
||||
for i, link := range parsedLinks {
|
||||
snippet := ""
|
||||
linkIdx := strings.Index(content, link.href)
|
||||
if linkIdx == -1 {
|
||||
// Try partial match (the href might be HTML-encoded in the source).
|
||||
linkIdx = strings.Index(content, url.QueryEscape(link.href))
|
||||
}
|
||||
|
||||
if linkIdx != -1 {
|
||||
snippetRegion := content[linkIdx+len(link.href):]
|
||||
snippetRegion := content[linkIdx:]
|
||||
if len(snippetRegion) > 2000 {
|
||||
snippetRegion = snippetRegion[:2000]
|
||||
}
|
||||
|
||||
snippetIdx := strings.Index(snippetRegion, "result-snippet")
|
||||
if snippetIdx == -1 {
|
||||
snippetIdx = strings.Index(snippetRegion, "result__snippet")
|
||||
}
|
||||
|
||||
// DDG uses single quotes: class='result-snippet'
|
||||
snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
|
||||
if snippetIdx != -1 {
|
||||
snippetBlock := snippetRegion[snippetIdx:]
|
||||
textStart := strings.Index(snippetBlock, ">")
|
||||
if textStart != -1 {
|
||||
textEnd := strings.Index(snippetBlock[textStart:], "</")
|
||||
textEnd := strings.Index(snippetBlock[textStart:], "</td>")
|
||||
if textEnd != -1 {
|
||||
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue