- DuckDuckGo: scrapes Lite HTML endpoint for results - Language-aware region mapping (de→de-de, ja→jp-jp, etc.) - HTML parser extracts result links and snippets from DDG Lite markup - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape - GitHub: uses public Search API (repos, sorted by stars) - No auth required (10 req/min unauthenticated) - Shows stars, language, topics, last updated date - Paginated via GitHub's page parameter - Reddit: uses public JSON search API - Respects safesearch (skips over_18 posts) - Shows subreddit, score, comment count - Links self-posts to the thread URL - Bing: scrapes web search HTML (b_algo containers) - Extracts titles, URLs, and snippets from Bing's result markup - Handles Bing's tracking URL encoding - Updated factory, config defaults, and config.example.toml - Full test suite: unit tests for all engines, HTML parsing tests, region mapping tests, live request tests (skipped in short mode) 9 engines total: wikipedia, arxiv, crossref, braveapi, qwant, duckduckgo, github, reddit, bing
112 lines
2.5 KiB
Go
112 lines
2.5 KiB
Go
package engines
|
|
|
|
import (
|
|
"io"
|
|
"strings"
|
|
|
|
"github.com/ashie/gosearch/internal/contracts"
|
|
)
|
|
|
|
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
|
|
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
|
|
body, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
content := string(body)
|
|
results := make([]contracts.MainResult, 0)
|
|
|
|
type parsedResult struct {
|
|
href string
|
|
title string
|
|
}
|
|
|
|
var parsedLinks []parsedResult
|
|
remaining := content
|
|
|
|
for {
|
|
idx := strings.Index(remaining, `class="result-link"`)
|
|
if idx == -1 {
|
|
break
|
|
}
|
|
|
|
block := remaining[idx:]
|
|
|
|
href := extractAttr(block, "href")
|
|
if href == "" {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
|
|
// Skip DDG internal links.
|
|
if strings.HasPrefix(href, "/") || strings.Contains(href, "duckduckgo.com/l/") {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
|
|
// Extract title — text between > and </a> after the href.
|
|
titleStart := strings.Index(block, ">")
|
|
if titleStart == -1 {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
afterHref := block[titleStart+1:]
|
|
titleEnd := strings.Index(afterHref, "</a>")
|
|
if titleEnd == -1 {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
title := stripHTML(afterHref[:titleEnd])
|
|
title = htmlUnescape(title)
|
|
|
|
parsedLinks = append(parsedLinks, parsedResult{
|
|
href: href,
|
|
title: title,
|
|
})
|
|
|
|
remaining = block[titleStart+1+titleEnd:]
|
|
}
|
|
|
|
// Extract snippets between results.
|
|
for i, link := range parsedLinks {
|
|
snippet := ""
|
|
linkIdx := strings.Index(content, link.href)
|
|
if linkIdx != -1 {
|
|
snippetRegion := content[linkIdx+len(link.href):]
|
|
if len(snippetRegion) > 2000 {
|
|
snippetRegion = snippetRegion[:2000]
|
|
}
|
|
|
|
snippetIdx := strings.Index(snippetRegion, "result-snippet")
|
|
if snippetIdx == -1 {
|
|
snippetIdx = strings.Index(snippetRegion, "result__snippet")
|
|
}
|
|
|
|
if snippetIdx != -1 {
|
|
snippetBlock := snippetRegion[snippetIdx:]
|
|
textStart := strings.Index(snippetBlock, ">")
|
|
if textStart != -1 {
|
|
textEnd := strings.Index(snippetBlock[textStart:], "</")
|
|
if textEnd != -1 {
|
|
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
urlPtr := link.href
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: link.title,
|
|
Content: snippet,
|
|
URL: &urlPtr,
|
|
Engine: "duckduckgo",
|
|
Score: float64(len(parsedLinks) - i),
|
|
Category: "general",
|
|
Engines: []string{"duckduckgo"},
|
|
})
|
|
}
|
|
|
|
return results, nil
|
|
}
|