feat: add DuckDuckGo, GitHub, Reddit, and Bing engines
- DuckDuckGo: scrapes Lite HTML endpoint for results - Language-aware region mapping (de→de-de, ja→jp-jp, etc.) - HTML parser extracts result links and snippets from DDG Lite markup - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape - GitHub: uses public Search API (repos, sorted by stars) - No auth required (10 req/min unauthenticated) - Shows stars, language, topics, last updated date - Paginated via GitHub's page parameter - Reddit: uses public JSON search API - Respects safesearch (skips over_18 posts) - Shows subreddit, score, comment count - Links self-posts to the thread URL - Bing: scrapes web search HTML (b_algo containers) - Extracts titles, URLs, and snippets from Bing's result markup - Handles Bing's tracking URL encoding - Updated factory, config defaults, and config.example.toml - Full test suite: unit tests for all engines, HTML parsing tests, region mapping tests, live request tests (skipped in short mode) 9 engines total: wikipedia, arxiv, crossref, braveapi, qwant, duckduckgo, github, reddit, bing
This commit is contained in:
parent
28b61ff251
commit
df8fe9474b
14 changed files with 1030 additions and 5 deletions
112
internal/engines/duckduckgo_parse.go
Normal file
112
internal/engines/duckduckgo_parse.go
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
|
||||
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
|
||||
body, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
content := string(body)
|
||||
results := make([]contracts.MainResult, 0)
|
||||
|
||||
type parsedResult struct {
|
||||
href string
|
||||
title string
|
||||
}
|
||||
|
||||
var parsedLinks []parsedResult
|
||||
remaining := content
|
||||
|
||||
for {
|
||||
idx := strings.Index(remaining, `class="result-link"`)
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
block := remaining[idx:]
|
||||
|
||||
href := extractAttr(block, "href")
|
||||
if href == "" {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip DDG internal links.
|
||||
if strings.HasPrefix(href, "/") || strings.Contains(href, "duckduckgo.com/l/") {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
|
||||
// Extract title — text between > and </a> after the href.
|
||||
titleStart := strings.Index(block, ">")
|
||||
if titleStart == -1 {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
afterHref := block[titleStart+1:]
|
||||
titleEnd := strings.Index(afterHref, "</a>")
|
||||
if titleEnd == -1 {
|
||||
remaining = block[1:]
|
||||
continue
|
||||
}
|
||||
title := stripHTML(afterHref[:titleEnd])
|
||||
title = htmlUnescape(title)
|
||||
|
||||
parsedLinks = append(parsedLinks, parsedResult{
|
||||
href: href,
|
||||
title: title,
|
||||
})
|
||||
|
||||
remaining = block[titleStart+1+titleEnd:]
|
||||
}
|
||||
|
||||
// Extract snippets between results.
|
||||
for i, link := range parsedLinks {
|
||||
snippet := ""
|
||||
linkIdx := strings.Index(content, link.href)
|
||||
if linkIdx != -1 {
|
||||
snippetRegion := content[linkIdx+len(link.href):]
|
||||
if len(snippetRegion) > 2000 {
|
||||
snippetRegion = snippetRegion[:2000]
|
||||
}
|
||||
|
||||
snippetIdx := strings.Index(snippetRegion, "result-snippet")
|
||||
if snippetIdx == -1 {
|
||||
snippetIdx = strings.Index(snippetRegion, "result__snippet")
|
||||
}
|
||||
|
||||
if snippetIdx != -1 {
|
||||
snippetBlock := snippetRegion[snippetIdx:]
|
||||
textStart := strings.Index(snippetBlock, ">")
|
||||
if textStart != -1 {
|
||||
textEnd := strings.Index(snippetBlock[textStart:], "</")
|
||||
if textEnd != -1 {
|
||||
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
urlPtr := link.href
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: link.title,
|
||||
Content: snippet,
|
||||
URL: &urlPtr,
|
||||
Engine: "duckduckgo",
|
||||
Score: float64(len(parsedLinks) - i),
|
||||
Category: "general",
|
||||
Engines: []string{"duckduckgo"},
|
||||
})
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue