feat: add DuckDuckGo, GitHub, Reddit, and Bing engines

- DuckDuckGo: scrapes Lite HTML endpoint for results
  - Language-aware region mapping (de→de-de, ja→jp-jp, etc.)
  - HTML parser extracts result links and snippets from DDG Lite markup
  - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape

- GitHub: uses public Search API (repos, sorted by stars)
  - No auth required (10 req/min unauthenticated)
  - Shows stars, language, topics, last updated date
  - Paginated via GitHub's page parameter

- Reddit: uses public JSON search API
  - Respects safesearch (skips over_18 posts)
  - Shows subreddit, score, comment count
  - Links self-posts to the thread URL

- Bing: scrapes web search HTML (b_algo containers)
  - Extracts titles, URLs, and snippets from Bing's result markup
  - Handles Bing's tracking URL encoding

- Updated factory, config defaults, and config.example.toml
- Full test suite: unit tests for all engines, HTML parsing tests,
  region mapping tests, live request tests (skipped in short mode)

9 engines total: wikipedia, arxiv, crossref, braveapi, qwant,
duckduckgo, github, reddit, bing
This commit is contained in:
Franz Kafka 2026-03-21 16:52:11 +00:00
parent 28b61ff251
commit df8fe9474b
14 changed files with 1030 additions and 5 deletions

View file

@ -0,0 +1,58 @@
package engines
import (
"strings"
)
// extractAttr finds attr="value" or attr='value' in an HTML string.
func extractAttr(s, attr string) string {
prefix := attr + `="`
idx := strings.Index(s, prefix)
if idx == -1 {
prefix = attr + "='"
idx = strings.Index(s, prefix)
if idx == -1 {
return ""
}
}
start := idx + len(prefix)
end := strings.Index(s[start:], "\"")
if end == -1 {
end = strings.Index(s[start:], "'")
}
if end == -1 {
end = len(s[start:])
}
return s[start : start+end]
}
// stripHTML removes all HTML tags from a string.
func stripHTML(s string) string {
var result strings.Builder
inTag := false
for _, r := range s {
if r == '<' {
inTag = true
continue
}
if r == '>' {
inTag = false
continue
}
if !inTag {
result.WriteRune(r)
}
}
return strings.TrimSpace(result.String())
}
// htmlUnescape handles basic HTML entities.
func htmlUnescape(s string) string {
s = strings.ReplaceAll(s, "&amp;", "&")
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&quot;", "\"")
s = strings.ReplaceAll(s, "&#39;", "'")
s = strings.ReplaceAll(s, "&nbsp;", " ")
return s
}