feat: add DuckDuckGo, GitHub, Reddit, and Bing engines
- DuckDuckGo: scrapes Lite HTML endpoint for results - Language-aware region mapping (de→de-de, ja→jp-jp, etc.) - HTML parser extracts result links and snippets from DDG Lite markup - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape - GitHub: uses public Search API (repos, sorted by stars) - No auth required (10 req/min unauthenticated) - Shows stars, language, topics, last updated date - Paginated via GitHub's page parameter - Reddit: uses public JSON search API - Respects safesearch (skips over_18 posts) - Shows subreddit, score, comment count - Links self-posts to the thread URL - Bing: scrapes web search HTML (b_algo containers) - Extracts titles, URLs, and snippets from Bing's result markup - Handles Bing's tracking URL encoding - Updated factory, config defaults, and config.example.toml - Full test suite: unit tests for all engines, HTML parsing tests, region mapping tests, live request tests (skipped in short mode) 9 engines total: wikipedia, arxiv, crossref, braveapi, qwant, duckduckgo, github, reddit, bing
This commit is contained in:
parent
28b61ff251
commit
df8fe9474b
14 changed files with 1030 additions and 5 deletions
58
internal/engines/html_helpers.go
Normal file
58
internal/engines/html_helpers.go
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// extractAttr finds attr="value" or attr='value' in an HTML string.
|
||||
func extractAttr(s, attr string) string {
|
||||
prefix := attr + `="`
|
||||
idx := strings.Index(s, prefix)
|
||||
if idx == -1 {
|
||||
prefix = attr + "='"
|
||||
idx = strings.Index(s, prefix)
|
||||
if idx == -1 {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
start := idx + len(prefix)
|
||||
end := strings.Index(s[start:], "\"")
|
||||
if end == -1 {
|
||||
end = strings.Index(s[start:], "'")
|
||||
}
|
||||
if end == -1 {
|
||||
end = len(s[start:])
|
||||
}
|
||||
return s[start : start+end]
|
||||
}
|
||||
|
||||
// stripHTML removes all HTML tags from a string.
|
||||
func stripHTML(s string) string {
|
||||
var result strings.Builder
|
||||
inTag := false
|
||||
for _, r := range s {
|
||||
if r == '<' {
|
||||
inTag = true
|
||||
continue
|
||||
}
|
||||
if r == '>' {
|
||||
inTag = false
|
||||
continue
|
||||
}
|
||||
if !inTag {
|
||||
result.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(result.String())
|
||||
}
|
||||
|
||||
// htmlUnescape handles basic HTML entities.
|
||||
func htmlUnescape(s string) string {
|
||||
s = strings.ReplaceAll(s, "&", "&")
|
||||
s = strings.ReplaceAll(s, "<", "<")
|
||||
s = strings.ReplaceAll(s, ">", ">")
|
||||
s = strings.ReplaceAll(s, """, "\"")
|
||||
s = strings.ReplaceAll(s, "'", "'")
|
||||
s = strings.ReplaceAll(s, " ", " ")
|
||||
return s
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue