kafka/internal/engines/duckduckgo_parse.go
Franz Kafka df8fe9474b feat: add DuckDuckGo, GitHub, Reddit, and Bing engines
- DuckDuckGo: scrapes Lite HTML endpoint for results
  - Language-aware region mapping (de→de-de, ja→jp-jp, etc.)
  - HTML parser extracts result links and snippets from DDG Lite markup
  - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape

- GitHub: uses public Search API (repos, sorted by stars)
  - No auth required (10 req/min unauthenticated)
  - Shows stars, language, topics, last updated date
  - Paginated via GitHub's page parameter

- Reddit: uses public JSON search API
  - Respects safesearch (skips over_18 posts)
  - Shows subreddit, score, comment count
  - Links self-posts to the thread URL

- Bing: scrapes web search HTML (b_algo containers)
  - Extracts titles, URLs, and snippets from Bing's result markup
  - Handles Bing's tracking URL encoding

- Updated factory, config defaults, and config.example.toml
- Full test suite: unit tests for all engines, HTML parsing tests,
  region mapping tests, live request tests (skipped in short mode)

9 engines total: wikipedia, arxiv, crossref, braveapi, qwant,
duckduckgo, github, reddit, bing
2026-03-21 16:52:11 +00:00

112 lines
2.5 KiB
Go

package engines
import (
"io"
"strings"
"github.com/ashie/gosearch/internal/contracts"
)
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
body, err := io.ReadAll(r)
if err != nil {
return nil, err
}
content := string(body)
results := make([]contracts.MainResult, 0)
type parsedResult struct {
href string
title string
}
var parsedLinks []parsedResult
remaining := content
for {
idx := strings.Index(remaining, `class="result-link"`)
if idx == -1 {
break
}
block := remaining[idx:]
href := extractAttr(block, "href")
if href == "" {
remaining = block[1:]
continue
}
// Skip DDG internal links.
if strings.HasPrefix(href, "/") || strings.Contains(href, "duckduckgo.com/l/") {
remaining = block[1:]
continue
}
// Extract title — text between > and </a> after the href.
titleStart := strings.Index(block, ">")
if titleStart == -1 {
remaining = block[1:]
continue
}
afterHref := block[titleStart+1:]
titleEnd := strings.Index(afterHref, "</a>")
if titleEnd == -1 {
remaining = block[1:]
continue
}
title := stripHTML(afterHref[:titleEnd])
title = htmlUnescape(title)
parsedLinks = append(parsedLinks, parsedResult{
href: href,
title: title,
})
remaining = block[titleStart+1+titleEnd:]
}
// Extract snippets between results.
for i, link := range parsedLinks {
snippet := ""
linkIdx := strings.Index(content, link.href)
if linkIdx != -1 {
snippetRegion := content[linkIdx+len(link.href):]
if len(snippetRegion) > 2000 {
snippetRegion = snippetRegion[:2000]
}
snippetIdx := strings.Index(snippetRegion, "result-snippet")
if snippetIdx == -1 {
snippetIdx = strings.Index(snippetRegion, "result__snippet")
}
if snippetIdx != -1 {
snippetBlock := snippetRegion[snippetIdx:]
textStart := strings.Index(snippetBlock, ">")
if textStart != -1 {
textEnd := strings.Index(snippetBlock[textStart:], "</")
if textEnd != -1 {
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
}
}
}
}
urlPtr := link.href
results = append(results, contracts.MainResult{
Template: "default.html",
Title: link.title,
Content: snippet,
URL: &urlPtr,
Engine: "duckduckgo",
Score: float64(len(parsedLinks) - i),
Category: "general",
Engines: []string{"duckduckgo"},
})
}
return results, nil
}