feat: add DuckDuckGo, GitHub, Reddit, and Bing engines
- DuckDuckGo: scrapes Lite HTML endpoint for results - Language-aware region mapping (de→de-de, ja→jp-jp, etc.) - HTML parser extracts result links and snippets from DDG Lite markup - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape - GitHub: uses public Search API (repos, sorted by stars) - No auth required (10 req/min unauthenticated) - Shows stars, language, topics, last updated date - Paginated via GitHub's page parameter - Reddit: uses public JSON search API - Respects safesearch (skips over_18 posts) - Shows subreddit, score, comment count - Links self-posts to the thread URL - Bing: scrapes web search HTML (b_algo containers) - Extracts titles, URLs, and snippets from Bing's result markup - Handles Bing's tracking URL encoding - Updated factory, config defaults, and config.example.toml - Full test suite: unit tests for all engines, HTML parsing tests, region mapping tests, live request tests (skipped in short mode) 9 engines total: wikipedia, arxiv, crossref, braveapi, qwant, duckduckgo, github, reddit, bing
This commit is contained in:
parent
28b61ff251
commit
df8fe9474b
14 changed files with 1030 additions and 5 deletions
182
internal/engines/bing.go
Normal file
182
internal/engines/bing.go
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
// BingEngine searches Bing via the public search endpoint.
|
||||
// Uses Bing's web search results page and extracts results from the HTML.
|
||||
type BingEngine struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (e *BingEngine) Name() string { return "bing" }
|
||||
|
||||
func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if strings.TrimSpace(req.Query) == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("bing engine not initialized")
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf(
|
||||
"https://www.bing.com/search?q=%s&count=10&offset=%d",
|
||||
url.QueryEscape(req.Query),
|
||||
(req.Pageno-1)*10,
|
||||
)
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
results, err := parseBingHTML(resp.Body)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// parseBingHTML extracts search results from Bing's HTML response.
|
||||
// Bing results are in <li class="b_algo"> elements containing <h2><a href="...">Title</a></h2>
|
||||
// and <p> or <div class="b_caption"> for snippets.
|
||||
func parseBingHTML(r io.Reader) ([]contracts.MainResult, error) {
|
||||
body, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
content := string(body)
|
||||
results := make([]contracts.MainResult, 0)
|
||||
|
||||
// Split on b_algo result containers.
|
||||
parts := strings.Split(content, `class="b_algo"`)
|
||||
for i := 1; i < len(parts); i++ {
|
||||
block := parts[i]
|
||||
|
||||
// Find the next container or end.
|
||||
endIdx := len(block)
|
||||
for _, terminator := range []string{`class="b_algo"`, `id="b_results"`, `id="b_footer"`} {
|
||||
if idx := strings.Index(block, terminator); idx > 0 && idx < endIdx {
|
||||
endIdx = idx
|
||||
}
|
||||
}
|
||||
block = block[:endIdx]
|
||||
|
||||
// Extract title and URL from <h2><a href="...">
|
||||
title, href := extractBingLink(block)
|
||||
if title == "" || href == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Extract snippet from <p> or <div class="b_caption"><p>
|
||||
snippet := extractBingSnippet(block)
|
||||
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: title,
|
||||
Content: snippet,
|
||||
URL: &href,
|
||||
Engine: "bing",
|
||||
Score: 0,
|
||||
Category: "general",
|
||||
Engines: []string{"bing"},
|
||||
})
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func extractBingLink(block string) (title, href string) {
|
||||
// Find <a href="...">
|
||||
hrefStart := strings.Index(block, `href="`)
|
||||
if hrefStart == -1 {
|
||||
return "", ""
|
||||
}
|
||||
hrefStart += 6
|
||||
hrefEnd := strings.Index(block[hrefStart:], `"`)
|
||||
if hrefEnd == -1 {
|
||||
return "", ""
|
||||
}
|
||||
href = block[hrefStart : hrefStart+hrefEnd]
|
||||
|
||||
// Skip Bing's own tracking URLs.
|
||||
if strings.Contains(href, "bing.com") && strings.Contains(href, "search?") {
|
||||
// Try to extract the real URL from u= parameter.
|
||||
if uIdx := strings.Index(href, "&u="); uIdx != -1 {
|
||||
encodedURL := href[uIdx+3:]
|
||||
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
|
||||
href = decoded
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Title is between > and </a> after the href.
|
||||
titleStart := strings.Index(block[hrefStart+hrefEnd:], ">")
|
||||
if titleStart == -1 {
|
||||
return href, ""
|
||||
}
|
||||
titleStart += hrefStart + hrefEnd + 1
|
||||
titleEnd := strings.Index(block[titleStart:], "</a>")
|
||||
if titleEnd == -1 {
|
||||
return href, ""
|
||||
}
|
||||
title = stripHTML(block[titleStart : titleStart+titleEnd])
|
||||
title = strings.TrimSpace(title)
|
||||
|
||||
return title, href
|
||||
}
|
||||
|
||||
func extractBingSnippet(block string) string {
|
||||
// Try <div class="b_caption"><p> first.
|
||||
if idx := strings.Index(block, `class="b_caption"`); idx != -1 {
|
||||
caption := block[idx:]
|
||||
if pStart := strings.Index(caption, "<p"); pStart != -1 {
|
||||
snippet := caption[pStart:]
|
||||
if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
|
||||
return stripHTML(snippet[:pEnd+4])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: any <p> tag.
|
||||
if pStart := strings.Index(block, "<p"); pStart != -1 {
|
||||
snippet := block[pStart:]
|
||||
if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
|
||||
return stripHTML(snippet[:pEnd+4])
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue