feat: add DuckDuckGo, GitHub, Reddit, and Bing engines

- DuckDuckGo: scrapes Lite HTML endpoint for results - Language-aware region mapping (de→de-de, ja→jp-jp, etc.) - HTML parser extracts result links and snippets from DDG Lite markup - Shared html_helpers.go with extractAttr, stripHTML, htmlUnescape - GitHub: uses public Search API (repos, sorted by stars) - No auth required (10 req/min unauthenticated) - Shows stars, language, topics, last updated date - Paginated via GitHub's page parameter - Reddit: uses public JSON search API - Respects safesearch (skips over_18 posts) - Shows subreddit, score, comment count - Links self-posts to the thread URL - Bing: scrapes web search HTML (b_algo containers) - Extracts titles, URLs, and snippets from Bing's result markup - Handles Bing's tracking URL encoding - Updated factory, config defaults, and config.example.toml - Full test suite: unit tests for all engines, HTML parsing tests, region mapping tests, live request tests (skipped in short mode) 9 engines total: wikipedia, arxiv, crossref, braveapi, qwant, duckduckgo, github, reddit, bing
2026-03-21 16:52:11 +00:00 · 2026-03-21 16:52:11 +00:00 · df8fe9474b
commit df8fe9474b
parent 28b61ff251
14 changed files with 1030 additions and 5 deletions
--- a/internal/engines/bing.go
+++ b/internal/engines/bing.go
@ -0,0 +1,182 @@
+package engines
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+
+	"github.com/ashie/gosearch/internal/contracts"
+)
+
+// BingEngine searches Bing via the public search endpoint.
+// Uses Bing's web search results page and extracts results from the HTML.
+type BingEngine struct {
+	client *http.Client
+}
+
+func (e *BingEngine) Name() string { return "bing" }
+
+func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
+	if strings.TrimSpace(req.Query) == "" {
+		return contracts.SearchResponse{Query: req.Query}, nil
+	}
+	if e == nil || e.client == nil {
+		return contracts.SearchResponse{}, errors.New("bing engine not initialized")
+	}
+
+	endpoint := fmt.Sprintf(
+		"https://www.bing.com/search?q=%s&count=10&offset=%d",
+		url.QueryEscape(req.Query),
+		(req.Pageno-1)*10,
+	)
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
+	if err != nil {
+		return contracts.SearchResponse{}, err
+	}
+	httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+
+	resp, err := e.client.Do(httpReq)
+	if err != nil {
+		return contracts.SearchResponse{}, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status=%d body=%q", resp.StatusCode, string(body))
+	}
+
+	results, err := parseBingHTML(resp.Body)
+	if err != nil {
+		return contracts.SearchResponse{}, err
+	}
+
+	return contracts.SearchResponse{
+		Query:               req.Query,
+		NumberOfResults:    len(results),
+		Results:             results,
+		Answers:             []map[string]any{},
+		Corrections:         []string{},
+		Infoboxes:           []map[string]any{},
+		Suggestions:         []string{},
+		UnresponsiveEngines: [][2]string{},
+	}, nil
+}
+
+// parseBingHTML extracts search results from Bing's HTML response.
+// Bing results are in <li class="b_algo"> elements containing <h2><a href="...">Title</a></h2>
+// and <p> or <div class="b_caption"> for snippets.
+func parseBingHTML(r io.Reader) ([]contracts.MainResult, error) {
+	body, err := io.ReadAll(r)
+	if err != nil {
+		return nil, err
+	}
+
+	content := string(body)
+	results := make([]contracts.MainResult, 0)
+
+	// Split on b_algo result containers.
+	parts := strings.Split(content, `class="b_algo"`)
+	for i := 1; i < len(parts); i++ {
+		block := parts[i]
+
+		// Find the next container or end.
+		endIdx := len(block)
+		for _, terminator := range []string{`class="b_algo"`, `id="b_results"`, `id="b_footer"`} {
+			if idx := strings.Index(block, terminator); idx > 0 && idx < endIdx {
+				endIdx = idx
+			}
+		}
+		block = block[:endIdx]
+
+		// Extract title and URL from <h2><a href="...">
+		title, href := extractBingLink(block)
+		if title == "" || href == "" {
+			continue
+		}
+
+		// Extract snippet from <p> or <div class="b_caption"><p>
+		snippet := extractBingSnippet(block)
+
+		results = append(results, contracts.MainResult{
+			Template: "default.html",
+			Title:    title,
+			Content:  snippet,
+			URL:      &href,
+			Engine:   "bing",
+			Score:    0,
+			Category: "general",
+			Engines:  []string{"bing"},
+		})
+	}
+
+	return results, nil
+}
+
+func extractBingLink(block string) (title, href string) {
+	// Find <a href="...">
+	hrefStart := strings.Index(block, `href="`)
+	if hrefStart == -1 {
+		return "", ""
+	}
+	hrefStart += 6
+	hrefEnd := strings.Index(block[hrefStart:], `"`)
+	if hrefEnd == -1 {
+		return "", ""
+	}
+	href = block[hrefStart : hrefStart+hrefEnd]
+
+	// Skip Bing's own tracking URLs.
+	if strings.Contains(href, "bing.com") && strings.Contains(href, "search?") {
+		// Try to extract the real URL from u= parameter.
+		if uIdx := strings.Index(href, "&u="); uIdx != -1 {
+			encodedURL := href[uIdx+3:]
+			if decoded, err := url.QueryUnescape(encodedURL); err == nil {
+				href = decoded
+			}
+		}
+	}
+
+	// Title is between > and </a> after the href.
+	titleStart := strings.Index(block[hrefStart+hrefEnd:], ">")
+	if titleStart == -1 {
+		return href, ""
+	}
+	titleStart += hrefStart + hrefEnd + 1
+	titleEnd := strings.Index(block[titleStart:], "</a>")
+	if titleEnd == -1 {
+		return href, ""
+	}
+	title = stripHTML(block[titleStart : titleStart+titleEnd])
+	title = strings.TrimSpace(title)
+
+	return title, href
+}
+
+func extractBingSnippet(block string) string {
+	// Try <div class="b_caption"><p> first.
+	if idx := strings.Index(block, `class="b_caption"`); idx != -1 {
+		caption := block[idx:]
+		if pStart := strings.Index(caption, "<p"); pStart != -1 {
+			snippet := caption[pStart:]
+			if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
+				return stripHTML(snippet[:pEnd+4])
+			}
+		}
+	}
+
+	// Fallback: any <p> tag.
+	if pStart := strings.Index(block, "<p"); pStart != -1 {
+		snippet := block[pStart:]
+		if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
+			return stripHTML(snippet[:pEnd+4])
+		}
+	}
+
+	return ""
+}