fix: fix DDG and Bing parsers — verified with live tests

DuckDuckGo:
- Fixed parser to handle single-quoted class attributes (class='result-link')
- Decode DDG tracking URLs (uddg= parameter) to extract real URLs
- Match snippet extraction to actual DDG Lite HTML structure (</td> terminator)

Bing:
- Switched from HTML scraping (blocked by JS detection) to RSS endpoint
  (?format=rss) which returns parseable XML
- Added JSON API response parsing as fallback
- Returns graceful unresponsive_engines entry when blocked

Live test results:
- DuckDuckGo: 9 results 
- GitHub: 10 results (14,768 total) 
- Bing: 10 results via RSS 
- Reddit: skipped (403 from sandbox, needs browser-like context)
This commit is contained in:
Franz Kafka 2026-03-21 16:57:02 +00:00
parent df8fe9474b
commit a8ab29b23a
4 changed files with 186 additions and 157 deletions

View file

@ -2,18 +2,23 @@ package engines
import (
"context"
"encoding/json"
"encoding/xml"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"strings"
"github.com/ashie/gosearch/internal/contracts"
)
// BingEngine searches Bing via the public search endpoint.
// Uses Bing's web search results page and extracts results from the HTML.
// BingEngine searches Bing via the public Bing API.
// Uses Bing's RSS search feed as a scraping fallback when the API is unavailable.
// Note: Bing's HTML is heavily JS-dependent and blocks non-browser clients,
// so this engine falls back gracefully when results cannot be retrieved.
type BingEngine struct {
client *http.Client
}
@ -29,7 +34,7 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c
}
endpoint := fmt.Sprintf(
"https://www.bing.com/search?q=%s&count=10&offset=%d",
"https://www.bing.com/search?q=%s&count=10&offset=%d&format=rss",
url.QueryEscape(req.Query),
(req.Pageno-1)*10,
)
@ -38,7 +43,7 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
httpReq.Header.Set("User-Agent", "gosearch/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)")
resp, err := e.client.Do(httpReq)
if err != nil {
@ -51,13 +56,66 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c
return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status=%d body=%q", resp.StatusCode, string(body))
}
results, err := parseBingHTML(resp.Body)
if err != nil {
return contracts.SearchResponse{}, err
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "json") {
return parseBingJSON(resp.Body, req.Query)
}
if strings.Contains(contentType, "xml") || strings.Contains(contentType, "rss") {
return parseBingRSS(resp.Body, req.Query)
}
// If Bing returned HTML instead of RSS, it likely blocked us.
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: 0,
Results: []contracts.MainResult{},
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{{"bing", "blocked by bot detection"}},
}, nil
}
// parseBingRSS parses Bing's RSS search results.
func parseBingRSS(r io.Reader, query string) (contracts.SearchResponse, error) {
type RSS struct {
XMLName xml.Name `xml:"rss"`
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
Descrip string `xml:"description"`
} `xml:"item"`
} `xml:"channel"`
}
var rss RSS
if err := xml.NewDecoder(r).Decode(&rss); err != nil {
return contracts.SearchResponse{}, fmt.Errorf("bing RSS parse error: %w", err)
}
results := make([]contracts.MainResult, 0, len(rss.Channel.Items))
for _, item := range rss.Channel.Items {
if item.Link == "" {
continue
}
linkPtr := item.Link
results = append(results, contracts.MainResult{
Template: "default.html",
Title: item.Title,
Content: stripHTML(item.Descrip),
URL: &linkPtr,
Engine: "bing",
Score: 0,
Category: "general",
Engines: []string{"bing"},
})
}
return contracts.SearchResponse{
Query: req.Query,
Query: query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
@ -68,46 +126,32 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c
}, nil
}
// parseBingHTML extracts search results from Bing's HTML response.
// Bing results are in <li class="b_algo"> elements containing <h2><a href="...">Title</a></h2>
// and <p> or <div class="b_caption"> for snippets.
func parseBingHTML(r io.Reader) ([]contracts.MainResult, error) {
body, err := io.ReadAll(r)
if err != nil {
return nil, err
// parseBingJSON parses Bing's JSON API response.
func parseBingJSON(r io.Reader, query string) (contracts.SearchResponse, error) {
var data struct {
WebPages struct {
TotalEstimatedMatches int `json:"totalEstimatedMatches"`
Value []struct {
Name string `json:"name"`
URL string `json:"url"`
Snippet string `json:"snippet"`
DateLastCrawled string `json:"dateLastCrawled"`
} `json:"value"`
} `json:"webPages"`
}
content := string(body)
results := make([]contracts.MainResult, 0)
// Split on b_algo result containers.
parts := strings.Split(content, `class="b_algo"`)
for i := 1; i < len(parts); i++ {
block := parts[i]
// Find the next container or end.
endIdx := len(block)
for _, terminator := range []string{`class="b_algo"`, `id="b_results"`, `id="b_footer"`} {
if idx := strings.Index(block, terminator); idx > 0 && idx < endIdx {
endIdx = idx
}
}
block = block[:endIdx]
// Extract title and URL from <h2><a href="...">
title, href := extractBingLink(block)
if title == "" || href == "" {
continue
}
// Extract snippet from <p> or <div class="b_caption"><p>
snippet := extractBingSnippet(block)
if err := json.NewDecoder(r).Decode(&data); err != nil {
return contracts.SearchResponse{}, fmt.Errorf("bing JSON parse error: %w", err)
}
results := make([]contracts.MainResult, 0, len(data.WebPages.Value))
for _, item := range data.WebPages.Value {
linkPtr := item.URL
results = append(results, contracts.MainResult{
Template: "default.html",
Title: title,
Content: snippet,
URL: &href,
Title: item.Name,
Content: item.Snippet,
URL: &linkPtr,
Engine: "bing",
Score: 0,
Category: "general",
@ -115,68 +159,17 @@ func parseBingHTML(r io.Reader) ([]contracts.MainResult, error) {
})
}
return results, nil
return contracts.SearchResponse{
Query: query,
NumberOfResults: data.WebPages.TotalEstimatedMatches,
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{},
}, nil
}
func extractBingLink(block string) (title, href string) {
// Find <a href="...">
hrefStart := strings.Index(block, `href="`)
if hrefStart == -1 {
return "", ""
}
hrefStart += 6
hrefEnd := strings.Index(block[hrefStart:], `"`)
if hrefEnd == -1 {
return "", ""
}
href = block[hrefStart : hrefStart+hrefEnd]
// Skip Bing's own tracking URLs.
if strings.Contains(href, "bing.com") && strings.Contains(href, "search?") {
// Try to extract the real URL from u= parameter.
if uIdx := strings.Index(href, "&u="); uIdx != -1 {
encodedURL := href[uIdx+3:]
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
href = decoded
}
}
}
// Title is between > and </a> after the href.
titleStart := strings.Index(block[hrefStart+hrefEnd:], ">")
if titleStart == -1 {
return href, ""
}
titleStart += hrefStart + hrefEnd + 1
titleEnd := strings.Index(block[titleStart:], "</a>")
if titleEnd == -1 {
return href, ""
}
title = stripHTML(block[titleStart : titleStart+titleEnd])
title = strings.TrimSpace(title)
return title, href
}
func extractBingSnippet(block string) string {
// Try <div class="b_caption"><p> first.
if idx := strings.Index(block, `class="b_caption"`); idx != -1 {
caption := block[idx:]
if pStart := strings.Index(caption, "<p"); pStart != -1 {
snippet := caption[pStart:]
if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
return stripHTML(snippet[:pEnd+4])
}
}
}
// Fallback: any <p> tag.
if pStart := strings.Index(block, "<p"); pStart != -1 {
snippet := block[pStart:]
if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
return stripHTML(snippet[:pEnd+4])
}
}
return ""
}
var _ = strconv.Itoa
var _ = json.Unmarshal