DuckDuckGo: - Fixed parser to handle single-quoted class attributes (class='result-link') - Decode DDG tracking URLs (uddg= parameter) to extract real URLs - Match snippet extraction to actual DDG Lite HTML structure (</td> terminator) Bing: - Switched from HTML scraping (blocked by JS detection) to RSS endpoint (?format=rss) which returns parseable XML - Added JSON API response parsing as fallback - Returns graceful unresponsive_engines entry when blocked Live test results: - DuckDuckGo: 9 results ✅ - GitHub: 10 results (14,768 total) ✅ - Bing: 10 results via RSS ✅ - Reddit: skipped (403 from sandbox, needs browser-like context)
175 lines
5 KiB
Go
175 lines
5 KiB
Go
package engines
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/ashie/gosearch/internal/contracts"
|
|
)
|
|
|
|
// BingEngine searches Bing via the public Bing API.
|
|
// Uses Bing's RSS search feed as a scraping fallback when the API is unavailable.
|
|
// Note: Bing's HTML is heavily JS-dependent and blocks non-browser clients,
|
|
// so this engine falls back gracefully when results cannot be retrieved.
|
|
type BingEngine struct {
|
|
client *http.Client
|
|
}
|
|
|
|
func (e *BingEngine) Name() string { return "bing" }
|
|
|
|
func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
|
if strings.TrimSpace(req.Query) == "" {
|
|
return contracts.SearchResponse{Query: req.Query}, nil
|
|
}
|
|
if e == nil || e.client == nil {
|
|
return contracts.SearchResponse{}, errors.New("bing engine not initialized")
|
|
}
|
|
|
|
endpoint := fmt.Sprintf(
|
|
"https://www.bing.com/search?q=%s&count=10&offset=%d&format=rss",
|
|
url.QueryEscape(req.Query),
|
|
(req.Pageno-1)*10,
|
|
)
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
httpReq.Header.Set("User-Agent", "gosearch/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)")
|
|
|
|
resp, err := e.client.Do(httpReq)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
|
return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
|
}
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if strings.Contains(contentType, "json") {
|
|
return parseBingJSON(resp.Body, req.Query)
|
|
}
|
|
|
|
if strings.Contains(contentType, "xml") || strings.Contains(contentType, "rss") {
|
|
return parseBingRSS(resp.Body, req.Query)
|
|
}
|
|
|
|
// If Bing returned HTML instead of RSS, it likely blocked us.
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: 0,
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{{"bing", "blocked by bot detection"}},
|
|
}, nil
|
|
}
|
|
|
|
// parseBingRSS parses Bing's RSS search results.
|
|
func parseBingRSS(r io.Reader, query string) (contracts.SearchResponse, error) {
|
|
type RSS struct {
|
|
XMLName xml.Name `xml:"rss"`
|
|
Channel struct {
|
|
Items []struct {
|
|
Title string `xml:"title"`
|
|
Link string `xml:"link"`
|
|
Descrip string `xml:"description"`
|
|
} `xml:"item"`
|
|
} `xml:"channel"`
|
|
}
|
|
|
|
var rss RSS
|
|
if err := xml.NewDecoder(r).Decode(&rss); err != nil {
|
|
return contracts.SearchResponse{}, fmt.Errorf("bing RSS parse error: %w", err)
|
|
}
|
|
|
|
results := make([]contracts.MainResult, 0, len(rss.Channel.Items))
|
|
for _, item := range rss.Channel.Items {
|
|
if item.Link == "" {
|
|
continue
|
|
}
|
|
linkPtr := item.Link
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: item.Title,
|
|
Content: stripHTML(item.Descrip),
|
|
URL: &linkPtr,
|
|
Engine: "bing",
|
|
Score: 0,
|
|
Category: "general",
|
|
Engines: []string{"bing"},
|
|
})
|
|
}
|
|
|
|
return contracts.SearchResponse{
|
|
Query: query,
|
|
NumberOfResults: len(results),
|
|
Results: results,
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|
|
// parseBingJSON parses Bing's JSON API response.
|
|
func parseBingJSON(r io.Reader, query string) (contracts.SearchResponse, error) {
|
|
var data struct {
|
|
WebPages struct {
|
|
TotalEstimatedMatches int `json:"totalEstimatedMatches"`
|
|
Value []struct {
|
|
Name string `json:"name"`
|
|
URL string `json:"url"`
|
|
Snippet string `json:"snippet"`
|
|
DateLastCrawled string `json:"dateLastCrawled"`
|
|
} `json:"value"`
|
|
} `json:"webPages"`
|
|
}
|
|
|
|
if err := json.NewDecoder(r).Decode(&data); err != nil {
|
|
return contracts.SearchResponse{}, fmt.Errorf("bing JSON parse error: %w", err)
|
|
}
|
|
|
|
results := make([]contracts.MainResult, 0, len(data.WebPages.Value))
|
|
for _, item := range data.WebPages.Value {
|
|
linkPtr := item.URL
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: item.Name,
|
|
Content: item.Snippet,
|
|
URL: &linkPtr,
|
|
Engine: "bing",
|
|
Score: 0,
|
|
Category: "general",
|
|
Engines: []string{"bing"},
|
|
})
|
|
}
|
|
|
|
return contracts.SearchResponse{
|
|
Query: query,
|
|
NumberOfResults: data.WebPages.TotalEstimatedMatches,
|
|
Results: results,
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|
|
var _ = strconv.Itoa
|
|
var _ = json.Unmarshal
|