diff --git a/internal/engines/bing.go b/internal/engines/bing.go index c96a996..1d46a26 100644 --- a/internal/engines/bing.go +++ b/internal/engines/bing.go @@ -2,18 +2,23 @@ package engines import ( "context" + "encoding/json" + "encoding/xml" "errors" "fmt" "io" "net/http" "net/url" + "strconv" "strings" "github.com/ashie/gosearch/internal/contracts" ) -// BingEngine searches Bing via the public search endpoint. -// Uses Bing's web search results page and extracts results from the HTML. +// BingEngine searches Bing via the public Bing API. +// Uses Bing's RSS search feed as a scraping fallback when the API is unavailable. +// Note: Bing's HTML is heavily JS-dependent and blocks non-browser clients, +// so this engine falls back gracefully when results cannot be retrieved. type BingEngine struct { client *http.Client } @@ -29,7 +34,7 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c } endpoint := fmt.Sprintf( - "https://www.bing.com/search?q=%s&count=10&offset=%d", + "https://www.bing.com/search?q=%s&count=10&offset=%d&format=rss", url.QueryEscape(req.Query), (req.Pageno-1)*10, ) @@ -38,7 +43,7 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c if err != nil { return contracts.SearchResponse{}, err } - httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + httpReq.Header.Set("User-Agent", "gosearch/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)") resp, err := e.client.Do(httpReq) if err != nil { @@ -51,13 +56,66 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status=%d body=%q", resp.StatusCode, string(body)) } - results, err := parseBingHTML(resp.Body) - if err != nil { - return contracts.SearchResponse{}, err + contentType := resp.Header.Get("Content-Type") + if strings.Contains(contentType, "json") { + return parseBingJSON(resp.Body, req.Query) + } + + if strings.Contains(contentType, "xml") || strings.Contains(contentType, "rss") { + return parseBingRSS(resp.Body, req.Query) + } + + // If Bing returned HTML instead of RSS, it likely blocked us. + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{{"bing", "blocked by bot detection"}}, + }, nil +} + +// parseBingRSS parses Bing's RSS search results. +func parseBingRSS(r io.Reader, query string) (contracts.SearchResponse, error) { + type RSS struct { + XMLName xml.Name `xml:"rss"` + Channel struct { + Items []struct { + Title string `xml:"title"` + Link string `xml:"link"` + Descrip string `xml:"description"` + } `xml:"item"` + } `xml:"channel"` + } + + var rss RSS + if err := xml.NewDecoder(r).Decode(&rss); err != nil { + return contracts.SearchResponse{}, fmt.Errorf("bing RSS parse error: %w", err) + } + + results := make([]contracts.MainResult, 0, len(rss.Channel.Items)) + for _, item := range rss.Channel.Items { + if item.Link == "" { + continue + } + linkPtr := item.Link + results = append(results, contracts.MainResult{ + Template: "default.html", + Title: item.Title, + Content: stripHTML(item.Descrip), + URL: &linkPtr, + Engine: "bing", + Score: 0, + Category: "general", + Engines: []string{"bing"}, + }) } return contracts.SearchResponse{ - Query: req.Query, + Query: query, NumberOfResults: len(results), Results: results, Answers: []map[string]any{}, @@ -68,46 +126,32 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c }, nil } -// parseBingHTML extracts search results from Bing's HTML response. -// Bing results are in
or
or
- snippet := extractBingSnippet(block) + if err := json.NewDecoder(r).Decode(&data); err != nil { + return contracts.SearchResponse{}, fmt.Errorf("bing JSON parse error: %w", err) + } + results := make([]contracts.MainResult, 0, len(data.WebPages.Value)) + for _, item := range data.WebPages.Value { + linkPtr := item.URL results = append(results, contracts.MainResult{ Template: "default.html", - Title: title, - Content: snippet, - URL: &href, + Title: item.Name, + Content: item.Snippet, + URL: &linkPtr, Engine: "bing", Score: 0, Category: "general", @@ -115,68 +159,17 @@ func parseBingHTML(r io.Reader) ([]contracts.MainResult, error) { }) } - return results, nil + return contracts.SearchResponse{ + Query: query, + NumberOfResults: data.WebPages.TotalEstimatedMatches, + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil } -func extractBingLink(block string) (title, href string) { - // Find - hrefStart := strings.Index(block, `href="`) - if hrefStart == -1 { - return "", "" - } - hrefStart += 6 - hrefEnd := strings.Index(block[hrefStart:], `"`) - if hrefEnd == -1 { - return "", "" - } - href = block[hrefStart : hrefStart+hrefEnd] - - // Skip Bing's own tracking URLs. - if strings.Contains(href, "bing.com") && strings.Contains(href, "search?") { - // Try to extract the real URL from u= parameter. - if uIdx := strings.Index(href, "&u="); uIdx != -1 { - encodedURL := href[uIdx+3:] - if decoded, err := url.QueryUnescape(encodedURL); err == nil { - href = decoded - } - } - } - - // Title is between > and after the href. - titleStart := strings.Index(block[hrefStart+hrefEnd:], ">") - if titleStart == -1 { - return href, "" - } - titleStart += hrefStart + hrefEnd + 1 - titleEnd := strings.Index(block[titleStart:], "") - if titleEnd == -1 { - return href, "" - } - title = stripHTML(block[titleStart : titleStart+titleEnd]) - title = strings.TrimSpace(title) - - return title, href -} - -func extractBingSnippet(block string) string { - // Try
first. - if idx := strings.Index(block, `class="b_caption"`); idx != -1 { - caption := block[idx:] - if pStart := strings.Index(caption, "
"); pEnd != -1 { - return stripHTML(snippet[:pEnd+4]) - } - } - } - - // Fallback: any
tag. - if pStart := strings.Index(block, "
"); pEnd != -1 { - return stripHTML(snippet[:pEnd+4]) - } - } - - return "" -} +var _ = strconv.Itoa +var _ = json.Unmarshal diff --git a/internal/engines/bing_test.go b/internal/engines/bing_test.go index abd4619..e5a043d 100644 --- a/internal/engines/bing_test.go +++ b/internal/engines/bing_test.go @@ -36,34 +36,6 @@ func TestBingEngine_Uninitialized(t *testing.T) { } } -func TestParseBingHTML(t *testing.T) { - html := `
This is a test snippet from Bing.
Another snippet