kafka/internal/engines/bing.go

package engines

import (
	"context"
	"errors"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"strings"

	"github.com/ashie/gosearch/internal/contracts"
)

// BingEngine searches Bing via the public search endpoint.
// Uses Bing's web search results page and extracts results from the HTML.
type BingEngine struct {
	client *http.Client
}

func (e *BingEngine) Name() string { return "bing" }

func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
	if strings.TrimSpace(req.Query) == "" {
		return contracts.SearchResponse{Query: req.Query}, nil
	}
	if e == nil || e.client == nil {
		return contracts.SearchResponse{}, errors.New("bing engine not initialized")
	}

	endpoint := fmt.Sprintf(
		"https://www.bing.com/search?q=%s&count=10&offset=%d",
		url.QueryEscape(req.Query),
		(req.Pageno-1)*10,
	)

	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
	if err != nil {
		return contracts.SearchResponse{}, err
	}
	httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

	resp, err := e.client.Do(httpReq)
	if err != nil {
		return contracts.SearchResponse{}, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
		return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status=%d body=%q", resp.StatusCode, string(body))
	}

	results, err := parseBingHTML(resp.Body)
	if err != nil {
		return contracts.SearchResponse{}, err
	}

	return contracts.SearchResponse{
		Query:               req.Query,
		NumberOfResults:    len(results),
		Results:             results,
		Answers:             []map[string]any{},
		Corrections:         []string{},
		Infoboxes:           []map[string]any{},
		Suggestions:         []string{},
		UnresponsiveEngines: [][2]string{},
	}, nil
}

// parseBingHTML extracts search results from Bing's HTML response.
// Bing results are in <li class="b_algo"> elements containing <h2><a href="...">Title</a></h2>
// and <p> or <div class="b_caption"> for snippets.
func parseBingHTML(r io.Reader) ([]contracts.MainResult, error) {
	body, err := io.ReadAll(r)
	if err != nil {
		return nil, err
	}

	content := string(body)
	results := make([]contracts.MainResult, 0)

	// Split on b_algo result containers.
	parts := strings.Split(content, `class="b_algo"`)
	for i := 1; i < len(parts); i++ {
		block := parts[i]

		// Find the next container or end.
		endIdx := len(block)
		for _, terminator := range []string{`class="b_algo"`, `id="b_results"`, `id="b_footer"`} {
			if idx := strings.Index(block, terminator); idx > 0 && idx < endIdx {
				endIdx = idx
			}
		}
		block = block[:endIdx]

		// Extract title and URL from <h2><a href="...">
		title, href := extractBingLink(block)
		if title == "" || href == "" {
			continue
		}

		// Extract snippet from <p> or <div class="b_caption"><p>
		snippet := extractBingSnippet(block)

		results = append(results, contracts.MainResult{
			Template: "default.html",
			Title:    title,
			Content:  snippet,
			URL:      &href,
			Engine:   "bing",
			Score:    0,
			Category: "general",
			Engines:  []string{"bing"},
		})
	}

	return results, nil
}

func extractBingLink(block string) (title, href string) {
	// Find <a href="...">
	hrefStart := strings.Index(block, `href="`)
	if hrefStart == -1 {
		return "", ""
	}
	hrefStart += 6
	hrefEnd := strings.Index(block[hrefStart:], `"`)
	if hrefEnd == -1 {
		return "", ""
	}
	href = block[hrefStart : hrefStart+hrefEnd]

	// Skip Bing's own tracking URLs.
	if strings.Contains(href, "bing.com") && strings.Contains(href, "search?") {
		// Try to extract the real URL from u= parameter.
		if uIdx := strings.Index(href, "&u="); uIdx != -1 {
			encodedURL := href[uIdx+3:]
			if decoded, err := url.QueryUnescape(encodedURL); err == nil {
				href = decoded
			}
		}
	}

	// Title is between > and </a> after the href.
	titleStart := strings.Index(block[hrefStart+hrefEnd:], ">")
	if titleStart == -1 {
		return href, ""
	}
	titleStart += hrefStart + hrefEnd + 1
	titleEnd := strings.Index(block[titleStart:], "</a>")
	if titleEnd == -1 {
		return href, ""
	}
	title = stripHTML(block[titleStart : titleStart+titleEnd])
	title = strings.TrimSpace(title)

	return title, href
}

func extractBingSnippet(block string) string {
	// Try <div class="b_caption"><p> first.
	if idx := strings.Index(block, `class="b_caption"`); idx != -1 {
		caption := block[idx:]
		if pStart := strings.Index(caption, "<p"); pStart != -1 {
			snippet := caption[pStart:]
			if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
				return stripHTML(snippet[:pEnd+4])
			}
		}
	}

	// Fallback: any <p> tag.
	if pStart := strings.Index(block, "<p"); pStart != -1 {
		snippet := block[pStart:]
		if pEnd := strings.Index(snippet, "</p>"); pEnd != -1 {
			return stripHTML(snippet[:pEnd+4])
		}
	}

	return ""
}