package engines import ( "context" "fmt" "io" "net/http" "net/url" "regexp" "strings" "github.com/metamorphosis-dev/kafka/internal/contracts" ) // GSA User-Agent pool — these are Google Search Appliance identifiers // that Google trusts for enterprise search appliance traffic. var gsaUserAgents = []string{ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", } func gsaUA() string { return gsaUserAgents[0] // deterministic for now; could rotate } type GoogleEngine struct { client *http.Client } func (e *GoogleEngine) Name() string { return "google" } func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { if strings.TrimSpace(req.Query) == "" { return contracts.SearchResponse{Query: req.Query}, nil } start := (req.Pageno - 1) * 10 query := url.QueryEscape(req.Query) // Build URL like SearXNG does. u := fmt.Sprintf( "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", query, start, googleHL(req.Language), googleUILanguage(req.Language), googleSafeSearchLevel(req.Safesearch), ) httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) if err != nil { return contracts.SearchResponse{}, err } httpReq.Header.Set("User-Agent", gsaUA()) httpReq.Header.Set("Accept", "*/*") httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) resp, err := e.client.Do(httpReq) if err != nil { return contracts.SearchResponse{}, err } defer resp.Body.Close() // Check for Google block / CAPTCHA page. if detectGoogleSorry(resp) { return contracts.SearchResponse{ Query: req.Query, NumberOfResults: 0, Results: nil, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, }, nil } if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) } body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) if err != nil { return contracts.SearchResponse{}, err } results := parseGoogleResults(string(body), req.Query) return contracts.SearchResponse{ Query: req.Query, NumberOfResults: len(results), Results: results, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: extractGoogleSuggestions(string(body)), UnresponsiveEngines: [][2]string{}, }, nil } // detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. func detectGoogleSorry(resp *http.Response) bool { if resp.Request != nil { if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { return true } } return false } // parseGoogleResults extracts search results from Google's HTML. // Uses the same selectors as SearXNG: div.MjjYud for result containers. func parseGoogleResults(body, query string) []contracts.MainResult { var results []contracts.MainResult // SearXNG selector: .//div[contains(@class, "MjjYud")] // Each result block contains a title link and snippet. // We simulate the XPath matching with regex-based extraction. // Find all MjjYud div blocks. mjjPattern := regexp.MustCompile(`