package engines import ( "context" "fmt" "io" "net/http" "net/url" "regexp" "strings" "github.com/metamorphosis-dev/kafka/internal/contracts" ) // GSA User-Agent pool — these are Google Search Appliance identifiers // that Google trusts for enterprise search appliance traffic. var gsaUserAgents = []string{ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", } func gsaUA() string { return gsaUserAgents[0] // deterministic for now; could rotate } type GoogleEngine struct { client *http.Client } func (e *GoogleEngine) Name() string { return "google" } func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { if strings.TrimSpace(req.Query) == "" { return contracts.SearchResponse{Query: req.Query}, nil } start := (req.Pageno - 1) * 10 query := url.QueryEscape(req.Query) // Build URL like SearXNG does. u := fmt.Sprintf( "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", query, start, googleHL(req.Language), googleUILanguage(req.Language), googleSafeSearchLevel(req.Safesearch), ) httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) if err != nil { return contracts.SearchResponse{}, err } httpReq.Header.Set("User-Agent", gsaUA()) httpReq.Header.Set("Accept", "*/*") httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) resp, err := e.client.Do(httpReq) if err != nil { return contracts.SearchResponse{}, err } defer resp.Body.Close() // Check for Google block / CAPTCHA page. if detectGoogleSorry(resp) { return contracts.SearchResponse{ Query: req.Query, NumberOfResults: 0, Results: nil, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, }, nil } if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) } body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) if err != nil { return contracts.SearchResponse{}, err } results := parseGoogleResults(string(body), req.Query) return contracts.SearchResponse{ Query: req.Query, NumberOfResults: len(results), Results: results, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: extractGoogleSuggestions(string(body)), UnresponsiveEngines: [][2]string{}, }, nil } // detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. func detectGoogleSorry(resp *http.Response) bool { if resp.Request != nil { if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { return true } } return false } // parseGoogleResults extracts search results from Google's HTML. // Uses the same selectors as SearXNG: div.MjjYud for result containers. func parseGoogleResults(body, query string) []contracts.MainResult { var results []contracts.MainResult // SearXNG selector: .//div[contains(@class, "MjjYud")] // Each result block contains a title link and snippet. // We simulate the XPath matching with regex-based extraction. // Find all MjjYud div blocks. mjjPattern := regexp.MustCompile(`]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)\s*(?=]*class="[^"]*MjjYud|$)`) matches := mjjPattern.FindAllStringSubmatch(body, -1) for i, match := range matches { if len(match) < 2 { continue } block := match[1] // Extract title and URL from the result link. // Pattern: TITLE urlPattern := regexp.MustCompile(`]+href="(/url\?q=[^"&]+)`) urlMatch := urlPattern.FindStringSubmatch(block) if len(urlMatch) < 2 { continue } rawURL := urlMatch[1] // Remove /url?q= prefix and decode. actualURL := strings.TrimPrefix(rawURL, "/url?q=") if amp := strings.Index(actualURL, "&"); amp != -1 { actualURL = actualURL[:amp] } if decoded, err := url.QueryUnescape(actualURL); err == nil { actualURL = decoded } if actualURL == "" || !strings.HasPrefix(actualURL, "http") { continue } // Extract title from the title tag. titlePattern := regexp.MustCompile(`]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)`) titleMatch := titlePattern.FindStringSubmatch(block) title := query if len(titleMatch) >= 2 { title = stripTags(titleMatch[1]) } else { // Fallback: extract visible text from an with data-title or role="link" linkTitlePattern := regexp.MustCompile(`]+role="link"[^>]*>([^<]+)<`) ltMatch := linkTitlePattern.FindStringSubmatch(block) if len(ltMatch) >= 2 { title = stripTags(ltMatch[1]) } } // Extract snippet from data-sncf divs (SearXNG's approach). snippet := extractGoogleSnippet(block) urlPtr := actualURL results = append(results, contracts.MainResult{ Title: title, URL: &urlPtr, Content: snippet, Engine: "google", Score: float64(len(matches) - i), Category: "general", Engines: []string{"google"}, Template: "default.html", }) } return results } // extractGoogleSnippet extracts the snippet text from a Google result block. func extractGoogleSnippet(block string) string { // Google's snippets live in divs with data-sncf attribute. // SearXNG looks for: .//div[contains(@data-sncf, "1")] snippetPattern := regexp.MustCompile(`]+data-sncf="1"[^>]*>(.*?)`) matches := snippetPattern.FindAllStringSubmatch(block, -1) var parts []string for _, m := range matches { if len(m) < 2 { continue } text := stripTags(m[1]) if text != "" { parts = append(parts, text) } } return strings.Join(parts, " ") } // extractGoogleSuggestions extracts search suggestions from Google result cards. func extractGoogleSuggestions(body string) []string { var suggestions []string // SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a suggestionPattern := regexp.MustCompile(`]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?]*>([^<]+)`, regexp.DotAll) matches := suggestionPattern.FindAllStringSubmatch(body, -1) seen := map[string]bool{} for _, m := range matches { if len(m) < 2 { continue } s := strings.TrimSpace(stripTags(m[1])) if s != "" && !seen[s] { seen[s] = true suggestions = append(suggestions, s) } } return suggestions } // googleHL maps SearXNG locale to Google hl (host language) parameter. // e.g. "en-US" -> "en-US" func googleHL(lang string) string { lang = strings.ToLower(strings.TrimSpace(lang)) if lang == "" || lang == "auto" { return "en" } return lang } // googleUILanguage maps SearXNG language to Google lr (language restrict) parameter. // e.g. "en" -> "lang_en", "de" -> "lang_de" func googleUILanguage(lang string) string { lang = strings.ToLower(strings.Split(lang, "-")[0]) if lang == "" || lang == "auto" { return "" } return "lang_" + lang } // googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter. func googleSafeSearchLevel(safesearch int) string { switch safesearch { case 0: return "off" case 1: return "medium" case 2: return "high" default: return "medium" } } // stripTags removes HTML tags from a string. func stripTags(s string) string { stripper := regexp.MustCompile(`<[^>]*>`) s = stripper.ReplaceAllString(s, "") s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, """, `"`) s = strings.ReplaceAll(s, "'", "'") s = strings.ReplaceAll(s, " ", " ") return strings.TrimSpace(s) }