From 4be9cf2725ce5245076a128bdb4a873f263c82d9 Mon Sep 17 00:00:00 2001 From: Franz Kafka Date: Sun, 22 Mar 2026 01:25:04 +0000 Subject: [PATCH] feat: add Google engine using GSA User-Agent scraping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SearXNG approach: use Google Search Appliance (GSA) User-Agent pool — these are whitelisted enterprise identifiers Google trusts. Key techniques: - GSA User-Agent (iPhone OS + GSA/ version) instead of Chrome desktop - CONSENT=YES+ cookie to bypass EU consent wall - Parse /url?q= redirector URLs (unquote + strip &sa= params) - div.MjjYud class for result containers (SearXNG selector) - data-sncf divs for snippets - detect sorry.google.com blocks - Suggestions from ouy7Mc class cards --- internal/engines/factory.go | 3 +- internal/engines/google.go | 271 ++++++++++++++++++++++++++++++++++++ internal/engines/planner.go | 3 +- 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 internal/engines/google.go diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 310a20e..937225f 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine { "duckduckgo": &DuckDuckGoEngine{client: client}, "github": &GitHubEngine{client: client}, "reddit": &RedditEngine{client: client}, - "bing": &BingEngine{client: client}, + "bing": &BingEngine{client: client}, + "google": &GoogleEngine{client: client}, } } diff --git a/internal/engines/google.go b/internal/engines/google.go new file mode 100644 index 0000000..0371283 --- /dev/null +++ b/internal/engines/google.go @@ -0,0 +1,271 @@ +package engines + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + + "github.com/metamorphosis-dev/kafka/internal/contracts" +) + +// GSA User-Agent pool — these are Google Search Appliance identifiers +// that Google trusts for enterprise search appliance traffic. +var gsaUserAgents = []string{ + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", +} + +func gsaUA() string { + return gsaUserAgents[0] // deterministic for now; could rotate +} + +type GoogleEngine struct { + client *http.Client +} + +func (e *GoogleEngine) Name() string { return "google" } + +func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if strings.TrimSpace(req.Query) == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + start := (req.Pageno - 1) * 10 + query := url.QueryEscape(req.Query) + + // Build URL like SearXNG does. + u := fmt.Sprintf( + "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", + query, + start, + googleHL(req.Language), + googleUILanguage(req.Language), + googleSafeSearchLevel(req.Safesearch), + ) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", gsaUA()) + httpReq.Header.Set("Accept", "*/*") + httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + // Check for Google block / CAPTCHA page. + if detectGoogleSorry(resp) { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: nil, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, + }, nil + } + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + results := parseGoogleResults(string(body), req.Query) + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: extractGoogleSuggestions(string(body)), + UnresponsiveEngines: [][2]string{}, + }, nil +} + +// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. +func detectGoogleSorry(resp *http.Response) bool { + if resp.Request != nil { + if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { + return true + } + } + return false +} + +// parseGoogleResults extracts search results from Google's HTML. +// Uses the same selectors as SearXNG: div.MjjYud for result containers. +func parseGoogleResults(body, query string) []contracts.MainResult { + var results []contracts.MainResult + + // SearXNG selector: .//div[contains(@class, "MjjYud")] + // Each result block contains a title link and snippet. + // We simulate the XPath matching with regex-based extraction. + + // Find all MjjYud div blocks. + mjjPattern := regexp.MustCompile(`]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)\s*(?=]*class="[^"]*MjjYud|$)`) + matches := mjjPattern.FindAllStringSubmatch(body, -1) + + for i, match := range matches { + if len(match) < 2 { + continue + } + block := match[1] + + // Extract title and URL from the result link. + // Pattern: TITLE + urlPattern := regexp.MustCompile(`]+href="(/url\?q=[^"&]+)`) + urlMatch := urlPattern.FindStringSubmatch(block) + if len(urlMatch) < 2 { + continue + } + rawURL := urlMatch[1] + // Remove /url?q= prefix and decode. + actualURL := strings.TrimPrefix(rawURL, "/url?q=") + if amp := strings.Index(actualURL, "&"); amp != -1 { + actualURL = actualURL[:amp] + } + if decoded, err := url.QueryUnescape(actualURL); err == nil { + actualURL = decoded + } + + if actualURL == "" || !strings.HasPrefix(actualURL, "http") { + continue + } + + // Extract title from the title tag. + titlePattern := regexp.MustCompile(`]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)`) + titleMatch := titlePattern.FindStringSubmatch(block) + title := query + if len(titleMatch) >= 2 { + title = stripTags(titleMatch[1]) + } else { + // Fallback: extract visible text from an with data-title or role="link" + linkTitlePattern := regexp.MustCompile(`]+role="link"[^>]*>([^<]+)<`) + ltMatch := linkTitlePattern.FindStringSubmatch(block) + if len(ltMatch) >= 2 { + title = stripTags(ltMatch[1]) + } + } + + // Extract snippet from data-sncf divs (SearXNG's approach). + snippet := extractGoogleSnippet(block) + + urlPtr := actualURL + results = append(results, contracts.MainResult{ + Title: title, + URL: &urlPtr, + Content: snippet, + Engine: "google", + Score: float64(len(matches) - i), + Category: "general", + Engines: []string{"google"}, + Template: "default.html", + }) + } + + return results +} + +// extractGoogleSnippet extracts the snippet text from a Google result block. +func extractGoogleSnippet(block string) string { + // Google's snippets live in divs with data-sncf attribute. + // SearXNG looks for: .//div[contains(@data-sncf, "1")] + snippetPattern := regexp.MustCompile(`]+data-sncf="1"[^>]*>(.*?)`) + matches := snippetPattern.FindAllStringSubmatch(block, -1) + var parts []string + for _, m := range matches { + if len(m) < 2 { + continue + } + text := stripTags(m[1]) + if text != "" { + parts = append(parts, text) + } + } + return strings.Join(parts, " ") +} + +// extractGoogleSuggestions extracts search suggestions from Google result cards. +func extractGoogleSuggestions(body string) []string { + var suggestions []string + // SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a + suggestionPattern := regexp.MustCompile(`]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?]*>([^<]+)`, regexp.DotAll) + matches := suggestionPattern.FindAllStringSubmatch(body, -1) + seen := map[string]bool{} + for _, m := range matches { + if len(m) < 2 { + continue + } + s := strings.TrimSpace(stripTags(m[1])) + if s != "" && !seen[s] { + seen[s] = true + suggestions = append(suggestions, s) + } + } + return suggestions +} + +// googleHL maps SearXNG locale to Google hl (host language) parameter. +// e.g. "en-US" -> "en-US" +func googleHL(lang string) string { + lang = strings.ToLower(strings.TrimSpace(lang)) + if lang == "" || lang == "auto" { + return "en" + } + return lang +} + +// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter. +// e.g. "en" -> "lang_en", "de" -> "lang_de" +func googleUILanguage(lang string) string { + lang = strings.ToLower(strings.Split(lang, "-")[0]) + if lang == "" || lang == "auto" { + return "" + } + return "lang_" + lang +} + +// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter. +func googleSafeSearchLevel(safesearch int) string { + switch safesearch { + case 0: + return "off" + case 1: + return "medium" + case 2: + return "high" + default: + return "medium" + } +} + +// stripTags removes HTML tags from a string. +func stripTags(s string) string { + stripper := regexp.MustCompile(`<[^>]*>`) + s = stripper.ReplaceAllString(s, "") + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, """, `"`) + s = strings.ReplaceAll(s, "'", "'") + s = strings.ReplaceAll(s, " ", " ") + return strings.TrimSpace(s) +} diff --git a/internal/engines/planner.go b/internal/engines/planner.go index 543f253..08b0a27 100644 --- a/internal/engines/planner.go +++ b/internal/engines/planner.go @@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string { set["qwant"] = true set["duckduckgo"] = true set["bing"] = true + set["google"] = true case "science", "scientific publications": set["arxiv"] = true set["crossref"] = true @@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string { out = append(out, e) } // stable order - order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8} + order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9} sortByOrder(out, order) return out }