From 4be9cf2725ce5245076a128bdb4a873f263c82d9 Mon Sep 17 00:00:00 2001
From: Franz Kafka <kafka@ashisgreat.xyz>
Date: Sun, 22 Mar 2026 01:25:04 +0000
Subject: [PATCH] feat: add Google engine using GSA User-Agent scraping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SearXNG approach: use Google Search Appliance (GSA) User-Agent
pool — these are whitelisted enterprise identifiers Google trusts.

Key techniques:
- GSA User-Agent (iPhone OS + GSA/ version) instead of Chrome desktop
- CONSENT=YES+ cookie to bypass EU consent wall
- Parse /url?q= redirector URLs (unquote + strip &sa= params)
- div.MjjYud class for result containers (SearXNG selector)
- data-sncf divs for snippets
- detect sorry.google.com blocks
- Suggestions from ouy7Mc class cards
---
 internal/engines/factory.go |   3 +-
 internal/engines/google.go  | 271 ++++++++++++++++++++++++++++++++++++
 internal/engines/planner.go |   3 +-
 3 files changed, 275 insertions(+), 2 deletions(-)
 create mode 100644 internal/engines/google.go

diff --git a/internal/engines/factory.go b/internal/engines/factory.go
index 310a20e..937225f 100644
--- a/internal/engines/factory.go
+++ b/internal/engines/factory.go
@@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine {
 		"duckduckgo": &DuckDuckGoEngine{client: client},
 		"github":     &GitHubEngine{client: client},
 		"reddit":     &RedditEngine{client: client},
-		"bing":       &BingEngine{client: client},
+		"bing":   &BingEngine{client: client},
+		"google": &GoogleEngine{client: client},
 	}
 }
diff --git a/internal/engines/google.go b/internal/engines/google.go
new file mode 100644
index 0000000..0371283
--- /dev/null
+++ b/internal/engines/google.go
@@ -0,0 +1,271 @@
+package engines
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+
+	"github.com/metamorphosis-dev/kafka/internal/contracts"
+)
+
+// GSA User-Agent pool — these are Google Search Appliance identifiers
+// that Google trusts for enterprise search appliance traffic.
+var gsaUserAgents = []string{
+	"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
+	"Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
+	"Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
+	"Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
+	"Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
+	"Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
+}
+
+func gsaUA() string {
+	return gsaUserAgents[0] // deterministic for now; could rotate
+}
+
+type GoogleEngine struct {
+	client *http.Client
+}
+
+func (e *GoogleEngine) Name() string { return "google" }
+
+func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
+	if strings.TrimSpace(req.Query) == "" {
+		return contracts.SearchResponse{Query: req.Query}, nil
+	}
+
+	start := (req.Pageno - 1) * 10
+	query := url.QueryEscape(req.Query)
+
+	// Build URL like SearXNG does.
+	u := fmt.Sprintf(
+		"https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s",
+		query,
+		start,
+		googleHL(req.Language),
+		googleUILanguage(req.Language),
+		googleSafeSearchLevel(req.Safesearch),
+	)
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
+	if err != nil {
+		return contracts.SearchResponse{}, err
+	}
+	httpReq.Header.Set("User-Agent", gsaUA())
+	httpReq.Header.Set("Accept", "*/*")
+	httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"})
+
+	resp, err := e.client.Do(httpReq)
+	if err != nil {
+		return contracts.SearchResponse{}, err
+	}
+	defer resp.Body.Close()
+
+	// Check for Google block / CAPTCHA page.
+	if detectGoogleSorry(resp) {
+		return contracts.SearchResponse{
+			Query:               req.Query,
+			NumberOfResults:     0,
+			Results:             nil,
+			Answers:             []map[string]any{},
+			Corrections:         []string{},
+			Infoboxes:           []map[string]any{},
+			Suggestions:         []string{},
+			UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}},
+		}, nil
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+		return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body))
+	}
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
+	if err != nil {
+		return contracts.SearchResponse{}, err
+	}
+
+	results := parseGoogleResults(string(body), req.Query)
+	return contracts.SearchResponse{
+		Query:               req.Query,
+		NumberOfResults:     len(results),
+		Results:             results,
+		Answers:             []map[string]any{},
+		Corrections:         []string{},
+		Infoboxes:           []map[string]any{},
+		Suggestions:         extractGoogleSuggestions(string(body)),
+		UnresponsiveEngines: [][2]string{},
+	}, nil
+}
+
+// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page.
+func detectGoogleSorry(resp *http.Response) bool {
+	if resp.Request != nil {
+		if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") {
+			return true
+		}
+	}
+	return false
+}
+
+// parseGoogleResults extracts search results from Google's HTML.
+// Uses the same selectors as SearXNG: div.MjjYud for result containers.
+func parseGoogleResults(body, query string) []contracts.MainResult {
+	var results []contracts.MainResult
+
+	// SearXNG selector: .//div[contains(@class, "MjjYud")]
+	// Each result block contains a title link and snippet.
+	// We simulate the XPath matching with regex-based extraction.
+
+	// Find all MjjYud div blocks.
+	mjjPattern := regexp.MustCompile(`<div[^>]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)</div>\s*(?=<div[^>]*class="[^"]*MjjYud|$)`)
+	matches := mjjPattern.FindAllStringSubmatch(body, -1)
+
+	for i, match := range matches {
+		if len(match) < 2 {
+			continue
+		}
+		block := match[1]
+
+		// Extract title and URL from the result link.
+		// Pattern: <a href="/url?q=ACTUAL_URL&amp;sa=..." ...>TITLE</a>
+		urlPattern := regexp.MustCompile(`<a[^>]+href="(/url\?q=[^"&]+)`)
+		urlMatch := urlPattern.FindStringSubmatch(block)
+		if len(urlMatch) < 2 {
+			continue
+		}
+		rawURL := urlMatch[1]
+		// Remove /url?q= prefix and decode.
+		actualURL := strings.TrimPrefix(rawURL, "/url?q=")
+		if amp := strings.Index(actualURL, "&amp;"); amp != -1 {
+			actualURL = actualURL[:amp]
+		}
+		if decoded, err := url.QueryUnescape(actualURL); err == nil {
+			actualURL = decoded
+		}
+
+		if actualURL == "" || !strings.HasPrefix(actualURL, "http") {
+			continue
+		}
+
+		// Extract title from the title tag.
+		titlePattern := regexp.MustCompile(`<span[^>]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)</span>`)
+		titleMatch := titlePattern.FindStringSubmatch(block)
+		title := query
+		if len(titleMatch) >= 2 {
+			title = stripTags(titleMatch[1])
+		} else {
+			// Fallback: extract visible text from an <a> with data-title or role="link"
+			linkTitlePattern := regexp.MustCompile(`<a[^>]+role="link"[^>]*>([^<]+)<`)
+			ltMatch := linkTitlePattern.FindStringSubmatch(block)
+			if len(ltMatch) >= 2 {
+				title = stripTags(ltMatch[1])
+			}
+		}
+
+		// Extract snippet from data-sncf divs (SearXNG's approach).
+		snippet := extractGoogleSnippet(block)
+
+		urlPtr := actualURL
+		results = append(results, contracts.MainResult{
+			Title:    title,
+			URL:       &urlPtr,
+			Content:   snippet,
+			Engine:    "google",
+			Score:     float64(len(matches) - i),
+			Category:  "general",
+			Engines:   []string{"google"},
+			Template:  "default.html",
+		})
+	}
+
+	return results
+}
+
+// extractGoogleSnippet extracts the snippet text from a Google result block.
+func extractGoogleSnippet(block string) string {
+	// Google's snippets live in divs with data-sncf attribute.
+	// SearXNG looks for: .//div[contains(@data-sncf, "1")]
+	snippetPattern := regexp.MustCompile(`<div[^>]+data-sncf="1"[^>]*>(.*?)</div>`)
+	matches := snippetPattern.FindAllStringSubmatch(block, -1)
+	var parts []string
+	for _, m := range matches {
+		if len(m) < 2 {
+			continue
+		}
+		text := stripTags(m[1])
+		if text != "" {
+			parts = append(parts, text)
+		}
+	}
+	return strings.Join(parts, " ")
+}
+
+// extractGoogleSuggestions extracts search suggestions from Google result cards.
+func extractGoogleSuggestions(body string) []string {
+	var suggestions []string
+	// SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a
+	suggestionPattern := regexp.MustCompile(`<div[^>]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`, regexp.DotAll)
+	matches := suggestionPattern.FindAllStringSubmatch(body, -1)
+	seen := map[string]bool{}
+	for _, m := range matches {
+		if len(m) < 2 {
+			continue
+		}
+		s := strings.TrimSpace(stripTags(m[1]))
+		if s != "" && !seen[s] {
+			seen[s] = true
+			suggestions = append(suggestions, s)
+		}
+	}
+	return suggestions
+}
+
+// googleHL maps SearXNG locale to Google hl (host language) parameter.
+// e.g. "en-US" -> "en-US"
+func googleHL(lang string) string {
+	lang = strings.ToLower(strings.TrimSpace(lang))
+	if lang == "" || lang == "auto" {
+		return "en"
+	}
+	return lang
+}
+
+// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter.
+// e.g. "en" -> "lang_en", "de" -> "lang_de"
+func googleUILanguage(lang string) string {
+	lang = strings.ToLower(strings.Split(lang, "-")[0])
+	if lang == "" || lang == "auto" {
+		return ""
+	}
+	return "lang_" + lang
+}
+
+// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter.
+func googleSafeSearchLevel(safesearch int) string {
+	switch safesearch {
+	case 0:
+		return "off"
+	case 1:
+		return "medium"
+	case 2:
+		return "high"
+	default:
+		return "medium"
+	}
+}
+
+// stripTags removes HTML tags from a string.
+func stripTags(s string) string {
+	stripper := regexp.MustCompile(`<[^>]*>`)
+	s = stripper.ReplaceAllString(s, "")
+	s = strings.ReplaceAll(s, "&amp;", "&")
+	s = strings.ReplaceAll(s, "&quot;", `"`)
+	s = strings.ReplaceAll(s, "&#39;", "'")
+	s = strings.ReplaceAll(s, "&nbsp;", " ")
+	return strings.TrimSpace(s)
+}
diff --git a/internal/engines/planner.go b/internal/engines/planner.go
index 543f253..08b0a27 100644
--- a/internal/engines/planner.go
+++ b/internal/engines/planner.go
@@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string {
 			set["qwant"] = true
 			set["duckduckgo"] = true
 			set["bing"] = true
+			set["google"] = true
 		case "science", "scientific publications":
 			set["arxiv"] = true
 			set["crossref"] = true
@@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string {
 		out = append(out, e)
 	}
 	// stable order
-	order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8}
+	order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9}
 	sortByOrder(out, order)
 	return out
 }