From 79e01e0de28e11d1bb50db4b31985b5b772576a0 Mon Sep 17 00:00:00 2001 From: Franz Kafka Date: Sun, 22 Mar 2026 01:25:04 +0000 Subject: [PATCH] feat: add Google engine (experimental, may be blocked) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Google blocks scrapers — results depend on whether Google serves a static page or a JS challenge. Set LOCAL_PORTED_ENGINES=google to enable. Without it, Google is proxied to upstream SearXNG. Closes #1 --- internal/engines/factory.go | 3 +- internal/engines/google.go | 192 ++++++++++++++++++++++++++++++++++++ internal/engines/planner.go | 3 +- 3 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 internal/engines/google.go diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 310a20e..937225f 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine { "duckduckgo": &DuckDuckGoEngine{client: client}, "github": &GitHubEngine{client: client}, "reddit": &RedditEngine{client: client}, - "bing": &BingEngine{client: client}, + "bing": &BingEngine{client: client}, + "google": &GoogleEngine{client: client}, } } diff --git a/internal/engines/google.go b/internal/engines/google.go new file mode 100644 index 0000000..2fbd00a --- /dev/null +++ b/internal/engines/google.go @@ -0,0 +1,192 @@ +package engines + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/metamorphosis-dev/kafka/internal/contracts" +) + +// GoogleEngine searches Google via direct HTTP. +// This may be blocked by Google (CAPTCHA/challenge). When blocked, +// the search service falls back to the configured upstream SearXNG +// if "google" is NOT in LOCAL_PORTED_ENGINES (i.e., it's treated as upstream). +// +// To use: add "google" to LOCAL_PORTED_ENGINES env var. +// Without that, Google is proxied to upstream SearXNG as normal. +type GoogleEngine struct { + client *http.Client +} + +func (e *GoogleEngine) Name() string { return "google" } + +func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if strings.TrimSpace(req.Query) == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + u := fmt.Sprintf( + "https://www.google.com/search?q=%s&num=%d&hl=%s&safe=%s", + url.QueryEscape(req.Query), + 10, + googleHL(req.Language), + googleSafeSearchLevel(req.Safesearch), + ) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36") + httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9") + httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusMovedPermanently || resp.StatusCode == http.StatusFound { + // "I'm Feeling Lucky" redirect — treat the final URL as the top result. + finalURL := resp.Request.URL.String() + urlPtr := finalURL + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 1, + UnresponsiveEngines: [][2]string{}, + Results: []contracts.MainResult{ + { + Title: req.Query, + URL: &urlPtr, + Content: "Google result (direct redirect)", + Engine: "google", + Score: 1.0, + Category: "general", + Engines: []string{"google"}, + }, + }, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + }, nil + } + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + results := parseGoogleHTML(string(body), req.Query) + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + +// parseGoogleHTML extracts results from Google's HTML. +// This is best-effort — Google's results are JS-rendered and this parser +// will often return empty results when Google doesn't serve a static page. +func parseGoogleHTML(body, query string) []contracts.MainResult { + var results []contracts.MainResult + + // Google occasionally serves result HTML in a
or + // similar static structure. Look for result links. + links := extractGoogleLinks(body) + for i, href := range links { + if href == "" || strings.HasPrefix(href, "/") || strings.Contains(href, "google.com") { + continue + } + urlPtr := href + results = append(results, contracts.MainResult{ + Title: fmt.Sprintf("Result %d for %s", i+1, query), + URL: &urlPtr, + Content: "", + Engine: "google", + Score: float64(len(links) - i), + Category: "general", + Engines: []string{"google"}, + }) + } + + return results +} + +// extractGoogleLinks finds result URLs in Google's HTML. +func extractGoogleLinks(body string) []string { + var links []string + seen := map[string]bool{} + + // Look for patterns pointing to real domains. + // Google's result links typically have href="/url?q=REAL_URL&..." structure. + for { + idx := strings.Index(body, "/url?q=") + if idx == -1 { + break + } + body = body[idx+8:] + end := strings.Index(body, "&") + if end == -1 { + break + } + href, _ := url.QueryUnescape(body[:end]) + body = body[end:] + + if seen[href] { + continue + } + seen[href] = true + if strings.HasPrefix(href, "http") && !strings.Contains(href, "google.com") { + links = append(links, href) + } + } + + return links +} + +// googleHL maps language codes to Google hl parameter. +func googleHL(lang string) string { + lang = strings.ToLower(strings.TrimSpace(lang)) + if lang == "" || lang == "auto" { + return "en" + } + googleHLMap := map[string]string{ + "en": "en", "de": "de", "fr": "fr", "es": "es", "pt": "pt", + "ru": "ru", "ja": "ja", "zh": "zh-CN", "ko": "ko", "it": "it", + "nl": "nl", "pl": "pl", "ar": "ar", "hi": "hi", "tr": "tr", + } + if h, ok := googleHLMap[lang]; ok { + return h + } + return "en" +} + +// googleSafeSearchLevel maps safesearch (0-2) to Google's safe search string. +func googleSafeSearchLevel(safesearch int) string { + switch safesearch { + case 0: + return "images" + case 1: + return "active" + case 2: + return "off" + default: + return "images" + } +} diff --git a/internal/engines/planner.go b/internal/engines/planner.go index 543f253..08b0a27 100644 --- a/internal/engines/planner.go +++ b/internal/engines/planner.go @@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string { set["qwant"] = true set["duckduckgo"] = true set["bing"] = true + set["google"] = true case "science", "scientific publications": set["arxiv"] = true set["crossref"] = true @@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string { out = append(out, e) } // stable order - order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8} + order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9} sortByOrder(out, order) return out }