package engines import ( "context" "fmt" "io" "net/http" "net/url" "strings" "github.com/metamorphosis-dev/kafka/internal/contracts" ) // GoogleEngine searches Google via direct HTTP. // This may be blocked by Google (CAPTCHA/challenge). When blocked, // the search service falls back to the configured upstream SearXNG // if "google" is NOT in LOCAL_PORTED_ENGINES (i.e., it's treated as upstream). // // To use: add "google" to LOCAL_PORTED_ENGINES env var. // Without that, Google is proxied to upstream SearXNG as normal. type GoogleEngine struct { client *http.Client } func (e *GoogleEngine) Name() string { return "google" } func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { if strings.TrimSpace(req.Query) == "" { return contracts.SearchResponse{Query: req.Query}, nil } u := fmt.Sprintf( "https://www.google.com/search?q=%s&num=%d&hl=%s&safe=%s", url.QueryEscape(req.Query), 10, googleHL(req.Language), googleSafeSearchLevel(req.Safesearch), ) httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) if err != nil { return contracts.SearchResponse{}, err } httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36") httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9") httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") resp, err := e.client.Do(httpReq) if err != nil { return contracts.SearchResponse{}, err } defer resp.Body.Close() if resp.StatusCode == http.StatusMovedPermanently || resp.StatusCode == http.StatusFound { // "I'm Feeling Lucky" redirect — treat the final URL as the top result. finalURL := resp.Request.URL.String() urlPtr := finalURL return contracts.SearchResponse{ Query: req.Query, NumberOfResults: 1, UnresponsiveEngines: [][2]string{}, Results: []contracts.MainResult{ { Title: req.Query, URL: &urlPtr, Content: "Google result (direct redirect)", Engine: "google", Score: 1.0, Category: "general", Engines: []string{"google"}, }, }, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, }, nil } if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) } body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) if err != nil { return contracts.SearchResponse{}, err } results := parseGoogleHTML(string(body), req.Query) return contracts.SearchResponse{ Query: req.Query, NumberOfResults: len(results), Results: results, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{}, }, nil } // parseGoogleHTML extracts results from Google's HTML. // This is best-effort — Google's results are JS-rendered and this parser // will often return empty results when Google doesn't serve a static page. func parseGoogleHTML(body, query string) []contracts.MainResult { var results []contracts.MainResult // Google occasionally serves result HTML in a
or // similar static structure. Look for result links. links := extractGoogleLinks(body) for i, href := range links { if href == "" || strings.HasPrefix(href, "/") || strings.Contains(href, "google.com") { continue } urlPtr := href results = append(results, contracts.MainResult{ Title: fmt.Sprintf("Result %d for %s", i+1, query), URL: &urlPtr, Content: "", Engine: "google", Score: float64(len(links) - i), Category: "general", Engines: []string{"google"}, }) } return results } // extractGoogleLinks finds result URLs in Google's HTML. func extractGoogleLinks(body string) []string { var links []string seen := map[string]bool{} // Look for patterns pointing to real domains. // Google's result links typically have href="/url?q=REAL_URL&..." structure. for { idx := strings.Index(body, "/url?q=") if idx == -1 { break } body = body[idx+8:] end := strings.Index(body, "&") if end == -1 { break } href, _ := url.QueryUnescape(body[:end]) body = body[end:] if seen[href] { continue } seen[href] = true if strings.HasPrefix(href, "http") && !strings.Contains(href, "google.com") { links = append(links, href) } } return links } // googleHL maps language codes to Google hl parameter. func googleHL(lang string) string { lang = strings.ToLower(strings.TrimSpace(lang)) if lang == "" || lang == "auto" { return "en" } googleHLMap := map[string]string{ "en": "en", "de": "de", "fr": "fr", "es": "es", "pt": "pt", "ru": "ru", "ja": "ja", "zh": "zh-CN", "ko": "ko", "it": "it", "nl": "nl", "pl": "pl", "ar": "ar", "hi": "hi", "tr": "tr", } if h, ok := googleHLMap[lang]; ok { return h } return "en" } // googleSafeSearchLevel maps safesearch (0-2) to Google's safe search string. func googleSafeSearchLevel(safesearch int) string { switch safesearch { case 0: return "images" case 1: return "active" case 2: return "off" default: return "images" } }