diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 937225f..310a20e 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -31,7 +31,6 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine { "duckduckgo": &DuckDuckGoEngine{client: client}, "github": &GitHubEngine{client: client}, "reddit": &RedditEngine{client: client}, - "bing": &BingEngine{client: client}, - "google": &GoogleEngine{client: client}, + "bing": &BingEngine{client: client}, } } diff --git a/internal/engines/google.go b/internal/engines/google.go deleted file mode 100644 index 0371283..0000000 --- a/internal/engines/google.go +++ /dev/null @@ -1,271 +0,0 @@ -package engines - -import ( - "context" - "fmt" - "io" - "net/http" - "net/url" - "regexp" - "strings" - - "github.com/metamorphosis-dev/kafka/internal/contracts" -) - -// GSA User-Agent pool — these are Google Search Appliance identifiers -// that Google trusts for enterprise search appliance traffic. -var gsaUserAgents = []string{ - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", -} - -func gsaUA() string { - return gsaUserAgents[0] // deterministic for now; could rotate -} - -type GoogleEngine struct { - client *http.Client -} - -func (e *GoogleEngine) Name() string { return "google" } - -func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { - if strings.TrimSpace(req.Query) == "" { - return contracts.SearchResponse{Query: req.Query}, nil - } - - start := (req.Pageno - 1) * 10 - query := url.QueryEscape(req.Query) - - // Build URL like SearXNG does. - u := fmt.Sprintf( - "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", - query, - start, - googleHL(req.Language), - googleUILanguage(req.Language), - googleSafeSearchLevel(req.Safesearch), - ) - - httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) - if err != nil { - return contracts.SearchResponse{}, err - } - httpReq.Header.Set("User-Agent", gsaUA()) - httpReq.Header.Set("Accept", "*/*") - httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) - - resp, err := e.client.Do(httpReq) - if err != nil { - return contracts.SearchResponse{}, err - } - defer resp.Body.Close() - - // Check for Google block / CAPTCHA page. - if detectGoogleSorry(resp) { - return contracts.SearchResponse{ - Query: req.Query, - NumberOfResults: 0, - Results: nil, - Answers: []map[string]any{}, - Corrections: []string{}, - Infoboxes: []map[string]any{}, - Suggestions: []string{}, - UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, - }, nil - } - - if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) - return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) - } - - body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) - if err != nil { - return contracts.SearchResponse{}, err - } - - results := parseGoogleResults(string(body), req.Query) - return contracts.SearchResponse{ - Query: req.Query, - NumberOfResults: len(results), - Results: results, - Answers: []map[string]any{}, - Corrections: []string{}, - Infoboxes: []map[string]any{}, - Suggestions: extractGoogleSuggestions(string(body)), - UnresponsiveEngines: [][2]string{}, - }, nil -} - -// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. -func detectGoogleSorry(resp *http.Response) bool { - if resp.Request != nil { - if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { - return true - } - } - return false -} - -// parseGoogleResults extracts search results from Google's HTML. -// Uses the same selectors as SearXNG: div.MjjYud for result containers. -func parseGoogleResults(body, query string) []contracts.MainResult { - var results []contracts.MainResult - - // SearXNG selector: .//div[contains(@class, "MjjYud")] - // Each result block contains a title link and snippet. - // We simulate the XPath matching with regex-based extraction. - - // Find all MjjYud div blocks. - mjjPattern := regexp.MustCompile(`]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)\s*(?=]*class="[^"]*MjjYud|$)`) - matches := mjjPattern.FindAllStringSubmatch(body, -1) - - for i, match := range matches { - if len(match) < 2 { - continue - } - block := match[1] - - // Extract title and URL from the result link. - // Pattern: TITLE - urlPattern := regexp.MustCompile(`]+href="(/url\?q=[^"&]+)`) - urlMatch := urlPattern.FindStringSubmatch(block) - if len(urlMatch) < 2 { - continue - } - rawURL := urlMatch[1] - // Remove /url?q= prefix and decode. - actualURL := strings.TrimPrefix(rawURL, "/url?q=") - if amp := strings.Index(actualURL, "&"); amp != -1 { - actualURL = actualURL[:amp] - } - if decoded, err := url.QueryUnescape(actualURL); err == nil { - actualURL = decoded - } - - if actualURL == "" || !strings.HasPrefix(actualURL, "http") { - continue - } - - // Extract title from the title tag. - titlePattern := regexp.MustCompile(`]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)`) - titleMatch := titlePattern.FindStringSubmatch(block) - title := query - if len(titleMatch) >= 2 { - title = stripTags(titleMatch[1]) - } else { - // Fallback: extract visible text from an with data-title or role="link" - linkTitlePattern := regexp.MustCompile(`]+role="link"[^>]*>([^<]+)<`) - ltMatch := linkTitlePattern.FindStringSubmatch(block) - if len(ltMatch) >= 2 { - title = stripTags(ltMatch[1]) - } - } - - // Extract snippet from data-sncf divs (SearXNG's approach). - snippet := extractGoogleSnippet(block) - - urlPtr := actualURL - results = append(results, contracts.MainResult{ - Title: title, - URL: &urlPtr, - Content: snippet, - Engine: "google", - Score: float64(len(matches) - i), - Category: "general", - Engines: []string{"google"}, - Template: "default.html", - }) - } - - return results -} - -// extractGoogleSnippet extracts the snippet text from a Google result block. -func extractGoogleSnippet(block string) string { - // Google's snippets live in divs with data-sncf attribute. - // SearXNG looks for: .//div[contains(@data-sncf, "1")] - snippetPattern := regexp.MustCompile(`]+data-sncf="1"[^>]*>(.*?)`) - matches := snippetPattern.FindAllStringSubmatch(block, -1) - var parts []string - for _, m := range matches { - if len(m) < 2 { - continue - } - text := stripTags(m[1]) - if text != "" { - parts = append(parts, text) - } - } - return strings.Join(parts, " ") -} - -// extractGoogleSuggestions extracts search suggestions from Google result cards. -func extractGoogleSuggestions(body string) []string { - var suggestions []string - // SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a - suggestionPattern := regexp.MustCompile(`]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?]*>([^<]+)`, regexp.DotAll) - matches := suggestionPattern.FindAllStringSubmatch(body, -1) - seen := map[string]bool{} - for _, m := range matches { - if len(m) < 2 { - continue - } - s := strings.TrimSpace(stripTags(m[1])) - if s != "" && !seen[s] { - seen[s] = true - suggestions = append(suggestions, s) - } - } - return suggestions -} - -// googleHL maps SearXNG locale to Google hl (host language) parameter. -// e.g. "en-US" -> "en-US" -func googleHL(lang string) string { - lang = strings.ToLower(strings.TrimSpace(lang)) - if lang == "" || lang == "auto" { - return "en" - } - return lang -} - -// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter. -// e.g. "en" -> "lang_en", "de" -> "lang_de" -func googleUILanguage(lang string) string { - lang = strings.ToLower(strings.Split(lang, "-")[0]) - if lang == "" || lang == "auto" { - return "" - } - return "lang_" + lang -} - -// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter. -func googleSafeSearchLevel(safesearch int) string { - switch safesearch { - case 0: - return "off" - case 1: - return "medium" - case 2: - return "high" - default: - return "medium" - } -} - -// stripTags removes HTML tags from a string. -func stripTags(s string) string { - stripper := regexp.MustCompile(`<[^>]*>`) - s = stripper.ReplaceAllString(s, "") - s = strings.ReplaceAll(s, "&", "&") - s = strings.ReplaceAll(s, """, `"`) - s = strings.ReplaceAll(s, "'", "'") - s = strings.ReplaceAll(s, " ", " ") - return strings.TrimSpace(s) -} diff --git a/internal/engines/planner.go b/internal/engines/planner.go index 24af031..56df656 100644 --- a/internal/engines/planner.go +++ b/internal/engines/planner.go @@ -91,7 +91,6 @@ func inferFromCategories(categories []string) []string { set["qwant"] = true set["duckduckgo"] = true set["bing"] = true - set["google"] = true case "science", "scientific publications": set["arxiv"] = true set["crossref"] = true @@ -107,7 +106,7 @@ func inferFromCategories(categories []string) []string { out = append(out, e) } // stable order - order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9} + order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8} sortByOrder(out, order) return out }