diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 937225f..310a20e 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -31,7 +31,6 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine { "duckduckgo": &DuckDuckGoEngine{client: client}, "github": &GitHubEngine{client: client}, "reddit": &RedditEngine{client: client}, - "bing": &BingEngine{client: client}, - "google": &GoogleEngine{client: client}, + "bing": &BingEngine{client: client}, } } diff --git a/internal/engines/google.go b/internal/engines/google.go deleted file mode 100644 index 0371283..0000000 --- a/internal/engines/google.go +++ /dev/null @@ -1,271 +0,0 @@ -package engines - -import ( - "context" - "fmt" - "io" - "net/http" - "net/url" - "regexp" - "strings" - - "github.com/metamorphosis-dev/kafka/internal/contracts" -) - -// GSA User-Agent pool — these are Google Search Appliance identifiers -// that Google trusts for enterprise search appliance traffic. -var gsaUserAgents = []string{ - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", -} - -func gsaUA() string { - return gsaUserAgents[0] // deterministic for now; could rotate -} - -type GoogleEngine struct { - client *http.Client -} - -func (e *GoogleEngine) Name() string { return "google" } - -func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { - if strings.TrimSpace(req.Query) == "" { - return contracts.SearchResponse{Query: req.Query}, nil - } - - start := (req.Pageno - 1) * 10 - query := url.QueryEscape(req.Query) - - // Build URL like SearXNG does. - u := fmt.Sprintf( - "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", - query, - start, - googleHL(req.Language), - googleUILanguage(req.Language), - googleSafeSearchLevel(req.Safesearch), - ) - - httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) - if err != nil { - return contracts.SearchResponse{}, err - } - httpReq.Header.Set("User-Agent", gsaUA()) - httpReq.Header.Set("Accept", "*/*") - httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) - - resp, err := e.client.Do(httpReq) - if err != nil { - return contracts.SearchResponse{}, err - } - defer resp.Body.Close() - - // Check for Google block / CAPTCHA page. - if detectGoogleSorry(resp) { - return contracts.SearchResponse{ - Query: req.Query, - NumberOfResults: 0, - Results: nil, - Answers: []map[string]any{}, - Corrections: []string{}, - Infoboxes: []map[string]any{}, - Suggestions: []string{}, - UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, - }, nil - } - - if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) - return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) - } - - body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) - if err != nil { - return contracts.SearchResponse{}, err - } - - results := parseGoogleResults(string(body), req.Query) - return contracts.SearchResponse{ - Query: req.Query, - NumberOfResults: len(results), - Results: results, - Answers: []map[string]any{}, - Corrections: []string{}, - Infoboxes: []map[string]any{}, - Suggestions: extractGoogleSuggestions(string(body)), - UnresponsiveEngines: [][2]string{}, - }, nil -} - -// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. -func detectGoogleSorry(resp *http.Response) bool { - if resp.Request != nil { - if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { - return true - } - } - return false -} - -// parseGoogleResults extracts search results from Google's HTML. -// Uses the same selectors as SearXNG: div.MjjYud for result containers. -func parseGoogleResults(body, query string) []contracts.MainResult { - var results []contracts.MainResult - - // SearXNG selector: .//div[contains(@class, "MjjYud")] - // Each result block contains a title link and snippet. - // We simulate the XPath matching with regex-based extraction. - - // Find all MjjYud div blocks. - mjjPattern := regexp.MustCompile(`