diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 310a20e..937225f 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine { "duckduckgo": &DuckDuckGoEngine{client: client}, "github": &GitHubEngine{client: client}, "reddit": &RedditEngine{client: client}, - "bing": &BingEngine{client: client}, + "bing": &BingEngine{client: client}, + "google": &GoogleEngine{client: client}, } } diff --git a/internal/engines/google.go b/internal/engines/google.go new file mode 100644 index 0000000..0371283 --- /dev/null +++ b/internal/engines/google.go @@ -0,0 +1,271 @@ +package engines + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + + "github.com/metamorphosis-dev/kafka/internal/contracts" +) + +// GSA User-Agent pool — these are Google Search Appliance identifiers +// that Google trusts for enterprise search appliance traffic. +var gsaUserAgents = []string{ + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", +} + +func gsaUA() string { + return gsaUserAgents[0] // deterministic for now; could rotate +} + +type GoogleEngine struct { + client *http.Client +} + +func (e *GoogleEngine) Name() string { return "google" } + +func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if strings.TrimSpace(req.Query) == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + start := (req.Pageno - 1) * 10 + query := url.QueryEscape(req.Query) + + // Build URL like SearXNG does. + u := fmt.Sprintf( + "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", + query, + start, + googleHL(req.Language), + googleUILanguage(req.Language), + googleSafeSearchLevel(req.Safesearch), + ) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", gsaUA()) + httpReq.Header.Set("Accept", "*/*") + httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + // Check for Google block / CAPTCHA page. + if detectGoogleSorry(resp) { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: nil, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, + }, nil + } + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + results := parseGoogleResults(string(body), req.Query) + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: extractGoogleSuggestions(string(body)), + UnresponsiveEngines: [][2]string{}, + }, nil +} + +// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. +func detectGoogleSorry(resp *http.Response) bool { + if resp.Request != nil { + if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { + return true + } + } + return false +} + +// parseGoogleResults extracts search results from Google's HTML. +// Uses the same selectors as SearXNG: div.MjjYud for result containers. +func parseGoogleResults(body, query string) []contracts.MainResult { + var results []contracts.MainResult + + // SearXNG selector: .//div[contains(@class, "MjjYud")] + // Each result block contains a title link and snippet. + // We simulate the XPath matching with regex-based extraction. + + // Find all MjjYud div blocks. + mjjPattern := regexp.MustCompile(`