Compare commits
2 commits
21b77f25bf
...
fc6e6ada68
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fc6e6ada68 | ||
|
|
7d23f13dfa |
3 changed files with 275 additions and 2 deletions
|
|
@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine {
|
|||
"duckduckgo": &DuckDuckGoEngine{client: client},
|
||||
"github": &GitHubEngine{client: client},
|
||||
"reddit": &RedditEngine{client: client},
|
||||
"bing": &BingEngine{client: client},
|
||||
"bing": &BingEngine{client: client},
|
||||
"google": &GoogleEngine{client: client},
|
||||
}
|
||||
}
|
||||
|
|
|
|||
271
internal/engines/google.go
Normal file
271
internal/engines/google.go
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
||||
)
|
||||
|
||||
// GSA User-Agent pool — these are Google Search Appliance identifiers
|
||||
// that Google trusts for enterprise search appliance traffic.
|
||||
var gsaUserAgents = []string{
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
|
||||
}
|
||||
|
||||
func gsaUA() string {
|
||||
return gsaUserAgents[0] // deterministic for now; could rotate
|
||||
}
|
||||
|
||||
type GoogleEngine struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (e *GoogleEngine) Name() string { return "google" }
|
||||
|
||||
func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if strings.TrimSpace(req.Query) == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
start := (req.Pageno - 1) * 10
|
||||
query := url.QueryEscape(req.Query)
|
||||
|
||||
// Build URL like SearXNG does.
|
||||
u := fmt.Sprintf(
|
||||
"https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s",
|
||||
query,
|
||||
start,
|
||||
googleHL(req.Language),
|
||||
googleUILanguage(req.Language),
|
||||
googleSafeSearchLevel(req.Safesearch),
|
||||
)
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", gsaUA())
|
||||
httpReq.Header.Set("Accept", "*/*")
|
||||
httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"})
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Check for Google block / CAPTCHA page.
|
||||
if detectGoogleSorry(resp) {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: 0,
|
||||
Results: nil,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
results := parseGoogleResults(string(body), req.Query)
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: extractGoogleSuggestions(string(body)),
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page.
|
||||
func detectGoogleSorry(resp *http.Response) bool {
|
||||
if resp.Request != nil {
|
||||
if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// parseGoogleResults extracts search results from Google's HTML.
|
||||
// Uses the same selectors as SearXNG: div.MjjYud for result containers.
|
||||
func parseGoogleResults(body, query string) []contracts.MainResult {
|
||||
var results []contracts.MainResult
|
||||
|
||||
// SearXNG selector: .//div[contains(@class, "MjjYud")]
|
||||
// Each result block contains a title link and snippet.
|
||||
// We simulate the XPath matching with regex-based extraction.
|
||||
|
||||
// Find all MjjYud div blocks.
|
||||
mjjPattern := regexp.MustCompile(`<div[^>]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)</div>\s*(?=<div[^>]*class="[^"]*MjjYud|$)`)
|
||||
matches := mjjPattern.FindAllStringSubmatch(body, -1)
|
||||
|
||||
for i, match := range matches {
|
||||
if len(match) < 2 {
|
||||
continue
|
||||
}
|
||||
block := match[1]
|
||||
|
||||
// Extract title and URL from the result link.
|
||||
// Pattern: <a href="/url?q=ACTUAL_URL&sa=..." ...>TITLE</a>
|
||||
urlPattern := regexp.MustCompile(`<a[^>]+href="(/url\?q=[^"&]+)`)
|
||||
urlMatch := urlPattern.FindStringSubmatch(block)
|
||||
if len(urlMatch) < 2 {
|
||||
continue
|
||||
}
|
||||
rawURL := urlMatch[1]
|
||||
// Remove /url?q= prefix and decode.
|
||||
actualURL := strings.TrimPrefix(rawURL, "/url?q=")
|
||||
if amp := strings.Index(actualURL, "&"); amp != -1 {
|
||||
actualURL = actualURL[:amp]
|
||||
}
|
||||
if decoded, err := url.QueryUnescape(actualURL); err == nil {
|
||||
actualURL = decoded
|
||||
}
|
||||
|
||||
if actualURL == "" || !strings.HasPrefix(actualURL, "http") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Extract title from the title tag.
|
||||
titlePattern := regexp.MustCompile(`<span[^>]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)</span>`)
|
||||
titleMatch := titlePattern.FindStringSubmatch(block)
|
||||
title := query
|
||||
if len(titleMatch) >= 2 {
|
||||
title = stripTags(titleMatch[1])
|
||||
} else {
|
||||
// Fallback: extract visible text from an <a> with data-title or role="link"
|
||||
linkTitlePattern := regexp.MustCompile(`<a[^>]+role="link"[^>]*>([^<]+)<`)
|
||||
ltMatch := linkTitlePattern.FindStringSubmatch(block)
|
||||
if len(ltMatch) >= 2 {
|
||||
title = stripTags(ltMatch[1])
|
||||
}
|
||||
}
|
||||
|
||||
// Extract snippet from data-sncf divs (SearXNG's approach).
|
||||
snippet := extractGoogleSnippet(block)
|
||||
|
||||
urlPtr := actualURL
|
||||
results = append(results, contracts.MainResult{
|
||||
Title: title,
|
||||
URL: &urlPtr,
|
||||
Content: snippet,
|
||||
Engine: "google",
|
||||
Score: float64(len(matches) - i),
|
||||
Category: "general",
|
||||
Engines: []string{"google"},
|
||||
Template: "default.html",
|
||||
})
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// extractGoogleSnippet extracts the snippet text from a Google result block.
|
||||
func extractGoogleSnippet(block string) string {
|
||||
// Google's snippets live in divs with data-sncf attribute.
|
||||
// SearXNG looks for: .//div[contains(@data-sncf, "1")]
|
||||
snippetPattern := regexp.MustCompile(`<div[^>]+data-sncf="1"[^>]*>(.*?)</div>`)
|
||||
matches := snippetPattern.FindAllStringSubmatch(block, -1)
|
||||
var parts []string
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
text := stripTags(m[1])
|
||||
if text != "" {
|
||||
parts = append(parts, text)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
// extractGoogleSuggestions extracts search suggestions from Google result cards.
|
||||
func extractGoogleSuggestions(body string) []string {
|
||||
var suggestions []string
|
||||
// SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a
|
||||
suggestionPattern := regexp.MustCompile(`<div[^>]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`, regexp.DotAll)
|
||||
matches := suggestionPattern.FindAllStringSubmatch(body, -1)
|
||||
seen := map[string]bool{}
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
s := strings.TrimSpace(stripTags(m[1]))
|
||||
if s != "" && !seen[s] {
|
||||
seen[s] = true
|
||||
suggestions = append(suggestions, s)
|
||||
}
|
||||
}
|
||||
return suggestions
|
||||
}
|
||||
|
||||
// googleHL maps SearXNG locale to Google hl (host language) parameter.
|
||||
// e.g. "en-US" -> "en-US"
|
||||
func googleHL(lang string) string {
|
||||
lang = strings.ToLower(strings.TrimSpace(lang))
|
||||
if lang == "" || lang == "auto" {
|
||||
return "en"
|
||||
}
|
||||
return lang
|
||||
}
|
||||
|
||||
// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter.
|
||||
// e.g. "en" -> "lang_en", "de" -> "lang_de"
|
||||
func googleUILanguage(lang string) string {
|
||||
lang = strings.ToLower(strings.Split(lang, "-")[0])
|
||||
if lang == "" || lang == "auto" {
|
||||
return ""
|
||||
}
|
||||
return "lang_" + lang
|
||||
}
|
||||
|
||||
// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter.
|
||||
func googleSafeSearchLevel(safesearch int) string {
|
||||
switch safesearch {
|
||||
case 0:
|
||||
return "off"
|
||||
case 1:
|
||||
return "medium"
|
||||
case 2:
|
||||
return "high"
|
||||
default:
|
||||
return "medium"
|
||||
}
|
||||
}
|
||||
|
||||
// stripTags removes HTML tags from a string.
|
||||
func stripTags(s string) string {
|
||||
stripper := regexp.MustCompile(`<[^>]*>`)
|
||||
s = stripper.ReplaceAllString(s, "")
|
||||
s = strings.ReplaceAll(s, "&", "&")
|
||||
s = strings.ReplaceAll(s, """, `"`)
|
||||
s = strings.ReplaceAll(s, "'", "'")
|
||||
s = strings.ReplaceAll(s, " ", " ")
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
|
|
@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string {
|
|||
set["qwant"] = true
|
||||
set["duckduckgo"] = true
|
||||
set["bing"] = true
|
||||
set["google"] = true
|
||||
case "science", "scientific publications":
|
||||
set["arxiv"] = true
|
||||
set["crossref"] = true
|
||||
|
|
@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string {
|
|||
out = append(out, e)
|
||||
}
|
||||
// stable order
|
||||
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8}
|
||||
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9}
|
||||
sortByOrder(out, order)
|
||||
return out
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue