feat: add Google engine (experimental, may be blocked)
Google blocks scrapers — results depend on whether Google serves a static page or a JS challenge. Set LOCAL_PORTED_ENGINES=google to enable. Without it, Google is proxied to upstream SearXNG. Closes #1
This commit is contained in:
parent
8ea318ad4a
commit
79e01e0de2
3 changed files with 196 additions and 2 deletions
|
|
@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine {
|
||||||
"duckduckgo": &DuckDuckGoEngine{client: client},
|
"duckduckgo": &DuckDuckGoEngine{client: client},
|
||||||
"github": &GitHubEngine{client: client},
|
"github": &GitHubEngine{client: client},
|
||||||
"reddit": &RedditEngine{client: client},
|
"reddit": &RedditEngine{client: client},
|
||||||
"bing": &BingEngine{client: client},
|
"bing": &BingEngine{client: client},
|
||||||
|
"google": &GoogleEngine{client: client},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
192
internal/engines/google.go
Normal file
192
internal/engines/google.go
Normal file
|
|
@ -0,0 +1,192 @@
|
||||||
|
package engines
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GoogleEngine searches Google via direct HTTP.
|
||||||
|
// This may be blocked by Google (CAPTCHA/challenge). When blocked,
|
||||||
|
// the search service falls back to the configured upstream SearXNG
|
||||||
|
// if "google" is NOT in LOCAL_PORTED_ENGINES (i.e., it's treated as upstream).
|
||||||
|
//
|
||||||
|
// To use: add "google" to LOCAL_PORTED_ENGINES env var.
|
||||||
|
// Without that, Google is proxied to upstream SearXNG as normal.
|
||||||
|
type GoogleEngine struct {
|
||||||
|
client *http.Client
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *GoogleEngine) Name() string { return "google" }
|
||||||
|
|
||||||
|
func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||||
|
if strings.TrimSpace(req.Query) == "" {
|
||||||
|
return contracts.SearchResponse{Query: req.Query}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
u := fmt.Sprintf(
|
||||||
|
"https://www.google.com/search?q=%s&num=%d&hl=%s&safe=%s",
|
||||||
|
url.QueryEscape(req.Query),
|
||||||
|
10,
|
||||||
|
googleHL(req.Language),
|
||||||
|
googleSafeSearchLevel(req.Safesearch),
|
||||||
|
)
|
||||||
|
|
||||||
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||||
|
if err != nil {
|
||||||
|
return contracts.SearchResponse{}, err
|
||||||
|
}
|
||||||
|
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
|
||||||
|
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||||
|
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||||
|
|
||||||
|
resp, err := e.client.Do(httpReq)
|
||||||
|
if err != nil {
|
||||||
|
return contracts.SearchResponse{}, err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusMovedPermanently || resp.StatusCode == http.StatusFound {
|
||||||
|
// "I'm Feeling Lucky" redirect — treat the final URL as the top result.
|
||||||
|
finalURL := resp.Request.URL.String()
|
||||||
|
urlPtr := finalURL
|
||||||
|
return contracts.SearchResponse{
|
||||||
|
Query: req.Query,
|
||||||
|
NumberOfResults: 1,
|
||||||
|
UnresponsiveEngines: [][2]string{},
|
||||||
|
Results: []contracts.MainResult{
|
||||||
|
{
|
||||||
|
Title: req.Query,
|
||||||
|
URL: &urlPtr,
|
||||||
|
Content: "Google result (direct redirect)",
|
||||||
|
Engine: "google",
|
||||||
|
Score: 1.0,
|
||||||
|
Category: "general",
|
||||||
|
Engines: []string{"google"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Answers: []map[string]any{},
|
||||||
|
Corrections: []string{},
|
||||||
|
Infoboxes: []map[string]any{},
|
||||||
|
Suggestions: []string{},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||||
|
return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
|
||||||
|
if err != nil {
|
||||||
|
return contracts.SearchResponse{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
results := parseGoogleHTML(string(body), req.Query)
|
||||||
|
return contracts.SearchResponse{
|
||||||
|
Query: req.Query,
|
||||||
|
NumberOfResults: len(results),
|
||||||
|
Results: results,
|
||||||
|
Answers: []map[string]any{},
|
||||||
|
Corrections: []string{},
|
||||||
|
Infoboxes: []map[string]any{},
|
||||||
|
Suggestions: []string{},
|
||||||
|
UnresponsiveEngines: [][2]string{},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseGoogleHTML extracts results from Google's HTML.
|
||||||
|
// This is best-effort — Google's results are JS-rendered and this parser
|
||||||
|
// will often return empty results when Google doesn't serve a static page.
|
||||||
|
func parseGoogleHTML(body, query string) []contracts.MainResult {
|
||||||
|
var results []contracts.MainResult
|
||||||
|
|
||||||
|
// Google occasionally serves result HTML in a <div class="egGl0c"> or
|
||||||
|
// similar static structure. Look for result links.
|
||||||
|
links := extractGoogleLinks(body)
|
||||||
|
for i, href := range links {
|
||||||
|
if href == "" || strings.HasPrefix(href, "/") || strings.Contains(href, "google.com") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
urlPtr := href
|
||||||
|
results = append(results, contracts.MainResult{
|
||||||
|
Title: fmt.Sprintf("Result %d for %s", i+1, query),
|
||||||
|
URL: &urlPtr,
|
||||||
|
Content: "",
|
||||||
|
Engine: "google",
|
||||||
|
Score: float64(len(links) - i),
|
||||||
|
Category: "general",
|
||||||
|
Engines: []string{"google"},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractGoogleLinks finds result URLs in Google's HTML.
|
||||||
|
func extractGoogleLinks(body string) []string {
|
||||||
|
var links []string
|
||||||
|
seen := map[string]bool{}
|
||||||
|
|
||||||
|
// Look for <a href="URL"> patterns pointing to real domains.
|
||||||
|
// Google's result links typically have href="/url?q=REAL_URL&..." structure.
|
||||||
|
for {
|
||||||
|
idx := strings.Index(body, "/url?q=")
|
||||||
|
if idx == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
body = body[idx+8:]
|
||||||
|
end := strings.Index(body, "&")
|
||||||
|
if end == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
href, _ := url.QueryUnescape(body[:end])
|
||||||
|
body = body[end:]
|
||||||
|
|
||||||
|
if seen[href] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[href] = true
|
||||||
|
if strings.HasPrefix(href, "http") && !strings.Contains(href, "google.com") {
|
||||||
|
links = append(links, href)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return links
|
||||||
|
}
|
||||||
|
|
||||||
|
// googleHL maps language codes to Google hl parameter.
|
||||||
|
func googleHL(lang string) string {
|
||||||
|
lang = strings.ToLower(strings.TrimSpace(lang))
|
||||||
|
if lang == "" || lang == "auto" {
|
||||||
|
return "en"
|
||||||
|
}
|
||||||
|
googleHLMap := map[string]string{
|
||||||
|
"en": "en", "de": "de", "fr": "fr", "es": "es", "pt": "pt",
|
||||||
|
"ru": "ru", "ja": "ja", "zh": "zh-CN", "ko": "ko", "it": "it",
|
||||||
|
"nl": "nl", "pl": "pl", "ar": "ar", "hi": "hi", "tr": "tr",
|
||||||
|
}
|
||||||
|
if h, ok := googleHLMap[lang]; ok {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
return "en"
|
||||||
|
}
|
||||||
|
|
||||||
|
// googleSafeSearchLevel maps safesearch (0-2) to Google's safe search string.
|
||||||
|
func googleSafeSearchLevel(safesearch int) string {
|
||||||
|
switch safesearch {
|
||||||
|
case 0:
|
||||||
|
return "images"
|
||||||
|
case 1:
|
||||||
|
return "active"
|
||||||
|
case 2:
|
||||||
|
return "off"
|
||||||
|
default:
|
||||||
|
return "images"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string {
|
||||||
set["qwant"] = true
|
set["qwant"] = true
|
||||||
set["duckduckgo"] = true
|
set["duckduckgo"] = true
|
||||||
set["bing"] = true
|
set["bing"] = true
|
||||||
|
set["google"] = true
|
||||||
case "science", "scientific publications":
|
case "science", "scientific publications":
|
||||||
set["arxiv"] = true
|
set["arxiv"] = true
|
||||||
set["crossref"] = true
|
set["crossref"] = true
|
||||||
|
|
@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string {
|
||||||
out = append(out, e)
|
out = append(out, e)
|
||||||
}
|
}
|
||||||
// stable order
|
// stable order
|
||||||
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8}
|
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9}
|
||||||
sortByOrder(out, order)
|
sortByOrder(out, order)
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue