kafka/internal/engines/google.go
Franz Kafka 7be03b4017 license: change from MIT to AGPLv3
Update LICENSE file and add AGPL header to all source files.

AGPLv3 ensures that if someone runs Kafka as a network service and
modifies it, they must release their source code under the same license.
2026-03-22 08:27:23 +00:00

287 lines
9.2 KiB
Go

// kafka — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package engines
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
// GSA User-Agent pool — these are Google Search Appliance identifiers
// that Google trusts for enterprise search appliance traffic.
var gsaUserAgents = []string{
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
}
func gsaUA() string {
return gsaUserAgents[0] // deterministic for now; could rotate
}
type GoogleEngine struct {
client *http.Client
}
func (e *GoogleEngine) Name() string { return "google" }
func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if strings.TrimSpace(req.Query) == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
start := (req.Pageno - 1) * 10
query := url.QueryEscape(req.Query)
// Build URL like SearXNG does.
u := fmt.Sprintf(
"https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s",
query,
start,
googleHL(req.Language),
googleUILanguage(req.Language),
googleSafeSearchLevel(req.Safesearch),
)
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", gsaUA())
httpReq.Header.Set("Accept", "*/*")
httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"})
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
// Check for Google block / CAPTCHA page.
if detectGoogleSorry(resp) {
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: 0,
Results: nil,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}},
}, nil
}
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body))
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
if err != nil {
return contracts.SearchResponse{}, err
}
results := parseGoogleResults(string(body), req.Query)
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: extractGoogleSuggestions(string(body)),
UnresponsiveEngines: [][2]string{},
}, nil
}
// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page.
func detectGoogleSorry(resp *http.Response) bool {
if resp.Request != nil {
if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") {
return true
}
}
return false
}
// parseGoogleResults extracts search results from Google's HTML.
// Uses the same selectors as SearXNG: div.MjjYud for result containers.
func parseGoogleResults(body, query string) []contracts.MainResult {
var results []contracts.MainResult
// SearXNG selector: .//div[contains(@class, "MjjYud")]
// Each result block contains a title link and snippet.
// We simulate the XPath matching with regex-based extraction.
// Find all MjjYud div blocks.
mjjPattern := regexp.MustCompile(`<div[^>]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)</div>\s*(?=<div[^>]*class="[^"]*MjjYud|$)`)
matches := mjjPattern.FindAllStringSubmatch(body, -1)
for i, match := range matches {
if len(match) < 2 {
continue
}
block := match[1]
// Extract title and URL from the result link.
// Pattern: <a href="/url?q=ACTUAL_URL&amp;sa=..." ...>TITLE</a>
urlPattern := regexp.MustCompile(`<a[^>]+href="(/url\?q=[^"&]+)`)
urlMatch := urlPattern.FindStringSubmatch(block)
if len(urlMatch) < 2 {
continue
}
rawURL := urlMatch[1]
// Remove /url?q= prefix and decode.
actualURL := strings.TrimPrefix(rawURL, "/url?q=")
if amp := strings.Index(actualURL, "&amp;"); amp != -1 {
actualURL = actualURL[:amp]
}
if decoded, err := url.QueryUnescape(actualURL); err == nil {
actualURL = decoded
}
if actualURL == "" || !strings.HasPrefix(actualURL, "http") {
continue
}
// Extract title from the title tag.
titlePattern := regexp.MustCompile(`<span[^>]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)</span>`)
titleMatch := titlePattern.FindStringSubmatch(block)
title := query
if len(titleMatch) >= 2 {
title = stripTags(titleMatch[1])
} else {
// Fallback: extract visible text from an <a> with data-title or role="link"
linkTitlePattern := regexp.MustCompile(`<a[^>]+role="link"[^>]*>([^<]+)<`)
ltMatch := linkTitlePattern.FindStringSubmatch(block)
if len(ltMatch) >= 2 {
title = stripTags(ltMatch[1])
}
}
// Extract snippet from data-sncf divs (SearXNG's approach).
snippet := extractGoogleSnippet(block)
urlPtr := actualURL
results = append(results, contracts.MainResult{
Title: title,
URL: &urlPtr,
Content: snippet,
Engine: "google",
Score: float64(len(matches) - i),
Category: "general",
Engines: []string{"google"},
Template: "default.html",
})
}
return results
}
// extractGoogleSnippet extracts the snippet text from a Google result block.
func extractGoogleSnippet(block string) string {
// Google's snippets live in divs with data-sncf attribute.
// SearXNG looks for: .//div[contains(@data-sncf, "1")]
snippetPattern := regexp.MustCompile(`<div[^>]+data-sncf="1"[^>]*>(.*?)</div>`)
matches := snippetPattern.FindAllStringSubmatch(block, -1)
var parts []string
for _, m := range matches {
if len(m) < 2 {
continue
}
text := stripTags(m[1])
if text != "" {
parts = append(parts, text)
}
}
return strings.Join(parts, " ")
}
// extractGoogleSuggestions extracts search suggestions from Google result cards.
func extractGoogleSuggestions(body string) []string {
var suggestions []string
// SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a
suggestionPattern := regexp.MustCompile(`(?s)<div[^>]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`)
matches := suggestionPattern.FindAllStringSubmatch(body, -1)
seen := map[string]bool{}
for _, m := range matches {
if len(m) < 2 {
continue
}
s := strings.TrimSpace(stripTags(m[1]))
if s != "" && !seen[s] {
seen[s] = true
suggestions = append(suggestions, s)
}
}
return suggestions
}
// googleHL maps SearXNG locale to Google hl (host language) parameter.
// e.g. "en-US" -> "en-US"
func googleHL(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" || lang == "auto" {
return "en"
}
return lang
}
// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter.
// e.g. "en" -> "lang_en", "de" -> "lang_de"
func googleUILanguage(lang string) string {
lang = strings.ToLower(strings.Split(lang, "-")[0])
if lang == "" || lang == "auto" {
return ""
}
return "lang_" + lang
}
// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter.
func googleSafeSearchLevel(safesearch int) string {
switch safesearch {
case 0:
return "off"
case 1:
return "medium"
case 2:
return "high"
default:
return "medium"
}
}
// stripTags removes HTML tags from a string.
func stripTags(s string) string {
stripper := regexp.MustCompile(`<[^>]*>`)
s = stripper.ReplaceAllString(s, "")
s = strings.ReplaceAll(s, "&amp;", "&")
s = strings.ReplaceAll(s, "&quot;", `"`)
s = strings.ReplaceAll(s, "&#39;", "'")
s = strings.ReplaceAll(s, "&nbsp;", " ")
return strings.TrimSpace(s)
}