diff --git a/CLAUDE.md b/CLAUDE.md index bba67e1..1ba6bdc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -37,7 +37,8 @@ There is no Makefile. There is no linter configured. - `internal/config` — TOML-based configuration with env var fallbacks. `Load(path)` reads `config.toml`; env vars override zero-value fields. See `config.example.toml` for all settings. - `internal/engines` — `Engine` interface and all 9 Go-native implementations. `factory.go` registers engines via `NewDefaultPortedEngines()`. `planner.go` routes engines to local or upstream based on `LOCAL_PORTED_ENGINES` env var. - `internal/search` — `Service` orchestrates the pipeline: cache check, planning, parallel engine execution via goroutines/WaitGroup, upstream proxying, response merging. Individual engine failures are reported as `unresponsive_engines` rather than aborting the search. Qwant has fallback logic to upstream on empty results. -- `internal/httpapi` — HTTP handlers for `/`, `/search`, `/healthz`, `/opensearch.xml`. Detects HTMX requests via `HX-Request` header to return fragments instead of full pages. +- `internal/autocomplete` — Fetches search suggestions. Proxies to upstream SearXNG `/autocompleter` if configured, falls back to Wikipedia OpenSearch API otherwise. +- `internal/httpapi` — HTTP handlers for `/`, `/search`, `/autocompleter`, `/healthz`, `/opensearch.xml`. Detects HTMX requests via `HX-Request` header to return fragments instead of full pages. - `internal/upstream` — Client that proxies requests to an upstream SearXNG instance via POST. - `internal/cache` — Valkey/Redis-backed cache with SHA-256 cache keys. No-op if unconfigured. - `internal/middleware` — Three rate limiters (per-IP sliding window, burst+sustained, global) and CORS. All disabled by default. diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 310a20e..937225f 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine { "duckduckgo": &DuckDuckGoEngine{client: client}, "github": &GitHubEngine{client: client}, "reddit": &RedditEngine{client: client}, - "bing": &BingEngine{client: client}, + "bing": &BingEngine{client: client}, + "google": &GoogleEngine{client: client}, } } diff --git a/internal/engines/google.go b/internal/engines/google.go new file mode 100644 index 0000000..0371283 --- /dev/null +++ b/internal/engines/google.go @@ -0,0 +1,271 @@ +package engines + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + + "github.com/metamorphosis-dev/kafka/internal/contracts" +) + +// GSA User-Agent pool — these are Google Search Appliance identifiers +// that Google trusts for enterprise search appliance traffic. +var gsaUserAgents = []string{ + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", +} + +func gsaUA() string { + return gsaUserAgents[0] // deterministic for now; could rotate +} + +type GoogleEngine struct { + client *http.Client +} + +func (e *GoogleEngine) Name() string { return "google" } + +func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if strings.TrimSpace(req.Query) == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + start := (req.Pageno - 1) * 10 + query := url.QueryEscape(req.Query) + + // Build URL like SearXNG does. + u := fmt.Sprintf( + "https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s", + query, + start, + googleHL(req.Language), + googleUILanguage(req.Language), + googleSafeSearchLevel(req.Safesearch), + ) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", gsaUA()) + httpReq.Header.Set("Accept", "*/*") + httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + // Check for Google block / CAPTCHA page. + if detectGoogleSorry(resp) { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: nil, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}}, + }, nil + } + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + results := parseGoogleResults(string(body), req.Query) + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: extractGoogleSuggestions(string(body)), + UnresponsiveEngines: [][2]string{}, + }, nil +} + +// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page. +func detectGoogleSorry(resp *http.Response) bool { + if resp.Request != nil { + if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") { + return true + } + } + return false +} + +// parseGoogleResults extracts search results from Google's HTML. +// Uses the same selectors as SearXNG: div.MjjYud for result containers. +func parseGoogleResults(body, query string) []contracts.MainResult { + var results []contracts.MainResult + + // SearXNG selector: .//div[contains(@class, "MjjYud")] + // Each result block contains a title link and snippet. + // We simulate the XPath matching with regex-based extraction. + + // Find all MjjYud div blocks. + mjjPattern := regexp.MustCompile(`]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)\s*(?=]*class="[^"]*MjjYud|$)`) + matches := mjjPattern.FindAllStringSubmatch(body, -1) + + for i, match := range matches { + if len(match) < 2 { + continue + } + block := match[1] + + // Extract title and URL from the result link. + // Pattern: TITLE + urlPattern := regexp.MustCompile(`]+href="(/url\?q=[^"&]+)`) + urlMatch := urlPattern.FindStringSubmatch(block) + if len(urlMatch) < 2 { + continue + } + rawURL := urlMatch[1] + // Remove /url?q= prefix and decode. + actualURL := strings.TrimPrefix(rawURL, "/url?q=") + if amp := strings.Index(actualURL, "&"); amp != -1 { + actualURL = actualURL[:amp] + } + if decoded, err := url.QueryUnescape(actualURL); err == nil { + actualURL = decoded + } + + if actualURL == "" || !strings.HasPrefix(actualURL, "http") { + continue + } + + // Extract title from the title tag. + titlePattern := regexp.MustCompile(`]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)`) + titleMatch := titlePattern.FindStringSubmatch(block) + title := query + if len(titleMatch) >= 2 { + title = stripTags(titleMatch[1]) + } else { + // Fallback: extract visible text from an with data-title or role="link" + linkTitlePattern := regexp.MustCompile(`]+role="link"[^>]*>([^<]+)<`) + ltMatch := linkTitlePattern.FindStringSubmatch(block) + if len(ltMatch) >= 2 { + title = stripTags(ltMatch[1]) + } + } + + // Extract snippet from data-sncf divs (SearXNG's approach). + snippet := extractGoogleSnippet(block) + + urlPtr := actualURL + results = append(results, contracts.MainResult{ + Title: title, + URL: &urlPtr, + Content: snippet, + Engine: "google", + Score: float64(len(matches) - i), + Category: "general", + Engines: []string{"google"}, + Template: "default.html", + }) + } + + return results +} + +// extractGoogleSnippet extracts the snippet text from a Google result block. +func extractGoogleSnippet(block string) string { + // Google's snippets live in divs with data-sncf attribute. + // SearXNG looks for: .//div[contains(@data-sncf, "1")] + snippetPattern := regexp.MustCompile(`]+data-sncf="1"[^>]*>(.*?)`) + matches := snippetPattern.FindAllStringSubmatch(block, -1) + var parts []string + for _, m := range matches { + if len(m) < 2 { + continue + } + text := stripTags(m[1]) + if text != "" { + parts = append(parts, text) + } + } + return strings.Join(parts, " ") +} + +// extractGoogleSuggestions extracts search suggestions from Google result cards. +func extractGoogleSuggestions(body string) []string { + var suggestions []string + // SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a + suggestionPattern := regexp.MustCompile(`]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?]*>([^<]+)`, regexp.DotAll) + matches := suggestionPattern.FindAllStringSubmatch(body, -1) + seen := map[string]bool{} + for _, m := range matches { + if len(m) < 2 { + continue + } + s := strings.TrimSpace(stripTags(m[1])) + if s != "" && !seen[s] { + seen[s] = true + suggestions = append(suggestions, s) + } + } + return suggestions +} + +// googleHL maps SearXNG locale to Google hl (host language) parameter. +// e.g. "en-US" -> "en-US" +func googleHL(lang string) string { + lang = strings.ToLower(strings.TrimSpace(lang)) + if lang == "" || lang == "auto" { + return "en" + } + return lang +} + +// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter. +// e.g. "en" -> "lang_en", "de" -> "lang_de" +func googleUILanguage(lang string) string { + lang = strings.ToLower(strings.Split(lang, "-")[0]) + if lang == "" || lang == "auto" { + return "" + } + return "lang_" + lang +} + +// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter. +func googleSafeSearchLevel(safesearch int) string { + switch safesearch { + case 0: + return "off" + case 1: + return "medium" + case 2: + return "high" + default: + return "medium" + } +} + +// stripTags removes HTML tags from a string. +func stripTags(s string) string { + stripper := regexp.MustCompile(`<[^>]*>`) + s = stripper.ReplaceAllString(s, "") + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, """, `"`) + s = strings.ReplaceAll(s, "'", "'") + s = strings.ReplaceAll(s, " ", " ") + return strings.TrimSpace(s) +} diff --git a/internal/engines/planner.go b/internal/engines/planner.go index 543f253..08b0a27 100644 --- a/internal/engines/planner.go +++ b/internal/engines/planner.go @@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string { set["qwant"] = true set["duckduckgo"] = true set["bing"] = true + set["google"] = true case "science", "scientific publications": set["arxiv"] = true set["crossref"] = true @@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string { out = append(out, e) } // stable order - order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8} + order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9} sortByOrder(out, order) return out } diff --git a/internal/views/static/css/kafka.css b/internal/views/static/css/kafka.css index ad794c4..376b2d8 100644 --- a/internal/views/static/css/kafka.css +++ b/internal/views/static/css/kafka.css @@ -421,6 +421,63 @@ footer a:hover { display: block; } +/* Autocomplete dropdown */ +#search { + position: relative; +} + +#autocomplete-dropdown { + position: absolute; + top: 100%; + left: 0; + right: 0; + background: var(--color-base-background); + border: 1px solid var(--color-search-border); + border-top: none; + border-radius: 0 0 var(--radius) var(--radius); + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); + z-index: 100; + max-height: 320px; + overflow-y: auto; + display: none; +} + +#autocomplete-dropdown.open { + display: block; +} + +.autocomplete-suggestion { + padding: 0.6rem 1rem; + cursor: pointer; + font-size: 0.95rem; + color: var(--color-base-font); + border-bottom: 1px solid var(--color-result-border); + transition: background 0.15s; +} + +.autocomplete-suggestion:last-child { + border-bottom: none; +} + +.autocomplete-suggestion:hover, +.autocomplete-suggestion.active { + background: var(--color-header-background); +} + +.autocomplete-suggestion mark { + background: none; + color: var(--color-link); + font-weight: 600; +} + +.autocomplete-footer { + padding: 0.4rem 1rem; + font-size: 0.75rem; + color: var(--color-suggestion); + border-top: 1px solid var(--color-result-border); + background: var(--color-header-background); +} + /* Responsive */ @media (max-width: 768px) { #results { diff --git a/internal/views/templates/base.html b/internal/views/templates/base.html index 10de540..6572b19 100644 --- a/internal/views/templates/base.html +++ b/internal/views/templates/base.html @@ -20,6 +20,123 @@

Powered by kafka — a privacy-respecting, open metasearch engine

+ {{end}} diff --git a/internal/views/templates/index.html b/internal/views/templates/index.html index e2ca279..c9df700 100644 --- a/internal/views/templates/index.html +++ b/internal/views/templates/index.html @@ -3,11 +3,12 @@

kafka