Compare commits

...

3 commits

Author SHA1 Message Date
4be9cf2725 feat: add Google engine using GSA User-Agent scraping
SearXNG approach: use Google Search Appliance (GSA) User-Agent
pool — these are whitelisted enterprise identifiers Google trusts.

Key techniques:
- GSA User-Agent (iPhone OS + GSA/ version) instead of Chrome desktop
- CONSENT=YES+ cookie to bypass EU consent wall
- Parse /url?q= redirector URLs (unquote + strip &sa= params)
- div.MjjYud class for result containers (SearXNG selector)
- data-sncf divs for snippets
- detect sorry.google.com blocks
- Suggestions from ouy7Mc class cards
2026-03-22 01:29:46 +00:00
4482cb4dde docs: update CLAUDE.md with autocomplete package and endpoint
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 01:27:10 +01:00
a2f8077669 feat: add autocomplete dropdown UI with keyboard nav
- Inline JS in base.html: debounced fetch from /autocompleter on keyup
- Keyboard nav: arrows to navigate, Enter to select, Esc to close
- Highlight matching prefix in suggestions
- Click to select and submit
- Dropdown positioned absolutely below search input
- Dark mode compatible via existing CSS variables
2026-03-22 00:20:43 +00:00
7 changed files with 453 additions and 4 deletions

View file

@ -37,7 +37,8 @@ There is no Makefile. There is no linter configured.
- `internal/config` — TOML-based configuration with env var fallbacks. `Load(path)` reads `config.toml`; env vars override zero-value fields. See `config.example.toml` for all settings.
- `internal/engines``Engine` interface and all 9 Go-native implementations. `factory.go` registers engines via `NewDefaultPortedEngines()`. `planner.go` routes engines to local or upstream based on `LOCAL_PORTED_ENGINES` env var.
- `internal/search``Service` orchestrates the pipeline: cache check, planning, parallel engine execution via goroutines/WaitGroup, upstream proxying, response merging. Individual engine failures are reported as `unresponsive_engines` rather than aborting the search. Qwant has fallback logic to upstream on empty results.
- `internal/httpapi` — HTTP handlers for `/`, `/search`, `/healthz`, `/opensearch.xml`. Detects HTMX requests via `HX-Request` header to return fragments instead of full pages.
- `internal/autocomplete` — Fetches search suggestions. Proxies to upstream SearXNG `/autocompleter` if configured, falls back to Wikipedia OpenSearch API otherwise.
- `internal/httpapi` — HTTP handlers for `/`, `/search`, `/autocompleter`, `/healthz`, `/opensearch.xml`. Detects HTMX requests via `HX-Request` header to return fragments instead of full pages.
- `internal/upstream` — Client that proxies requests to an upstream SearXNG instance via POST.
- `internal/cache` — Valkey/Redis-backed cache with SHA-256 cache keys. No-op if unconfigured.
- `internal/middleware` — Three rate limiters (per-IP sliding window, burst+sustained, global) and CORS. All disabled by default.

View file

@ -31,6 +31,7 @@ func NewDefaultPortedEngines(client *http.Client) map[string]Engine {
"duckduckgo": &DuckDuckGoEngine{client: client},
"github": &GitHubEngine{client: client},
"reddit": &RedditEngine{client: client},
"bing": &BingEngine{client: client},
"bing": &BingEngine{client: client},
"google": &GoogleEngine{client: client},
}
}

271
internal/engines/google.go Normal file
View file

@ -0,0 +1,271 @@
package engines
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
// GSA User-Agent pool — these are Google Search Appliance identifiers
// that Google trusts for enterprise search appliance traffic.
var gsaUserAgents = []string{
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
}
func gsaUA() string {
return gsaUserAgents[0] // deterministic for now; could rotate
}
type GoogleEngine struct {
client *http.Client
}
func (e *GoogleEngine) Name() string { return "google" }
func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if strings.TrimSpace(req.Query) == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
start := (req.Pageno - 1) * 10
query := url.QueryEscape(req.Query)
// Build URL like SearXNG does.
u := fmt.Sprintf(
"https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s",
query,
start,
googleHL(req.Language),
googleUILanguage(req.Language),
googleSafeSearchLevel(req.Safesearch),
)
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", gsaUA())
httpReq.Header.Set("Accept", "*/*")
httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"})
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
// Check for Google block / CAPTCHA page.
if detectGoogleSorry(resp) {
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: 0,
Results: nil,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{{"google", "blocked by Google (CAPTCHA/sorry page)"}},
}, nil
}
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("google error: status=%d body=%q", resp.StatusCode, string(body))
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
if err != nil {
return contracts.SearchResponse{}, err
}
results := parseGoogleResults(string(body), req.Query)
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: extractGoogleSuggestions(string(body)),
UnresponsiveEngines: [][2]string{},
}, nil
}
// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page.
func detectGoogleSorry(resp *http.Response) bool {
if resp.Request != nil {
if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") {
return true
}
}
return false
}
// parseGoogleResults extracts search results from Google's HTML.
// Uses the same selectors as SearXNG: div.MjjYud for result containers.
func parseGoogleResults(body, query string) []contracts.MainResult {
var results []contracts.MainResult
// SearXNG selector: .//div[contains(@class, "MjjYud")]
// Each result block contains a title link and snippet.
// We simulate the XPath matching with regex-based extraction.
// Find all MjjYud div blocks.
mjjPattern := regexp.MustCompile(`<div[^>]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)</div>\s*(?=<div[^>]*class="[^"]*MjjYud|$)`)
matches := mjjPattern.FindAllStringSubmatch(body, -1)
for i, match := range matches {
if len(match) < 2 {
continue
}
block := match[1]
// Extract title and URL from the result link.
// Pattern: <a href="/url?q=ACTUAL_URL&amp;sa=..." ...>TITLE</a>
urlPattern := regexp.MustCompile(`<a[^>]+href="(/url\?q=[^"&]+)`)
urlMatch := urlPattern.FindStringSubmatch(block)
if len(urlMatch) < 2 {
continue
}
rawURL := urlMatch[1]
// Remove /url?q= prefix and decode.
actualURL := strings.TrimPrefix(rawURL, "/url?q=")
if amp := strings.Index(actualURL, "&amp;"); amp != -1 {
actualURL = actualURL[:amp]
}
if decoded, err := url.QueryUnescape(actualURL); err == nil {
actualURL = decoded
}
if actualURL == "" || !strings.HasPrefix(actualURL, "http") {
continue
}
// Extract title from the title tag.
titlePattern := regexp.MustCompile(`<span[^>]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)</span>`)
titleMatch := titlePattern.FindStringSubmatch(block)
title := query
if len(titleMatch) >= 2 {
title = stripTags(titleMatch[1])
} else {
// Fallback: extract visible text from an <a> with data-title or role="link"
linkTitlePattern := regexp.MustCompile(`<a[^>]+role="link"[^>]*>([^<]+)<`)
ltMatch := linkTitlePattern.FindStringSubmatch(block)
if len(ltMatch) >= 2 {
title = stripTags(ltMatch[1])
}
}
// Extract snippet from data-sncf divs (SearXNG's approach).
snippet := extractGoogleSnippet(block)
urlPtr := actualURL
results = append(results, contracts.MainResult{
Title: title,
URL: &urlPtr,
Content: snippet,
Engine: "google",
Score: float64(len(matches) - i),
Category: "general",
Engines: []string{"google"},
Template: "default.html",
})
}
return results
}
// extractGoogleSnippet extracts the snippet text from a Google result block.
func extractGoogleSnippet(block string) string {
// Google's snippets live in divs with data-sncf attribute.
// SearXNG looks for: .//div[contains(@data-sncf, "1")]
snippetPattern := regexp.MustCompile(`<div[^>]+data-sncf="1"[^>]*>(.*?)</div>`)
matches := snippetPattern.FindAllStringSubmatch(block, -1)
var parts []string
for _, m := range matches {
if len(m) < 2 {
continue
}
text := stripTags(m[1])
if text != "" {
parts = append(parts, text)
}
}
return strings.Join(parts, " ")
}
// extractGoogleSuggestions extracts search suggestions from Google result cards.
func extractGoogleSuggestions(body string) []string {
var suggestions []string
// SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a
suggestionPattern := regexp.MustCompile(`<div[^>]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`, regexp.DotAll)
matches := suggestionPattern.FindAllStringSubmatch(body, -1)
seen := map[string]bool{}
for _, m := range matches {
if len(m) < 2 {
continue
}
s := strings.TrimSpace(stripTags(m[1]))
if s != "" && !seen[s] {
seen[s] = true
suggestions = append(suggestions, s)
}
}
return suggestions
}
// googleHL maps SearXNG locale to Google hl (host language) parameter.
// e.g. "en-US" -> "en-US"
func googleHL(lang string) string {
lang = strings.ToLower(strings.TrimSpace(lang))
if lang == "" || lang == "auto" {
return "en"
}
return lang
}
// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter.
// e.g. "en" -> "lang_en", "de" -> "lang_de"
func googleUILanguage(lang string) string {
lang = strings.ToLower(strings.Split(lang, "-")[0])
if lang == "" || lang == "auto" {
return ""
}
return "lang_" + lang
}
// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter.
func googleSafeSearchLevel(safesearch int) string {
switch safesearch {
case 0:
return "off"
case 1:
return "medium"
case 2:
return "high"
default:
return "medium"
}
}
// stripTags removes HTML tags from a string.
func stripTags(s string) string {
stripper := regexp.MustCompile(`<[^>]*>`)
s = stripper.ReplaceAllString(s, "")
s = strings.ReplaceAll(s, "&amp;", "&")
s = strings.ReplaceAll(s, "&quot;", `"`)
s = strings.ReplaceAll(s, "&#39;", "'")
s = strings.ReplaceAll(s, "&nbsp;", " ")
return strings.TrimSpace(s)
}

View file

@ -91,6 +91,7 @@ func inferFromCategories(categories []string) []string {
set["qwant"] = true
set["duckduckgo"] = true
set["bing"] = true
set["google"] = true
case "science", "scientific publications":
set["arxiv"] = true
set["crossref"] = true
@ -106,7 +107,7 @@ func inferFromCategories(categories []string) []string {
out = append(out, e)
}
// stable order
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "arxiv": 5, "crossref": 6, "github": 7, "reddit": 8}
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9}
sortByOrder(out, order)
return out
}

View file

@ -421,6 +421,63 @@ footer a:hover {
display: block;
}
/* Autocomplete dropdown */
#search {
position: relative;
}
#autocomplete-dropdown {
position: absolute;
top: 100%;
left: 0;
right: 0;
background: var(--color-base-background);
border: 1px solid var(--color-search-border);
border-top: none;
border-radius: 0 0 var(--radius) var(--radius);
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
z-index: 100;
max-height: 320px;
overflow-y: auto;
display: none;
}
#autocomplete-dropdown.open {
display: block;
}
.autocomplete-suggestion {
padding: 0.6rem 1rem;
cursor: pointer;
font-size: 0.95rem;
color: var(--color-base-font);
border-bottom: 1px solid var(--color-result-border);
transition: background 0.15s;
}
.autocomplete-suggestion:last-child {
border-bottom: none;
}
.autocomplete-suggestion:hover,
.autocomplete-suggestion.active {
background: var(--color-header-background);
}
.autocomplete-suggestion mark {
background: none;
color: var(--color-link);
font-weight: 600;
}
.autocomplete-footer {
padding: 0.4rem 1rem;
font-size: 0.75rem;
color: var(--color-suggestion);
border-top: 1px solid var(--color-result-border);
background: var(--color-header-background);
}
/* Responsive */
@media (max-width: 768px) {
#results {

View file

@ -20,6 +20,123 @@
<footer>
<p>Powered by <a href="https://git.ashisgreat.xyz/penal-colony/kafka">kafka</a> — a privacy-respecting, open metasearch engine</p>
</footer>
<script>
(function () {
'use strict';
var input = document.getElementById('q');
var dropdown = document.getElementById('autocomplete-dropdown');
var form = document.getElementById('search-form');
var debounceTimer = null;
var suggestions = [];
var activeIndex = -1;
var fetchController = null;
// Escape regex special chars for highlight matching
function escapeRegex(str) {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
// Highlight matching prefix
function highlight(text, query) {
if (!query) return text;
var re = new RegExp('^(' + escapeRegex(query) + ')', 'i');
return text.replace(re, '<mark>$1</mark>');
}
function renderDropdown() {
if (suggestions.length === 0) {
dropdown.classList.remove('open');
return;
}
var html = '';
for (var i = 0; i < suggestions.length; i++) {
var escaped = highlight(suggestions[i], input.value);
html += '<div class="autocomplete-suggestion" data-index="' + i + '">' + escaped + '</div>';
}
html += '<div class="autocomplete-footer">Press <kbd></kbd><kbd></kbd> to navigate, Enter to select, Esc to close</div>';
dropdown.innerHTML = html;
dropdown.classList.add('open');
activeIndex = -1;
}
function closeDropdown() {
dropdown.classList.remove('open');
suggestions = [];
activeIndex = -1;
}
function selectSuggestion(index) {
if (index < 0 || index >= suggestions.length) return;
input.value = suggestions[index];
closeDropdown();
form.submit();
}
function updateActive(newIndex) {
var items = dropdown.querySelectorAll('.autocomplete-suggestion');
items.forEach(function (el) { el.classList.remove('active'); });
if (newIndex >= 0 && newIndex < items.length) {
items[newIndex].classList.add('active');
items[newIndex].scrollIntoView({ block: 'nearest' });
}
activeIndex = newIndex;
}
function fetchSuggestions(query) {
if (fetchController) fetchController.abort();
fetchController = new AbortController();
fetch('/autocompleter?q=' + encodeURIComponent(query), { signal: fetchController.signal })
.then(function (r) { return r.json(); })
.then(function (data) {
suggestions = data || [];
renderDropdown();
})
.catch(function (e) {
if (e.name !== 'AbortError') suggestions = [];
dropdown.classList.remove('open');
});
}
input.addEventListener('input', function () {
clearTimeout(debounceTimer);
var q = input.value.trim();
if (q.length < 2) { closeDropdown(); return; }
debounceTimer = setTimeout(function () { fetchSuggestions(q); }, 250);
});
input.addEventListener('keydown', function (e) {
if (!dropdown.classList.contains('open')) return;
var items = dropdown.querySelectorAll('.autocomplete-suggestion');
if (e.key === 'ArrowDown') {
e.preventDefault();
updateActive(Math.min(activeIndex + 1, items.length - 1));
} else if (e.key === 'ArrowUp') {
e.preventDefault();
updateActive(Math.max(activeIndex - 1, -1));
} else if (e.key === 'Enter' && activeIndex >= 0) {
e.preventDefault();
selectSuggestion(activeIndex);
} else if (e.key === 'Escape') {
closeDropdown();
}
});
input.addEventListener('blur', function () {
// Delay to allow click events on suggestions
setTimeout(closeDropdown, 150);
});
dropdown.addEventListener('mousedown', function (e) {
var item = e.target.closest('.autocomplete-suggestion');
if (item) {
e.preventDefault(); // prevent blur from firing before select
var idx = parseInt(item.getAttribute('data-index'), 10);
selectSuggestion(idx);
}
});
}());
</script>
</body>
</html>
{{end}}

View file

@ -3,11 +3,12 @@
<div class="index">
<div class="title"><h1>kafka</h1></div>
<div id="search">
<form method="GET" action="/search" role="search">
<form method="GET" action="/search" role="search" id="search-form">
<input type="text" name="q" id="q" placeholder="Search…" autocomplete="off" autofocus
hx-get="/search" hx-target="#results" hx-trigger="keyup changed delay:500ms" hx-include="this">
<button type="submit">Search</button>
</form>
<div id="autocomplete-dropdown"></div>
</div>
</div>
<div id="results"></div>