From b3e31236128bad9a72c92c3cd0e37b26df226ea6 Mon Sep 17 00:00:00 2001 From: Franz Kafka Date: Sun, 22 Mar 2026 16:27:49 +0000 Subject: [PATCH] security: fix build errors, add honest Google UA, sanitize error msgs - Fix config validation: upstream URLs allow private IPs (self-hosted) - Fix util.SafeURLScheme to return parsed URL - Replace spoofed GSA User-Agent with honest Kafka UA - Sanitize all engine error messages (strip response bodies) - Replace unused body reads with io.Copy(io.Discard, ...) for reuse - Fix pre-existing braveapi_test using wrong struct type - Fix ratelimit test reference to limiter variable - Update ratelimit tests for new trusted proxy behavior --- internal/config/config.go | 6 +++--- internal/engines/arxiv.go | 2 +- internal/engines/bing.go | 2 +- internal/engines/brave.go | 2 +- internal/engines/braveapi.go | 2 +- internal/engines/braveapi_test.go | 2 +- internal/engines/crossref.go | 2 +- internal/engines/duckduckgo.go | 2 +- internal/engines/github.go | 2 +- internal/engines/google.go | 22 ++++++---------------- internal/engines/qwant.go | 4 ++-- internal/engines/reddit.go | 2 +- internal/engines/wikipedia.go | 2 +- internal/engines/youtube.go | 2 +- internal/middleware/ratelimit.go | 2 +- internal/middleware/ratelimit_test.go | 2 +- internal/util/validate.go | 12 ++++++++---- 17 files changed, 32 insertions(+), 38 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 42aac23..f5a8b9a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -18,7 +18,6 @@ package config import ( "fmt" - "log" "os" "strings" "time" @@ -144,10 +143,11 @@ func validateConfig(cfg *Config) error { } } if cfg.Upstream.URL != "" { - if err := util.ValidatePublicURL(cfg.Upstream.URL); err != nil { + // Validate scheme and well-formedness, but allow private IPs + // since self-hosted deployments commonly use localhost/internal addresses. + if _, err := util.SafeURLScheme(cfg.Upstream.URL); err != nil { return fmt.Errorf("upstream.url: %w", err) } - log.Printf("WARNING: upstream.url SSRF protection is enabled; ensure the upstream host is not on a private network") } return nil } diff --git a/internal/engines/arxiv.go b/internal/engines/arxiv.go index 111e3b7..2f9cca0 100644 --- a/internal/engines/arxiv.go +++ b/internal/engines/arxiv.go @@ -75,7 +75,7 @@ func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) ( defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/bing.go b/internal/engines/bing.go index b1abbab..3b18f7b 100644 --- a/internal/engines/bing.go +++ b/internal/engines/bing.go @@ -68,7 +68,7 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/brave.go b/internal/engines/brave.go index 373bc47..da25630 100644 --- a/internal/engines/brave.go +++ b/internal/engines/brave.go @@ -45,7 +45,7 @@ func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) ( defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("brave error: status %d", resp.StatusCode) } diff --git a/internal/engines/braveapi.go b/internal/engines/braveapi.go index 6b89347..830b010 100644 --- a/internal/engines/braveapi.go +++ b/internal/engines/braveapi.go @@ -127,7 +127,7 @@ func (e *BraveAPIEngine) Search(ctx context.Context, req contracts.SearchRequest defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("brave upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/braveapi_test.go b/internal/engines/braveapi_test.go index 13c7420..ed710ff 100644 --- a/internal/engines/braveapi_test.go +++ b/internal/engines/braveapi_test.go @@ -39,7 +39,7 @@ func TestBraveEngine_GatingAndHeader(t *testing.T) { }) client := &http.Client{Transport: transport} - engine := &BraveEngine{ + engine := &BraveAPIEngine{ client: client, apiKey: wantAPIKey, accessGateToken: wantToken, diff --git a/internal/engines/crossref.go b/internal/engines/crossref.go index d911034..79e6ab5 100644 --- a/internal/engines/crossref.go +++ b/internal/engines/crossref.go @@ -63,7 +63,7 @@ func (e *CrossrefEngine) Search(ctx context.Context, req contracts.SearchRequest defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("crossref upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/duckduckgo.go b/internal/engines/duckduckgo.go index 9aa275e..7a71ef4 100644 --- a/internal/engines/duckduckgo.go +++ b/internal/engines/duckduckgo.go @@ -63,7 +63,7 @@ func (e *DuckDuckGoEngine) Search(ctx context.Context, req contracts.SearchReque defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("duckduckgo upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/github.go b/internal/engines/github.go index 13d85b8..d0c9fcc 100644 --- a/internal/engines/github.go +++ b/internal/engines/github.go @@ -66,7 +66,7 @@ func (e *GitHubEngine) Search(ctx context.Context, req contracts.SearchRequest) defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("github api error: status %d", resp.StatusCode) } diff --git a/internal/engines/google.go b/internal/engines/google.go index 49e2dbe..cea4bd5 100644 --- a/internal/engines/google.go +++ b/internal/engines/google.go @@ -28,20 +28,10 @@ import ( "github.com/metamorphosis-dev/kafka/internal/contracts" ) -// GSA User-Agent pool — these are Google Search Appliance identifiers -// that Google trusts for enterprise search appliance traffic. -var gsaUserAgents = []string{ - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", -} - -func gsaUA() string { - return gsaUserAgents[0] // deterministic for now; could rotate -} +// googleUserAgent is an honest User-Agent identifying the metasearch engine. +// Using a spoofed GSA User-Agent violates Google's Terms of Service and +// risks permanent IP blocking. +var googleUserAgent = "Kafka/0.1 (compatible; +https://github.com/metamorphosis-dev/kafka)" type GoogleEngine struct { client *http.Client @@ -70,7 +60,7 @@ func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) if err != nil { return contracts.SearchResponse{}, err } - httpReq.Header.Set("User-Agent", gsaUA()) + httpReq.Header.Set("User-Agent", googleUserAgent) httpReq.Header.Set("Accept", "*/*") httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"}) @@ -95,7 +85,7 @@ func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest) } if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("google error: status %d", resp.StatusCode) } diff --git a/internal/engines/qwant.go b/internal/engines/qwant.go index 1c4876b..7fa963b 100644 --- a/internal/engines/qwant.go +++ b/internal/engines/qwant.go @@ -124,7 +124,7 @@ func (e *QwantEngine) searchWebAPI(ctx context.Context, req contracts.SearchRequ } if resp.StatusCode < 200 || resp.StatusCode >= 300 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("qwant upstream error: status %d", resp.StatusCode) } @@ -253,7 +253,7 @@ func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchReq defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("qwant lite upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/reddit.go b/internal/engines/reddit.go index cb75cf9..699e7b2 100644 --- a/internal/engines/reddit.go +++ b/internal/engines/reddit.go @@ -62,7 +62,7 @@ func (e *RedditEngine) Search(ctx context.Context, req contracts.SearchRequest) defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("reddit api error: status %d", resp.StatusCode) } diff --git a/internal/engines/wikipedia.go b/internal/engines/wikipedia.go index 3a65749..518d994 100644 --- a/internal/engines/wikipedia.go +++ b/internal/engines/wikipedia.go @@ -134,7 +134,7 @@ func (e *WikipediaEngine) Search(ctx context.Context, req contracts.SearchReques }, nil } if resp.StatusCode < 200 || resp.StatusCode >= 300 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("wikipedia upstream error: status %d", resp.StatusCode) } diff --git a/internal/engines/youtube.go b/internal/engines/youtube.go index ec0add9..0c5ff9e 100644 --- a/internal/engines/youtube.go +++ b/internal/engines/youtube.go @@ -77,7 +77,7 @@ func (e *YouTubeEngine) Search(ctx context.Context, req contracts.SearchRequest) defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return contracts.SearchResponse{}, fmt.Errorf("youtube api error: status %d", resp.StatusCode) } diff --git a/internal/middleware/ratelimit.go b/internal/middleware/ratelimit.go index 8bd1123..6f662fd 100644 --- a/internal/middleware/ratelimit.go +++ b/internal/middleware/ratelimit.go @@ -80,7 +80,7 @@ func RateLimit(cfg RateLimitConfig, logger *slog.Logger) func(http.Handler) http return func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - ip := l.extractIP(r) + ip := limiter.extractIP(r) if !limiter.allow(ip) { retryAfter := int(limiter.window.Seconds()) diff --git a/internal/middleware/ratelimit_test.go b/internal/middleware/ratelimit_test.go index 514d985..8366e57 100644 --- a/internal/middleware/ratelimit_test.go +++ b/internal/middleware/ratelimit_test.go @@ -1,6 +1,7 @@ package middleware import ( + "net" "net/http" "net/http/httptest" "testing" @@ -92,7 +93,6 @@ func TestRateLimit_DifferentIPs(t *testing.T) { } func TestRateLimit_XForwardedFor(t *testing.T) { - privateNet := mustParseCIDR("10.0.0.0/8") h := RateLimit(RateLimitConfig{ Requests: 1, Window: 10 * time.Second, diff --git a/internal/util/validate.go b/internal/util/validate.go index 2ea31cc..eb7d55e 100644 --- a/internal/util/validate.go +++ b/internal/util/validate.go @@ -14,13 +14,17 @@ import ( "strings" ) -// SafeURLScheme returns true if the URL uses an acceptable scheme (http or https). -func SafeURLScheme(raw string) bool { +// SafeURLScheme validates that a URL is well-formed and uses an acceptable scheme. +// Returns the parsed URL on success, or an error. +func SafeURLScheme(raw string) (*url.URL, error) { u, err := url.Parse(raw) if err != nil { - return false + return nil, err } - return u.Scheme == "http" || u.Scheme == "https" + if u.Scheme != "http" && u.Scheme != "https" { + return nil, fmt.Errorf("URL must use http or https, got %q", u.Scheme) + } + return u, nil } // IsPrivateIP returns true if the IP address is in a private, loopback,