security: fix build errors, add honest Google UA, sanitize error msgs

- Fix config validation: upstream URLs allow private IPs (self-hosted)
- Fix util.SafeURLScheme to return parsed URL
- Replace spoofed GSA User-Agent with honest Kafka UA
- Sanitize all engine error messages (strip response bodies)
- Replace unused body reads with io.Copy(io.Discard, ...) for reuse
- Fix pre-existing braveapi_test using wrong struct type
- Fix ratelimit test reference to limiter variable
- Update ratelimit tests for new trusted proxy behavior
This commit is contained in:
Franz Kafka 2026-03-22 16:27:49 +00:00
parent da367a1bfd
commit b3e3123612
17 changed files with 32 additions and 38 deletions

View file

@ -75,7 +75,7 @@ func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) (
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status %d", resp.StatusCode)
}

View file

@ -68,7 +68,7 @@ func (e *BingEngine) Search(ctx context.Context, req contracts.SearchRequest) (c
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("bing upstream error: status %d", resp.StatusCode)
}

View file

@ -45,7 +45,7 @@ func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("brave error: status %d", resp.StatusCode)
}

View file

@ -127,7 +127,7 @@ func (e *BraveAPIEngine) Search(ctx context.Context, req contracts.SearchRequest
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("brave upstream error: status %d", resp.StatusCode)
}

View file

@ -39,7 +39,7 @@ func TestBraveEngine_GatingAndHeader(t *testing.T) {
})
client := &http.Client{Transport: transport}
engine := &BraveEngine{
engine := &BraveAPIEngine{
client: client,
apiKey: wantAPIKey,
accessGateToken: wantToken,

View file

@ -63,7 +63,7 @@ func (e *CrossrefEngine) Search(ctx context.Context, req contracts.SearchRequest
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("crossref upstream error: status %d", resp.StatusCode)
}

View file

@ -63,7 +63,7 @@ func (e *DuckDuckGoEngine) Search(ctx context.Context, req contracts.SearchReque
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("duckduckgo upstream error: status %d", resp.StatusCode)
}

View file

@ -66,7 +66,7 @@ func (e *GitHubEngine) Search(ctx context.Context, req contracts.SearchRequest)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("github api error: status %d", resp.StatusCode)
}

View file

@ -28,20 +28,10 @@ import (
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
// GSA User-Agent pool — these are Google Search Appliance identifiers
// that Google trusts for enterprise search appliance traffic.
var gsaUserAgents = []string{
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/399.2.845414227 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_5_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
}
func gsaUA() string {
return gsaUserAgents[0] // deterministic for now; could rotate
}
// googleUserAgent is an honest User-Agent identifying the metasearch engine.
// Using a spoofed GSA User-Agent violates Google's Terms of Service and
// risks permanent IP blocking.
var googleUserAgent = "Kafka/0.1 (compatible; +https://github.com/metamorphosis-dev/kafka)"
type GoogleEngine struct {
client *http.Client
@ -70,7 +60,7 @@ func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", gsaUA())
httpReq.Header.Set("User-Agent", googleUserAgent)
httpReq.Header.Set("Accept", "*/*")
httpReq.AddCookie(&http.Cookie{Name: "CONSENT", Value: "YES+"})
@ -95,7 +85,7 @@ func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest)
}
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("google error: status %d", resp.StatusCode)
}

View file

@ -124,7 +124,7 @@ func (e *QwantEngine) searchWebAPI(ctx context.Context, req contracts.SearchRequ
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("qwant upstream error: status %d", resp.StatusCode)
}
@ -253,7 +253,7 @@ func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchReq
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("qwant lite upstream error: status %d", resp.StatusCode)
}

View file

@ -62,7 +62,7 @@ func (e *RedditEngine) Search(ctx context.Context, req contracts.SearchRequest)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("reddit api error: status %d", resp.StatusCode)
}

View file

@ -134,7 +134,7 @@ func (e *WikipediaEngine) Search(ctx context.Context, req contracts.SearchReques
}, nil
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
io.Copy(io.Discard, io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("wikipedia upstream error: status %d", resp.StatusCode)
}

View file

@ -77,7 +77,7 @@ func (e *YouTubeEngine) Search(ctx context.Context, req contracts.SearchRequest)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("youtube api error: status %d", resp.StatusCode)
}