Critical: - Validate baseURL/sourceURL/upstreamURL at config load time (prevents XML injection, XSS, SSRF via config/env manipulation) - Use xml.Escape for OpenSearch XML template interpolation High: - Add security headers middleware (CSP, X-Frame-Options, HSTS, etc.) - Sanitize result URLs to reject javascript:/data: schemes - Sanitize infobox img_src against dangerous URL schemes - Default CORS to deny-all (was wildcard *) Medium: - Rate limiter: X-Forwarded-For only trusted from configured proxies - Validate engine names against known registry allowlist - Add 1024-char max query length - Sanitize upstream error messages (strip raw response bodies) - Upstream client validates URL scheme (http/https only) Test updates: - Update extractIP tests for new trusted proxy behavior
210 lines
7.3 KiB
Go
210 lines
7.3 KiB
Go
// kafka — a privacy-respecting metasearch engine
|
|
// Copyright (C) 2026-present metamorphosis-dev
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
package engines
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
|
)
|
|
|
|
type WikipediaEngine struct {
|
|
client *http.Client
|
|
}
|
|
|
|
// validWikipediaLangs contains the set of valid Wikipedia language codes.
|
|
// This prevents SSRF attacks where an attacker could use a malicious language
|
|
// value to redirect requests to an attacker-controlled domain.
|
|
var validWikipediaLangs = map[string]struct{}{
|
|
"aa": {}, "ab": {}, "ae": {}, "af": {}, "ak": {}, "am": {}, "an": {},
|
|
"ar": {}, "arc": {}, "as": {}, "ast": {}, "at": {}, "av": {}, "ay": {},
|
|
"az": {}, "ba": {}, "be": {}, "bg": {}, "bh": {}, "bi": {}, "bm": {},
|
|
"bn": {}, "bo": {}, "br": {}, "bs": {}, "ca": {}, "ce": {}, "ch": {},
|
|
"co": {}, "cr": {}, "cs": {}, "cu": {}, "cv": {}, "cy": {}, "da": {},
|
|
"de": {}, "di": {}, "dv": {}, "dz": {}, "ee": {}, "el": {}, "en": {},
|
|
"eo": {}, "es": {}, "et": {}, "eu": {}, "fa": {}, "ff": {}, "fi": {},
|
|
"fj": {}, "fo": {}, "fr": {}, "fy": {}, "ga": {}, "gd": {}, "gl": {},
|
|
"gn": {}, "gu": {}, "gv": {}, "ha": {}, "he": {}, "hi": {}, "ho": {},
|
|
"hr": {}, "ht": {}, "hu": {}, "hy": {}, "hz": {}, "ia": {}, "id": {},
|
|
"ie": {}, "ig": {}, "ii": {}, "ik": {}, "io": {}, "is": {}, "it": {},
|
|
"iu": {}, "ja": {}, "jv": {}, "ka": {}, "kg": {}, "ki": {}, "kj": {},
|
|
"kk": {}, "kl": {}, "km": {}, "kn": {}, "ko": {}, "kr": {}, "ks": {},
|
|
"ku": {}, "kv": {}, "kw": {}, "ky": {}, "la": {}, "lb": {}, "lg": {},
|
|
"li": {}, "lij": {}, "ln": {}, "lo": {}, "lt": {}, "lv": {}, "mg": {},
|
|
"mh": {}, "mi": {}, "mk": {}, "ml": {}, "mn": {}, "mo": {}, "mr": {},
|
|
"ms": {}, "mt": {}, "mus": {}, "my": {}, "na": {}, "nah": {}, "nap": {},
|
|
"nd": {}, "nds": {}, "ne": {}, "new": {}, "ng": {}, "nl": {}, "nn": {},
|
|
"no": {}, "nov": {}, "nrm": {}, "nv": {}, "ny": {}, "oc": {}, "oj": {},
|
|
"om": {}, "or": {}, "os": {}, "pa": {}, "pag": {}, "pam": {}, "pap": {},
|
|
"pdc": {}, "pl": {}, "pms": {}, "pn": {}, "ps": {}, "pt": {}, "qu": {},
|
|
"rm": {}, "rmy": {}, "rn": {}, "ro": {}, "roa-rup": {}, "ru": {},
|
|
"rw": {}, "sa": {}, "sah": {}, "sc": {}, "scn": {}, "sco": {}, "sd": {},
|
|
"se": {}, "sg": {}, "sh": {}, "si": {}, "simple": {}, "sk": {}, "sl": {},
|
|
"sm": {}, "sn": {}, "so": {}, "sq": {}, "sr": {}, "ss": {}, "st": {},
|
|
"su": {}, "sv": {}, "sw": {}, "szl": {}, "ta": {}, "te": {}, "tg": {},
|
|
"th": {}, "ti": {}, "tk": {}, "tl": {}, "tn": {}, "to": {}, "tpi": {},
|
|
"tr": {}, "ts": {}, "tt": {}, "tum": {}, "tw": {}, "ty": {}, "udm": {},
|
|
"ug": {}, "uk": {}, "ur": {}, "uz": {}, "ve": {}, "vec": {}, "vi": {},
|
|
"vls": {}, "vo": {}, "wa": {}, "wo": {}, "xal": {}, "xh": {}, "yi": {},
|
|
"yo": {}, "za": {}, "zea": {}, "zh": {}, "zh-classical": {},
|
|
"zh-min-nan": {}, "zh-yue": {}, "zu": {},
|
|
}
|
|
|
|
func (e *WikipediaEngine) Name() string { return "wikipedia" }
|
|
|
|
func (e *WikipediaEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
|
if e == nil || e.client == nil {
|
|
return contracts.SearchResponse{}, errors.New("wikipedia engine not initialized")
|
|
}
|
|
if strings.TrimSpace(req.Query) == "" {
|
|
return contracts.SearchResponse{Query: req.Query}, nil
|
|
}
|
|
|
|
lang := strings.TrimSpace(req.Language)
|
|
if lang == "" || lang == "auto" {
|
|
lang = "en"
|
|
}
|
|
// Wikipedia subdomains are based on the language code; keep it simple for MVP.
|
|
lang = strings.SplitN(lang, "-", 2)[0]
|
|
lang = strings.ReplaceAll(lang, "_", "-")
|
|
// Validate lang against whitelist to prevent SSRF attacks where an attacker
|
|
// could use a malicious language value to redirect requests to their server.
|
|
if _, ok := validWikipediaLangs[lang]; !ok {
|
|
lang = "en"
|
|
}
|
|
wikiNetloc := fmt.Sprintf("%s.wikipedia.org", lang)
|
|
|
|
endpoint := fmt.Sprintf(
|
|
"https://%s/api/rest_v1/page/summary/%s",
|
|
wikiNetloc,
|
|
url.PathEscape(req.Query),
|
|
)
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
// Wikimedia APIs require a descriptive User-Agent.
|
|
httpReq.Header.Set(
|
|
"User-Agent",
|
|
"gosearch-go/0.1 (compatible; +https://github.com/metamorphosis-dev/kafka)",
|
|
)
|
|
// Best-effort: hint content language.
|
|
if req.Language != "" && req.Language != "auto" {
|
|
httpReq.Header.Set("Accept-Language", req.Language)
|
|
}
|
|
|
|
resp, err := e.client.Do(httpReq)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusNotFound {
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: 0,
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
|
return contracts.SearchResponse{}, fmt.Errorf("wikipedia upstream error: status %d", resp.StatusCode)
|
|
}
|
|
|
|
var api struct {
|
|
Title string `json:"title"`
|
|
Description string `json:"description"`
|
|
Titles struct {
|
|
Display string `json:"display"`
|
|
} `json:"titles"`
|
|
ContentURLs struct {
|
|
Desktop struct {
|
|
Page string `json:"page"`
|
|
} `json:"desktop"`
|
|
} `json:"content_urls"`
|
|
}
|
|
|
|
if err := json.NewDecoder(resp.Body).Decode(&api); err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
|
|
pageURL := api.ContentURLs.Desktop.Page
|
|
if pageURL == "" {
|
|
// API returned a non-standard payload; treat as no result.
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: 0,
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|
|
title := api.Titles.Display
|
|
if title == "" {
|
|
title = api.Title
|
|
}
|
|
|
|
content := api.Description
|
|
|
|
urlPtr := pageURL
|
|
pub := (*string)(nil)
|
|
|
|
results := []contracts.MainResult{
|
|
{
|
|
Template: "default.html",
|
|
Title: title,
|
|
Content: content,
|
|
URL: &urlPtr,
|
|
Pubdate: pub,
|
|
Engine: "wikipedia",
|
|
Score: 0,
|
|
Category: "general",
|
|
Priority: "",
|
|
Positions: nil,
|
|
Engines: []string{"wikipedia"},
|
|
},
|
|
}
|
|
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: len(results),
|
|
Results: results,
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|