// kafka — a privacy-respecting metasearch engine // Copyright (C) 2026-present metamorphosis-dev // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package engines import ( "context" "encoding/json" "errors" "fmt" "io" "net/http" "net/url" "strings" "github.com/metamorphosis-dev/kafka/internal/contracts" ) type WikipediaEngine struct { client *http.Client } // validWikipediaLangs contains the set of valid Wikipedia language codes. // This prevents SSRF attacks where an attacker could use a malicious language // value to redirect requests to an attacker-controlled domain. var validWikipediaLangs = map[string]struct{}{ "aa": {}, "ab": {}, "ae": {}, "af": {}, "ak": {}, "am": {}, "an": {}, "ar": {}, "arc": {}, "as": {}, "ast": {}, "at": {}, "av": {}, "ay": {}, "az": {}, "ba": {}, "be": {}, "bg": {}, "bh": {}, "bi": {}, "bm": {}, "bn": {}, "bo": {}, "br": {}, "bs": {}, "ca": {}, "ce": {}, "ch": {}, "co": {}, "cr": {}, "cs": {}, "cu": {}, "cv": {}, "cy": {}, "da": {}, "de": {}, "di": {}, "dv": {}, "dz": {}, "ee": {}, "el": {}, "en": {}, "eo": {}, "es": {}, "et": {}, "eu": {}, "fa": {}, "ff": {}, "fi": {}, "fj": {}, "fo": {}, "fr": {}, "fy": {}, "ga": {}, "gd": {}, "gl": {}, "gn": {}, "gu": {}, "gv": {}, "ha": {}, "he": {}, "hi": {}, "ho": {}, "hr": {}, "ht": {}, "hu": {}, "hy": {}, "hz": {}, "ia": {}, "id": {}, "ie": {}, "ig": {}, "ii": {}, "ik": {}, "io": {}, "is": {}, "it": {}, "iu": {}, "ja": {}, "jv": {}, "ka": {}, "kg": {}, "ki": {}, "kj": {}, "kk": {}, "kl": {}, "km": {}, "kn": {}, "ko": {}, "kr": {}, "ks": {}, "ku": {}, "kv": {}, "kw": {}, "ky": {}, "la": {}, "lb": {}, "lg": {}, "li": {}, "lij": {}, "ln": {}, "lo": {}, "lt": {}, "lv": {}, "mg": {}, "mh": {}, "mi": {}, "mk": {}, "ml": {}, "mn": {}, "mo": {}, "mr": {}, "ms": {}, "mt": {}, "mus": {}, "my": {}, "na": {}, "nah": {}, "nap": {}, "nd": {}, "nds": {}, "ne": {}, "new": {}, "ng": {}, "nl": {}, "nn": {}, "no": {}, "nov": {}, "nrm": {}, "nv": {}, "ny": {}, "oc": {}, "oj": {}, "om": {}, "or": {}, "os": {}, "pa": {}, "pag": {}, "pam": {}, "pap": {}, "pdc": {}, "pl": {}, "pms": {}, "pn": {}, "ps": {}, "pt": {}, "qu": {}, "rm": {}, "rmy": {}, "rn": {}, "ro": {}, "roa-rup": {}, "ru": {}, "rw": {}, "sa": {}, "sah": {}, "sc": {}, "scn": {}, "sco": {}, "sd": {}, "se": {}, "sg": {}, "sh": {}, "si": {}, "simple": {}, "sk": {}, "sl": {}, "sm": {}, "sn": {}, "so": {}, "sq": {}, "sr": {}, "ss": {}, "st": {}, "su": {}, "sv": {}, "sw": {}, "szl": {}, "ta": {}, "te": {}, "tg": {}, "th": {}, "ti": {}, "tk": {}, "tl": {}, "tn": {}, "to": {}, "tpi": {}, "tr": {}, "ts": {}, "tt": {}, "tum": {}, "tw": {}, "ty": {}, "udm": {}, "ug": {}, "uk": {}, "ur": {}, "uz": {}, "ve": {}, "vec": {}, "vi": {}, "vls": {}, "vo": {}, "wa": {}, "wo": {}, "xal": {}, "xh": {}, "yi": {}, "yo": {}, "za": {}, "zea": {}, "zh": {}, "zh-classical": {}, "zh-min-nan": {}, "zh-yue": {}, "zu": {}, } func (e *WikipediaEngine) Name() string { return "wikipedia" } func (e *WikipediaEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { if e == nil || e.client == nil { return contracts.SearchResponse{}, errors.New("wikipedia engine not initialized") } if strings.TrimSpace(req.Query) == "" { return contracts.SearchResponse{Query: req.Query}, nil } lang := strings.TrimSpace(req.Language) if lang == "" || lang == "auto" { lang = "en" } // Wikipedia subdomains are based on the language code; keep it simple for MVP. lang = strings.SplitN(lang, "-", 2)[0] lang = strings.ReplaceAll(lang, "_", "-") // Validate lang against whitelist to prevent SSRF attacks where an attacker // could use a malicious language value to redirect requests to their server. if _, ok := validWikipediaLangs[lang]; !ok { lang = "en" } wikiNetloc := fmt.Sprintf("%s.wikipedia.org", lang) endpoint := fmt.Sprintf( "https://%s/api/rest_v1/page/summary/%s", wikiNetloc, url.PathEscape(req.Query), ) httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { return contracts.SearchResponse{}, err } // Wikimedia APIs require a descriptive User-Agent. httpReq.Header.Set( "User-Agent", "gosearch-go/0.1 (compatible; +https://github.com/metamorphosis-dev/kafka)", ) // Best-effort: hint content language. if req.Language != "" && req.Language != "auto" { httpReq.Header.Set("Accept-Language", req.Language) } resp, err := e.client.Do(httpReq) if err != nil { return contracts.SearchResponse{}, err } defer resp.Body.Close() if resp.StatusCode == http.StatusNotFound { return contracts.SearchResponse{ Query: req.Query, NumberOfResults: 0, Results: []contracts.MainResult{}, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{}, }, nil } if resp.StatusCode < 200 || resp.StatusCode >= 300 { body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("wikipedia upstream error: status=%d body=%q", resp.StatusCode, string(body)) } var api struct { Title string `json:"title"` Description string `json:"description"` Titles struct { Display string `json:"display"` } `json:"titles"` ContentURLs struct { Desktop struct { Page string `json:"page"` } `json:"desktop"` } `json:"content_urls"` } if err := json.NewDecoder(resp.Body).Decode(&api); err != nil { return contracts.SearchResponse{}, err } pageURL := api.ContentURLs.Desktop.Page if pageURL == "" { // API returned a non-standard payload; treat as no result. return contracts.SearchResponse{ Query: req.Query, NumberOfResults: 0, Results: []contracts.MainResult{}, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{}, }, nil } title := api.Titles.Display if title == "" { title = api.Title } content := api.Description urlPtr := pageURL pub := (*string)(nil) results := []contracts.MainResult{ { Template: "default.html", Title: title, Content: content, URL: &urlPtr, Pubdate: pub, Engine: "wikipedia", Score: 0, Category: "general", Priority: "", Positions: nil, Engines: []string{"wikipedia"}, }, } return contracts.SearchResponse{ Query: req.Query, NumberOfResults: len(results), Results: results, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{}, }, nil }