Update LICENSE file and add AGPL header to all source files. AGPLv3 ensures that if someone runs Kafka as a network service and modifies it, they must release their source code under the same license.
483 lines
13 KiB
Go
483 lines
13 KiB
Go
// kafka — a privacy-respecting metasearch engine
|
|
// Copyright (C) 2026-present metamorphosis-dev
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
package engines
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
// QwantEngine implements a `qwant` (web) adapter using
|
|
// Qwant v3 endpoint: https://api.qwant.com/v3/search/web.
|
|
//
|
|
// Qwant's API is not fully documented; this implements parsing logic
|
|
// for the `web` category.
|
|
type QwantEngine struct {
|
|
client *http.Client
|
|
category string // "web" (JSON API) or "web-lite" (HTML fallback)
|
|
resultsPerPage int
|
|
}
|
|
|
|
func (e *QwantEngine) Name() string { return "qwant" }
|
|
|
|
func (e *QwantEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
|
if e == nil || e.client == nil {
|
|
return contracts.SearchResponse{}, errors.New("qwant engine not initialized")
|
|
}
|
|
|
|
q := strings.TrimSpace(req.Query)
|
|
if q == "" {
|
|
return contracts.SearchResponse{Query: req.Query}, nil
|
|
}
|
|
|
|
// For API parity we use web defaults: count=10, offset=(pageno-1)*count.
|
|
// The engine's config field exists so we can expand to news/images/videos later.
|
|
count := e.resultsPerPage
|
|
if count <= 0 {
|
|
count = 10
|
|
}
|
|
offset := 0
|
|
if req.Pageno > 1 {
|
|
offset = (req.Pageno - 1) * count
|
|
}
|
|
mode := strings.TrimSpace(strings.ToLower(e.category))
|
|
if mode == "" {
|
|
mode = "web"
|
|
}
|
|
|
|
switch mode {
|
|
case "web-lite":
|
|
return e.searchWebLite(ctx, req)
|
|
case "web":
|
|
return e.searchWebAPI(ctx, req, count, offset)
|
|
default:
|
|
// Unknown mode: treat as unresponsive.
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
UnresponsiveEngines: [][2]string{
|
|
{e.Name(), "unknown_qwant_mode"},
|
|
},
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
func (e *QwantEngine) searchWebAPI(ctx context.Context, req contracts.SearchRequest, count, offset int) (contracts.SearchResponse, error) {
|
|
qLocale := qwantLocale(req.Language)
|
|
args := url.Values{}
|
|
args.Set("q", req.Query)
|
|
args.Set("count", fmt.Sprintf("%d", count))
|
|
args.Set("locale", qLocale)
|
|
args.Set("safesearch", fmt.Sprintf("%d", req.Safesearch))
|
|
args.Set("llm", "false")
|
|
args.Set("tgp", "3")
|
|
args.Set("offset", fmt.Sprintf("%d", offset))
|
|
|
|
endpoint := "https://api.qwant.com/v3/search/web?" + args.Encode()
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
httpReq.Header.Set("User-Agent", "kafka/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)")
|
|
|
|
resp, err := e.client.Do(httpReq)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Qwant often returns a 403 captcha/JS block for the JSON API.
|
|
if resp.StatusCode == http.StatusForbidden {
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
UnresponsiveEngines: [][2]string{
|
|
{e.Name(), "captcha_or_js_block"},
|
|
},
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
}, nil
|
|
}
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
|
return contracts.SearchResponse{}, fmt.Errorf("qwant upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
|
}
|
|
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
|
|
var top map[string]any
|
|
if err := json.Unmarshal(body, &top); err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
|
|
status, _ := top["status"].(string)
|
|
if status != "success" {
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
UnresponsiveEngines: [][2]string{
|
|
{e.Name(), "api_error"},
|
|
},
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
}, nil
|
|
}
|
|
|
|
data, _ := top["data"].(map[string]any)
|
|
result, _ := data["result"].(map[string]any)
|
|
items, _ := result["items"].(map[string]any)
|
|
mainline := items["mainline"]
|
|
|
|
rows := toSlice(mainline)
|
|
if len(rows) == 0 {
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: 0,
|
|
Results: []contracts.MainResult{},
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|
|
results := make([]contracts.MainResult, 0, len(rows))
|
|
for _, row := range rows {
|
|
rowMap, ok := row.(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
rowType, _ := rowMap["type"].(string)
|
|
if rowType == "" {
|
|
rowType = "web"
|
|
}
|
|
if rowType != "web" {
|
|
continue
|
|
}
|
|
if rowType == "ads" {
|
|
continue
|
|
}
|
|
|
|
rowItems := toSlice(rowMap["items"])
|
|
for _, it := range rowItems {
|
|
itemMap, ok := it.(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
title := toString(itemMap["title"])
|
|
resURL := toString(itemMap["url"])
|
|
desc := toString(itemMap["desc"])
|
|
if resURL == "" {
|
|
continue
|
|
}
|
|
urlPtr := resURL
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: title,
|
|
Content: desc,
|
|
URL: &urlPtr,
|
|
Engine: e.Name(),
|
|
Score: 0,
|
|
Category: "general",
|
|
Engines: []string{e.Name()},
|
|
})
|
|
}
|
|
}
|
|
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: len(results),
|
|
Results: results,
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|
|
func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
|
qLocale := qwantLocale(req.Language)
|
|
langBase := strings.SplitN(qLocale, "_", 2)[0]
|
|
|
|
args := url.Values{}
|
|
args.Set("q", req.Query)
|
|
args.Set("locale", strings.ToLower(qLocale))
|
|
args.Set("l", langBase)
|
|
args.Set("s", fmt.Sprintf("%d", req.Safesearch))
|
|
args.Set("p", fmt.Sprintf("%d", req.Pageno))
|
|
|
|
endpoint := "https://lite.qwant.com/?" + args.Encode()
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
httpReq.Header.Set("User-Agent", "kafka/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)")
|
|
|
|
resp, err := e.client.Do(httpReq)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
|
return contracts.SearchResponse{}, fmt.Errorf("qwant lite upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
if err != nil {
|
|
return contracts.SearchResponse{}, err
|
|
}
|
|
|
|
results := make([]contracts.MainResult, 0)
|
|
seen := map[string]bool{}
|
|
|
|
// Pattern 1: legacy/known qwant-lite structure.
|
|
doc.Find("section article").Each(func(_ int, item *goquery.Selection) {
|
|
// ignore randomly interspersed advertising adds
|
|
if item.Find("span.tooltip").Length() > 0 {
|
|
return
|
|
}
|
|
|
|
// Selector: "./span[contains(@class, 'url partner')]"
|
|
urlText := strings.TrimSpace(item.Find("span.url.partner").First().Text())
|
|
if urlText == "" {
|
|
// fallback: any span with class containing both 'url' and 'partner'
|
|
urlText = strings.TrimSpace(item.Find("span[class*='url'][class*='partner']").First().Text())
|
|
}
|
|
title := strings.TrimSpace(item.Find("h2 a").First().Text())
|
|
content := strings.TrimSpace(item.Find("p").First().Text())
|
|
|
|
if urlText == "" {
|
|
return
|
|
}
|
|
if seen[urlText] {
|
|
return
|
|
}
|
|
seen[urlText] = true
|
|
u := urlText
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: title,
|
|
Content: content,
|
|
URL: &u,
|
|
Engine: e.Name(),
|
|
Score: 0,
|
|
Category: "general",
|
|
Engines: []string{e.Name()},
|
|
})
|
|
})
|
|
|
|
// Pattern 2: broader fallback for updated lite markup:
|
|
// any article/list item/div block containing an external anchor.
|
|
// We keep this conservative by requiring non-empty title + URL.
|
|
doc.Find("article, li, div").Each(func(_ int, item *goquery.Selection) {
|
|
if len(results) >= 20 {
|
|
return
|
|
}
|
|
// Skip ad-like blocks in fallback pass too.
|
|
if item.Find("span.tooltip").Length() > 0 {
|
|
return
|
|
}
|
|
|
|
// Skip obvious nav/footer blocks.
|
|
classAttr, _ := item.Attr("class")
|
|
classLower := strings.ToLower(classAttr)
|
|
if strings.Contains(classLower, "nav") || strings.Contains(classLower, "footer") {
|
|
return
|
|
}
|
|
|
|
a := item.Find("a[href]").First()
|
|
if a.Length() == 0 {
|
|
return
|
|
}
|
|
href, ok := a.Attr("href")
|
|
if !ok {
|
|
return
|
|
}
|
|
href = strings.TrimSpace(href)
|
|
if href == "" {
|
|
return
|
|
}
|
|
|
|
// Ignore in-page and relative links.
|
|
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "#") {
|
|
return
|
|
}
|
|
if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
|
|
return
|
|
}
|
|
// Skip known sponsored partner links surfaced in lite pages.
|
|
if isKnownSponsoredURL(href) {
|
|
return
|
|
}
|
|
if isQwantInternalURL(href) {
|
|
// Ignore qwant nav/house links.
|
|
return
|
|
}
|
|
|
|
title := strings.TrimSpace(a.Text())
|
|
if title == "" {
|
|
return
|
|
}
|
|
if isLikelyNavTitle(title) {
|
|
return
|
|
}
|
|
|
|
if seen[href] {
|
|
return
|
|
}
|
|
seen[href] = true
|
|
|
|
// Best-effort snippet extraction from nearby paragraph/span text.
|
|
content := strings.TrimSpace(item.Find("p").First().Text())
|
|
if content == "" {
|
|
content = strings.TrimSpace(item.Find("span").First().Text())
|
|
}
|
|
// If there is no snippet, still keep clearly external result links.
|
|
// Qwant-lite frequently omits rich snippets for some entries.
|
|
|
|
u := href
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: title,
|
|
Content: content,
|
|
URL: &u,
|
|
Engine: e.Name(),
|
|
Score: 0,
|
|
Category: "general",
|
|
Engines: []string{e.Name()},
|
|
})
|
|
})
|
|
|
|
return contracts.SearchResponse{
|
|
Query: req.Query,
|
|
NumberOfResults: len(results),
|
|
Results: results,
|
|
Answers: []map[string]any{},
|
|
Corrections: []string{},
|
|
Infoboxes: []map[string]any{},
|
|
Suggestions: []string{},
|
|
UnresponsiveEngines: [][2]string{},
|
|
}, nil
|
|
}
|
|
|
|
func qwantLocale(lang string) string {
|
|
lang = strings.TrimSpace(lang)
|
|
if lang == "" || lang == "auto" {
|
|
return "en_US"
|
|
}
|
|
lang = strings.ReplaceAll(lang, "-", "_")
|
|
parts := strings.SplitN(lang, "_", 2)
|
|
base := strings.ToLower(parts[0])
|
|
country := "US"
|
|
if len(parts) == 2 && strings.TrimSpace(parts[1]) != "" {
|
|
country = strings.ToUpper(strings.TrimSpace(parts[1]))
|
|
}
|
|
// Qwant expects locales like en_US.
|
|
return base + "_" + country
|
|
}
|
|
|
|
func toSlice(v any) []any {
|
|
switch t := v.(type) {
|
|
case []any:
|
|
return t
|
|
default:
|
|
// Handle case where mainline might be a single object.
|
|
if m, ok := v.(map[string]any); ok {
|
|
return []any{m}
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func toString(v any) string {
|
|
switch t := v.(type) {
|
|
case string:
|
|
return t
|
|
case json.Number:
|
|
return t.String()
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func isQwantInternalURL(raw string) bool {
|
|
u, err := url.Parse(raw)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
host := strings.ToLower(u.Hostname())
|
|
if host == "" {
|
|
return false
|
|
}
|
|
return host == "qwant.com" || host == "www.qwant.com" || strings.HasSuffix(host, ".qwant.com") || host == "about.qwant.com"
|
|
}
|
|
|
|
func isLikelyNavTitle(title string) bool {
|
|
t := strings.TrimSpace(strings.ToLower(title))
|
|
switch t {
|
|
case "qwant search", "search", "privacy", "discover the service", "better web", "discover":
|
|
return true
|
|
}
|
|
if strings.HasPrefix(t, "get 20gb of free storage") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isKnownSponsoredURL(raw string) bool {
|
|
u, err := url.Parse(raw)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
host := strings.ToLower(u.Hostname())
|
|
switch host {
|
|
case "shdw.me", "www.shdw.me":
|
|
return true
|
|
}
|
|
if strings.Contains(strings.ToLower(raw), "qwant-tool") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|