kafka/internal/engines/qwant.go
Franz Kafka 5b942a5fd6
Some checks failed
Build and Push Docker Image / build-and-push (push) Failing after 7s
Mirror to GitHub / mirror (push) Failing after 3s
Tests / test (push) Successful in 25s
refactor: clean up verbose and redundant comments
Trim or remove comments that:
- State the obvious (function names already convey purpose)
- Repeat what the code clearly shows
- Are excessively long without adding value

Keep comments that explain *why*, not *what*.
2026-03-22 11:10:50 +00:00

467 lines
12 KiB
Go

// kafka — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package engines
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"github.com/metamorphosis-dev/kafka/internal/contracts"
"github.com/PuerkitoBio/goquery"
)
// QwantEngine implements the Qwant v3 API (web and web-lite modes).
type QwantEngine struct {
client *http.Client
category string // "web" (JSON API) or "web-lite" (HTML fallback)
resultsPerPage int
}
func (e *QwantEngine) Name() string { return "qwant" }
func (e *QwantEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if e == nil || e.client == nil {
return contracts.SearchResponse{}, errors.New("qwant engine not initialized")
}
q := strings.TrimSpace(req.Query)
if q == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
count := e.resultsPerPage
if count <= 0 {
count = 10
}
offset := 0
if req.Pageno > 1 {
offset = (req.Pageno - 1) * count
}
mode := strings.TrimSpace(strings.ToLower(e.category))
if mode == "" {
mode = "web"
}
switch mode {
case "web-lite":
return e.searchWebLite(ctx, req)
case "web":
return e.searchWebAPI(ctx, req, count, offset)
default:
// Unknown mode: treat as unresponsive.
return contracts.SearchResponse{
Query: req.Query,
UnresponsiveEngines: [][2]string{
{e.Name(), "unknown_qwant_mode"},
},
Results: []contracts.MainResult{},
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
}, nil
}
}
func (e *QwantEngine) searchWebAPI(ctx context.Context, req contracts.SearchRequest, count, offset int) (contracts.SearchResponse, error) {
qLocale := qwantLocale(req.Language)
args := url.Values{}
args.Set("q", req.Query)
args.Set("count", fmt.Sprintf("%d", count))
args.Set("locale", qLocale)
args.Set("safesearch", fmt.Sprintf("%d", req.Safesearch))
args.Set("llm", "false")
args.Set("tgp", "3")
args.Set("offset", fmt.Sprintf("%d", offset))
endpoint := "https://api.qwant.com/v3/search/web?" + args.Encode()
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", "kafka/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)")
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
// Qwant often returns a 403 captcha/JS block for the JSON API.
if resp.StatusCode == http.StatusForbidden {
return contracts.SearchResponse{
Query: req.Query,
UnresponsiveEngines: [][2]string{
{e.Name(), "captcha_or_js_block"},
},
Results: []contracts.MainResult{},
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
}, nil
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("qwant upstream error: status=%d body=%q", resp.StatusCode, string(body))
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return contracts.SearchResponse{}, err
}
var top map[string]any
if err := json.Unmarshal(body, &top); err != nil {
return contracts.SearchResponse{}, err
}
status, _ := top["status"].(string)
if status != "success" {
return contracts.SearchResponse{
Query: req.Query,
UnresponsiveEngines: [][2]string{
{e.Name(), "api_error"},
},
Results: []contracts.MainResult{},
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
}, nil
}
data, _ := top["data"].(map[string]any)
result, _ := data["result"].(map[string]any)
items, _ := result["items"].(map[string]any)
mainline := items["mainline"]
rows := toSlice(mainline)
if len(rows) == 0 {
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: 0,
Results: []contracts.MainResult{},
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{},
}, nil
}
results := make([]contracts.MainResult, 0, len(rows))
for _, row := range rows {
rowMap, ok := row.(map[string]any)
if !ok {
continue
}
rowType, _ := rowMap["type"].(string)
if rowType == "" {
rowType = "web"
}
if rowType != "web" {
continue
}
if rowType == "ads" {
continue
}
rowItems := toSlice(rowMap["items"])
for _, it := range rowItems {
itemMap, ok := it.(map[string]any)
if !ok {
continue
}
title := toString(itemMap["title"])
resURL := toString(itemMap["url"])
desc := toString(itemMap["desc"])
if resURL == "" {
continue
}
urlPtr := resURL
results = append(results, contracts.MainResult{
Template: "default.html",
Title: title,
Content: desc,
URL: &urlPtr,
Engine: e.Name(),
Score: 0,
Category: "general",
Engines: []string{e.Name()},
})
}
}
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{},
}, nil
}
func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
qLocale := qwantLocale(req.Language)
langBase := strings.SplitN(qLocale, "_", 2)[0]
args := url.Values{}
args.Set("q", req.Query)
args.Set("locale", strings.ToLower(qLocale))
args.Set("l", langBase)
args.Set("s", fmt.Sprintf("%d", req.Safesearch))
args.Set("p", fmt.Sprintf("%d", req.Pageno))
endpoint := "https://lite.qwant.com/?" + args.Encode()
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", "kafka/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/gosearch)")
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("qwant lite upstream error: status=%d body=%q", resp.StatusCode, string(body))
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return contracts.SearchResponse{}, err
}
results := make([]contracts.MainResult, 0)
seen := map[string]bool{}
doc.Find("section article").Each(func(_ int, item *goquery.Selection) {
if item.Find("span.tooltip").Length() > 0 {
return
}
// Selector: "./span[contains(@class, 'url partner')]"
urlText := strings.TrimSpace(item.Find("span.url.partner").First().Text())
if urlText == "" {
// fallback: any span with class containing both 'url' and 'partner'
urlText = strings.TrimSpace(item.Find("span[class*='url'][class*='partner']").First().Text())
}
title := strings.TrimSpace(item.Find("h2 a").First().Text())
content := strings.TrimSpace(item.Find("p").First().Text())
if urlText == "" {
return
}
if seen[urlText] {
return
}
seen[urlText] = true
u := urlText
results = append(results, contracts.MainResult{
Template: "default.html",
Title: title,
Content: content,
URL: &u,
Engine: e.Name(),
Score: 0,
Category: "general",
Engines: []string{e.Name()},
})
})
doc.Find("article, li, div").Each(func(_ int, item *goquery.Selection) {
if len(results) >= 20 {
return
}
if item.Find("span.tooltip").Length() > 0 {
return
}
classAttr, _ := item.Attr("class")
classLower := strings.ToLower(classAttr)
if strings.Contains(classLower, "nav") || strings.Contains(classLower, "footer") {
return
}
a := item.Find("a[href]").First()
if a.Length() == 0 {
return
}
href, ok := a.Attr("href")
if !ok {
return
}
href = strings.TrimSpace(href)
if href == "" {
return
}
// Ignore in-page and relative links.
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "#") {
return
}
if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
return
}
// Skip known sponsored partner links surfaced in lite pages.
if isKnownSponsoredURL(href) {
return
}
if isQwantInternalURL(href) {
// Ignore qwant nav/house links.
return
}
title := strings.TrimSpace(a.Text())
if title == "" {
return
}
if isLikelyNavTitle(title) {
return
}
if seen[href] {
return
}
seen[href] = true
content := strings.TrimSpace(item.Find("p").First().Text())
if content == "" {
content = strings.TrimSpace(item.Find("span").First().Text())
}
u := href
results = append(results, contracts.MainResult{
Template: "default.html",
Title: title,
Content: content,
URL: &u,
Engine: e.Name(),
Score: 0,
Category: "general",
Engines: []string{e.Name()},
})
})
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{},
}, nil
}
func qwantLocale(lang string) string {
lang = strings.TrimSpace(lang)
if lang == "" || lang == "auto" {
return "en_US"
}
lang = strings.ReplaceAll(lang, "-", "_")
parts := strings.SplitN(lang, "_", 2)
base := strings.ToLower(parts[0])
country := "US"
if len(parts) == 2 && strings.TrimSpace(parts[1]) != "" {
country = strings.ToUpper(strings.TrimSpace(parts[1]))
}
// Qwant expects locales like en_US.
return base + "_" + country
}
func toSlice(v any) []any {
switch t := v.(type) {
case []any:
return t
default:
// Handle case where mainline might be a single object.
if m, ok := v.(map[string]any); ok {
return []any{m}
}
return nil
}
}
func toString(v any) string {
switch t := v.(type) {
case string:
return t
case json.Number:
return t.String()
default:
return ""
}
}
func isQwantInternalURL(raw string) bool {
u, err := url.Parse(raw)
if err != nil {
return false
}
host := strings.ToLower(u.Hostname())
if host == "" {
return false
}
return host == "qwant.com" || host == "www.qwant.com" || strings.HasSuffix(host, ".qwant.com") || host == "about.qwant.com"
}
func isLikelyNavTitle(title string) bool {
t := strings.TrimSpace(strings.ToLower(title))
switch t {
case "qwant search", "search", "privacy", "discover the service", "better web", "discover":
return true
}
if strings.HasPrefix(t, "get 20gb of free storage") {
return true
}
return false
}
func isKnownSponsoredURL(raw string) bool {
u, err := url.Parse(raw)
if err != nil {
return false
}
host := strings.ToLower(u.Hostname())
switch host {
case "shdw.me", "www.shdw.me":
return true
}
if strings.Contains(strings.ToLower(raw), "qwant-tool") {
return true
}
return false
}