feat: build Go-based SearXNG-compatible search service
Implement an API-first Go rewrite with local engine adapters, upstream fallback, and Nix-based tooling so searches can run without matching the original UI while preserving response compatibility. Made-with: Cursor
This commit is contained in:
parent
7783367c71
commit
dc44837219
32 changed files with 3330 additions and 0 deletions
191
internal/engines/arxiv.go
Normal file
191
internal/engines/arxiv.go
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
const (
|
||||
arxivSearchPrefix = "all"
|
||||
arxivMaxResults = 10
|
||||
)
|
||||
|
||||
type ArxivEngine struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (e *ArxivEngine) Name() string { return "arxiv" }
|
||||
|
||||
func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("arxiv engine not initialized")
|
||||
}
|
||||
q := strings.TrimSpace(req.Query)
|
||||
if q == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
start := (req.Pageno - 1) * arxivMaxResults
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
args := url.Values{}
|
||||
args.Set("search_query", fmt.Sprintf("%s:%s", arxivSearchPrefix, q))
|
||||
args.Set("start", fmt.Sprintf("%d", start))
|
||||
args.Set("max_results", fmt.Sprintf("%d", arxivMaxResults))
|
||||
|
||||
endpoint := "https://export.arxiv.org/api/query?" + args.Encode()
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
raw, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
results, err := parseArxivAtom(raw)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
type arxivEntry struct {
|
||||
Title string
|
||||
ID string
|
||||
Summary string
|
||||
Published string
|
||||
}
|
||||
|
||||
func parseArxivAtom(xmlBytes []byte) ([]contracts.MainResult, error) {
|
||||
dec := xml.NewDecoder(bytes.NewReader(xmlBytes))
|
||||
|
||||
var entries []arxivEntry
|
||||
var cur *arxivEntry
|
||||
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch t := tok.(type) {
|
||||
case xml.StartElement:
|
||||
switch strings.ToLower(t.Name.Local) {
|
||||
case "entry":
|
||||
cur = &arxivEntry{}
|
||||
case "title":
|
||||
if cur != nil {
|
||||
var v string
|
||||
if err := dec.DecodeElement(&v, &t); err == nil {
|
||||
cur.Title = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
case "id":
|
||||
if cur != nil {
|
||||
var v string
|
||||
if err := dec.DecodeElement(&v, &t); err == nil {
|
||||
cur.ID = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
case "summary":
|
||||
if cur != nil {
|
||||
var v string
|
||||
if err := dec.DecodeElement(&v, &t); err == nil {
|
||||
cur.Summary = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
case "published":
|
||||
if cur != nil {
|
||||
var v string
|
||||
if err := dec.DecodeElement(&v, &t); err == nil {
|
||||
cur.Published = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
}
|
||||
case xml.EndElement:
|
||||
if strings.ToLower(t.Name.Local) == "entry" && cur != nil {
|
||||
if cur.Title != "" && cur.ID != "" {
|
||||
entries = append(entries, *cur)
|
||||
}
|
||||
cur = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out := make([]contracts.MainResult, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
urlPtr := e.ID
|
||||
content := e.Summary
|
||||
pubdate := parseArxivPublished(e.Published)
|
||||
|
||||
out = append(out, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: e.Title,
|
||||
Content: content,
|
||||
URL: &urlPtr,
|
||||
Pubdate: pubdate,
|
||||
Engine: "arxiv",
|
||||
Category: "science",
|
||||
Score: 0,
|
||||
Positions: nil,
|
||||
Engines: []string{"arxiv"},
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func parseArxivPublished(s string) *string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ArXiv uses RFC3339 like "2024-06-03T00:00:00Z".
|
||||
t, err := time.Parse(time.RFC3339, s)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
formatted := t.Format("2006-01-02 15:04:05-0700")
|
||||
return &formatted
|
||||
}
|
||||
|
||||
66
internal/engines/arxiv_test.go
Normal file
66
internal/engines/arxiv_test.go
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
func TestArxivEngine_Search(t *testing.T) {
|
||||
transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
if r.Method != http.MethodGet {
|
||||
return httpResponse(http.StatusMethodNotAllowed, "", ""), nil
|
||||
}
|
||||
if r.URL.Host != "export.arxiv.org" || r.URL.Path != "/api/query" {
|
||||
return httpResponse(http.StatusNotFound, "", ""), nil
|
||||
}
|
||||
|
||||
q := r.URL.Query().Get("search_query")
|
||||
if q != "all:quantum" {
|
||||
return httpResponse(http.StatusBadRequest, "", ""), nil
|
||||
}
|
||||
|
||||
atom := `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<title>Quantum Test</title>
|
||||
<id>http://arxiv.org/abs/1234.5678</id>
|
||||
<summary>Abstract here</summary>
|
||||
<published>2024-06-03T00:00:00Z</published>
|
||||
</entry>
|
||||
</feed>`
|
||||
return httpResponse(http.StatusOK, atom, "application/atom+xml"), nil
|
||||
})
|
||||
|
||||
client := &http.Client{Transport: transport}
|
||||
engine := &ArxivEngine{client: client}
|
||||
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "quantum",
|
||||
Pageno: 1,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 1 {
|
||||
t.Fatalf("expected 1 result, got %d", len(resp.Results))
|
||||
}
|
||||
|
||||
r := resp.Results[0]
|
||||
if r.Title != "Quantum Test" {
|
||||
t.Fatalf("unexpected title: %q", r.Title)
|
||||
}
|
||||
if r.Content != "Abstract here" {
|
||||
t.Fatalf("unexpected content: %q", r.Content)
|
||||
}
|
||||
if r.URL == nil || !strings.Contains(*r.URL, "1234.5678") {
|
||||
t.Fatalf("unexpected url: %v", r.URL)
|
||||
}
|
||||
if r.Pubdate == nil || !strings.Contains(*r.Pubdate, "2024-06-03") {
|
||||
t.Fatalf("expected pubdate around 2024-06-03, got %v", r.Pubdate)
|
||||
}
|
||||
}
|
||||
|
||||
195
internal/engines/braveapi.go
Normal file
195
internal/engines/braveapi.go
Normal file
|
|
@ -0,0 +1,195 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
// BraveEngine implements the SearXNG `braveapi` engine (Brave Web Search API).
|
||||
//
|
||||
// Config / gating:
|
||||
// - BRAVE_API_KEY: required to call Brave
|
||||
// - BRAVE_ACCESS_TOKEN (optional): if set, the request must include a token
|
||||
// that matches the env var (via Authorization Bearer, X-Search-Token,
|
||||
// X-Brave-Access-Token, or form field `token`).
|
||||
type BraveEngine struct {
|
||||
client *http.Client
|
||||
apiKey string
|
||||
accessGateToken string
|
||||
resultsPerPage int
|
||||
}
|
||||
|
||||
func (e *BraveEngine) Name() string { return "braveapi" }
|
||||
|
||||
func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("brave engine not initialized")
|
||||
}
|
||||
|
||||
// Gate / config checks should not be treated as fatal errors; SearXNG
|
||||
// treats misconfigured engines as unresponsive.
|
||||
if strings.TrimSpace(e.apiKey) == "" {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: 0,
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{{e.Name(), "missing_api_key"}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
if gate := strings.TrimSpace(e.accessGateToken); gate != "" {
|
||||
if strings.TrimSpace(req.AccessToken) == "" || req.AccessToken != gate {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: 0,
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{{e.Name(), "unauthorized"}},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
q := strings.TrimSpace(req.Query)
|
||||
if q == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
offset := 0
|
||||
if req.Pageno > 1 {
|
||||
offset = (req.Pageno - 1) * e.resultsPerPage
|
||||
}
|
||||
|
||||
args := url.Values{}
|
||||
args.Set("q", q)
|
||||
args.Set("count", fmt.Sprintf("%d", e.resultsPerPage))
|
||||
args.Set("offset", fmt.Sprintf("%d", offset))
|
||||
|
||||
if req.TimeRange != nil {
|
||||
switch *req.TimeRange {
|
||||
case "day":
|
||||
args.Set("time_range", "past_day")
|
||||
case "week":
|
||||
args.Set("time_range", "past_week")
|
||||
case "month":
|
||||
args.Set("time_range", "past_month")
|
||||
case "year":
|
||||
args.Set("time_range", "past_year")
|
||||
}
|
||||
}
|
||||
|
||||
// SearXNG's python checks `if params["safesearch"]:` which treats any
|
||||
// non-zero (moderate/strict) as strict.
|
||||
if req.Safesearch > 0 {
|
||||
args.Set("safesearch", "strict")
|
||||
}
|
||||
|
||||
endpoint := "https://api.search.brave.com/res/v1/web/search?" + args.Encode()
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
httpReq.Header.Set("X-Subscription-Token", e.apiKey)
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("brave upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var api struct {
|
||||
Web struct {
|
||||
Results []struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Age string `json:"age"`
|
||||
Thumbnail struct {
|
||||
Src string `json:"src"`
|
||||
} `json:"thumbnail"`
|
||||
} `json:"results"`
|
||||
} `json:"web"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(resp.Body).Decode(&api); err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
results := make([]contracts.MainResult, 0, len(api.Web.Results))
|
||||
for _, r := range api.Web.Results {
|
||||
urlPtr := strings.TrimSpace(r.URL)
|
||||
if urlPtr == "" {
|
||||
continue
|
||||
}
|
||||
pub := parseBraveAge(r.Age)
|
||||
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: r.Title,
|
||||
Content: r.Description,
|
||||
URL: &urlPtr,
|
||||
Pubdate: pub,
|
||||
Engine: e.Name(),
|
||||
Score: 0,
|
||||
Category: "general",
|
||||
Priority: "",
|
||||
Positions: nil,
|
||||
Engines: []string{e.Name()},
|
||||
})
|
||||
}
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseBraveAge(ageRaw string) *string {
|
||||
ageRaw = strings.TrimSpace(ageRaw)
|
||||
if ageRaw == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Brave sometimes returns RFC3339-like timestamps for `age`.
|
||||
layouts := []string{
|
||||
time.RFC3339Nano,
|
||||
time.RFC3339,
|
||||
"2006-01-02T15:04:05Z07:00",
|
||||
"2006-01-02",
|
||||
}
|
||||
for _, layout := range layouts {
|
||||
if t, err := time.Parse(layout, ageRaw); err == nil {
|
||||
s := t.Format("2006-01-02 15:04:05-0700")
|
||||
return &s
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
92
internal/engines/braveapi_test.go
Normal file
92
internal/engines/braveapi_test.go
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
func TestBraveEngine_GatingAndHeader(t *testing.T) {
|
||||
wantToken := "letmein"
|
||||
wantAPIKey := "api-key"
|
||||
|
||||
transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
if r.Header.Get("X-Subscription-Token") != wantAPIKey {
|
||||
t.Fatalf("missing/incorrect X-Subscription-Token header: got %q", r.Header.Get("X-Subscription-Token"))
|
||||
}
|
||||
if r.URL.Host != "api.search.brave.com" {
|
||||
t.Fatalf("unexpected host: %s", r.URL.Host)
|
||||
}
|
||||
if r.URL.Path != "/res/v1/web/search" {
|
||||
t.Fatalf("unexpected path: %s", r.URL.Path)
|
||||
}
|
||||
// basic query assertions
|
||||
q := r.URL.Query().Get("q")
|
||||
if q != "hugo" {
|
||||
t.Fatalf("unexpected q: %q", q)
|
||||
}
|
||||
|
||||
body := `{
|
||||
"web": {
|
||||
"results": [
|
||||
{"url":"https://example.com/a","title":"A","description":"B","age":"2024-06-03T00:00:00Z","thumbnail":{"src":"x"}}
|
||||
]
|
||||
}
|
||||
}`
|
||||
return httpResponse(http.StatusOK, body, "application/json"), nil
|
||||
})
|
||||
|
||||
client := &http.Client{Transport: transport}
|
||||
engine := &BraveEngine{
|
||||
client: client,
|
||||
apiKey: wantAPIKey,
|
||||
accessGateToken: wantToken,
|
||||
resultsPerPage: 20,
|
||||
}
|
||||
|
||||
// Wrong token => no upstream call / unresponsive engine.
|
||||
{
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "hugo",
|
||||
Pageno: 1,
|
||||
Safesearch: 0,
|
||||
Language: "en",
|
||||
AccessToken: "wrong",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 0 {
|
||||
t.Fatalf("expected no results on unauthorized, got %d", len(resp.Results))
|
||||
}
|
||||
if len(resp.UnresponsiveEngines) != 1 {
|
||||
t.Fatalf("expected 1 unresponsive engine entry, got %v", resp.UnresponsiveEngines)
|
||||
}
|
||||
}
|
||||
|
||||
// Correct token => upstream call.
|
||||
{
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "hugo",
|
||||
Pageno: 1,
|
||||
Safesearch: 0,
|
||||
Language: "en",
|
||||
AccessToken: wantToken,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 1 {
|
||||
t.Fatalf("expected 1 result, got %d", len(resp.Results))
|
||||
}
|
||||
if resp.Results[0].Title != "A" {
|
||||
t.Fatalf("unexpected title: %q", resp.Results[0].Title)
|
||||
}
|
||||
if resp.Results[0].URL == nil || *resp.Results[0].URL != "https://example.com/a" {
|
||||
t.Fatalf("unexpected url: %v", resp.Results[0].URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
144
internal/engines/crossref.go
Normal file
144
internal/engines/crossref.go
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
type CrossrefEngine struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (e *CrossrefEngine) Name() string { return "crossref" }
|
||||
|
||||
func (e *CrossrefEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("crossref engine not initialized")
|
||||
}
|
||||
q := strings.TrimSpace(req.Query)
|
||||
if q == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
offset := 20 * (req.Pageno - 1)
|
||||
args := url.Values{}
|
||||
args.Set("query", q)
|
||||
args.Set("offset", fmt.Sprintf("%d", offset))
|
||||
|
||||
endpoint := "https://api.crossref.org/works?" + args.Encode()
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("crossref upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var api struct {
|
||||
Message struct {
|
||||
Items []crossrefItem `json:"items"`
|
||||
} `json:"message"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(resp.Body).Decode(&api); err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
results := make([]contracts.MainResult, 0, len(api.Message.Items))
|
||||
for _, item := range api.Message.Items {
|
||||
title := ""
|
||||
if len(item.Title) > 0 {
|
||||
title = strings.TrimSpace(item.Title[0])
|
||||
}
|
||||
|
||||
content := strings.TrimSpace(item.Abstract)
|
||||
|
||||
urlStr := strings.TrimSpace(item.URL)
|
||||
if urlStr == "" {
|
||||
urlStr = strings.TrimSpace(item.DOI)
|
||||
}
|
||||
|
||||
pub := parseCrossrefDateParts(item.Published.DateParts)
|
||||
|
||||
urlPtr := urlStr
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: &urlPtr,
|
||||
Pubdate: pub,
|
||||
Engine: "crossref",
|
||||
Score: 0,
|
||||
Category: "science",
|
||||
Priority: "",
|
||||
Positions: nil,
|
||||
Engines: []string{"crossref"},
|
||||
})
|
||||
}
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
type crossrefItem struct {
|
||||
Type string `json:"type"`
|
||||
Title []string `json:"title"`
|
||||
URL string `json:"URL"`
|
||||
DOI string `json:"DOI"`
|
||||
Abstract string `json:"abstract"`
|
||||
Page string `json:"page"`
|
||||
Publisher string `json:"publisher"`
|
||||
Subject []string `json:"subject"`
|
||||
Published crossrefPublished `json:"published"`
|
||||
}
|
||||
|
||||
type crossrefPublished struct {
|
||||
DateParts [][]int `json:"date-parts"`
|
||||
}
|
||||
|
||||
func parseCrossrefDateParts(parts [][]int) *string {
|
||||
if len(parts) == 0 || len(parts[0]) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
dp := parts[0]
|
||||
year := dp[0]
|
||||
month := 1
|
||||
day := 1
|
||||
if len(dp) >= 2 {
|
||||
month = dp[1]
|
||||
}
|
||||
if len(dp) >= 3 {
|
||||
day = dp[2]
|
||||
}
|
||||
|
||||
t := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC)
|
||||
formatted := t.Format("2006-01-02 00:00:00+0000")
|
||||
return &formatted
|
||||
}
|
||||
|
||||
71
internal/engines/crossref_test.go
Normal file
71
internal/engines/crossref_test.go
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
func TestCrossrefEngine_Search(t *testing.T) {
|
||||
transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
if r.Method != http.MethodGet {
|
||||
return httpResponse(http.StatusMethodNotAllowed, "", ""), nil
|
||||
}
|
||||
if r.URL.Host != "api.crossref.org" || r.URL.Path != "/works" {
|
||||
return httpResponse(http.StatusNotFound, "", ""), nil
|
||||
}
|
||||
q := r.URL.Query().Get("query")
|
||||
if q != "hugo" {
|
||||
return httpResponse(http.StatusBadRequest, "", ""), nil
|
||||
}
|
||||
|
||||
body := `{
|
||||
"message": {
|
||||
"items": [
|
||||
{
|
||||
"type": "journal-article",
|
||||
"title": ["Paper B"],
|
||||
"URL": "https://example.com/paperb",
|
||||
"abstract": "Abstract B",
|
||||
"DOI": "10.1234/b",
|
||||
"published": {
|
||||
"date-parts": [[2020, 5, 1]]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
return httpResponse(http.StatusOK, body, "application/json"), nil
|
||||
})
|
||||
|
||||
client := &http.Client{Transport: transport}
|
||||
engine := &CrossrefEngine{client: client}
|
||||
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "hugo",
|
||||
Pageno: 1,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 1 {
|
||||
t.Fatalf("expected 1 result, got %d", len(resp.Results))
|
||||
}
|
||||
|
||||
r := resp.Results[0]
|
||||
if r.Title != "Paper B" {
|
||||
t.Fatalf("expected title Paper B, got %q", r.Title)
|
||||
}
|
||||
if r.Content != "Abstract B" {
|
||||
t.Fatalf("expected content, got %q", r.Content)
|
||||
}
|
||||
if r.Pubdate == nil || *r.Pubdate == "" {
|
||||
t.Fatalf("expected pubdate, got nil/empty")
|
||||
}
|
||||
if r.Engine != "crossref" {
|
||||
t.Fatalf("expected engine crossref, got %q", r.Engine)
|
||||
}
|
||||
}
|
||||
|
||||
17
internal/engines/engine.go
Normal file
17
internal/engines/engine.go
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
// Engine is a Go-native implementation of a SearXNG engine.
|
||||
//
|
||||
// Implementations should return a SearchResponse containing only the results
|
||||
// for that engine subset; the caller will merge multiple engine responses.
|
||||
type Engine interface {
|
||||
Name() string
|
||||
Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error)
|
||||
}
|
||||
|
||||
33
internal/engines/factory.go
Normal file
33
internal/engines/factory.go
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// NewDefaultPortedEngines returns the starter set of Go-native engines.
|
||||
// The service can swap/extend this registry later as more engines are ported.
|
||||
func NewDefaultPortedEngines(client *http.Client) map[string]Engine {
|
||||
if client == nil {
|
||||
client = &http.Client{Timeout: 10 * time.Second}
|
||||
}
|
||||
|
||||
return map[string]Engine{
|
||||
"wikipedia": &WikipediaEngine{client: client},
|
||||
"arxiv": &ArxivEngine{client: client},
|
||||
"crossref": &CrossrefEngine{client: client},
|
||||
"braveapi": &BraveEngine{
|
||||
client: client,
|
||||
apiKey: os.Getenv("BRAVE_API_KEY"),
|
||||
accessGateToken: os.Getenv("BRAVE_ACCESS_TOKEN"),
|
||||
resultsPerPage: 20,
|
||||
},
|
||||
"qwant": &QwantEngine{
|
||||
client: client,
|
||||
category: "web-lite",
|
||||
resultsPerPage: 10,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
26
internal/engines/http_mock_test.go
Normal file
26
internal/engines/http_mock_test.go
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type roundTripperFunc func(*http.Request) (*http.Response, error)
|
||||
|
||||
func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) {
|
||||
return f(r)
|
||||
}
|
||||
|
||||
func httpResponse(status int, body string, contentType string) *http.Response {
|
||||
h := make(http.Header)
|
||||
if contentType != "" {
|
||||
h.Set("Content-Type", contentType)
|
||||
}
|
||||
return &http.Response{
|
||||
StatusCode: status,
|
||||
Header: h,
|
||||
Body: io.NopCloser(strings.NewReader(body)),
|
||||
}
|
||||
}
|
||||
|
||||
148
internal/engines/planner.go
Normal file
148
internal/engines/planner.go
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "qwant"}
|
||||
|
||||
type Planner struct {
|
||||
PortedSet map[string]bool
|
||||
PortedList []string
|
||||
}
|
||||
|
||||
func NewPlannerFromEnv() *Planner {
|
||||
raw := strings.TrimSpace(os.Getenv("LOCAL_PORTED_ENGINES"))
|
||||
if raw == "" {
|
||||
return NewPlanner(defaultPortedEngines)
|
||||
}
|
||||
parts := splitCSV(raw)
|
||||
if len(parts) == 0 {
|
||||
return NewPlanner(defaultPortedEngines)
|
||||
}
|
||||
return NewPlanner(parts)
|
||||
}
|
||||
|
||||
func NewPlanner(portedEngines []string) *Planner {
|
||||
set := make(map[string]bool, len(portedEngines))
|
||||
out := make([]string, 0, len(portedEngines))
|
||||
for _, e := range portedEngines {
|
||||
e = strings.TrimSpace(strings.ToLower(e))
|
||||
if e == "" {
|
||||
continue
|
||||
}
|
||||
if set[e] {
|
||||
continue
|
||||
}
|
||||
set[e] = true
|
||||
out = append(out, e)
|
||||
}
|
||||
return &Planner{
|
||||
PortedSet: set,
|
||||
PortedList: out,
|
||||
}
|
||||
}
|
||||
|
||||
// Plan returns:
|
||||
// - localEngines: engines that are configured as ported for this service
|
||||
// - upstreamEngines: engines that should be executed by upstream SearXNG
|
||||
// - requestedEngines: the (possibly inferred) requested engines list
|
||||
//
|
||||
// If the request provides an explicit `engines` parameter, we use it.
|
||||
// Otherwise we infer a small subset from `categories` for the starter set.
|
||||
func (p *Planner) Plan(req contracts.SearchRequest) (localEngines, upstreamEngines, requestedEngines []string) {
|
||||
if p == nil {
|
||||
p = NewPlannerFromEnv()
|
||||
}
|
||||
|
||||
requestedEngines = nil
|
||||
if len(req.Engines) > 0 {
|
||||
requestedEngines = normalizeList(req.Engines)
|
||||
} else {
|
||||
requestedEngines = inferFromCategories(req.Categories)
|
||||
}
|
||||
|
||||
localEngines = make([]string, 0, len(requestedEngines))
|
||||
upstreamEngines = make([]string, 0, len(requestedEngines))
|
||||
for _, e := range requestedEngines {
|
||||
if p.PortedSet[e] {
|
||||
localEngines = append(localEngines, e)
|
||||
} else {
|
||||
upstreamEngines = append(upstreamEngines, e)
|
||||
}
|
||||
}
|
||||
|
||||
return localEngines, upstreamEngines, requestedEngines
|
||||
}
|
||||
|
||||
func inferFromCategories(categories []string) []string {
|
||||
// Minimal mapping for the initial porting subset.
|
||||
// This mirrors the idea of selecting from SearXNG categories without
|
||||
// embedding the whole engine registry.
|
||||
set := map[string]bool{}
|
||||
for _, c := range categories {
|
||||
switch strings.TrimSpace(strings.ToLower(c)) {
|
||||
case "general":
|
||||
set["wikipedia"] = true
|
||||
set["braveapi"] = true
|
||||
set["qwant"] = true
|
||||
case "science", "scientific publications":
|
||||
set["arxiv"] = true
|
||||
set["crossref"] = true
|
||||
}
|
||||
}
|
||||
|
||||
out := make([]string, 0, len(set))
|
||||
for e := range set {
|
||||
out = append(out, e)
|
||||
}
|
||||
// stable order
|
||||
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "arxiv": 3, "crossref": 4}
|
||||
sortByOrder(out, order)
|
||||
return out
|
||||
}
|
||||
|
||||
func sortByOrder(list []string, order map[string]int) {
|
||||
// simple insertion sort (list is tiny)
|
||||
for i := 1; i < len(list); i++ {
|
||||
j := i
|
||||
for j > 0 && order[list[j-1]] > order[list[j]] {
|
||||
list[j-1], list[j] = list[j], list[j-1]
|
||||
j--
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeList(in []string) []string {
|
||||
out := make([]string, 0, len(in))
|
||||
seen := map[string]bool{}
|
||||
for _, e := range in {
|
||||
e = strings.TrimSpace(strings.ToLower(e))
|
||||
if e == "" || seen[e] {
|
||||
continue
|
||||
}
|
||||
seen[e] = true
|
||||
out = append(out, e)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func splitCSV(s string) []string {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(s, ",")
|
||||
out := make([]string, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, p)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
467
internal/engines/qwant.go
Normal file
467
internal/engines/qwant.go
Normal file
|
|
@ -0,0 +1,467 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// QwantEngine implements a SearXNG-like `qwant` (web) adapter using
|
||||
// Qwant v3 endpoint: https://api.qwant.com/v3/search/web.
|
||||
//
|
||||
// Qwant's API is not fully documented; this mirrors SearXNG's parsing logic
|
||||
// for the `web` category from `.agent/searxng/searx/engines/qwant.py`.
|
||||
type QwantEngine struct {
|
||||
client *http.Client
|
||||
category string // "web" (JSON API) or "web-lite" (HTML fallback)
|
||||
resultsPerPage int
|
||||
}
|
||||
|
||||
func (e *QwantEngine) Name() string { return "qwant" }
|
||||
|
||||
func (e *QwantEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("qwant engine not initialized")
|
||||
}
|
||||
|
||||
q := strings.TrimSpace(req.Query)
|
||||
if q == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
// For API parity we use SearXNG web defaults: count=10, offset=(pageno-1)*count.
|
||||
// The engine's config field exists so we can expand to news/images/videos later.
|
||||
count := e.resultsPerPage
|
||||
if count <= 0 {
|
||||
count = 10
|
||||
}
|
||||
offset := 0
|
||||
if req.Pageno > 1 {
|
||||
offset = (req.Pageno - 1) * count
|
||||
}
|
||||
mode := strings.TrimSpace(strings.ToLower(e.category))
|
||||
if mode == "" {
|
||||
mode = "web"
|
||||
}
|
||||
|
||||
switch mode {
|
||||
case "web-lite":
|
||||
return e.searchWebLite(ctx, req)
|
||||
case "web":
|
||||
return e.searchWebAPI(ctx, req, count, offset)
|
||||
default:
|
||||
// Unknown mode: treat as unresponsive.
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
UnresponsiveEngines: [][2]string{
|
||||
{e.Name(), "unknown_qwant_mode"},
|
||||
},
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (e *QwantEngine) searchWebAPI(ctx context.Context, req contracts.SearchRequest, count, offset int) (contracts.SearchResponse, error) {
|
||||
qLocale := qwantLocale(req.Language)
|
||||
args := url.Values{}
|
||||
args.Set("q", req.Query)
|
||||
args.Set("count", fmt.Sprintf("%d", count))
|
||||
args.Set("locale", qLocale)
|
||||
args.Set("safesearch", fmt.Sprintf("%d", req.Safesearch))
|
||||
args.Set("llm", "false")
|
||||
args.Set("tgp", "3")
|
||||
args.Set("offset", fmt.Sprintf("%d", offset))
|
||||
|
||||
endpoint := "https://api.qwant.com/v3/search/web?" + args.Encode()
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "gosearch-go/0.1 (+https://github.com/ashie/gosearch)")
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Qwant often returns a 403 captcha/JS block for the JSON API.
|
||||
if resp.StatusCode == http.StatusForbidden {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
UnresponsiveEngines: [][2]string{
|
||||
{e.Name(), "captcha_or_js_block"},
|
||||
},
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("qwant upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
var top map[string]any
|
||||
if err := json.Unmarshal(body, &top); err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
status, _ := top["status"].(string)
|
||||
if status != "success" {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
UnresponsiveEngines: [][2]string{
|
||||
{e.Name(), "api_error"},
|
||||
},
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
data, _ := top["data"].(map[string]any)
|
||||
result, _ := data["result"].(map[string]any)
|
||||
items, _ := result["items"].(map[string]any)
|
||||
mainline := items["mainline"]
|
||||
|
||||
rows := toSlice(mainline)
|
||||
if len(rows) == 0 {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: 0,
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
results := make([]contracts.MainResult, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
rowMap, ok := row.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
rowType, _ := rowMap["type"].(string)
|
||||
if rowType == "" {
|
||||
rowType = "web"
|
||||
}
|
||||
if rowType != "web" {
|
||||
continue
|
||||
}
|
||||
if rowType == "ads" {
|
||||
continue
|
||||
}
|
||||
|
||||
rowItems := toSlice(rowMap["items"])
|
||||
for _, it := range rowItems {
|
||||
itemMap, ok := it.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
title := toString(itemMap["title"])
|
||||
resURL := toString(itemMap["url"])
|
||||
desc := toString(itemMap["desc"])
|
||||
if resURL == "" {
|
||||
continue
|
||||
}
|
||||
urlPtr := resURL
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: title,
|
||||
Content: desc,
|
||||
URL: &urlPtr,
|
||||
Engine: e.Name(),
|
||||
Score: 0,
|
||||
Category: "general",
|
||||
Engines: []string{e.Name()},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
qLocale := qwantLocale(req.Language)
|
||||
langBase := strings.SplitN(qLocale, "_", 2)[0]
|
||||
|
||||
args := url.Values{}
|
||||
args.Set("q", req.Query)
|
||||
args.Set("locale", strings.ToLower(qLocale))
|
||||
args.Set("l", langBase)
|
||||
args.Set("s", fmt.Sprintf("%d", req.Safesearch))
|
||||
args.Set("p", fmt.Sprintf("%d", req.Pageno))
|
||||
|
||||
endpoint := "https://lite.qwant.com/?" + args.Encode()
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "gosearch-go/0.1 (+https://github.com/ashie/gosearch)")
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("qwant lite upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
results := make([]contracts.MainResult, 0)
|
||||
seen := map[string]bool{}
|
||||
|
||||
// Pattern 1: legacy/known qwant-lite structure.
|
||||
doc.Find("section article").Each(func(_ int, item *goquery.Selection) {
|
||||
// ignore randomly interspersed advertising adds
|
||||
if item.Find("span.tooltip").Length() > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// In SearXNG: "./span[contains(@class, 'url partner')]"
|
||||
urlText := strings.TrimSpace(item.Find("span.url.partner").First().Text())
|
||||
if urlText == "" {
|
||||
// fallback: any span with class containing both 'url' and 'partner'
|
||||
urlText = strings.TrimSpace(item.Find("span[class*='url'][class*='partner']").First().Text())
|
||||
}
|
||||
title := strings.TrimSpace(item.Find("h2 a").First().Text())
|
||||
content := strings.TrimSpace(item.Find("p").First().Text())
|
||||
|
||||
if urlText == "" {
|
||||
return
|
||||
}
|
||||
if seen[urlText] {
|
||||
return
|
||||
}
|
||||
seen[urlText] = true
|
||||
u := urlText
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: &u,
|
||||
Engine: e.Name(),
|
||||
Score: 0,
|
||||
Category: "general",
|
||||
Engines: []string{e.Name()},
|
||||
})
|
||||
})
|
||||
|
||||
// Pattern 2: broader fallback for updated lite markup:
|
||||
// any article/list item/div block containing an external anchor.
|
||||
// We keep this conservative by requiring non-empty title + URL.
|
||||
doc.Find("article, li, div").Each(func(_ int, item *goquery.Selection) {
|
||||
if len(results) >= 20 {
|
||||
return
|
||||
}
|
||||
// Skip ad-like blocks in fallback pass too.
|
||||
if item.Find("span.tooltip").Length() > 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip obvious nav/footer blocks.
|
||||
classAttr, _ := item.Attr("class")
|
||||
classLower := strings.ToLower(classAttr)
|
||||
if strings.Contains(classLower, "nav") || strings.Contains(classLower, "footer") {
|
||||
return
|
||||
}
|
||||
|
||||
a := item.Find("a[href]").First()
|
||||
if a.Length() == 0 {
|
||||
return
|
||||
}
|
||||
href, ok := a.Attr("href")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
href = strings.TrimSpace(href)
|
||||
if href == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// Ignore in-page and relative links.
|
||||
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "#") {
|
||||
return
|
||||
}
|
||||
if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
|
||||
return
|
||||
}
|
||||
// Skip known sponsored partner links surfaced in lite pages.
|
||||
if isKnownSponsoredURL(href) {
|
||||
return
|
||||
}
|
||||
if isQwantInternalURL(href) {
|
||||
// Ignore qwant nav/house links.
|
||||
return
|
||||
}
|
||||
|
||||
title := strings.TrimSpace(a.Text())
|
||||
if title == "" {
|
||||
return
|
||||
}
|
||||
if isLikelyNavTitle(title) {
|
||||
return
|
||||
}
|
||||
|
||||
if seen[href] {
|
||||
return
|
||||
}
|
||||
seen[href] = true
|
||||
|
||||
// Best-effort snippet extraction from nearby paragraph/span text.
|
||||
content := strings.TrimSpace(item.Find("p").First().Text())
|
||||
if content == "" {
|
||||
content = strings.TrimSpace(item.Find("span").First().Text())
|
||||
}
|
||||
// If there is no snippet, still keep clearly external result links.
|
||||
// Qwant-lite frequently omits rich snippets for some entries.
|
||||
|
||||
u := href
|
||||
results = append(results, contracts.MainResult{
|
||||
Template: "default.html",
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: &u,
|
||||
Engine: e.Name(),
|
||||
Score: 0,
|
||||
Category: "general",
|
||||
Engines: []string{e.Name()},
|
||||
})
|
||||
})
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func qwantLocale(lang string) string {
|
||||
lang = strings.TrimSpace(lang)
|
||||
if lang == "" || lang == "auto" {
|
||||
return "en_US"
|
||||
}
|
||||
lang = strings.ReplaceAll(lang, "-", "_")
|
||||
parts := strings.SplitN(lang, "_", 2)
|
||||
base := strings.ToLower(parts[0])
|
||||
country := "US"
|
||||
if len(parts) == 2 && strings.TrimSpace(parts[1]) != "" {
|
||||
country = strings.ToUpper(strings.TrimSpace(parts[1]))
|
||||
}
|
||||
// Qwant expects locales like en_US.
|
||||
return base + "_" + country
|
||||
}
|
||||
|
||||
func toSlice(v any) []any {
|
||||
switch t := v.(type) {
|
||||
case []any:
|
||||
return t
|
||||
default:
|
||||
// Handle case where mainline might be a single object.
|
||||
if m, ok := v.(map[string]any); ok {
|
||||
return []any{m}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func toString(v any) string {
|
||||
switch t := v.(type) {
|
||||
case string:
|
||||
return t
|
||||
case json.Number:
|
||||
return t.String()
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func isQwantInternalURL(raw string) bool {
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := strings.ToLower(u.Hostname())
|
||||
if host == "" {
|
||||
return false
|
||||
}
|
||||
return host == "qwant.com" || host == "www.qwant.com" || strings.HasSuffix(host, ".qwant.com") || host == "about.qwant.com"
|
||||
}
|
||||
|
||||
func isLikelyNavTitle(title string) bool {
|
||||
t := strings.TrimSpace(strings.ToLower(title))
|
||||
switch t {
|
||||
case "qwant search", "search", "privacy", "discover the service", "better web", "discover":
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(t, "get 20gb of free storage") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isKnownSponsoredURL(raw string) bool {
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := strings.ToLower(u.Hostname())
|
||||
switch host {
|
||||
case "shdw.me", "www.shdw.me":
|
||||
return true
|
||||
}
|
||||
if strings.Contains(strings.ToLower(raw), "qwant-tool") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
89
internal/engines/qwant_lite_test.go
Normal file
89
internal/engines/qwant_lite_test.go
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
func TestQwantEngine_WebLite(t *testing.T) {
|
||||
transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
if r.Method != http.MethodGet {
|
||||
return httpResponse(http.StatusMethodNotAllowed, "", ""), nil
|
||||
}
|
||||
if r.URL.Host != "lite.qwant.com" {
|
||||
return httpResponse(http.StatusNotFound, "", ""), nil
|
||||
}
|
||||
if r.URL.Path != "/" {
|
||||
// goquery request URL parsing should normalize to "/"
|
||||
t.Fatalf("unexpected path: %s", r.URL.Path)
|
||||
}
|
||||
|
||||
q := r.URL.Query().Get("q")
|
||||
if q != "hugo" {
|
||||
t.Fatalf("unexpected q: %q", q)
|
||||
}
|
||||
if r.URL.Query().Get("locale") != "en_us" {
|
||||
t.Fatalf("unexpected locale: %q", r.URL.Query().Get("locale"))
|
||||
}
|
||||
if r.URL.Query().Get("l") != "en" {
|
||||
t.Fatalf("unexpected l: %q", r.URL.Query().Get("l"))
|
||||
}
|
||||
if r.URL.Query().Get("s") != "0" {
|
||||
t.Fatalf("unexpected s: %q", r.URL.Query().Get("s"))
|
||||
}
|
||||
if r.URL.Query().Get("p") != "1" {
|
||||
t.Fatalf("unexpected p: %q", r.URL.Query().Get("p"))
|
||||
}
|
||||
|
||||
body := `
|
||||
<!doctype html>
|
||||
<html>
|
||||
<body>
|
||||
<section>
|
||||
<article>
|
||||
<span class="url partner">https://example.com/q</span>
|
||||
<h2><a href="https://example.com/q">Qwant Title</a></h2>
|
||||
<p>Qwant description</p>
|
||||
</article>
|
||||
<article>
|
||||
<span class="tooltip">ad</span>
|
||||
<span class="url partner">https://example.com/ad</span>
|
||||
<h2><a href="https://example.com/ad">Ad Title</a></h2>
|
||||
<p>Ad description</p>
|
||||
</article>
|
||||
</section>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
return httpResponse(http.StatusOK, body, "text/html"), nil
|
||||
})
|
||||
|
||||
client := &http.Client{Transport: transport}
|
||||
engine := &QwantEngine{client: client, category: "web-lite", resultsPerPage: 10}
|
||||
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "hugo",
|
||||
Pageno: 1,
|
||||
Safesearch: 0,
|
||||
Language: "en",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 1 {
|
||||
t.Fatalf("expected 1 result (non-ad), got %d", len(resp.Results))
|
||||
}
|
||||
if resp.Results[0].Title != "Qwant Title" {
|
||||
t.Fatalf("unexpected title: %q", resp.Results[0].Title)
|
||||
}
|
||||
if resp.Results[0].Content != "Qwant description" {
|
||||
t.Fatalf("unexpected content: %q", resp.Results[0].Content)
|
||||
}
|
||||
if resp.Results[0].URL == nil || *resp.Results[0].URL != "https://example.com/q" {
|
||||
t.Fatalf("unexpected url: %v", resp.Results[0].URL)
|
||||
}
|
||||
}
|
||||
|
||||
94
internal/engines/qwant_test.go
Normal file
94
internal/engines/qwant_test.go
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
func TestQwantEngine_Web(t *testing.T) {
|
||||
transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
if r.Method != http.MethodGet {
|
||||
return httpResponse(http.StatusMethodNotAllowed, "", ""), nil
|
||||
}
|
||||
if r.URL.Host != "api.qwant.com" {
|
||||
return httpResponse(http.StatusNotFound, "", ""), nil
|
||||
}
|
||||
if r.URL.Path != "/v3/search/web" {
|
||||
t.Fatalf("unexpected path: %s", r.URL.Path)
|
||||
}
|
||||
|
||||
q := r.URL.Query().Get("q")
|
||||
if q != "hugo" {
|
||||
t.Fatalf("unexpected q: %q", q)
|
||||
}
|
||||
if r.URL.Query().Get("count") != "10" {
|
||||
t.Fatalf("unexpected count: %q", r.URL.Query().Get("count"))
|
||||
}
|
||||
if r.URL.Query().Get("locale") != "en_US" {
|
||||
t.Fatalf("unexpected locale: %q", r.URL.Query().Get("locale"))
|
||||
}
|
||||
if r.URL.Query().Get("safesearch") != "0" {
|
||||
t.Fatalf("unexpected safesearch: %q", r.URL.Query().Get("safesearch"))
|
||||
}
|
||||
if r.URL.Query().Get("llm") != "false" {
|
||||
t.Fatalf("unexpected llm: %q", r.URL.Query().Get("llm"))
|
||||
}
|
||||
if r.URL.Query().Get("tgp") != "3" {
|
||||
t.Fatalf("unexpected tgp: %q", r.URL.Query().Get("tgp"))
|
||||
}
|
||||
if r.URL.Query().Get("offset") != "0" {
|
||||
t.Fatalf("unexpected offset: %q", r.URL.Query().Get("offset"))
|
||||
}
|
||||
|
||||
body := `{
|
||||
"status": "success",
|
||||
"data": {
|
||||
"result": {
|
||||
"items": {
|
||||
"mainline": [
|
||||
{
|
||||
"type": "web",
|
||||
"items": [
|
||||
{ "title": "Qwant Title", "url": "https://example.com/q", "desc": "Qwant description" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
return httpResponse(http.StatusOK, body, "application/json"), nil
|
||||
})
|
||||
|
||||
client := &http.Client{Transport: transport}
|
||||
engine := &QwantEngine{client: client, category: "web", resultsPerPage: 10}
|
||||
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "hugo",
|
||||
Pageno: 1,
|
||||
Safesearch: 0,
|
||||
Language: "en",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 1 {
|
||||
t.Fatalf("expected 1 result, got %d", len(resp.Results))
|
||||
}
|
||||
if resp.Results[0].Title != "Qwant Title" {
|
||||
t.Fatalf("unexpected title: %q", resp.Results[0].Title)
|
||||
}
|
||||
if resp.Results[0].Content != "Qwant description" {
|
||||
t.Fatalf("unexpected content: %q", resp.Results[0].Content)
|
||||
}
|
||||
if resp.Results[0].URL == nil || *resp.Results[0].URL != "https://example.com/q" {
|
||||
t.Fatalf("unexpected url: %v", resp.Results[0].URL)
|
||||
}
|
||||
if resp.Results[0].Engine != "qwant" {
|
||||
t.Fatalf("unexpected engine: %q", resp.Results[0].Engine)
|
||||
}
|
||||
}
|
||||
|
||||
151
internal/engines/wikipedia.go
Normal file
151
internal/engines/wikipedia.go
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
type WikipediaEngine struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (e *WikipediaEngine) Name() string { return "wikipedia" }
|
||||
|
||||
func (e *WikipediaEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("wikipedia engine not initialized")
|
||||
}
|
||||
if strings.TrimSpace(req.Query) == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
lang := strings.TrimSpace(req.Language)
|
||||
if lang == "" || lang == "auto" {
|
||||
lang = "en"
|
||||
}
|
||||
// Wikipedia subdomains are based on the language code; keep it simple for MVP.
|
||||
lang = strings.SplitN(lang, "-", 2)[0]
|
||||
lang = strings.ReplaceAll(lang, "_", "-")
|
||||
wikiNetloc := fmt.Sprintf("%s.wikipedia.org", lang)
|
||||
|
||||
endpoint := fmt.Sprintf(
|
||||
"https://%s/api/rest_v1/page/summary/%s",
|
||||
wikiNetloc,
|
||||
url.PathEscape(req.Query),
|
||||
)
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
// Wikimedia APIs require a descriptive User-Agent.
|
||||
httpReq.Header.Set(
|
||||
"User-Agent",
|
||||
"gosearch-go/0.1 (compatible; +https://github.com/ashie/gosearch)",
|
||||
)
|
||||
// Best-effort: hint content language.
|
||||
if req.Language != "" && req.Language != "auto" {
|
||||
httpReq.Header.Set("Accept-Language", req.Language)
|
||||
}
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: 0,
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("wikipedia upstream error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var api struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Titles struct {
|
||||
Display string `json:"display"`
|
||||
} `json:"titles"`
|
||||
ContentURLs struct {
|
||||
Desktop struct {
|
||||
Page string `json:"page"`
|
||||
} `json:"desktop"`
|
||||
} `json:"content_urls"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(resp.Body).Decode(&api); err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
pageURL := api.ContentURLs.Desktop.Page
|
||||
if pageURL == "" {
|
||||
// API returned a non-standard payload; treat as no result.
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: 0,
|
||||
Results: []contracts.MainResult{},
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
title := api.Titles.Display
|
||||
if title == "" {
|
||||
title = api.Title
|
||||
}
|
||||
|
||||
content := api.Description
|
||||
|
||||
urlPtr := pageURL
|
||||
pub := (*string)(nil)
|
||||
|
||||
results := []contracts.MainResult{
|
||||
{
|
||||
Template: "default.html",
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: &urlPtr,
|
||||
Pubdate: pub,
|
||||
Engine: "wikipedia",
|
||||
Score: 0,
|
||||
Category: "general",
|
||||
Priority: "",
|
||||
Positions: nil,
|
||||
Engines: []string{"wikipedia"},
|
||||
},
|
||||
}
|
||||
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: []string{},
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
61
internal/engines/wikipedia_test.go
Normal file
61
internal/engines/wikipedia_test.go
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"testing"
|
||||
|
||||
"github.com/ashie/gosearch/internal/contracts"
|
||||
)
|
||||
|
||||
func TestWikipediaEngine_Search(t *testing.T) {
|
||||
transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
if r.Method != http.MethodGet {
|
||||
return httpResponse(http.StatusMethodNotAllowed, "", ""), nil
|
||||
}
|
||||
if r.URL.Host != "en.wikipedia.org" {
|
||||
return httpResponse(http.StatusNotFound, "", ""), nil
|
||||
}
|
||||
|
||||
if r.URL.Path != "/api/rest_v1/page/summary/Taxi" {
|
||||
return httpResponse(http.StatusNotFound, "", ""), nil
|
||||
}
|
||||
|
||||
body := `{
|
||||
"title": "Taxi",
|
||||
"description": "A car",
|
||||
"titles": { "display": "Taxi" },
|
||||
"content_urls": { "desktop": { "page": "https://en.wikipedia.org/wiki/Taxi" } }
|
||||
}`
|
||||
return httpResponse(http.StatusOK, body, "application/json"), nil
|
||||
})
|
||||
|
||||
client := &http.Client{Transport: transport}
|
||||
engine := &WikipediaEngine{client: client}
|
||||
|
||||
resp, err := engine.Search(context.Background(), contracts.SearchRequest{
|
||||
Query: "Taxi",
|
||||
Pageno: 1,
|
||||
Language: "en",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(resp.Results) != 1 {
|
||||
t.Fatalf("expected 1 result, got %d", len(resp.Results))
|
||||
}
|
||||
r := resp.Results[0]
|
||||
if r.Title != "Taxi" {
|
||||
t.Fatalf("expected title Taxi, got %q", r.Title)
|
||||
}
|
||||
if r.Content != "A car" {
|
||||
t.Fatalf("expected content, got %q", r.Content)
|
||||
}
|
||||
if r.URL == nil || *r.URL == "" {
|
||||
t.Fatalf("expected url, got nil/empty")
|
||||
}
|
||||
if *r.URL != "https://en.wikipedia.org/wiki/Taxi" {
|
||||
t.Fatalf("unexpected url: %q", *r.URL)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue