From dc44837219bf52078f14206759a36014b1da7029 Mon Sep 17 00:00:00 2001 From: Franz Kafka Date: Fri, 20 Mar 2026 20:34:08 +0100 Subject: [PATCH] feat: build Go-based SearXNG-compatible search service Implement an API-first Go rewrite with local engine adapters, upstream fallback, and Nix-based tooling so searches can run without matching the original UI while preserving response compatibility. Made-with: Cursor --- README.md | 76 ++++ cmd/searxng-go/main.go | 43 +++ flake.lock | 27 ++ flake.nix | 28 ++ go.mod | 9 + go.sum | 71 ++++ internal/contracts/main_result.go | 193 ++++++++++ internal/contracts/types.go | 50 +++ internal/engines/arxiv.go | 191 ++++++++++ internal/engines/arxiv_test.go | 66 ++++ internal/engines/braveapi.go | 195 +++++++++++ internal/engines/braveapi_test.go | 92 +++++ internal/engines/crossref.go | 144 ++++++++ internal/engines/crossref_test.go | 71 ++++ internal/engines/engine.go | 17 + internal/engines/factory.go | 33 ++ internal/engines/http_mock_test.go | 26 ++ internal/engines/planner.go | 148 ++++++++ internal/engines/qwant.go | 467 +++++++++++++++++++++++++ internal/engines/qwant_lite_test.go | 89 +++++ internal/engines/qwant_test.go | 94 +++++ internal/engines/wikipedia.go | 151 ++++++++ internal/engines/wikipedia_test.go | 61 ++++ internal/httpapi/handlers.go | 41 +++ internal/search/merge.go | 121 +++++++ internal/search/merge_test.go | 80 +++++ internal/search/request_params.go | 206 +++++++++++ internal/search/request_params_test.go | 74 ++++ internal/search/response.go | 223 ++++++++++++ internal/search/service.go | 111 ++++++ internal/search/types.go | 20 ++ internal/upstream/client.go | 112 ++++++ 32 files changed, 3330 insertions(+) create mode 100644 cmd/searxng-go/main.go create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/contracts/main_result.go create mode 100644 internal/contracts/types.go create mode 100644 internal/engines/arxiv.go create mode 100644 internal/engines/arxiv_test.go create mode 100644 internal/engines/braveapi.go create mode 100644 internal/engines/braveapi_test.go create mode 100644 internal/engines/crossref.go create mode 100644 internal/engines/crossref_test.go create mode 100644 internal/engines/engine.go create mode 100644 internal/engines/factory.go create mode 100644 internal/engines/http_mock_test.go create mode 100644 internal/engines/planner.go create mode 100644 internal/engines/qwant.go create mode 100644 internal/engines/qwant_lite_test.go create mode 100644 internal/engines/qwant_test.go create mode 100644 internal/engines/wikipedia.go create mode 100644 internal/engines/wikipedia_test.go create mode 100644 internal/httpapi/handlers.go create mode 100644 internal/search/merge.go create mode 100644 internal/search/merge_test.go create mode 100644 internal/search/request_params.go create mode 100644 internal/search/request_params_test.go create mode 100644 internal/search/response.go create mode 100644 internal/search/service.go create mode 100644 internal/search/types.go create mode 100644 internal/upstream/client.go diff --git a/README.md b/README.md index e69de29..c9e421c 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,76 @@ +## gosearch (SearXNG rewrite in Go) + +This repository contains a standalone Go HTTP service that implements a SearXNG-compatible +API-first `/search` endpoint and proxies unported engines to an upstream SearXNG instance. + +### Endpoints + +- `GET /healthz` -> `OK` +- `GET|POST /search` + - Required form/body parameter: `q` + - Optional: `format` (`json` | `csv` | `rss`; default: `json`) + +### Supported `format=...` + +- `json`: SearXNG-style JSON response (`query`, `number_of_results`, `results`, `answers`, `corrections`, `infoboxes`, `suggestions`, `unresponsive_engines`) +- `csv`: CSV with header `title,url,content,host,engine,score,type` +- `rss`: RSS 2.0 feed based on the `opensearch_response_rss.xml` template fields + +### Request parameters + +The server accepts SearXNG form parameters (both `GET` query string and `POST` form-encoded): + +- `q` (required): search query +- `format` (optional): `json`/`csv`/`rss` +- `pageno` (optional, default `1`): positive integer +- `safesearch` (optional, default `0`): integer `0..2` +- `time_range` (optional): `day|week|month|year` (or omitted/`None`) +- `timeout_limit` (optional): float, seconds (or omitted/`None`) +- `language` (optional, default `auto`): `auto` or a BCP-47-ish language code +- `engines` (optional): comma-separated engine names (e.g. `wikipedia,arxiv`) +- `categories` / `category_` (optional): used for selecting the initial ported subset +- `engine_data--=` (optional): per-engine custom parameters + +### Environment variables + +- `PORT` (optional, default `8080`) +- `UPSTREAM_SEARXNG_URL` (optional for now, but required if you expect unported engines) + - When set, unported engines are proxied to `${UPSTREAM_SEARXNG_URL}/search` with `format=json`. +- `LOCAL_PORTED_ENGINES` (optional, default `wikipedia,arxiv,crossref,braveapi,qwant`) + - Controls which engine names are executed locally (Go-native adapters). +- `HTTP_TIMEOUT` (optional, default `10s`) + - Timeout for both local engine API calls and upstream proxy calls. +- Brave Search API: + - `BRAVE_API_KEY` (optional): enables the `braveapi` engine when set + - `BRAVE_ACCESS_TOKEN` (optional): if set, requests must include a token + (header `Authorization: Bearer `, `X-Search-Token`, `X-Brave-Access-Token`, or form field `token`) + +### Ported vs proxied strategy + +1. The service plans which engines should run locally vs upstream using `LOCAL_PORTED_ENGINES`. +2. It executes local ported engines using Go-native adapters: + - `wikipedia`, `arxiv`, `crossref` +3. Any remaining requested engines are proxied to upstream SearXNG (`format=json`). +4. Responses are merged: + - `results` are de-duplicated by `engine|title|url` + - `suggestions`/`corrections` are treated as sets + - other arrays are concatenated + +### Running with Nix + +This repo uses `flake.nix` to provide the Go toolchain. + +```bash +nix develop +go test ./... +go run ./cmd/searxng-go +``` + +Example: + +```bash +export UPSTREAM_SEARXNG_URL="http://127.0.0.1:8888" +export PORT="8080" +nix develop -c go run ./cmd/searxng-go +``` + diff --git a/cmd/searxng-go/main.go b/cmd/searxng-go/main.go new file mode 100644 index 0000000..7797ee6 --- /dev/null +++ b/cmd/searxng-go/main.go @@ -0,0 +1,43 @@ +package main + +import ( + "log" + "net/http" + "os" + "time" + + "github.com/ashie/gosearch/internal/httpapi" + "github.com/ashie/gosearch/internal/search" +) + +func main() { + port := os.Getenv("PORT") + if port == "" { + port = "8080" + } + + upstreamURL := os.Getenv("UPSTREAM_SEARXNG_URL") + + timeout := 10 * time.Second + if v := os.Getenv("HTTP_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + timeout = d + } + } + + svc := search.NewService(search.ServiceConfig{ + UpstreamURL: upstreamURL, + HTTPTimeout: timeout, + }) + + h := httpapi.NewHandler(svc) + + mux := http.NewServeMux() + mux.HandleFunc("/healthz", h.Healthz) + mux.HandleFunc("/search", h.Search) + + addr := ":" + port + log.Printf("searxng-go listening on %s", addr) + log.Fatal(http.ListenAndServe(addr, mux)) +} + diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..d98f5a2 --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1773628058, + "narHash": "sha256-hpXH0z3K9xv0fHaje136KY872VT2T5uwxtezlAskQgY=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "f8573b9c935cfaa162dd62cc9e75ae2db86f85df", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..fe2a141 --- /dev/null +++ b/flake.nix @@ -0,0 +1,28 @@ +{ + description = "Gosearch - SearXNG rewrite in Go"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + }; + + outputs = { self, nixpkgs }: + let + systems = [ "x86_64-linux" "aarch64-linux" ]; + forAllSystems = f: nixpkgs.lib.genAttrs systems (system: f system); + in { + devShells = forAllSystems (system: + let + pkgs = import nixpkgs { inherit system; }; + go = pkgs.go_1_24; + in + { + default = pkgs.mkShell { + buildInputs = [ + go + pkgs.curl + ]; + }; + }); + }; +} + diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..288c9ef --- /dev/null +++ b/go.mod @@ -0,0 +1,9 @@ +module github.com/ashie/gosearch + +go 1.25.0 + +require ( + github.com/PuerkitoBio/goquery v1.12.0 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect + golang.org/x/net v0.52.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..bf43075 --- /dev/null +++ b/go.sum @@ -0,0 +1,71 @@ +github.com/PuerkitoBio/goquery v1.12.0 h1:pAcL4g3WRXekcB9AU/y1mbKez2dbY2AajVhtkO8RIBo= +github.com/PuerkitoBio/goquery v1.12.0/go.mod h1:802ej+gV2y7bbIhOIoPY5sT183ZW0YFofScC4q/hIpQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/internal/contracts/main_result.go b/internal/contracts/main_result.go new file mode 100644 index 0000000..48005f8 --- /dev/null +++ b/internal/contracts/main_result.go @@ -0,0 +1,193 @@ +package contracts + +import ( + "bytes" + "encoding/json" +) + +// MainResult represents one element of SearXNG's `results` array. +// +// SearXNG returns many additional keys beyond what templates use. To keep the +// contract stable for proxying/merging, we preserve all unknown keys in +// `raw` and re-emit them via MarshalJSON. +type MainResult struct { + raw map[string]any + + // Common fields used by SearXNG templates (RSS uses: title, url, content, pubdate). + Template string `json:"template"` + Title string `json:"title"` + Content string `json:"content"` + URL *string `json:"url"` + Pubdate *string `json:"pubdate"` + + Engine string `json:"engine"` + Score float64 `json:"score"` + Category string `json:"category"` + Priority string `json:"priority"` + + Positions []int `json:"positions"` + Engines []string `json:"engines"` + + // These fields exist in SearXNG's MainResult base; keep them so downstream + // callers can generate richer output later. + OpenGroup bool `json:"open_group"` + CloseGroup bool `json:"close_group"` + + // parsed_url in SearXNG is emitted as a tuple; we preserve it as-is. + ParsedURL any `json:"parsed_url"` +} + +func (mr *MainResult) UnmarshalJSON(data []byte) error { + // Preserve the full object. + dec := json.NewDecoder(bytes.NewReader(data)) + dec.UseNumber() + + var m map[string]any + if err := dec.Decode(&m); err != nil { + return err + } + + mr.raw = m + + // Fill the typed/common fields (best-effort; don't fail if types differ). + mr.Template = stringOrEmpty(m["template"]) + mr.Title = stringOrEmpty(m["title"]) + mr.Content = stringOrEmpty(m["content"]) + mr.Engine = stringOrEmpty(m["engine"]) + mr.Category = stringOrEmpty(m["category"]) + mr.Priority = stringOrEmpty(m["priority"]) + + if s, ok := stringOrNullable(m["url"]); ok { + mr.URL = &s + } + if s, ok := stringOrNullable(m["pubdate"]); ok { + mr.Pubdate = &s + } + + mr.Score = floatOrZero(m["score"]) + + if v, ok := sliceOfStrings(m["engines"]); ok { + mr.Engines = v + } + if v, ok := sliceOfInts(m["positions"]); ok { + mr.Positions = v + } + + if v, ok := boolOrFalse(m["open_group"]); ok { + mr.OpenGroup = v + } + if v, ok := boolOrFalse(m["close_group"]); ok { + mr.CloseGroup = v + } + + mr.ParsedURL = m["parsed_url"] + + return nil +} + +func (mr MainResult) MarshalJSON() ([]byte, error) { + // If we came from upstream JSON, preserve all keys exactly. + if mr.raw != nil { + return json.Marshal(mr.raw) + } + + // Otherwise, marshal the known fields. + m := map[string]any{ + "template": mr.Template, + "title": mr.Title, + "content": mr.Content, + "url": mr.URL, + "pubdate": mr.Pubdate, + "engine": mr.Engine, + "score": mr.Score, + "category": mr.Category, + "priority": mr.Priority, + "positions": mr.Positions, + "engines": mr.Engines, + "open_group": mr.OpenGroup, + "close_group": mr.CloseGroup, + "parsed_url": mr.ParsedURL, + } + return json.Marshal(m) +} + +func stringOrEmpty(v any) string { + s, _ := v.(string) + return s +} + +func stringOrNullable(v any) (string, bool) { + if v == nil { + return "", false + } + s, ok := v.(string) + return s, ok +} + +func floatOrZero(v any) float64 { + switch t := v.(type) { + case float64: + return t + case float32: + return float64(t) + case int: + return float64(t) + case int64: + return float64(t) + case json.Number: + f, _ := t.Float64() + return f + default: + return 0 + } +} + +func boolOrFalse(v any) (bool, bool) { + b, ok := v.(bool) + if !ok { + return false, false + } + return b, true +} + +func sliceOfStrings(v any) ([]string, bool) { + raw, ok := v.([]any) + if !ok { + return nil, false + } + out := make([]string, 0, len(raw)) + for _, item := range raw { + s, ok := item.(string) + if !ok { + return nil, false + } + out = append(out, s) + } + return out, true +} + +func sliceOfInts(v any) ([]int, bool) { + raw, ok := v.([]any) + if !ok { + return nil, false + } + out := make([]int, 0, len(raw)) + for _, item := range raw { + switch t := item.(type) { + case float64: + out = append(out, int(t)) + case int: + out = append(out, t) + case json.Number: + i64, err := t.Int64() + if err != nil { + return nil, false + } + out = append(out, int(i64)) + default: + return nil, false + } + } + return out, true +} + diff --git a/internal/contracts/types.go b/internal/contracts/types.go new file mode 100644 index 0000000..a68f77a --- /dev/null +++ b/internal/contracts/types.go @@ -0,0 +1,50 @@ +package contracts + +// OutputFormat matches SearXNG's `/search?format=...` values. +type OutputFormat string + +const ( + FormatHTML OutputFormat = "html" // accepted for compatibility (not yet implemented) + FormatJSON OutputFormat = "json" + FormatCSV OutputFormat = "csv" + FormatRSS OutputFormat = "rss" +) + +type SearchRequest struct { + // Format is what the client requested via `format=...`. + Format OutputFormat + + Query string + + Pageno int + Safesearch int + TimeRange *string + + TimeoutLimit *float64 + Language string + + // Engines and categories are used for deciding which engines run locally vs are proxied. + // For now, engines can be supplied directly via the `engines` form parameter. + Engines []string + Categories []string + + // EngineData matches SearXNG's `engine_data--=` parameters. + EngineData map[string]map[string]string + + // AccessToken is an optional request token used to gate paid/limited engines. + // It is not part of the upstream JSON schema; it only influences local engines. + AccessToken string +} + +// SearchResponse matches the JSON schema returned by SearXNG's `webutils.get_json_response()`. +type SearchResponse struct { + Query string `json:"query"` + NumberOfResults int `json:"number_of_results"` + Results []MainResult `json:"results"` + Answers []map[string]any `json:"answers"` + Corrections []string `json:"corrections"` + Infoboxes []map[string]any `json:"infoboxes"` + Suggestions []string `json:"suggestions"` + UnresponsiveEngines [][2]string `json:"unresponsive_engines"` +} + diff --git a/internal/engines/arxiv.go b/internal/engines/arxiv.go new file mode 100644 index 0000000..0c58d68 --- /dev/null +++ b/internal/engines/arxiv.go @@ -0,0 +1,191 @@ +package engines + +import ( + "bytes" + "context" + "encoding/xml" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/ashie/gosearch/internal/contracts" +) + +const ( + arxivSearchPrefix = "all" + arxivMaxResults = 10 +) + +type ArxivEngine struct { + client *http.Client +} + +func (e *ArxivEngine) Name() string { return "arxiv" } + +func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if e == nil || e.client == nil { + return contracts.SearchResponse{}, errors.New("arxiv engine not initialized") + } + q := strings.TrimSpace(req.Query) + if q == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + start := (req.Pageno - 1) * arxivMaxResults + if start < 0 { + start = 0 + } + + args := url.Values{} + args.Set("search_query", fmt.Sprintf("%s:%s", arxivSearchPrefix, q)) + args.Set("start", fmt.Sprintf("%d", start)) + args.Set("max_results", fmt.Sprintf("%d", arxivMaxResults)) + + endpoint := "https://export.arxiv.org/api/query?" + args.Encode() + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status=%d body=%q", resp.StatusCode, string(body)) + } + + raw, err := io.ReadAll(resp.Body) + if err != nil { + return contracts.SearchResponse{}, err + } + + results, err := parseArxivAtom(raw) + if err != nil { + return contracts.SearchResponse{}, err + } + + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + +type arxivEntry struct { + Title string + ID string + Summary string + Published string +} + +func parseArxivAtom(xmlBytes []byte) ([]contracts.MainResult, error) { + dec := xml.NewDecoder(bytes.NewReader(xmlBytes)) + + var entries []arxivEntry + var cur *arxivEntry + + for { + tok, err := dec.Token() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + + switch t := tok.(type) { + case xml.StartElement: + switch strings.ToLower(t.Name.Local) { + case "entry": + cur = &arxivEntry{} + case "title": + if cur != nil { + var v string + if err := dec.DecodeElement(&v, &t); err == nil { + cur.Title = strings.TrimSpace(v) + } + } + case "id": + if cur != nil { + var v string + if err := dec.DecodeElement(&v, &t); err == nil { + cur.ID = strings.TrimSpace(v) + } + } + case "summary": + if cur != nil { + var v string + if err := dec.DecodeElement(&v, &t); err == nil { + cur.Summary = strings.TrimSpace(v) + } + } + case "published": + if cur != nil { + var v string + if err := dec.DecodeElement(&v, &t); err == nil { + cur.Published = strings.TrimSpace(v) + } + } + } + case xml.EndElement: + if strings.ToLower(t.Name.Local) == "entry" && cur != nil { + if cur.Title != "" && cur.ID != "" { + entries = append(entries, *cur) + } + cur = nil + } + } + } + + out := make([]contracts.MainResult, 0, len(entries)) + for _, e := range entries { + urlPtr := e.ID + content := e.Summary + pubdate := parseArxivPublished(e.Published) + + out = append(out, contracts.MainResult{ + Template: "default.html", + Title: e.Title, + Content: content, + URL: &urlPtr, + Pubdate: pubdate, + Engine: "arxiv", + Category: "science", + Score: 0, + Positions: nil, + Engines: []string{"arxiv"}, + }) + } + return out, nil +} + +func parseArxivPublished(s string) *string { + s = strings.TrimSpace(s) + if s == "" { + return nil + } + + // ArXiv uses RFC3339 like "2024-06-03T00:00:00Z". + t, err := time.Parse(time.RFC3339, s) + if err != nil { + return nil + } + + formatted := t.Format("2006-01-02 15:04:05-0700") + return &formatted +} + diff --git a/internal/engines/arxiv_test.go b/internal/engines/arxiv_test.go new file mode 100644 index 0000000..276769d --- /dev/null +++ b/internal/engines/arxiv_test.go @@ -0,0 +1,66 @@ +package engines + +import ( + "context" + "net/http" + "strings" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestArxivEngine_Search(t *testing.T) { + transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) { + if r.Method != http.MethodGet { + return httpResponse(http.StatusMethodNotAllowed, "", ""), nil + } + if r.URL.Host != "export.arxiv.org" || r.URL.Path != "/api/query" { + return httpResponse(http.StatusNotFound, "", ""), nil + } + + q := r.URL.Query().Get("search_query") + if q != "all:quantum" { + return httpResponse(http.StatusBadRequest, "", ""), nil + } + + atom := ` + + + Quantum Test + http://arxiv.org/abs/1234.5678 + Abstract here + 2024-06-03T00:00:00Z + +` + return httpResponse(http.StatusOK, atom, "application/atom+xml"), nil + }) + + client := &http.Client{Transport: transport} + engine := &ArxivEngine{client: client} + + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "quantum", + Pageno: 1, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 1 { + t.Fatalf("expected 1 result, got %d", len(resp.Results)) + } + + r := resp.Results[0] + if r.Title != "Quantum Test" { + t.Fatalf("unexpected title: %q", r.Title) + } + if r.Content != "Abstract here" { + t.Fatalf("unexpected content: %q", r.Content) + } + if r.URL == nil || !strings.Contains(*r.URL, "1234.5678") { + t.Fatalf("unexpected url: %v", r.URL) + } + if r.Pubdate == nil || !strings.Contains(*r.Pubdate, "2024-06-03") { + t.Fatalf("expected pubdate around 2024-06-03, got %v", r.Pubdate) + } +} + diff --git a/internal/engines/braveapi.go b/internal/engines/braveapi.go new file mode 100644 index 0000000..9b94722 --- /dev/null +++ b/internal/engines/braveapi.go @@ -0,0 +1,195 @@ +package engines + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/ashie/gosearch/internal/contracts" +) + +// BraveEngine implements the SearXNG `braveapi` engine (Brave Web Search API). +// +// Config / gating: +// - BRAVE_API_KEY: required to call Brave +// - BRAVE_ACCESS_TOKEN (optional): if set, the request must include a token +// that matches the env var (via Authorization Bearer, X-Search-Token, +// X-Brave-Access-Token, or form field `token`). +type BraveEngine struct { + client *http.Client + apiKey string + accessGateToken string + resultsPerPage int +} + +func (e *BraveEngine) Name() string { return "braveapi" } + +func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if e == nil || e.client == nil { + return contracts.SearchResponse{}, errors.New("brave engine not initialized") + } + + // Gate / config checks should not be treated as fatal errors; SearXNG + // treats misconfigured engines as unresponsive. + if strings.TrimSpace(e.apiKey) == "" { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{{e.Name(), "missing_api_key"}}, + }, nil + } + + if gate := strings.TrimSpace(e.accessGateToken); gate != "" { + if strings.TrimSpace(req.AccessToken) == "" || req.AccessToken != gate { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{{e.Name(), "unauthorized"}}, + }, nil + } + } + + q := strings.TrimSpace(req.Query) + if q == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + offset := 0 + if req.Pageno > 1 { + offset = (req.Pageno - 1) * e.resultsPerPage + } + + args := url.Values{} + args.Set("q", q) + args.Set("count", fmt.Sprintf("%d", e.resultsPerPage)) + args.Set("offset", fmt.Sprintf("%d", offset)) + + if req.TimeRange != nil { + switch *req.TimeRange { + case "day": + args.Set("time_range", "past_day") + case "week": + args.Set("time_range", "past_week") + case "month": + args.Set("time_range", "past_month") + case "year": + args.Set("time_range", "past_year") + } + } + + // SearXNG's python checks `if params["safesearch"]:` which treats any + // non-zero (moderate/strict) as strict. + if req.Safesearch > 0 { + args.Set("safesearch", "strict") + } + + endpoint := "https://api.search.brave.com/res/v1/web/search?" + args.Encode() + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("X-Subscription-Token", e.apiKey) + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + return contracts.SearchResponse{}, fmt.Errorf("brave upstream error: status=%d body=%q", resp.StatusCode, string(body)) + } + + var api struct { + Web struct { + Results []struct { + URL string `json:"url"` + Title string `json:"title"` + Description string `json:"description"` + Age string `json:"age"` + Thumbnail struct { + Src string `json:"src"` + } `json:"thumbnail"` + } `json:"results"` + } `json:"web"` + } + + if err := json.NewDecoder(resp.Body).Decode(&api); err != nil { + return contracts.SearchResponse{}, err + } + + results := make([]contracts.MainResult, 0, len(api.Web.Results)) + for _, r := range api.Web.Results { + urlPtr := strings.TrimSpace(r.URL) + if urlPtr == "" { + continue + } + pub := parseBraveAge(r.Age) + + results = append(results, contracts.MainResult{ + Template: "default.html", + Title: r.Title, + Content: r.Description, + URL: &urlPtr, + Pubdate: pub, + Engine: e.Name(), + Score: 0, + Category: "general", + Priority: "", + Positions: nil, + Engines: []string{e.Name()}, + }) + } + + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + +func parseBraveAge(ageRaw string) *string { + ageRaw = strings.TrimSpace(ageRaw) + if ageRaw == "" { + return nil + } + + // Brave sometimes returns RFC3339-like timestamps for `age`. + layouts := []string{ + time.RFC3339Nano, + time.RFC3339, + "2006-01-02T15:04:05Z07:00", + "2006-01-02", + } + for _, layout := range layouts { + if t, err := time.Parse(layout, ageRaw); err == nil { + s := t.Format("2006-01-02 15:04:05-0700") + return &s + } + } + return nil +} + diff --git a/internal/engines/braveapi_test.go b/internal/engines/braveapi_test.go new file mode 100644 index 0000000..7f43f68 --- /dev/null +++ b/internal/engines/braveapi_test.go @@ -0,0 +1,92 @@ +package engines + +import ( + "context" + "net/http" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestBraveEngine_GatingAndHeader(t *testing.T) { + wantToken := "letmein" + wantAPIKey := "api-key" + + transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) { + if r.Header.Get("X-Subscription-Token") != wantAPIKey { + t.Fatalf("missing/incorrect X-Subscription-Token header: got %q", r.Header.Get("X-Subscription-Token")) + } + if r.URL.Host != "api.search.brave.com" { + t.Fatalf("unexpected host: %s", r.URL.Host) + } + if r.URL.Path != "/res/v1/web/search" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + // basic query assertions + q := r.URL.Query().Get("q") + if q != "hugo" { + t.Fatalf("unexpected q: %q", q) + } + + body := `{ + "web": { + "results": [ + {"url":"https://example.com/a","title":"A","description":"B","age":"2024-06-03T00:00:00Z","thumbnail":{"src":"x"}} + ] + } + }` + return httpResponse(http.StatusOK, body, "application/json"), nil + }) + + client := &http.Client{Transport: transport} + engine := &BraveEngine{ + client: client, + apiKey: wantAPIKey, + accessGateToken: wantToken, + resultsPerPage: 20, + } + + // Wrong token => no upstream call / unresponsive engine. + { + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "hugo", + Pageno: 1, + Safesearch: 0, + Language: "en", + AccessToken: "wrong", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 0 { + t.Fatalf("expected no results on unauthorized, got %d", len(resp.Results)) + } + if len(resp.UnresponsiveEngines) != 1 { + t.Fatalf("expected 1 unresponsive engine entry, got %v", resp.UnresponsiveEngines) + } + } + + // Correct token => upstream call. + { + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "hugo", + Pageno: 1, + Safesearch: 0, + Language: "en", + AccessToken: wantToken, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 1 { + t.Fatalf("expected 1 result, got %d", len(resp.Results)) + } + if resp.Results[0].Title != "A" { + t.Fatalf("unexpected title: %q", resp.Results[0].Title) + } + if resp.Results[0].URL == nil || *resp.Results[0].URL != "https://example.com/a" { + t.Fatalf("unexpected url: %v", resp.Results[0].URL) + } + } +} + diff --git a/internal/engines/crossref.go b/internal/engines/crossref.go new file mode 100644 index 0000000..f52f0cb --- /dev/null +++ b/internal/engines/crossref.go @@ -0,0 +1,144 @@ +package engines + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/ashie/gosearch/internal/contracts" +) + +type CrossrefEngine struct { + client *http.Client +} + +func (e *CrossrefEngine) Name() string { return "crossref" } + +func (e *CrossrefEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if e == nil || e.client == nil { + return contracts.SearchResponse{}, errors.New("crossref engine not initialized") + } + q := strings.TrimSpace(req.Query) + if q == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + offset := 20 * (req.Pageno - 1) + args := url.Values{} + args.Set("query", q) + args.Set("offset", fmt.Sprintf("%d", offset)) + + endpoint := "https://api.crossref.org/works?" + args.Encode() + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + return contracts.SearchResponse{}, fmt.Errorf("crossref upstream error: status=%d body=%q", resp.StatusCode, string(body)) + } + + var api struct { + Message struct { + Items []crossrefItem `json:"items"` + } `json:"message"` + } + + if err := json.NewDecoder(resp.Body).Decode(&api); err != nil { + return contracts.SearchResponse{}, err + } + + results := make([]contracts.MainResult, 0, len(api.Message.Items)) + for _, item := range api.Message.Items { + title := "" + if len(item.Title) > 0 { + title = strings.TrimSpace(item.Title[0]) + } + + content := strings.TrimSpace(item.Abstract) + + urlStr := strings.TrimSpace(item.URL) + if urlStr == "" { + urlStr = strings.TrimSpace(item.DOI) + } + + pub := parseCrossrefDateParts(item.Published.DateParts) + + urlPtr := urlStr + results = append(results, contracts.MainResult{ + Template: "default.html", + Title: title, + Content: content, + URL: &urlPtr, + Pubdate: pub, + Engine: "crossref", + Score: 0, + Category: "science", + Priority: "", + Positions: nil, + Engines: []string{"crossref"}, + }) + } + + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + +type crossrefItem struct { + Type string `json:"type"` + Title []string `json:"title"` + URL string `json:"URL"` + DOI string `json:"DOI"` + Abstract string `json:"abstract"` + Page string `json:"page"` + Publisher string `json:"publisher"` + Subject []string `json:"subject"` + Published crossrefPublished `json:"published"` +} + +type crossrefPublished struct { + DateParts [][]int `json:"date-parts"` +} + +func parseCrossrefDateParts(parts [][]int) *string { + if len(parts) == 0 || len(parts[0]) == 0 { + return nil + } + + dp := parts[0] + year := dp[0] + month := 1 + day := 1 + if len(dp) >= 2 { + month = dp[1] + } + if len(dp) >= 3 { + day = dp[2] + } + + t := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC) + formatted := t.Format("2006-01-02 00:00:00+0000") + return &formatted +} + diff --git a/internal/engines/crossref_test.go b/internal/engines/crossref_test.go new file mode 100644 index 0000000..9d6c6b1 --- /dev/null +++ b/internal/engines/crossref_test.go @@ -0,0 +1,71 @@ +package engines + +import ( + "context" + "net/http" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestCrossrefEngine_Search(t *testing.T) { + transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) { + if r.Method != http.MethodGet { + return httpResponse(http.StatusMethodNotAllowed, "", ""), nil + } + if r.URL.Host != "api.crossref.org" || r.URL.Path != "/works" { + return httpResponse(http.StatusNotFound, "", ""), nil + } + q := r.URL.Query().Get("query") + if q != "hugo" { + return httpResponse(http.StatusBadRequest, "", ""), nil + } + + body := `{ + "message": { + "items": [ + { + "type": "journal-article", + "title": ["Paper B"], + "URL": "https://example.com/paperb", + "abstract": "Abstract B", + "DOI": "10.1234/b", + "published": { + "date-parts": [[2020, 5, 1]] + } + } + ] + } +}` + return httpResponse(http.StatusOK, body, "application/json"), nil + }) + + client := &http.Client{Transport: transport} + engine := &CrossrefEngine{client: client} + + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "hugo", + Pageno: 1, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 1 { + t.Fatalf("expected 1 result, got %d", len(resp.Results)) + } + + r := resp.Results[0] + if r.Title != "Paper B" { + t.Fatalf("expected title Paper B, got %q", r.Title) + } + if r.Content != "Abstract B" { + t.Fatalf("expected content, got %q", r.Content) + } + if r.Pubdate == nil || *r.Pubdate == "" { + t.Fatalf("expected pubdate, got nil/empty") + } + if r.Engine != "crossref" { + t.Fatalf("expected engine crossref, got %q", r.Engine) + } +} + diff --git a/internal/engines/engine.go b/internal/engines/engine.go new file mode 100644 index 0000000..bfc9dd0 --- /dev/null +++ b/internal/engines/engine.go @@ -0,0 +1,17 @@ +package engines + +import ( + "context" + + "github.com/ashie/gosearch/internal/contracts" +) + +// Engine is a Go-native implementation of a SearXNG engine. +// +// Implementations should return a SearchResponse containing only the results +// for that engine subset; the caller will merge multiple engine responses. +type Engine interface { + Name() string + Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) +} + diff --git a/internal/engines/factory.go b/internal/engines/factory.go new file mode 100644 index 0000000..5148dc9 --- /dev/null +++ b/internal/engines/factory.go @@ -0,0 +1,33 @@ +package engines + +import ( + "net/http" + "os" + "time" +) + +// NewDefaultPortedEngines returns the starter set of Go-native engines. +// The service can swap/extend this registry later as more engines are ported. +func NewDefaultPortedEngines(client *http.Client) map[string]Engine { + if client == nil { + client = &http.Client{Timeout: 10 * time.Second} + } + + return map[string]Engine{ + "wikipedia": &WikipediaEngine{client: client}, + "arxiv": &ArxivEngine{client: client}, + "crossref": &CrossrefEngine{client: client}, + "braveapi": &BraveEngine{ + client: client, + apiKey: os.Getenv("BRAVE_API_KEY"), + accessGateToken: os.Getenv("BRAVE_ACCESS_TOKEN"), + resultsPerPage: 20, + }, + "qwant": &QwantEngine{ + client: client, + category: "web-lite", + resultsPerPage: 10, + }, + } +} + diff --git a/internal/engines/http_mock_test.go b/internal/engines/http_mock_test.go new file mode 100644 index 0000000..ccf2306 --- /dev/null +++ b/internal/engines/http_mock_test.go @@ -0,0 +1,26 @@ +package engines + +import ( + "io" + "net/http" + "strings" +) + +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) { + return f(r) +} + +func httpResponse(status int, body string, contentType string) *http.Response { + h := make(http.Header) + if contentType != "" { + h.Set("Content-Type", contentType) + } + return &http.Response{ + StatusCode: status, + Header: h, + Body: io.NopCloser(strings.NewReader(body)), + } +} + diff --git a/internal/engines/planner.go b/internal/engines/planner.go new file mode 100644 index 0000000..d18ecb0 --- /dev/null +++ b/internal/engines/planner.go @@ -0,0 +1,148 @@ +package engines + +import ( + "os" + "strings" + + "github.com/ashie/gosearch/internal/contracts" +) + +var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "qwant"} + +type Planner struct { + PortedSet map[string]bool + PortedList []string +} + +func NewPlannerFromEnv() *Planner { + raw := strings.TrimSpace(os.Getenv("LOCAL_PORTED_ENGINES")) + if raw == "" { + return NewPlanner(defaultPortedEngines) + } + parts := splitCSV(raw) + if len(parts) == 0 { + return NewPlanner(defaultPortedEngines) + } + return NewPlanner(parts) +} + +func NewPlanner(portedEngines []string) *Planner { + set := make(map[string]bool, len(portedEngines)) + out := make([]string, 0, len(portedEngines)) + for _, e := range portedEngines { + e = strings.TrimSpace(strings.ToLower(e)) + if e == "" { + continue + } + if set[e] { + continue + } + set[e] = true + out = append(out, e) + } + return &Planner{ + PortedSet: set, + PortedList: out, + } +} + +// Plan returns: +// - localEngines: engines that are configured as ported for this service +// - upstreamEngines: engines that should be executed by upstream SearXNG +// - requestedEngines: the (possibly inferred) requested engines list +// +// If the request provides an explicit `engines` parameter, we use it. +// Otherwise we infer a small subset from `categories` for the starter set. +func (p *Planner) Plan(req contracts.SearchRequest) (localEngines, upstreamEngines, requestedEngines []string) { + if p == nil { + p = NewPlannerFromEnv() + } + + requestedEngines = nil + if len(req.Engines) > 0 { + requestedEngines = normalizeList(req.Engines) + } else { + requestedEngines = inferFromCategories(req.Categories) + } + + localEngines = make([]string, 0, len(requestedEngines)) + upstreamEngines = make([]string, 0, len(requestedEngines)) + for _, e := range requestedEngines { + if p.PortedSet[e] { + localEngines = append(localEngines, e) + } else { + upstreamEngines = append(upstreamEngines, e) + } + } + + return localEngines, upstreamEngines, requestedEngines +} + +func inferFromCategories(categories []string) []string { + // Minimal mapping for the initial porting subset. + // This mirrors the idea of selecting from SearXNG categories without + // embedding the whole engine registry. + set := map[string]bool{} + for _, c := range categories { + switch strings.TrimSpace(strings.ToLower(c)) { + case "general": + set["wikipedia"] = true + set["braveapi"] = true + set["qwant"] = true + case "science", "scientific publications": + set["arxiv"] = true + set["crossref"] = true + } + } + + out := make([]string, 0, len(set)) + for e := range set { + out = append(out, e) + } + // stable order + order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "arxiv": 3, "crossref": 4} + sortByOrder(out, order) + return out +} + +func sortByOrder(list []string, order map[string]int) { + // simple insertion sort (list is tiny) + for i := 1; i < len(list); i++ { + j := i + for j > 0 && order[list[j-1]] > order[list[j]] { + list[j-1], list[j] = list[j], list[j-1] + j-- + } + } +} + +func normalizeList(in []string) []string { + out := make([]string, 0, len(in)) + seen := map[string]bool{} + for _, e := range in { + e = strings.TrimSpace(strings.ToLower(e)) + if e == "" || seen[e] { + continue + } + seen[e] = true + out = append(out, e) + } + return out +} + +func splitCSV(s string) []string { + if s == "" { + return nil + } + parts := strings.Split(s, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p == "" { + continue + } + out = append(out, p) + } + return out +} + diff --git a/internal/engines/qwant.go b/internal/engines/qwant.go new file mode 100644 index 0000000..853b501 --- /dev/null +++ b/internal/engines/qwant.go @@ -0,0 +1,467 @@ +package engines + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/ashie/gosearch/internal/contracts" + "github.com/PuerkitoBio/goquery" +) + +// QwantEngine implements a SearXNG-like `qwant` (web) adapter using +// Qwant v3 endpoint: https://api.qwant.com/v3/search/web. +// +// Qwant's API is not fully documented; this mirrors SearXNG's parsing logic +// for the `web` category from `.agent/searxng/searx/engines/qwant.py`. +type QwantEngine struct { + client *http.Client + category string // "web" (JSON API) or "web-lite" (HTML fallback) + resultsPerPage int +} + +func (e *QwantEngine) Name() string { return "qwant" } + +func (e *QwantEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if e == nil || e.client == nil { + return contracts.SearchResponse{}, errors.New("qwant engine not initialized") + } + + q := strings.TrimSpace(req.Query) + if q == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + // For API parity we use SearXNG web defaults: count=10, offset=(pageno-1)*count. + // The engine's config field exists so we can expand to news/images/videos later. + count := e.resultsPerPage + if count <= 0 { + count = 10 + } + offset := 0 + if req.Pageno > 1 { + offset = (req.Pageno - 1) * count + } + mode := strings.TrimSpace(strings.ToLower(e.category)) + if mode == "" { + mode = "web" + } + + switch mode { + case "web-lite": + return e.searchWebLite(ctx, req) + case "web": + return e.searchWebAPI(ctx, req, count, offset) + default: + // Unknown mode: treat as unresponsive. + return contracts.SearchResponse{ + Query: req.Query, + UnresponsiveEngines: [][2]string{ + {e.Name(), "unknown_qwant_mode"}, + }, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + }, nil + } +} + +func (e *QwantEngine) searchWebAPI(ctx context.Context, req contracts.SearchRequest, count, offset int) (contracts.SearchResponse, error) { + qLocale := qwantLocale(req.Language) + args := url.Values{} + args.Set("q", req.Query) + args.Set("count", fmt.Sprintf("%d", count)) + args.Set("locale", qLocale) + args.Set("safesearch", fmt.Sprintf("%d", req.Safesearch)) + args.Set("llm", "false") + args.Set("tgp", "3") + args.Set("offset", fmt.Sprintf("%d", offset)) + + endpoint := "https://api.qwant.com/v3/search/web?" + args.Encode() + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", "gosearch-go/0.1 (+https://github.com/ashie/gosearch)") + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + // Qwant often returns a 403 captcha/JS block for the JSON API. + if resp.StatusCode == http.StatusForbidden { + return contracts.SearchResponse{ + Query: req.Query, + UnresponsiveEngines: [][2]string{ + {e.Name(), "captcha_or_js_block"}, + }, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + }, nil + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + return contracts.SearchResponse{}, fmt.Errorf("qwant upstream error: status=%d body=%q", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + var top map[string]any + if err := json.Unmarshal(body, &top); err != nil { + return contracts.SearchResponse{}, err + } + + status, _ := top["status"].(string) + if status != "success" { + return contracts.SearchResponse{ + Query: req.Query, + UnresponsiveEngines: [][2]string{ + {e.Name(), "api_error"}, + }, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + }, nil + } + + data, _ := top["data"].(map[string]any) + result, _ := data["result"].(map[string]any) + items, _ := result["items"].(map[string]any) + mainline := items["mainline"] + + rows := toSlice(mainline) + if len(rows) == 0 { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil + } + + results := make([]contracts.MainResult, 0, len(rows)) + for _, row := range rows { + rowMap, ok := row.(map[string]any) + if !ok { + continue + } + + rowType, _ := rowMap["type"].(string) + if rowType == "" { + rowType = "web" + } + if rowType != "web" { + continue + } + if rowType == "ads" { + continue + } + + rowItems := toSlice(rowMap["items"]) + for _, it := range rowItems { + itemMap, ok := it.(map[string]any) + if !ok { + continue + } + title := toString(itemMap["title"]) + resURL := toString(itemMap["url"]) + desc := toString(itemMap["desc"]) + if resURL == "" { + continue + } + urlPtr := resURL + results = append(results, contracts.MainResult{ + Template: "default.html", + Title: title, + Content: desc, + URL: &urlPtr, + Engine: e.Name(), + Score: 0, + Category: "general", + Engines: []string{e.Name()}, + }) + } + } + + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + +func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + qLocale := qwantLocale(req.Language) + langBase := strings.SplitN(qLocale, "_", 2)[0] + + args := url.Values{} + args.Set("q", req.Query) + args.Set("locale", strings.ToLower(qLocale)) + args.Set("l", langBase) + args.Set("s", fmt.Sprintf("%d", req.Safesearch)) + args.Set("p", fmt.Sprintf("%d", req.Pageno)) + + endpoint := "https://lite.qwant.com/?" + args.Encode() + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", "gosearch-go/0.1 (+https://github.com/ashie/gosearch)") + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + return contracts.SearchResponse{}, fmt.Errorf("qwant lite upstream error: status=%d body=%q", resp.StatusCode, string(body)) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return contracts.SearchResponse{}, err + } + + results := make([]contracts.MainResult, 0) + seen := map[string]bool{} + + // Pattern 1: legacy/known qwant-lite structure. + doc.Find("section article").Each(func(_ int, item *goquery.Selection) { + // ignore randomly interspersed advertising adds + if item.Find("span.tooltip").Length() > 0 { + return + } + + // In SearXNG: "./span[contains(@class, 'url partner')]" + urlText := strings.TrimSpace(item.Find("span.url.partner").First().Text()) + if urlText == "" { + // fallback: any span with class containing both 'url' and 'partner' + urlText = strings.TrimSpace(item.Find("span[class*='url'][class*='partner']").First().Text()) + } + title := strings.TrimSpace(item.Find("h2 a").First().Text()) + content := strings.TrimSpace(item.Find("p").First().Text()) + + if urlText == "" { + return + } + if seen[urlText] { + return + } + seen[urlText] = true + u := urlText + results = append(results, contracts.MainResult{ + Template: "default.html", + Title: title, + Content: content, + URL: &u, + Engine: e.Name(), + Score: 0, + Category: "general", + Engines: []string{e.Name()}, + }) + }) + + // Pattern 2: broader fallback for updated lite markup: + // any article/list item/div block containing an external anchor. + // We keep this conservative by requiring non-empty title + URL. + doc.Find("article, li, div").Each(func(_ int, item *goquery.Selection) { + if len(results) >= 20 { + return + } + // Skip ad-like blocks in fallback pass too. + if item.Find("span.tooltip").Length() > 0 { + return + } + + // Skip obvious nav/footer blocks. + classAttr, _ := item.Attr("class") + classLower := strings.ToLower(classAttr) + if strings.Contains(classLower, "nav") || strings.Contains(classLower, "footer") { + return + } + + a := item.Find("a[href]").First() + if a.Length() == 0 { + return + } + href, ok := a.Attr("href") + if !ok { + return + } + href = strings.TrimSpace(href) + if href == "" { + return + } + + // Ignore in-page and relative links. + if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "#") { + return + } + if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") { + return + } + // Skip known sponsored partner links surfaced in lite pages. + if isKnownSponsoredURL(href) { + return + } + if isQwantInternalURL(href) { + // Ignore qwant nav/house links. + return + } + + title := strings.TrimSpace(a.Text()) + if title == "" { + return + } + if isLikelyNavTitle(title) { + return + } + + if seen[href] { + return + } + seen[href] = true + + // Best-effort snippet extraction from nearby paragraph/span text. + content := strings.TrimSpace(item.Find("p").First().Text()) + if content == "" { + content = strings.TrimSpace(item.Find("span").First().Text()) + } + // If there is no snippet, still keep clearly external result links. + // Qwant-lite frequently omits rich snippets for some entries. + + u := href + results = append(results, contracts.MainResult{ + Template: "default.html", + Title: title, + Content: content, + URL: &u, + Engine: e.Name(), + Score: 0, + Category: "general", + Engines: []string{e.Name()}, + }) + }) + + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + +func qwantLocale(lang string) string { + lang = strings.TrimSpace(lang) + if lang == "" || lang == "auto" { + return "en_US" + } + lang = strings.ReplaceAll(lang, "-", "_") + parts := strings.SplitN(lang, "_", 2) + base := strings.ToLower(parts[0]) + country := "US" + if len(parts) == 2 && strings.TrimSpace(parts[1]) != "" { + country = strings.ToUpper(strings.TrimSpace(parts[1])) + } + // Qwant expects locales like en_US. + return base + "_" + country +} + +func toSlice(v any) []any { + switch t := v.(type) { + case []any: + return t + default: + // Handle case where mainline might be a single object. + if m, ok := v.(map[string]any); ok { + return []any{m} + } + return nil + } +} + +func toString(v any) string { + switch t := v.(type) { + case string: + return t + case json.Number: + return t.String() + default: + return "" + } +} + +func isQwantInternalURL(raw string) bool { + u, err := url.Parse(raw) + if err != nil { + return false + } + host := strings.ToLower(u.Hostname()) + if host == "" { + return false + } + return host == "qwant.com" || host == "www.qwant.com" || strings.HasSuffix(host, ".qwant.com") || host == "about.qwant.com" +} + +func isLikelyNavTitle(title string) bool { + t := strings.TrimSpace(strings.ToLower(title)) + switch t { + case "qwant search", "search", "privacy", "discover the service", "better web", "discover": + return true + } + if strings.HasPrefix(t, "get 20gb of free storage") { + return true + } + return false +} + +func isKnownSponsoredURL(raw string) bool { + u, err := url.Parse(raw) + if err != nil { + return false + } + host := strings.ToLower(u.Hostname()) + switch host { + case "shdw.me", "www.shdw.me": + return true + } + if strings.Contains(strings.ToLower(raw), "qwant-tool") { + return true + } + return false +} + diff --git a/internal/engines/qwant_lite_test.go b/internal/engines/qwant_lite_test.go new file mode 100644 index 0000000..c81d1fc --- /dev/null +++ b/internal/engines/qwant_lite_test.go @@ -0,0 +1,89 @@ +package engines + +import ( + "context" + "net/http" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestQwantEngine_WebLite(t *testing.T) { + transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) { + if r.Method != http.MethodGet { + return httpResponse(http.StatusMethodNotAllowed, "", ""), nil + } + if r.URL.Host != "lite.qwant.com" { + return httpResponse(http.StatusNotFound, "", ""), nil + } + if r.URL.Path != "/" { + // goquery request URL parsing should normalize to "/" + t.Fatalf("unexpected path: %s", r.URL.Path) + } + + q := r.URL.Query().Get("q") + if q != "hugo" { + t.Fatalf("unexpected q: %q", q) + } + if r.URL.Query().Get("locale") != "en_us" { + t.Fatalf("unexpected locale: %q", r.URL.Query().Get("locale")) + } + if r.URL.Query().Get("l") != "en" { + t.Fatalf("unexpected l: %q", r.URL.Query().Get("l")) + } + if r.URL.Query().Get("s") != "0" { + t.Fatalf("unexpected s: %q", r.URL.Query().Get("s")) + } + if r.URL.Query().Get("p") != "1" { + t.Fatalf("unexpected p: %q", r.URL.Query().Get("p")) + } + + body := ` + + + +
+
+ https://example.com/q +

Qwant Title

+

Qwant description

+
+
+ ad + https://example.com/ad +

Ad Title

+

Ad description

+
+
+ +` + + return httpResponse(http.StatusOK, body, "text/html"), nil + }) + + client := &http.Client{Transport: transport} + engine := &QwantEngine{client: client, category: "web-lite", resultsPerPage: 10} + + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "hugo", + Pageno: 1, + Safesearch: 0, + Language: "en", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 1 { + t.Fatalf("expected 1 result (non-ad), got %d", len(resp.Results)) + } + if resp.Results[0].Title != "Qwant Title" { + t.Fatalf("unexpected title: %q", resp.Results[0].Title) + } + if resp.Results[0].Content != "Qwant description" { + t.Fatalf("unexpected content: %q", resp.Results[0].Content) + } + if resp.Results[0].URL == nil || *resp.Results[0].URL != "https://example.com/q" { + t.Fatalf("unexpected url: %v", resp.Results[0].URL) + } +} + diff --git a/internal/engines/qwant_test.go b/internal/engines/qwant_test.go new file mode 100644 index 0000000..b19d184 --- /dev/null +++ b/internal/engines/qwant_test.go @@ -0,0 +1,94 @@ +package engines + +import ( + "context" + "net/http" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestQwantEngine_Web(t *testing.T) { + transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) { + if r.Method != http.MethodGet { + return httpResponse(http.StatusMethodNotAllowed, "", ""), nil + } + if r.URL.Host != "api.qwant.com" { + return httpResponse(http.StatusNotFound, "", ""), nil + } + if r.URL.Path != "/v3/search/web" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + + q := r.URL.Query().Get("q") + if q != "hugo" { + t.Fatalf("unexpected q: %q", q) + } + if r.URL.Query().Get("count") != "10" { + t.Fatalf("unexpected count: %q", r.URL.Query().Get("count")) + } + if r.URL.Query().Get("locale") != "en_US" { + t.Fatalf("unexpected locale: %q", r.URL.Query().Get("locale")) + } + if r.URL.Query().Get("safesearch") != "0" { + t.Fatalf("unexpected safesearch: %q", r.URL.Query().Get("safesearch")) + } + if r.URL.Query().Get("llm") != "false" { + t.Fatalf("unexpected llm: %q", r.URL.Query().Get("llm")) + } + if r.URL.Query().Get("tgp") != "3" { + t.Fatalf("unexpected tgp: %q", r.URL.Query().Get("tgp")) + } + if r.URL.Query().Get("offset") != "0" { + t.Fatalf("unexpected offset: %q", r.URL.Query().Get("offset")) + } + + body := `{ + "status": "success", + "data": { + "result": { + "items": { + "mainline": [ + { + "type": "web", + "items": [ + { "title": "Qwant Title", "url": "https://example.com/q", "desc": "Qwant description" } + ] + } + ] + } + } + } +}` + return httpResponse(http.StatusOK, body, "application/json"), nil + }) + + client := &http.Client{Transport: transport} + engine := &QwantEngine{client: client, category: "web", resultsPerPage: 10} + + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "hugo", + Pageno: 1, + Safesearch: 0, + Language: "en", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 1 { + t.Fatalf("expected 1 result, got %d", len(resp.Results)) + } + if resp.Results[0].Title != "Qwant Title" { + t.Fatalf("unexpected title: %q", resp.Results[0].Title) + } + if resp.Results[0].Content != "Qwant description" { + t.Fatalf("unexpected content: %q", resp.Results[0].Content) + } + if resp.Results[0].URL == nil || *resp.Results[0].URL != "https://example.com/q" { + t.Fatalf("unexpected url: %v", resp.Results[0].URL) + } + if resp.Results[0].Engine != "qwant" { + t.Fatalf("unexpected engine: %q", resp.Results[0].Engine) + } +} + diff --git a/internal/engines/wikipedia.go b/internal/engines/wikipedia.go new file mode 100644 index 0000000..4e58a7a --- /dev/null +++ b/internal/engines/wikipedia.go @@ -0,0 +1,151 @@ +package engines + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/ashie/gosearch/internal/contracts" +) + +type WikipediaEngine struct { + client *http.Client +} + +func (e *WikipediaEngine) Name() string { return "wikipedia" } + +func (e *WikipediaEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if e == nil || e.client == nil { + return contracts.SearchResponse{}, errors.New("wikipedia engine not initialized") + } + if strings.TrimSpace(req.Query) == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + lang := strings.TrimSpace(req.Language) + if lang == "" || lang == "auto" { + lang = "en" + } + // Wikipedia subdomains are based on the language code; keep it simple for MVP. + lang = strings.SplitN(lang, "-", 2)[0] + lang = strings.ReplaceAll(lang, "_", "-") + wikiNetloc := fmt.Sprintf("%s.wikipedia.org", lang) + + endpoint := fmt.Sprintf( + "https://%s/api/rest_v1/page/summary/%s", + wikiNetloc, + url.PathEscape(req.Query), + ) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + // Wikimedia APIs require a descriptive User-Agent. + httpReq.Header.Set( + "User-Agent", + "gosearch-go/0.1 (compatible; +https://github.com/ashie/gosearch)", + ) + // Best-effort: hint content language. + if req.Language != "" && req.Language != "auto" { + httpReq.Header.Set("Accept-Language", req.Language) + } + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) + return contracts.SearchResponse{}, fmt.Errorf("wikipedia upstream error: status=%d body=%q", resp.StatusCode, string(body)) + } + + var api struct { + Title string `json:"title"` + Description string `json:"description"` + Titles struct { + Display string `json:"display"` + } `json:"titles"` + ContentURLs struct { + Desktop struct { + Page string `json:"page"` + } `json:"desktop"` + } `json:"content_urls"` + } + + if err := json.NewDecoder(resp.Body).Decode(&api); err != nil { + return contracts.SearchResponse{}, err + } + + pageURL := api.ContentURLs.Desktop.Page + if pageURL == "" { + // API returned a non-standard payload; treat as no result. + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []contracts.MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil + } + + title := api.Titles.Display + if title == "" { + title = api.Title + } + + content := api.Description + + urlPtr := pageURL + pub := (*string)(nil) + + results := []contracts.MainResult{ + { + Template: "default.html", + Title: title, + Content: content, + URL: &urlPtr, + Pubdate: pub, + Engine: "wikipedia", + Score: 0, + Category: "general", + Priority: "", + Positions: nil, + Engines: []string{"wikipedia"}, + }, + } + + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil +} + diff --git a/internal/engines/wikipedia_test.go b/internal/engines/wikipedia_test.go new file mode 100644 index 0000000..b2dc230 --- /dev/null +++ b/internal/engines/wikipedia_test.go @@ -0,0 +1,61 @@ +package engines + +import ( + "context" + "net/http" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestWikipediaEngine_Search(t *testing.T) { + transport := roundTripperFunc(func(r *http.Request) (*http.Response, error) { + if r.Method != http.MethodGet { + return httpResponse(http.StatusMethodNotAllowed, "", ""), nil + } + if r.URL.Host != "en.wikipedia.org" { + return httpResponse(http.StatusNotFound, "", ""), nil + } + + if r.URL.Path != "/api/rest_v1/page/summary/Taxi" { + return httpResponse(http.StatusNotFound, "", ""), nil + } + + body := `{ + "title": "Taxi", + "description": "A car", + "titles": { "display": "Taxi" }, + "content_urls": { "desktop": { "page": "https://en.wikipedia.org/wiki/Taxi" } } +}` + return httpResponse(http.StatusOK, body, "application/json"), nil + }) + + client := &http.Client{Transport: transport} + engine := &WikipediaEngine{client: client} + + resp, err := engine.Search(context.Background(), contracts.SearchRequest{ + Query: "Taxi", + Pageno: 1, + Language: "en", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(resp.Results) != 1 { + t.Fatalf("expected 1 result, got %d", len(resp.Results)) + } + r := resp.Results[0] + if r.Title != "Taxi" { + t.Fatalf("expected title Taxi, got %q", r.Title) + } + if r.Content != "A car" { + t.Fatalf("expected content, got %q", r.Content) + } + if r.URL == nil || *r.URL == "" { + t.Fatalf("expected url, got nil/empty") + } + if *r.URL != "https://en.wikipedia.org/wiki/Taxi" { + t.Fatalf("unexpected url: %q", *r.URL) + } +} + diff --git a/internal/httpapi/handlers.go b/internal/httpapi/handlers.go new file mode 100644 index 0000000..0b31e9b --- /dev/null +++ b/internal/httpapi/handlers.go @@ -0,0 +1,41 @@ +package httpapi + +import ( + "net/http" + + "github.com/ashie/gosearch/internal/search" +) + +type Handler struct { + searchSvc *search.Service +} + +func NewHandler(searchSvc *search.Service) *Handler { + return &Handler{searchSvc: searchSvc} +} + +func (h *Handler) Healthz(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("OK")) +} + +func (h *Handler) Search(w http.ResponseWriter, r *http.Request) { + req, err := search.ParseSearchRequest(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + resp, err := h.searchSvc.Search(r.Context(), req) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if err := search.WriteSearchResponse(w, req.Format, resp); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } +} + diff --git a/internal/search/merge.go b/internal/search/merge.go new file mode 100644 index 0000000..344a78a --- /dev/null +++ b/internal/search/merge.go @@ -0,0 +1,121 @@ +package search + +import ( + "encoding/json" + "net/url" + "strings" + + "github.com/ashie/gosearch/internal/contracts" +) + +// MergeResponses merges multiple SearXNG-compatible JSON responses. +// +// MVP merge semantics: +// - results are concatenated with a simple de-dup key (engine|title|url) +// - suggestions/corrections are de-duplicated as sets +// - answers/infoboxes/unresponsive_engines are concatenated (best-effort) +func MergeResponses(responses []contracts.SearchResponse) contracts.SearchResponse { + var merged contracts.SearchResponse + + mergedResultSeen := map[string]struct{}{} + mergedAnswerSeen := map[string]struct{}{} + mergedCorrectionsSeen := map[string]struct{}{} + mergedSuggestionsSeen := map[string]struct{}{} + + for _, r := range responses { + if merged.Query == "" { + merged.Query = r.Query + } + + merged.NumberOfResults = maxInt(merged.NumberOfResults, r.NumberOfResults) + + for _, mr := range r.Results { + key := resultDedupKey(mr) + if _, ok := mergedResultSeen[key]; ok { + continue + } + mergedResultSeen[key] = struct{}{} + merged.Results = append(merged.Results, mr) + } + + for _, ans := range r.Answers { + // De-dup by normalized JSON when possible. + b, err := json.Marshal(ans) + if err != nil { + merged.Answers = append(merged.Answers, ans) + continue + } + key := string(b) + if _, ok := mergedAnswerSeen[key]; ok { + continue + } + mergedAnswerSeen[key] = struct{}{} + merged.Answers = append(merged.Answers, ans) + } + + merged.Corrections = unionStrings(merged.Corrections, r.Corrections, &mergedCorrectionsSeen) + merged.Suggestions = unionStrings(merged.Suggestions, r.Suggestions, &mergedSuggestionsSeen) + + merged.Infoboxes = append(merged.Infoboxes, r.Infoboxes...) + merged.UnresponsiveEngines = append(merged.UnresponsiveEngines, r.UnresponsiveEngines...) + } + + // Ensure non-nil slices to keep JSON shape stable. + if merged.Results == nil { + merged.Results = []contracts.MainResult{} + } + if merged.Answers == nil { + merged.Answers = []map[string]any{} + } + if merged.Corrections == nil { + merged.Corrections = []string{} + } + if merged.Infoboxes == nil { + merged.Infoboxes = []map[string]any{} + } + if merged.Suggestions == nil { + merged.Suggestions = []string{} + } + if merged.UnresponsiveEngines == nil { + merged.UnresponsiveEngines = [][2]string{} + } + + return merged +} + +func resultDedupKey(r contracts.MainResult) string { + urlStr := "" + if r.URL != nil { + urlStr = *r.URL + } + // Normalize host to reduce duplicates. + if u, err := url.Parse(urlStr); err == nil { + if u.Host != "" { + urlStr = u.Host + u.Path + } + } + return strings.ToLower(r.Engine) + "|" + strings.ToLower(r.Title) + "|" + urlStr +} + +func unionStrings(dst []string, src []string, seen *map[string]struct{}) []string { + if *seen == nil { + *seen = map[string]struct{}{} + } + out := dst + for _, s := range src { + if _, ok := (*seen)[s]; ok { + continue + } + (*seen)[s] = struct{}{} + out = append(out, s) + } + return out +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} + diff --git a/internal/search/merge_test.go b/internal/search/merge_test.go new file mode 100644 index 0000000..36850ca --- /dev/null +++ b/internal/search/merge_test.go @@ -0,0 +1,80 @@ +package search + +import ( + "strings" + "testing" + + "github.com/ashie/gosearch/internal/contracts" +) + +func TestMergeResponses_DedupResultsAndSets(t *testing.T) { + url1 := "https://example.com/a?x=1" + uPtr := &url1 + + r1 := contracts.SearchResponse{ + Query: "q", + NumberOfResults: 1, + Results: []contracts.MainResult{ + { + Template: "default.html", + Title: "Title1", + Content: "C1", + URL: uPtr, + Engine: "wikipedia", + Score: 1.0, + }, + }, + Answers: []map[string]any{{"title": "A1", "url": url1}}, + Corrections: []string{"corr1", "corr2"}, + Suggestions: []string{"s1", "s2"}, + Infoboxes: []map[string]any{}, + UnresponsiveEngines: [][2]string{}, + } + + r2 := contracts.SearchResponse{ + Query: "q", + NumberOfResults: 1, + Results: []contracts.MainResult{ + { + Template: "default.html", + Title: "Title1", + Content: "C2", + URL: uPtr, + Engine: "wikipedia", + Score: 2.0, + }, + }, + Answers: []map[string]any{{"title": "A1", "url": url1}}, + Corrections: []string{"corr2", "corr3"}, + Suggestions: []string{"s2", "s3"}, + Infoboxes: []map[string]any{}, + UnresponsiveEngines: [][2]string{}, + } + + merged := MergeResponses([]contracts.SearchResponse{r1, r2}) + + if merged.Query != "q" { + t.Fatalf("expected query q, got %q", merged.Query) + } + if merged.NumberOfResults != 1 { + t.Fatalf("expected number_of_results max=1, got %d", merged.NumberOfResults) + } + if len(merged.Results) != 1 { + t.Fatalf("expected 1 merged result, got %d", len(merged.Results)) + } + + // Corrections/suggestions should be unioned. + joinedCorr := strings.Join(merged.Corrections, ",") + if !strings.Contains(joinedCorr, "corr1") || !strings.Contains(joinedCorr, "corr2") || !strings.Contains(joinedCorr, "corr3") { + t.Fatalf("expected unioned corrections, got %v", merged.Corrections) + } + joinedSug := strings.Join(merged.Suggestions, ",") + if !strings.Contains(joinedSug, "s1") || !strings.Contains(joinedSug, "s2") || !strings.Contains(joinedSug, "s3") { + t.Fatalf("expected unioned suggestions, got %v", merged.Suggestions) + } + + if len(merged.Answers) != 1 { + t.Fatalf("expected 1 merged answer, got %d", len(merged.Answers)) + } +} + diff --git a/internal/search/request_params.go b/internal/search/request_params.go new file mode 100644 index 0000000..9e4fd55 --- /dev/null +++ b/internal/search/request_params.go @@ -0,0 +1,206 @@ +package search + +import ( + "errors" + "net/http" + "regexp" + "strconv" + "strings" +) + +var languageCodeRe = regexp.MustCompile(`^[a-z]{2,3}(-[a-zA-Z]{2})?$`) + +func ParseSearchRequest(r *http.Request) (SearchRequest, error) { + // SearXNG supports both GET and POST and relies on form values for routing. + if err := r.ParseForm(); err != nil { + return SearchRequest{}, errors.New("invalid request: cannot parse form") + } + + format := strings.ToLower(r.FormValue("format")) + switch OutputFormat(format) { + case FormatJSON, FormatCSV, FormatRSS: + default: + // MVP: treat everything else as json, except `html` which we accept for compatibility. + if format == string(FormatHTML) { + // accepted, but not implemented by the server yet + } else { + format = string(FormatJSON) + } + } + + q := r.FormValue("q") + if strings.TrimSpace(q) == "" { + return SearchRequest{}, errors.New("missing required parameter: q") + } + + pageno := 1 + if s := strings.TrimSpace(r.FormValue("pageno")); s != "" { + n, err := strconv.Atoi(s) + if err != nil || n < 1 { + return SearchRequest{}, errors.New("invalid parameter: pageno") + } + pageno = n + } + + // MVP defaults. + safesearch := 0 + if s := strings.TrimSpace(r.FormValue("safesearch")); s != "" { + n, err := strconv.Atoi(s) + if err != nil || n < 0 || n > 2 { + return SearchRequest{}, errors.New("invalid parameter: safesearch") + } + safesearch = n + } + + var timeRange *string + if tr := strings.TrimSpace(r.FormValue("time_range")); tr != "" && tr != "None" { + switch tr { + case "day", "week", "month", "year": + tt := tr + timeRange = &tt + default: + return SearchRequest{}, errors.New("invalid parameter: time_range") + } + } + + var timeoutLimit *float64 + if s := strings.TrimSpace(r.FormValue("timeout_limit")); s != "" && s != "None" { + v, err := strconv.ParseFloat(s, 64) + if err != nil || v <= 0 { + return SearchRequest{}, errors.New("invalid parameter: timeout_limit") + } + timeoutLimit = &v + } + + language := strings.TrimSpace(r.FormValue("language")) + if language == "" { + language = "auto" + } + switch language { + case "auto", "all": + // ok + default: + if !languageCodeRe.MatchString(language) { + return SearchRequest{}, errors.New("invalid parameter: language") + } + } + + // engines is an explicit list of engine names. + engines := splitCSV(strings.TrimSpace(r.FormValue("engines"))) + + // categories and category_ params mirror SearXNG's webadapter parsing. + // We don't validate against a registry here; we just preserve the requested values. + catSet := map[string]bool{} + if catsParam := strings.TrimSpace(r.FormValue("categories")); catsParam != "" { + for _, cat := range splitCSV(catsParam) { + catSet[cat] = true + } + } + for k, v := range r.Form { + if !strings.HasPrefix(k, "category_") { + continue + } + category := strings.TrimPrefix(k, "category_") + if category == "" { + continue + } + val := "" + if len(v) > 0 { + val = strings.TrimSpace(v[0]) + } + if val == "" || val != "off" { + catSet[category] = true + } else { + delete(catSet, category) + } + } + categories := make([]string, 0, len(catSet)) + for c := range catSet { + categories = append(categories, c) + } + if len(categories) == 0 { + categories = []string{"general"} + } + + // Parse engine_data--= parameters. + engineData := map[string]map[string]string{} + for k, v := range r.Form { + if !strings.HasPrefix(k, "engine_data-") { + continue + } + parts := strings.SplitN(k, "-", 3) // engine_data-- + if len(parts) != 3 { + continue + } + engine := parts[1] + key := parts[2] + // For HTML forms, r.Form[k] can contain multiple values; keep first. + val := "" + if len(v) > 0 { + val = v[0] + } + if _, ok := engineData[engine]; !ok { + engineData[engine] = map[string]string{} + } + engineData[engine][key] = val + } + + accessToken := parseAccessToken(r) + + return SearchRequest{ + Format: OutputFormat(format), + Query: q, + Pageno: pageno, + Safesearch: safesearch, + TimeRange: timeRange, + TimeoutLimit: timeoutLimit, + Language: language, + Engines: engines, + Categories: categories, + EngineData: engineData, + AccessToken: accessToken, + }, nil +} + +func splitCSV(s string) []string { + if s == "" { + return nil + } + raw := strings.Split(s, ",") + out := make([]string, 0, len(raw)) + for _, item := range raw { + item = strings.TrimSpace(item) + if item == "" { + continue + } + out = append(out, item) + } + return out +} + +func parseAccessToken(r *http.Request) string { + // Supported sources (first non-empty wins): + // - `Authorization: Bearer ` + // - `X-Search-Token` / `X-Brave-Access-Token` + // - `token` form value + if auth := r.Header.Get("Authorization"); auth != "" { + const prefix = "Bearer " + if len(auth) > len(prefix) && auth[:len(prefix)] == prefix { + return strings.TrimSpace(auth[len(prefix):]) + } + } + + if v := strings.TrimSpace(r.Header.Get("X-Search-Token")); v != "" { + return v + } + if v := strings.TrimSpace(r.Header.Get("X-Brave-Access-Token")); v != "" { + return v + } + + if v := strings.TrimSpace(r.FormValue("token")); v != "" { + return v + } + + return "" +} + diff --git a/internal/search/request_params_test.go b/internal/search/request_params_test.go new file mode 100644 index 0000000..a14911f --- /dev/null +++ b/internal/search/request_params_test.go @@ -0,0 +1,74 @@ +package search + +import ( + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" +) + +func TestParseSearchRequest_MissingQ(t *testing.T) { + r := httptest.NewRequest(http.MethodGet, "/search?format=json", nil) + _, err := ParseSearchRequest(r) + if err == nil { + t.Fatalf("expected error, got nil") + } +} + +func TestParseSearchRequest_InvalidPageno(t *testing.T) { + r := httptest.NewRequest(http.MethodGet, "/search?q=hi&pageno=0", nil) + _, err := ParseSearchRequest(r) + if err == nil { + t.Fatalf("expected error for pageno, got nil") + } +} + +func TestParseSearchRequest_InvalidLanguage(t *testing.T) { + r := httptest.NewRequest(http.MethodGet, "/search?q=hi&language=bad!", nil) + _, err := ParseSearchRequest(r) + if err == nil { + t.Fatalf("expected error for language, got nil") + } +} + +func TestParseSearchRequest_CategoriesAndEngineData(t *testing.T) { + values := url.Values{} + values.Set("q", "hello") + values.Set("format", "json") + values.Set("categories", "general,science") + values.Set("category_science", "off") + values.Set("engines", "wikipedia,arxiv") + values.Set("engine_data-wikipedia-timeout", "123") + + r := httptest.NewRequest(http.MethodPost, "/search", strings.NewReader(values.Encode())) + r.Header.Set("Content-Type", "application/x-www-form-urlencoded") + + req, err := ParseSearchRequest(r) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // categories should drop `science` due to category_science=off + wantCats := map[string]bool{"general": true} + gotCats := map[string]bool{} + for _, c := range req.Categories { + gotCats[c] = true + } + for c := range wantCats { + if !gotCats[c] { + t.Fatalf("expected category %q in result, got %v", c, req.Categories) + } + } + if gotCats["science"] { + t.Fatalf("expected category science to be removed, got %v", req.Categories) + } + + if len(req.Engines) != 2 { + t.Fatalf("expected 2 engines, got %v", req.Engines) + } + if req.EngineData["wikipedia"]["timeout"] != "123" { + t.Fatalf("expected engine_data parsed, got %#v", req.EngineData) + } +} + diff --git a/internal/search/response.go b/internal/search/response.go new file mode 100644 index 0000000..3b07096 --- /dev/null +++ b/internal/search/response.go @@ -0,0 +1,223 @@ +package search + +import ( + "bytes" + "encoding/csv" + "encoding/json" + "fmt" + "net/http" + "net/url" + "encoding/xml" + "strconv" + "strings" +) + +func WriteSearchResponse(w http.ResponseWriter, format OutputFormat, resp SearchResponse) error { + switch format { + case FormatJSON: + w.Header().Set("Content-Type", "application/json; charset=utf-8") + return json.NewEncoder(w).Encode(resp) + case FormatCSV: + w.Header().Set("Content-Type", "text/csv; charset=utf-8") + if err := writeCSV(w, resp); err != nil { + return err + } + return nil + case FormatRSS: + w.Header().Set("Content-Type", "text/xml; charset=utf-8") + if err := writeRSS(w, resp); err != nil { + return err + } + return nil + case FormatHTML: + w.WriteHeader(http.StatusNotImplemented) + _, _ = w.Write([]byte("format=html not implemented yet")) + return nil + default: + return fmt.Errorf("unsupported format: %s", format) + } +} + +// csvRowHeader matches the SearXNG CSV writer key order. +var csvRowHeader = []string{"title", "url", "content", "host", "engine", "score", "type"} + +func writeCSV(w http.ResponseWriter, resp SearchResponse) error { + cw := csv.NewWriter(w) + defer cw.Flush() + + if err := cw.Write(csvRowHeader); err != nil { + return err + } + + for _, r := range resp.Results { + urlStr := "" + if r.URL != nil { + urlStr = *r.URL + } + host := hostFromURL(urlStr) + scoreStr := strconv.FormatFloat(r.Score, 'f', -1, 64) + row := []string{ + r.Title, + urlStr, + r.Content, + host, + r.Engine, + scoreStr, + "result", + } + if err := cw.Write(row); err != nil { + return err + } + } + + for _, ans := range resp.Answers { + title := asString(ans["title"]) + urlStr := asString(ans["url"]) + content := asString(ans["content"]) + engine := asString(ans["engine"]) + scoreStr := scoreString(ans["score"]) + host := hostFromURL(urlStr) + + row := []string{ + title, + urlStr, + content, + host, + engine, + scoreStr, + "answer", + } + if err := cw.Write(row); err != nil { + return err + } + } + + for _, s := range resp.Suggestions { + row := []string{s, "", "", "", "", "", "suggestion"} + if err := cw.Write(row); err != nil { + return err + } + } + + for _, c := range resp.Corrections { + row := []string{c, "", "", "", "", "", "correction"} + if err := cw.Write(row); err != nil { + return err + } + } + + return nil +} + +func writeRSS(w http.ResponseWriter, resp SearchResponse) error { + q := resp.Query + escapedTitle := xmlEscape("SearXNG search: " + q) + escapedDesc := xmlEscape("Search results for \"" + q + "\" - SearXNG") + escapedQueryTerms := xmlEscape(q) + + link := "/search?q=" + url.QueryEscape(q) + opensearchQuery := fmt.Sprintf(``, escapedQueryTerms) + + // SearXNG template uses the number of results for both totalResults and itemsPerPage. + nr := resp.NumberOfResults + + var items bytes.Buffer + for _, r := range resp.Results { + title := xmlEscape(r.Title) + urlStr := "" + if r.URL != nil { + urlStr = *r.URL + } + linkEsc := xmlEscape(urlStr) + desc := xmlEscape(r.Content) + + pub := "" + if r.Pubdate != nil && strings.TrimSpace(*r.Pubdate) != "" { + pub = "" + xmlEscape(*r.Pubdate) + "" + } + + items.WriteString( + fmt.Sprintf( + `%sresult%s%s%s`, + title, + linkEsc, + desc, + pub, + ), + ) + } + + xml := fmt.Sprintf( + ` + + + + %s + %s + %s + %d + 1 + %d + + +`, + escapedTitle, + xmlEscape(link), + escapedDesc, + nr, + nr, + opensearchQuery, + items.String(), + ) + + _, err := w.Write([]byte(xml)) + return err +} + +func xmlEscape(s string) string { + var b bytes.Buffer + _ = xml.EscapeText(&b, []byte(s)) + return b.String() +} + +func hostFromURL(urlStr string) string { + if strings.TrimSpace(urlStr) == "" { + return "" + } + u, err := url.Parse(urlStr) + if err != nil { + return "" + } + return u.Host +} + +func asString(v any) string { + s, _ := v.(string) + return s +} + +func scoreString(v any) string { + switch t := v.(type) { + case float64: + return strconv.FormatFloat(t, 'f', -1, 64) + case float32: + return strconv.FormatFloat(float64(t), 'f', -1, 64) + case int: + return strconv.Itoa(t) + case int64: + return strconv.FormatInt(t, 10) + case json.Number: + if f, err := t.Float64(); err == nil { + return strconv.FormatFloat(f, 'f', -1, 64) + } + return "" + default: + return "" + } +} + diff --git a/internal/search/service.go b/internal/search/service.go new file mode 100644 index 0000000..a2cbdd2 --- /dev/null +++ b/internal/search/service.go @@ -0,0 +1,111 @@ +package search + +import ( + "context" + "net/http" + "time" + + "github.com/ashie/gosearch/internal/engines" + "github.com/ashie/gosearch/internal/contracts" + "github.com/ashie/gosearch/internal/upstream" +) + +type ServiceConfig struct { + UpstreamURL string + HTTPTimeout time.Duration +} + +type Service struct { + upstreamClient *upstream.Client + planner *engines.Planner + localEngines map[string]engines.Engine +} + +func NewService(cfg ServiceConfig) *Service { + timeout := cfg.HTTPTimeout + if timeout <= 0 { + timeout = 10 * time.Second + } + + httpClient := &http.Client{Timeout: timeout} + + var up *upstream.Client + if cfg.UpstreamURL != "" { + c, err := upstream.NewClient(cfg.UpstreamURL, timeout) + if err == nil { + up = c + } + } + + return &Service{ + upstreamClient: up, + planner: engines.NewPlannerFromEnv(), + localEngines: engines.NewDefaultPortedEngines(httpClient), + } +} + +func (s *Service) Search(ctx context.Context, req SearchRequest) (SearchResponse, error) { + localEngines, upstreamEngines, _ := s.planner.Plan(req) + + responses := make([]contracts.SearchResponse, 0, 2) + upstreamSet := map[string]bool{} + for _, e := range upstreamEngines { + upstreamSet[e] = true + } + + for _, engineName := range localEngines { + eng, ok := s.localEngines[engineName] + if !ok { + continue + } + r, err := eng.Search(ctx, req) + if err != nil { + // MVP: fail fast so the client sees a real error. + return SearchResponse{}, err + } + responses = append(responses, r) + + // Some engines (notably qwant due to anti-bot protections) can return + // zero local results depending on client/IP. If upstream SearXNG is + // configured, let it attempt the same engine as a fallback. + if shouldFallbackToUpstream(engineName, r) && !upstreamSet[engineName] { + upstreamEngines = append(upstreamEngines, engineName) + upstreamSet[engineName] = true + } + } + + if s.upstreamClient != nil && len(upstreamEngines) > 0 { + r, err := s.upstreamClient.SearchJSON(ctx, req, upstreamEngines) + if err != nil { + return SearchResponse{}, err + } + responses = append(responses, r) + } + + if len(responses) == 0 { + return SearchResponse{ + Query: req.Query, + NumberOfResults: 0, + Results: []MainResult{}, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: []string{}, + UnresponsiveEngines: [][2]string{}, + }, nil + } + + merged := MergeResponses(responses) + if merged.Query == "" { + merged.Query = req.Query + } + return merged, nil +} + +func shouldFallbackToUpstream(engineName string, r contracts.SearchResponse) bool { + if engineName != "qwant" { + return false + } + return len(r.Results) == 0 && len(r.Answers) == 0 && len(r.Infoboxes) == 0 +} + diff --git a/internal/search/types.go b/internal/search/types.go new file mode 100644 index 0000000..ff5a406 --- /dev/null +++ b/internal/search/types.go @@ -0,0 +1,20 @@ +package search + +import "github.com/ashie/gosearch/internal/contracts" + +// Re-export the JSON contract types so the rest of the code can stay in the +// `internal/search` namespace without creating an import cycle. +type OutputFormat = contracts.OutputFormat + +const ( + FormatHTML = contracts.FormatHTML // accepted for compatibility (not yet implemented) + FormatJSON = contracts.FormatJSON + FormatCSV = contracts.FormatCSV + FormatRSS = contracts.FormatRSS +) + +type SearchRequest = contracts.SearchRequest +type SearchResponse = contracts.SearchResponse + +type MainResult = contracts.MainResult + diff --git a/internal/upstream/client.go b/internal/upstream/client.go new file mode 100644 index 0000000..54c6685 --- /dev/null +++ b/internal/upstream/client.go @@ -0,0 +1,112 @@ +package upstream + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" + + "github.com/ashie/gosearch/internal/contracts" +) + +type Client struct { + baseURL string + http *http.Client +} + +func NewClient(baseURL string, timeout time.Duration) (*Client, error) { + if strings.TrimSpace(baseURL) == "" { + return nil, errors.New("upstream base URL is empty") + } + + u, err := url.Parse(baseURL) + if err != nil { + return nil, fmt.Errorf("invalid upstream base URL: %w", err) + } + // Normalize: trim trailing slash to make URL concatenation predictable. + base := strings.TrimRight(u.String(), "/") + + if timeout <= 0 { + timeout = 10 * time.Second + } + + return &Client{ + baseURL: base, + http: &http.Client{ + Timeout: timeout, + }, + }, nil +} + +func (c *Client) SearchJSON(ctx context.Context, req contracts.SearchRequest, engines []string) (contracts.SearchResponse, error) { + // Always request upstream JSON; the Go service will handle csv/rss later. + form := url.Values{} + form.Set("q", req.Query) + form.Set("format", "json") + form.Set("pageno", fmt.Sprintf("%d", req.Pageno)) + form.Set("safesearch", fmt.Sprintf("%d", req.Safesearch)) + form.Set("language", req.Language) + + if req.TimeRange != nil { + form.Set("time_range", *req.TimeRange) + } + if req.TimeoutLimit != nil { + form.Set("timeout_limit", formatFloat(*req.TimeoutLimit)) + } + if len(req.Categories) > 0 { + form.Set("categories", strings.Join(req.Categories, ",")) + } + + if len(engines) > 0 { + form.Set("engines", strings.Join(engines, ",")) + } + + for engineName, kv := range req.EngineData { + for key, value := range kv { + // Mirror SearXNG's naming: `engine_data--=` + form.Set(fmt.Sprintf("engine_data-%s-%s", engineName, key), value) + } + } + + endpoint := c.baseURL + "/search" + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, strings.NewReader(form.Encode())) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=utf-8") + + resp, err := c.http.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(io.LimitReader(resp.Body, 4*1024*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + if resp.StatusCode != http.StatusOK { + return contracts.SearchResponse{}, fmt.Errorf("upstream search failed: status=%d body=%q", resp.StatusCode, string(body)) + } + + // Decode upstream JSON into our contract types. + var out contracts.SearchResponse + dec := json.NewDecoder(strings.NewReader(string(body))) + if err := dec.Decode(&out); err != nil { + return contracts.SearchResponse{}, fmt.Errorf("decode upstream JSON: %w", err) + } + + return out, nil +} + +func formatFloat(f float64) string { + // Keep stable formatting for upstream parsing. + return strings.TrimRight(strings.TrimRight(fmt.Sprintf("%.6f", f), "0"), ".") +} +