refactor: clean up verbose and redundant comments
Trim or remove comments that: - State the obvious (function names already convey purpose) - Repeat what the code clearly shows - Are excessively long without adding value Keep comments that explain *why*, not *what*.
This commit is contained in:
parent
805e7ffdc2
commit
5b942a5fd6
11 changed files with 16 additions and 102 deletions
|
|
@ -27,8 +27,7 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Service fetches search suggestions from an upstream metasearch instance
|
// Service fetches search suggestions from upstream or Wikipedia OpenSearch.
|
||||||
// or falls back to Wikipedia's OpenSearch API.
|
|
||||||
type Service struct {
|
type Service struct {
|
||||||
upstreamURL string
|
upstreamURL string
|
||||||
http *http.Client
|
http *http.Client
|
||||||
|
|
@ -44,7 +43,6 @@ func NewService(upstreamURL string, timeout time.Duration) *Service {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Suggestions returns search suggestions for the given query.
|
|
||||||
func (s *Service) Suggestions(ctx context.Context, query string) ([]string, error) {
|
func (s *Service) Suggestions(ctx context.Context, query string) ([]string, error) {
|
||||||
if strings.TrimSpace(query) == "" {
|
if strings.TrimSpace(query) == "" {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
|
|
@ -56,7 +54,6 @@ func (s *Service) Suggestions(ctx context.Context, query string) ([]string, erro
|
||||||
return s.wikipediaSuggestions(ctx, query)
|
return s.wikipediaSuggestions(ctx, query)
|
||||||
}
|
}
|
||||||
|
|
||||||
// upstreamSuggestions proxies to an upstream /autocompleter endpoint.
|
|
||||||
func (s *Service) upstreamSuggestions(ctx context.Context, query string) ([]string, error) {
|
func (s *Service) upstreamSuggestions(ctx context.Context, query string) ([]string, error) {
|
||||||
u := s.upstreamURL + "/autocompleter?" + url.Values{"q": {query}}.Encode()
|
u := s.upstreamURL + "/autocompleter?" + url.Values{"q": {query}}.Encode()
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||||
|
|
|
||||||
|
|
@ -22,14 +22,10 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
// MainResult represents one element of the `results` array.
|
// MainResult represents one element of the `results` array.
|
||||||
//
|
// Unknown keys are preserved in `raw` and re-emitted via MarshalJSON.
|
||||||
// The API returns many additional keys beyond what templates use. To keep the
|
|
||||||
// contract stable for proxying/merging, we preserve all unknown keys in
|
|
||||||
// `raw` and re-emit them via MarshalJSON.
|
|
||||||
type MainResult struct {
|
type MainResult struct {
|
||||||
raw map[string]any
|
raw map[string]any
|
||||||
|
|
||||||
// Common fields used by templates (RSS uses: title, url, content, pubdate).
|
|
||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
Title string `json:"title"`
|
Title string `json:"title"`
|
||||||
Content string `json:"content"`
|
Content string `json:"content"`
|
||||||
|
|
@ -45,17 +41,13 @@ type MainResult struct {
|
||||||
Positions []int `json:"positions"`
|
Positions []int `json:"positions"`
|
||||||
Engines []string `json:"engines"`
|
Engines []string `json:"engines"`
|
||||||
|
|
||||||
// These fields exist in the MainResult base; keep them so downstream
|
|
||||||
// callers can generate richer output later.
|
|
||||||
OpenGroup bool `json:"open_group"`
|
OpenGroup bool `json:"open_group"`
|
||||||
CloseGroup bool `json:"close_group"`
|
CloseGroup bool `json:"close_group"`
|
||||||
|
|
||||||
// parsed_url is emitted as a tuple; we preserve it as-is.
|
|
||||||
ParsedURL any `json:"parsed_url"`
|
ParsedURL any `json:"parsed_url"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (mr *MainResult) UnmarshalJSON(data []byte) error {
|
func (mr *MainResult) UnmarshalJSON(data []byte) error {
|
||||||
// Preserve the full object.
|
|
||||||
dec := json.NewDecoder(bytes.NewReader(data))
|
dec := json.NewDecoder(bytes.NewReader(data))
|
||||||
dec.UseNumber()
|
dec.UseNumber()
|
||||||
|
|
||||||
|
|
@ -66,7 +58,6 @@ func (mr *MainResult) UnmarshalJSON(data []byte) error {
|
||||||
|
|
||||||
mr.raw = m
|
mr.raw = m
|
||||||
|
|
||||||
// Fill the typed/common fields (best-effort; don't fail if types differ).
|
|
||||||
mr.Template = stringOrEmpty(m["template"])
|
mr.Template = stringOrEmpty(m["template"])
|
||||||
mr.Title = stringOrEmpty(m["title"])
|
mr.Title = stringOrEmpty(m["title"])
|
||||||
mr.Content = stringOrEmpty(m["content"])
|
mr.Content = stringOrEmpty(m["content"])
|
||||||
|
|
@ -104,12 +95,10 @@ func (mr *MainResult) UnmarshalJSON(data []byte) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (mr MainResult) MarshalJSON() ([]byte, error) {
|
func (mr MainResult) MarshalJSON() ([]byte, error) {
|
||||||
// If we came from upstream JSON, preserve all keys exactly.
|
|
||||||
if mr.raw != nil {
|
if mr.raw != nil {
|
||||||
return json.Marshal(mr.raw)
|
return json.Marshal(mr.raw)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise, marshal the known fields.
|
|
||||||
m := map[string]any{
|
m := map[string]any{
|
||||||
"template": mr.Template,
|
"template": mr.Template,
|
||||||
"title": mr.Title,
|
"title": mr.Title,
|
||||||
|
|
|
||||||
|
|
@ -20,18 +20,15 @@ package contracts
|
||||||
type OutputFormat string
|
type OutputFormat string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
FormatHTML OutputFormat = "html" // accepted for compatibility (not yet implemented)
|
FormatHTML OutputFormat = "html" // accepted for compatibility
|
||||||
FormatJSON OutputFormat = "json"
|
FormatJSON OutputFormat = "json"
|
||||||
FormatCSV OutputFormat = "csv"
|
FormatCSV OutputFormat = "csv"
|
||||||
FormatRSS OutputFormat = "rss"
|
FormatRSS OutputFormat = "rss"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SearchRequest struct {
|
type SearchRequest struct {
|
||||||
// Format is what the client requested via `format=...`.
|
Format OutputFormat
|
||||||
Format OutputFormat
|
Query string
|
||||||
|
|
||||||
Query string
|
|
||||||
|
|
||||||
Pageno int
|
Pageno int
|
||||||
Safesearch int
|
Safesearch int
|
||||||
TimeRange *string
|
TimeRange *string
|
||||||
|
|
@ -39,16 +36,14 @@ type SearchRequest struct {
|
||||||
TimeoutLimit *float64
|
TimeoutLimit *float64
|
||||||
Language string
|
Language string
|
||||||
|
|
||||||
// Engines and categories are used for deciding which engines run locally vs are proxied.
|
// Engines and categories decide which engines run locally vs proxy to upstream.
|
||||||
// For now, engines can be supplied directly via the `engines` form parameter.
|
|
||||||
Engines []string
|
Engines []string
|
||||||
Categories []string
|
Categories []string
|
||||||
|
|
||||||
// EngineData matches the `engine_data-<engine>-<key>=<value>` parameters.
|
// EngineData matches the `engine_data-<engine>-<key>=<value>` parameters.
|
||||||
EngineData map[string]map[string]string
|
EngineData map[string]map[string]string
|
||||||
|
|
||||||
// AccessToken is an optional request token used to gate paid/limited engines.
|
// AccessToken gates paid/limited engines. Not part of upstream JSON schema.
|
||||||
// It is not part of the upstream JSON schema; it only influences local engines.
|
|
||||||
AccessToken string
|
AccessToken string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,13 +30,9 @@ import (
|
||||||
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
||||||
)
|
)
|
||||||
|
|
||||||
// BraveEngine implements the `braveapi` engine (Brave Web Search API).
|
// BraveEngine implements the Brave Web Search API.
|
||||||
//
|
// Required: BRAVE_API_KEY env var or config.
|
||||||
// Config / gating:
|
// Optional: BRAVE_ACCESS_TOKEN to gate requests.
|
||||||
// - BRAVE_API_KEY: required to call Brave
|
|
||||||
// - BRAVE_ACCESS_TOKEN (optional): if set, the request must include a token
|
|
||||||
// that matches the env var (via Authorization Bearer, X-Search-Token,
|
|
||||||
// X-Brave-Access-Token, or form field `token`).
|
|
||||||
type BraveEngine struct {
|
type BraveEngine struct {
|
||||||
client *http.Client
|
client *http.Client
|
||||||
apiKey string
|
apiKey string
|
||||||
|
|
@ -51,8 +47,6 @@ func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (
|
||||||
return contracts.SearchResponse{}, errors.New("brave engine not initialized")
|
return contracts.SearchResponse{}, errors.New("brave engine not initialized")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gate / config checks should not be treated as fatal errors; the reference
|
|
||||||
// implementation treats misconfigured engines as unresponsive.
|
|
||||||
if strings.TrimSpace(e.apiKey) == "" {
|
if strings.TrimSpace(e.apiKey) == "" {
|
||||||
return contracts.SearchResponse{
|
return contracts.SearchResponse{
|
||||||
Query: req.Query,
|
Query: req.Query,
|
||||||
|
|
@ -109,8 +103,6 @@ func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The reference implementation checks `if params["safesearch"]:` which treats any
|
|
||||||
// non-zero (moderate/strict) as strict.
|
|
||||||
if req.Safesearch > 0 {
|
if req.Safesearch > 0 {
|
||||||
args.Set("safesearch", "strict")
|
args.Set("safesearch", "strict")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,9 +24,8 @@ import (
|
||||||
"github.com/metamorphosis-dev/kafka/internal/config"
|
"github.com/metamorphosis-dev/kafka/internal/config"
|
||||||
)
|
)
|
||||||
|
|
||||||
// NewDefaultPortedEngines returns the starter set of Go-native engines.
|
// NewDefaultPortedEngines returns the Go-native engine registry.
|
||||||
// The service can swap/extend this registry later as more engines are ported.
|
// If cfg is nil, API keys fall back to environment variables.
|
||||||
// If cfg is nil, falls back to reading API keys from environment variables.
|
|
||||||
func NewDefaultPortedEngines(client *http.Client, cfg *config.Config) map[string]Engine {
|
func NewDefaultPortedEngines(client *http.Client, cfg *config.Config) map[string]Engine {
|
||||||
if client == nil {
|
if client == nil {
|
||||||
client = &http.Client{Timeout: 10 * time.Second}
|
client = &http.Client{Timeout: 10 * time.Second}
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,6 @@ func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest)
|
||||||
start := (req.Pageno - 1) * 10
|
start := (req.Pageno - 1) * 10
|
||||||
query := url.QueryEscape(req.Query)
|
query := url.QueryEscape(req.Query)
|
||||||
|
|
||||||
// Build URL like SearXNG does.
|
|
||||||
u := fmt.Sprintf(
|
u := fmt.Sprintf(
|
||||||
"https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s",
|
"https://www.google.com/search?q=%s&filter=0&start=%d&hl=%s&lr=%s&safe=%s",
|
||||||
query,
|
query,
|
||||||
|
|
@ -118,7 +117,6 @@ func (e *GoogleEngine) Search(ctx context.Context, req contracts.SearchRequest)
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// detectGoogleSorry returns true if the response is a Google block/CAPTCHA page.
|
|
||||||
func detectGoogleSorry(resp *http.Response) bool {
|
func detectGoogleSorry(resp *http.Response) bool {
|
||||||
if resp.Request != nil {
|
if resp.Request != nil {
|
||||||
if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") {
|
if resp.Request.URL.Host == "sorry.google.com" || strings.HasPrefix(resp.Request.URL.Path, "/sorry") {
|
||||||
|
|
@ -128,16 +126,9 @@ func detectGoogleSorry(resp *http.Response) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseGoogleResults extracts search results from Google's HTML.
|
|
||||||
// Uses the same selectors as SearXNG: div.MjjYud for result containers.
|
|
||||||
func parseGoogleResults(body, query string) []contracts.MainResult {
|
func parseGoogleResults(body, query string) []contracts.MainResult {
|
||||||
var results []contracts.MainResult
|
var results []contracts.MainResult
|
||||||
|
|
||||||
// SearXNG selector: .//div[contains(@class, "MjjYud")]
|
|
||||||
// Each result block contains a title link and snippet.
|
|
||||||
// We simulate the XPath matching with regex-based extraction.
|
|
||||||
|
|
||||||
// Find all MjjYud div blocks.
|
|
||||||
mjjPattern := regexp.MustCompile(`<div[^>]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)</div>\s*(?=<div[^>]*class="[^"]*MjjYud|$)`)
|
mjjPattern := regexp.MustCompile(`<div[^>]*class="[^"]*MjjYud[^"]*"[^>]*>(.*?)</div>\s*(?=<div[^>]*class="[^"]*MjjYud|$)`)
|
||||||
matches := mjjPattern.FindAllStringSubmatch(body, -1)
|
matches := mjjPattern.FindAllStringSubmatch(body, -1)
|
||||||
|
|
||||||
|
|
@ -147,15 +138,12 @@ func parseGoogleResults(body, query string) []contracts.MainResult {
|
||||||
}
|
}
|
||||||
block := match[1]
|
block := match[1]
|
||||||
|
|
||||||
// Extract title and URL from the result link.
|
|
||||||
// Pattern: <a href="/url?q=ACTUAL_URL&sa=..." ...>TITLE</a>
|
|
||||||
urlPattern := regexp.MustCompile(`<a[^>]+href="(/url\?q=[^"&]+)`)
|
urlPattern := regexp.MustCompile(`<a[^>]+href="(/url\?q=[^"&]+)`)
|
||||||
urlMatch := urlPattern.FindStringSubmatch(block)
|
urlMatch := urlPattern.FindStringSubmatch(block)
|
||||||
if len(urlMatch) < 2 {
|
if len(urlMatch) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
rawURL := urlMatch[1]
|
rawURL := urlMatch[1]
|
||||||
// Remove /url?q= prefix and decode.
|
|
||||||
actualURL := strings.TrimPrefix(rawURL, "/url?q=")
|
actualURL := strings.TrimPrefix(rawURL, "/url?q=")
|
||||||
if amp := strings.Index(actualURL, "&"); amp != -1 {
|
if amp := strings.Index(actualURL, "&"); amp != -1 {
|
||||||
actualURL = actualURL[:amp]
|
actualURL = actualURL[:amp]
|
||||||
|
|
@ -168,14 +156,12 @@ func parseGoogleResults(body, query string) []contracts.MainResult {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract title from the title tag.
|
|
||||||
titlePattern := regexp.MustCompile(`<span[^>]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)</span>`)
|
titlePattern := regexp.MustCompile(`<span[^>]*class="[^"]*qrStP[^"]*"[^>]*>([^<]+)</span>`)
|
||||||
titleMatch := titlePattern.FindStringSubmatch(block)
|
titleMatch := titlePattern.FindStringSubmatch(block)
|
||||||
title := query
|
title := query
|
||||||
if len(titleMatch) >= 2 {
|
if len(titleMatch) >= 2 {
|
||||||
title = stripTags(titleMatch[1])
|
title = stripTags(titleMatch[1])
|
||||||
} else {
|
} else {
|
||||||
// Fallback: extract visible text from an <a> with data-title or role="link"
|
|
||||||
linkTitlePattern := regexp.MustCompile(`<a[^>]+role="link"[^>]*>([^<]+)<`)
|
linkTitlePattern := regexp.MustCompile(`<a[^>]+role="link"[^>]*>([^<]+)<`)
|
||||||
ltMatch := linkTitlePattern.FindStringSubmatch(block)
|
ltMatch := linkTitlePattern.FindStringSubmatch(block)
|
||||||
if len(ltMatch) >= 2 {
|
if len(ltMatch) >= 2 {
|
||||||
|
|
@ -183,7 +169,6 @@ func parseGoogleResults(body, query string) []contracts.MainResult {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract snippet from data-sncf divs (SearXNG's approach).
|
|
||||||
snippet := extractGoogleSnippet(block)
|
snippet := extractGoogleSnippet(block)
|
||||||
|
|
||||||
urlPtr := actualURL
|
urlPtr := actualURL
|
||||||
|
|
@ -202,10 +187,7 @@ func parseGoogleResults(body, query string) []contracts.MainResult {
|
||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractGoogleSnippet extracts the snippet text from a Google result block.
|
|
||||||
func extractGoogleSnippet(block string) string {
|
func extractGoogleSnippet(block string) string {
|
||||||
// Google's snippets live in divs with data-sncf attribute.
|
|
||||||
// SearXNG looks for: .//div[contains(@data-sncf, "1")]
|
|
||||||
snippetPattern := regexp.MustCompile(`<div[^>]+data-sncf="1"[^>]*>(.*?)</div>`)
|
snippetPattern := regexp.MustCompile(`<div[^>]+data-sncf="1"[^>]*>(.*?)</div>`)
|
||||||
matches := snippetPattern.FindAllStringSubmatch(block, -1)
|
matches := snippetPattern.FindAllStringSubmatch(block, -1)
|
||||||
var parts []string
|
var parts []string
|
||||||
|
|
@ -221,10 +203,8 @@ func extractGoogleSnippet(block string) string {
|
||||||
return strings.Join(parts, " ")
|
return strings.Join(parts, " ")
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractGoogleSuggestions extracts search suggestions from Google result cards.
|
|
||||||
func extractGoogleSuggestions(body string) []string {
|
func extractGoogleSuggestions(body string) []string {
|
||||||
var suggestions []string
|
var suggestions []string
|
||||||
// SearXNG xpath: //div[contains(@class, "ouy7Mc")]//a
|
|
||||||
suggestionPattern := regexp.MustCompile(`(?s)<div[^>]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`)
|
suggestionPattern := regexp.MustCompile(`(?s)<div[^>]*class="[^"]*ouy7Mc[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`)
|
||||||
matches := suggestionPattern.FindAllStringSubmatch(body, -1)
|
matches := suggestionPattern.FindAllStringSubmatch(body, -1)
|
||||||
seen := map[string]bool{}
|
seen := map[string]bool{}
|
||||||
|
|
@ -241,8 +221,6 @@ func extractGoogleSuggestions(body string) []string {
|
||||||
return suggestions
|
return suggestions
|
||||||
}
|
}
|
||||||
|
|
||||||
// googleHL maps SearXNG locale to Google hl (host language) parameter.
|
|
||||||
// e.g. "en-US" -> "en-US"
|
|
||||||
func googleHL(lang string) string {
|
func googleHL(lang string) string {
|
||||||
lang = strings.ToLower(strings.TrimSpace(lang))
|
lang = strings.ToLower(strings.TrimSpace(lang))
|
||||||
if lang == "" || lang == "auto" {
|
if lang == "" || lang == "auto" {
|
||||||
|
|
@ -251,8 +229,6 @@ func googleHL(lang string) string {
|
||||||
return lang
|
return lang
|
||||||
}
|
}
|
||||||
|
|
||||||
// googleUILanguage maps SearXNG language to Google lr (language restrict) parameter.
|
|
||||||
// e.g. "en" -> "lang_en", "de" -> "lang_de"
|
|
||||||
func googleUILanguage(lang string) string {
|
func googleUILanguage(lang string) string {
|
||||||
lang = strings.ToLower(strings.Split(lang, "-")[0])
|
lang = strings.ToLower(strings.Split(lang, "-")[0])
|
||||||
if lang == "" || lang == "auto" {
|
if lang == "" || lang == "auto" {
|
||||||
|
|
@ -261,7 +237,6 @@ func googleUILanguage(lang string) string {
|
||||||
return "lang_" + lang
|
return "lang_" + lang
|
||||||
}
|
}
|
||||||
|
|
||||||
// googleSafeSearchLevel maps safesearch (0-2) to Google's safe parameter.
|
|
||||||
func googleSafeSearchLevel(safesearch int) string {
|
func googleSafeSearchLevel(safesearch int) string {
|
||||||
switch safesearch {
|
switch safesearch {
|
||||||
case 0:
|
case 0:
|
||||||
|
|
@ -275,7 +250,6 @@ func googleSafeSearchLevel(safesearch int) string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// stripTags removes HTML tags from a string.
|
|
||||||
func stripTags(s string) string {
|
func stripTags(s string) string {
|
||||||
stripper := regexp.MustCompile(`<[^>]*>`)
|
stripper := regexp.MustCompile(`<[^>]*>`)
|
||||||
s = stripper.ReplaceAllString(s, "")
|
s = stripper.ReplaceAllString(s, "")
|
||||||
|
|
|
||||||
|
|
@ -95,9 +95,6 @@ func (p *Planner) Plan(req contracts.SearchRequest) (localEngines, upstreamEngin
|
||||||
}
|
}
|
||||||
|
|
||||||
func inferFromCategories(categories []string) []string {
|
func inferFromCategories(categories []string) []string {
|
||||||
// Minimal mapping for the initial porting subset.
|
|
||||||
// This mirrors the idea of selecting from engine categories without
|
|
||||||
// embedding the whole engine registry.
|
|
||||||
set := map[string]bool{}
|
set := map[string]bool{}
|
||||||
for _, c := range categories {
|
for _, c := range categories {
|
||||||
switch strings.TrimSpace(strings.ToLower(c)) {
|
switch strings.TrimSpace(strings.ToLower(c)) {
|
||||||
|
|
@ -131,7 +128,6 @@ func inferFromCategories(categories []string) []string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func sortByOrder(list []string, order map[string]int) {
|
func sortByOrder(list []string, order map[string]int) {
|
||||||
// simple insertion sort (list is tiny)
|
|
||||||
for i := 1; i < len(list); i++ {
|
for i := 1; i < len(list); i++ {
|
||||||
j := i
|
j := i
|
||||||
for j > 0 && order[list[j-1]] > order[list[j]] {
|
for j > 0 && order[list[j-1]] > order[list[j]] {
|
||||||
|
|
|
||||||
|
|
@ -30,11 +30,7 @@ import (
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
// QwantEngine implements a `qwant` (web) adapter using
|
// QwantEngine implements the Qwant v3 API (web and web-lite modes).
|
||||||
// Qwant v3 endpoint: https://api.qwant.com/v3/search/web.
|
|
||||||
//
|
|
||||||
// Qwant's API is not fully documented; this implements parsing logic
|
|
||||||
// for the `web` category.
|
|
||||||
type QwantEngine struct {
|
type QwantEngine struct {
|
||||||
client *http.Client
|
client *http.Client
|
||||||
category string // "web" (JSON API) or "web-lite" (HTML fallback)
|
category string // "web" (JSON API) or "web-lite" (HTML fallback)
|
||||||
|
|
@ -53,8 +49,6 @@ func (e *QwantEngine) Search(ctx context.Context, req contracts.SearchRequest) (
|
||||||
return contracts.SearchResponse{Query: req.Query}, nil
|
return contracts.SearchResponse{Query: req.Query}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// For API parity we use web defaults: count=10, offset=(pageno-1)*count.
|
|
||||||
// The engine's config field exists so we can expand to news/images/videos later.
|
|
||||||
count := e.resultsPerPage
|
count := e.resultsPerPage
|
||||||
if count <= 0 {
|
if count <= 0 {
|
||||||
count = 10
|
count = 10
|
||||||
|
|
@ -271,9 +265,7 @@ func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchReq
|
||||||
results := make([]contracts.MainResult, 0)
|
results := make([]contracts.MainResult, 0)
|
||||||
seen := map[string]bool{}
|
seen := map[string]bool{}
|
||||||
|
|
||||||
// Pattern 1: legacy/known qwant-lite structure.
|
|
||||||
doc.Find("section article").Each(func(_ int, item *goquery.Selection) {
|
doc.Find("section article").Each(func(_ int, item *goquery.Selection) {
|
||||||
// ignore randomly interspersed advertising adds
|
|
||||||
if item.Find("span.tooltip").Length() > 0 {
|
if item.Find("span.tooltip").Length() > 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
@ -307,19 +299,14 @@ func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchReq
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
// Pattern 2: broader fallback for updated lite markup:
|
|
||||||
// any article/list item/div block containing an external anchor.
|
|
||||||
// We keep this conservative by requiring non-empty title + URL.
|
|
||||||
doc.Find("article, li, div").Each(func(_ int, item *goquery.Selection) {
|
doc.Find("article, li, div").Each(func(_ int, item *goquery.Selection) {
|
||||||
if len(results) >= 20 {
|
if len(results) >= 20 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Skip ad-like blocks in fallback pass too.
|
|
||||||
if item.Find("span.tooltip").Length() > 0 {
|
if item.Find("span.tooltip").Length() > 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip obvious nav/footer blocks.
|
|
||||||
classAttr, _ := item.Attr("class")
|
classAttr, _ := item.Attr("class")
|
||||||
classLower := strings.ToLower(classAttr)
|
classLower := strings.ToLower(classAttr)
|
||||||
if strings.Contains(classLower, "nav") || strings.Contains(classLower, "footer") {
|
if strings.Contains(classLower, "nav") || strings.Contains(classLower, "footer") {
|
||||||
|
|
@ -368,13 +355,10 @@ func (e *QwantEngine) searchWebLite(ctx context.Context, req contracts.SearchReq
|
||||||
}
|
}
|
||||||
seen[href] = true
|
seen[href] = true
|
||||||
|
|
||||||
// Best-effort snippet extraction from nearby paragraph/span text.
|
|
||||||
content := strings.TrimSpace(item.Find("p").First().Text())
|
content := strings.TrimSpace(item.Find("p").First().Text())
|
||||||
if content == "" {
|
if content == "" {
|
||||||
content = strings.TrimSpace(item.Find("span").First().Text())
|
content = strings.TrimSpace(item.Find("span").First().Text())
|
||||||
}
|
}
|
||||||
// If there is no snippet, still keep clearly external result links.
|
|
||||||
// Qwant-lite frequently omits rich snippets for some entries.
|
|
||||||
|
|
||||||
u := href
|
u := href
|
||||||
results = append(results, contracts.MainResult{
|
results = append(results, contracts.MainResult{
|
||||||
|
|
|
||||||
|
|
@ -27,19 +27,12 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
)
|
)
|
||||||
|
|
||||||
// RateLimitConfig controls per-IP rate limiting using a sliding window counter.
|
|
||||||
type RateLimitConfig struct {
|
type RateLimitConfig struct {
|
||||||
// Requests is the max number of requests allowed per window.
|
Requests int
|
||||||
Requests int
|
Window time.Duration
|
||||||
// Window is the time window duration (e.g. "1m").
|
|
||||||
Window time.Duration
|
|
||||||
// CleanupInterval is how often stale entries are purged (default: 5m).
|
|
||||||
CleanupInterval time.Duration
|
CleanupInterval time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// RateLimit returns a middleware that limits requests per IP address.
|
|
||||||
// Uses an in-memory sliding window counter. When the limit is exceeded,
|
|
||||||
// responds with HTTP 429 and a Retry-After header.
|
|
||||||
func RateLimit(cfg RateLimitConfig, logger *slog.Logger) func(http.Handler) http.Handler {
|
func RateLimit(cfg RateLimitConfig, logger *slog.Logger) func(http.Handler) http.Handler {
|
||||||
requests := cfg.Requests
|
requests := cfg.Requests
|
||||||
if requests <= 0 {
|
if requests <= 0 {
|
||||||
|
|
|
||||||
|
|
@ -25,11 +25,6 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
// MergeResponses merges multiple compatible JSON responses.
|
// MergeResponses merges multiple compatible JSON responses.
|
||||||
//
|
|
||||||
// MVP merge semantics:
|
|
||||||
// - results are concatenated with a simple de-dup key (engine|title|url)
|
|
||||||
// - suggestions/corrections are de-duplicated as sets
|
|
||||||
// - answers/infoboxes/unresponsive_engines are concatenated (best-effort)
|
|
||||||
func MergeResponses(responses []contracts.SearchResponse) contracts.SearchResponse {
|
func MergeResponses(responses []contracts.SearchResponse) contracts.SearchResponse {
|
||||||
var merged contracts.SearchResponse
|
var merged contracts.SearchResponse
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ import "github.com/metamorphosis-dev/kafka/internal/contracts"
|
||||||
type OutputFormat = contracts.OutputFormat
|
type OutputFormat = contracts.OutputFormat
|
||||||
|
|
||||||
const (
|
const (
|
||||||
FormatHTML = contracts.FormatHTML // accepted for compatibility (not yet implemented)
|
FormatHTML = contracts.FormatHTML // accepted for compatibility
|
||||||
FormatJSON = contracts.FormatJSON
|
FormatJSON = contracts.FormatJSON
|
||||||
FormatCSV = contracts.FormatCSV
|
FormatCSV = contracts.FormatCSV
|
||||||
FormatRSS = contracts.FormatRSS
|
FormatRSS = contracts.FormatRSS
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue