kafka/internal/engines/arxiv.go
Franz Kafka 7be03b4017 license: change from MIT to AGPLv3
Update LICENSE file and add AGPL header to all source files.

AGPLv3 ensures that if someone runs Kafka as a network service and
modifies it, they must release their source code under the same license.
2026-03-22 08:27:23 +00:00

207 lines
4.9 KiB
Go

// kafka — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package engines
import (
"bytes"
"context"
"encoding/xml"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
const (
arxivSearchPrefix = "all"
arxivMaxResults = 10
)
type ArxivEngine struct {
client *http.Client
}
func (e *ArxivEngine) Name() string { return "arxiv" }
func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if e == nil || e.client == nil {
return contracts.SearchResponse{}, errors.New("arxiv engine not initialized")
}
q := strings.TrimSpace(req.Query)
if q == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
start := (req.Pageno - 1) * arxivMaxResults
if start < 0 {
start = 0
}
args := url.Values{}
args.Set("search_query", fmt.Sprintf("%s:%s", arxivSearchPrefix, q))
args.Set("start", fmt.Sprintf("%d", start))
args.Set("max_results", fmt.Sprintf("%d", arxivMaxResults))
endpoint := "https://export.arxiv.org/api/query?" + args.Encode()
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status=%d body=%q", resp.StatusCode, string(body))
}
raw, err := io.ReadAll(resp.Body)
if err != nil {
return contracts.SearchResponse{}, err
}
results, err := parseArxivAtom(raw)
if err != nil {
return contracts.SearchResponse{}, err
}
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{},
}, nil
}
type arxivEntry struct {
Title string
ID string
Summary string
Published string
}
func parseArxivAtom(xmlBytes []byte) ([]contracts.MainResult, error) {
dec := xml.NewDecoder(bytes.NewReader(xmlBytes))
var entries []arxivEntry
var cur *arxivEntry
for {
tok, err := dec.Token()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
switch t := tok.(type) {
case xml.StartElement:
switch strings.ToLower(t.Name.Local) {
case "entry":
cur = &arxivEntry{}
case "title":
if cur != nil {
var v string
if err := dec.DecodeElement(&v, &t); err == nil {
cur.Title = strings.TrimSpace(v)
}
}
case "id":
if cur != nil {
var v string
if err := dec.DecodeElement(&v, &t); err == nil {
cur.ID = strings.TrimSpace(v)
}
}
case "summary":
if cur != nil {
var v string
if err := dec.DecodeElement(&v, &t); err == nil {
cur.Summary = strings.TrimSpace(v)
}
}
case "published":
if cur != nil {
var v string
if err := dec.DecodeElement(&v, &t); err == nil {
cur.Published = strings.TrimSpace(v)
}
}
}
case xml.EndElement:
if strings.ToLower(t.Name.Local) == "entry" && cur != nil {
if cur.Title != "" && cur.ID != "" {
entries = append(entries, *cur)
}
cur = nil
}
}
}
out := make([]contracts.MainResult, 0, len(entries))
for _, e := range entries {
urlPtr := e.ID
content := e.Summary
pubdate := parseArxivPublished(e.Published)
out = append(out, contracts.MainResult{
Template: "default.html",
Title: e.Title,
Content: content,
URL: &urlPtr,
Pubdate: pubdate,
Engine: "arxiv",
Category: "science",
Score: 0,
Positions: nil,
Engines: []string{"arxiv"},
})
}
return out, nil
}
func parseArxivPublished(s string) *string {
s = strings.TrimSpace(s)
if s == "" {
return nil
}
// ArXiv uses RFC3339 like "2024-06-03T00:00:00Z".
t, err := time.Parse(time.RFC3339, s)
if err != nil {
return nil
}
formatted := t.Format("2006-01-02 15:04:05-0700")
return &formatted
}