kafka/internal/engines/arxiv.go

package engines

import (
	"bytes"
	"context"
	"encoding/xml"
	"errors"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"strings"
	"time"

	"github.com/metamorphosis-dev/kafka/internal/contracts"
)

const (
	arxivSearchPrefix = "all"
	arxivMaxResults   = 10
)

type ArxivEngine struct {
	client *http.Client
}

func (e *ArxivEngine) Name() string { return "arxiv" }

func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
	if e == nil || e.client == nil {
		return contracts.SearchResponse{}, errors.New("arxiv engine not initialized")
	}
	q := strings.TrimSpace(req.Query)
	if q == "" {
		return contracts.SearchResponse{Query: req.Query}, nil
	}

	start := (req.Pageno - 1) * arxivMaxResults
	if start < 0 {
		start = 0
	}

	args := url.Values{}
	args.Set("search_query", fmt.Sprintf("%s:%s", arxivSearchPrefix, q))
	args.Set("start", fmt.Sprintf("%d", start))
	args.Set("max_results", fmt.Sprintf("%d", arxivMaxResults))

	endpoint := "https://export.arxiv.org/api/query?" + args.Encode()

	httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
	if err != nil {
		return contracts.SearchResponse{}, err
	}

	resp, err := e.client.Do(httpReq)
	if err != nil {
		return contracts.SearchResponse{}, err
	}
	defer resp.Body.Close()

	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024))
		return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status=%d body=%q", resp.StatusCode, string(body))
	}

	raw, err := io.ReadAll(resp.Body)
	if err != nil {
		return contracts.SearchResponse{}, err
	}

	results, err := parseArxivAtom(raw)
	if err != nil {
		return contracts.SearchResponse{}, err
	}

	return contracts.SearchResponse{
		Query:               req.Query,
		NumberOfResults:    len(results),
		Results:             results,
		Answers:             []map[string]any{},
		Corrections:         []string{},
		Infoboxes:           []map[string]any{},
		Suggestions:         []string{},
		UnresponsiveEngines: [][2]string{},
	}, nil
}

type arxivEntry struct {
	Title     string
	ID        string
	Summary   string
	Published string
}

func parseArxivAtom(xmlBytes []byte) ([]contracts.MainResult, error) {
	dec := xml.NewDecoder(bytes.NewReader(xmlBytes))

	var entries []arxivEntry
	var cur *arxivEntry

	for {
		tok, err := dec.Token()
		if err == io.EOF {
			break
		}
		if err != nil {
			return nil, err
		}

		switch t := tok.(type) {
		case xml.StartElement:
			switch strings.ToLower(t.Name.Local) {
			case "entry":
				cur = &arxivEntry{}
			case "title":
				if cur != nil {
					var v string
					if err := dec.DecodeElement(&v, &t); err == nil {
						cur.Title = strings.TrimSpace(v)
					}
				}
			case "id":
				if cur != nil {
					var v string
					if err := dec.DecodeElement(&v, &t); err == nil {
						cur.ID = strings.TrimSpace(v)
					}
				}
			case "summary":
				if cur != nil {
					var v string
					if err := dec.DecodeElement(&v, &t); err == nil {
						cur.Summary = strings.TrimSpace(v)
					}
				}
			case "published":
				if cur != nil {
					var v string
					if err := dec.DecodeElement(&v, &t); err == nil {
						cur.Published = strings.TrimSpace(v)
					}
				}
			}
		case xml.EndElement:
			if strings.ToLower(t.Name.Local) == "entry" && cur != nil {
				if cur.Title != "" && cur.ID != "" {
					entries = append(entries, *cur)
				}
				cur = nil
			}
		}
	}

	out := make([]contracts.MainResult, 0, len(entries))
	for _, e := range entries {
		urlPtr := e.ID
		content := e.Summary
		pubdate := parseArxivPublished(e.Published)

		out = append(out, contracts.MainResult{
			Template: "default.html",
			Title:    e.Title,
			Content:  content,
			URL:      &urlPtr,
			Pubdate:  pubdate,
			Engine:   "arxiv",
			Category: "science",
			Score:    0,
			Positions: nil,
			Engines:  []string{"arxiv"},
		})
	}
	return out, nil
}

func parseArxivPublished(s string) *string {
	s = strings.TrimSpace(s)
	if s == "" {
		return nil
	}

	// ArXiv uses RFC3339 like "2024-06-03T00:00:00Z".
	t, err := time.Parse(time.RFC3339, s)
	if err != nil {
		return nil
	}

	formatted := t.Format("2006-01-02 15:04:05-0700")
	return &formatted
}