package engines import ( "bytes" "context" "encoding/xml" "errors" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/metamorphosis-dev/kafka/internal/contracts" ) const ( arxivSearchPrefix = "all" arxivMaxResults = 10 ) type ArxivEngine struct { client *http.Client } func (e *ArxivEngine) Name() string { return "arxiv" } func (e *ArxivEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { if e == nil || e.client == nil { return contracts.SearchResponse{}, errors.New("arxiv engine not initialized") } q := strings.TrimSpace(req.Query) if q == "" { return contracts.SearchResponse{Query: req.Query}, nil } start := (req.Pageno - 1) * arxivMaxResults if start < 0 { start = 0 } args := url.Values{} args.Set("search_query", fmt.Sprintf("%s:%s", arxivSearchPrefix, q)) args.Set("start", fmt.Sprintf("%d", start)) args.Set("max_results", fmt.Sprintf("%d", arxivMaxResults)) endpoint := "https://export.arxiv.org/api/query?" + args.Encode() httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { return contracts.SearchResponse{}, err } resp, err := e.client.Do(httpReq) if err != nil { return contracts.SearchResponse{}, err } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { body, _ := io.ReadAll(io.LimitReader(resp.Body, 16*1024)) return contracts.SearchResponse{}, fmt.Errorf("arxiv upstream error: status=%d body=%q", resp.StatusCode, string(body)) } raw, err := io.ReadAll(resp.Body) if err != nil { return contracts.SearchResponse{}, err } results, err := parseArxivAtom(raw) if err != nil { return contracts.SearchResponse{}, err } return contracts.SearchResponse{ Query: req.Query, NumberOfResults: len(results), Results: results, Answers: []map[string]any{}, Corrections: []string{}, Infoboxes: []map[string]any{}, Suggestions: []string{}, UnresponsiveEngines: [][2]string{}, }, nil } type arxivEntry struct { Title string ID string Summary string Published string } func parseArxivAtom(xmlBytes []byte) ([]contracts.MainResult, error) { dec := xml.NewDecoder(bytes.NewReader(xmlBytes)) var entries []arxivEntry var cur *arxivEntry for { tok, err := dec.Token() if err == io.EOF { break } if err != nil { return nil, err } switch t := tok.(type) { case xml.StartElement: switch strings.ToLower(t.Name.Local) { case "entry": cur = &arxivEntry{} case "title": if cur != nil { var v string if err := dec.DecodeElement(&v, &t); err == nil { cur.Title = strings.TrimSpace(v) } } case "id": if cur != nil { var v string if err := dec.DecodeElement(&v, &t); err == nil { cur.ID = strings.TrimSpace(v) } } case "summary": if cur != nil { var v string if err := dec.DecodeElement(&v, &t); err == nil { cur.Summary = strings.TrimSpace(v) } } case "published": if cur != nil { var v string if err := dec.DecodeElement(&v, &t); err == nil { cur.Published = strings.TrimSpace(v) } } } case xml.EndElement: if strings.ToLower(t.Name.Local) == "entry" && cur != nil { if cur.Title != "" && cur.ID != "" { entries = append(entries, *cur) } cur = nil } } } out := make([]contracts.MainResult, 0, len(entries)) for _, e := range entries { urlPtr := e.ID content := e.Summary pubdate := parseArxivPublished(e.Published) out = append(out, contracts.MainResult{ Template: "default.html", Title: e.Title, Content: content, URL: &urlPtr, Pubdate: pubdate, Engine: "arxiv", Category: "science", Score: 0, Positions: nil, Engines: []string{"arxiv"}, }) } return out, nil } func parseArxivPublished(s string) *string { s = strings.TrimSpace(s) if s == "" { return nil } // ArXiv uses RFC3339 like "2024-06-03T00:00:00Z". t, err := time.Parse(time.RFC3339, s) if err != nil { return nil } formatted := t.Format("2006-01-02 15:04:05-0700") return &formatted }