samsa/internal/engines/stackoverflow.go
Franz Kafka df67492602 feat: add Stack Overflow search engine
Uses the Stack Exchange API v3 (/search/advanced) to find questions
sorted by relevance. No API key required (300 req/day); optionally
configure via STACKOVERFLOW_KEY env var or [engines.stackoverflow].

Results include score, answer count, view count, and tags in the
snippet. Assigned to the 'it' category, triggered by the IT category
tab or explicit engine selection.

6 tests covering parsing, edge cases, and helpers.
2026-03-22 22:29:34 +00:00

226 lines
6.6 KiB
Go

// kafka — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package engines
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
const stackOverflowAPIBase = "https://api.stackexchange.com/2.3"
// StackOverflowEngine searches Stack Overflow via the public API.
// No API key is required, but providing one via STACKOVERFLOW_KEY env var
// or config raises the rate limit from 300 to 10,000 requests/day.
type StackOverflowEngine struct {
client *http.Client
apiKey string
}
func (e *StackOverflowEngine) Name() string { return "stackoverflow" }
func (e *StackOverflowEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if e == nil || e.client == nil {
return contracts.SearchResponse{}, errors.New("stackoverflow engine not initialized")
}
q := strings.TrimSpace(req.Query)
if q == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
page := req.Pageno
if page < 1 {
page = 1
}
args := url.Values{}
args.Set("order", "desc")
args.Set("sort", "relevance")
args.Set("site", "stackoverflow")
args.Set("page", fmt.Sprintf("%d", page))
args.Set("pagesize", "20")
args.Set("filter", "!9_bDDxJY5")
if e.apiKey != "" {
args.Set("key", e.apiKey)
}
endpoint := stackOverflowAPIBase + "/search/advanced?" + args.Encode() + "&q=" + url.QueryEscape(q)
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", "kafka/0.1 (compatible; +https://git.ashisgreat.xyz/penal-colony/kafka)")
httpReq.Header.Set("Accept", "application/json")
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusTooManyRequests {
return contracts.SearchResponse{
Query: req.Query,
UnresponsiveEngines: [][2]string{{"stackoverflow", "rate_limited"}},
Results: []contracts.MainResult{},
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
}, nil
}
if resp.StatusCode != http.StatusOK {
io.Copy(io.Discard, io.LimitReader(resp.Body, 4*1024))
return contracts.SearchResponse{}, fmt.Errorf("stackoverflow upstream error: status %d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return contracts.SearchResponse{}, err
}
return parseStackOverflow(body, req.Query)
}
// soQuestion represents a question item from the Stack Exchange API.
type soQuestion struct {
QuestionID int `json:"question_id"`
Title string `json:"title"`
Link string `json:"link"`
Body string `json:"body"`
Score int `json:"score"`
AnswerCount int `json:"answer_count"`
ViewCount int `json:"view_count"`
Tags []string `json:"tags"`
CreationDate float64 `json:"creation_date"`
Owner *soOwner `json:"owner"`
AcceptedAnswerID *int `json:"accepted_answer_id"`
IsAnswered bool `json:"is_answered"`
}
type soOwner struct {
Reputation int `json:"reputation"`
DisplayName string `json:"display_name"`
}
type soResponse struct {
Items []soQuestion `json:"items"`
HasMore bool `json:"has_more"`
QuotaRemaining int `json:"quota_remaining"`
QuotaMax int `json:"quota_max"`
}
func parseStackOverflow(body []byte, query string) (contracts.SearchResponse, error) {
var resp soResponse
if err := json.Unmarshal(body, &resp); err != nil {
return contracts.SearchResponse{}, fmt.Errorf("stackoverflow JSON parse error: %w", err)
}
results := make([]contracts.MainResult, 0, len(resp.Items))
for _, q := range resp.Items {
if q.Link == "" {
continue
}
// Strip HTML from the body excerpt.
snippet := truncate(stripHTML(q.Body), 300)
// Build a content string with useful metadata.
content := snippet
if q.Score > 0 {
content = fmt.Sprintf("Score: %d", q.Score)
if q.AnswerCount > 0 {
content += fmt.Sprintf(" · %d answers", q.AnswerCount)
}
if q.ViewCount > 0 {
content += fmt.Sprintf(" · %s views", formatCount(q.ViewCount))
}
if snippet != "" {
content += "\n" + snippet
}
}
// Append tags as category hint.
if len(q.Tags) > 0 {
displayTags := q.Tags
if len(displayTags) > 5 {
displayTags = displayTags[:5]
}
content += "\n[" + strings.Join(displayTags, "] [") + "]"
}
linkPtr := q.Link
results = append(results, contracts.MainResult{
Template: "default",
Title: q.Title,
Content: content,
URL: &linkPtr,
Engine: "stackoverflow",
Score: float64(q.Score),
Category: "it",
Engines: []string{"stackoverflow"},
})
}
return contracts.SearchResponse{
Query: query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: []string{},
UnresponsiveEngines: [][2]string{},
}, nil
}
// formatCount formats large numbers compactly (1.2k, 3.4M).
func formatCount(n int) string {
if n >= 1_000_000 {
return fmt.Sprintf("%.1fM", float64(n)/1_000_000)
}
if n >= 1_000 {
return fmt.Sprintf("%.1fk", float64(n)/1_000)
}
return fmt.Sprintf("%d", n)
}
// truncate cuts a string to at most maxLen characters, appending "…" if truncated.
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen] + "…"
}
// stackOverflowCreatedAt returns a time.Time from a Unix timestamp.
// Kept as a helper for potential future pubdate use.
func stackOverflowCreatedAt(unix float64) *string {
t := time.Unix(int64(unix), 0).UTC()
s := t.Format("2006-01-02")
return &s
}