kafka/internal/engines/planner.go

package engines

import (
	"os"
	"strings"

	"github.com/metamorphosis-dev/kafka/internal/contracts"
)

var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"}

type Planner struct {
	PortedSet  map[string]bool
	PortedList []string
}

func NewPlannerFromEnv() *Planner {
	raw := strings.TrimSpace(os.Getenv("LOCAL_PORTED_ENGINES"))
	if raw == "" {
		return NewPlanner(defaultPortedEngines)
	}
	parts := splitCSV(raw)
	if len(parts) == 0 {
		return NewPlanner(defaultPortedEngines)
	}
	return NewPlanner(parts)
}

func NewPlanner(portedEngines []string) *Planner {
	set := make(map[string]bool, len(portedEngines))
	out := make([]string, 0, len(portedEngines))
	for _, e := range portedEngines {
		e = strings.TrimSpace(strings.ToLower(e))
		if e == "" {
			continue
		}
		if set[e] {
			continue
		}
		set[e] = true
		out = append(out, e)
	}
	return &Planner{
		PortedSet:  set,
		PortedList: out,
	}
}

// Plan returns:
// - localEngines: engines that are configured as ported for this service
// - upstreamEngines: engines that should be executed by the upstream instance
// - requestedEngines: the (possibly inferred) requested engines list
//
// If the request provides an explicit `engines` parameter, we use it.
// Otherwise we infer a small subset from `categories` for the starter set.
func (p *Planner) Plan(req contracts.SearchRequest) (localEngines, upstreamEngines, requestedEngines []string) {
	if p == nil {
		p = NewPlannerFromEnv()
	}

	requestedEngines = nil
	if len(req.Engines) > 0 {
		requestedEngines = normalizeList(req.Engines)
	} else {
		requestedEngines = inferFromCategories(req.Categories)
	}

	localEngines = make([]string, 0, len(requestedEngines))
	upstreamEngines = make([]string, 0, len(requestedEngines))
	for _, e := range requestedEngines {
		if p.PortedSet[e] {
			localEngines = append(localEngines, e)
		} else {
			upstreamEngines = append(upstreamEngines, e)
		}
	}

	return localEngines, upstreamEngines, requestedEngines
}

func inferFromCategories(categories []string) []string {
	// Minimal mapping for the initial porting subset.
	// This mirrors the idea of selecting from engine categories without
	// embedding the whole engine registry.
	set := map[string]bool{}
	for _, c := range categories {
		switch strings.TrimSpace(strings.ToLower(c)) {
		case "general":
			set["wikipedia"] = true
			set["braveapi"] = true
			set["qwant"] = true
			set["duckduckgo"] = true
			set["bing"] = true
			set["google"] = true
		case "science", "scientific publications":
			set["arxiv"] = true
			set["crossref"] = true
		case "it":
			set["github"] = true
		case "social media":
			set["reddit"] = true
		case "videos":
			set["youtube"] = true
		}
	}

	out := make([]string, 0, len(set))
	for e := range set {
		out = append(out, e)
	}
	// stable order
	order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9, "youtube": 10}
	sortByOrder(out, order)
	return out
}

func sortByOrder(list []string, order map[string]int) {
	// simple insertion sort (list is tiny)
	for i := 1; i < len(list); i++ {
		j := i
		for j > 0 && order[list[j-1]] > order[list[j]] {
			list[j-1], list[j] = list[j], list[j-1]
			j--
		}
	}
}

func normalizeList(in []string) []string {
	out := make([]string, 0, len(in))
	seen := map[string]bool{}
	for _, e := range in {
		e = strings.TrimSpace(strings.ToLower(e))
		if e == "" || seen[e] {
			continue
		}
		seen[e] = true
		out = append(out, e)
	}
	return out
}

func splitCSV(s string) []string {
	if s == "" {
		return nil
	}
	parts := strings.Split(s, ",")
	out := make([]string, 0, len(parts))
	for _, p := range parts {
		p = strings.TrimSpace(p)
		if p == "" {
			continue
		}
		out = append(out, p)
	}
	return out
}