From 2d22a8cdbb636aaad40f1240b4a70877a649d61f Mon Sep 17 00:00:00 2001 From: Franz Kafka Date: Sun, 22 Mar 2026 15:12:21 +0000 Subject: [PATCH] feat: add Brave web search scraper engine New brave.go: scrapes https://search.brave.com directly. Extracts title, URL, snippet, and favicon from Brave's HTML. No API key required. Rename existing BraveAPIEngine (was BraveEngine) to avoid collision with the new scraper. API engine stays as 'braveapi', scraper as 'brave'. --- internal/engines/brave.go | 172 +++++++++++++++++++++++++++++++++++ internal/engines/braveapi.go | 6 +- internal/engines/factory.go | 3 +- internal/engines/planner.go | 4 +- 4 files changed, 179 insertions(+), 6 deletions(-) create mode 100644 internal/engines/brave.go diff --git a/internal/engines/brave.go b/internal/engines/brave.go new file mode 100644 index 0000000..cb9313d --- /dev/null +++ b/internal/engines/brave.go @@ -0,0 +1,172 @@ +package engines + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + + "github.com/metamorphosis-dev/kafka/internal/contracts" +) + +type BraveEngine struct { + client *http.Client +} + +func (e *BraveEngine) Name() string { return "brave" } + +func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { + if strings.TrimSpace(req.Query) == "" { + return contracts.SearchResponse{Query: req.Query}, nil + } + + start := (req.Pageno - 1) * 20 + u := fmt.Sprintf( + "https://search.brave.com/search?q=%s&offset=%d&source=web", + url.QueryEscape(req.Query), + start, + ) + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return contracts.SearchResponse{}, err + } + httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36") + httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := e.client.Do(httpReq) + if err != nil { + return contracts.SearchResponse{}, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return contracts.SearchResponse{}, fmt.Errorf("brave error: status=%d body=%q", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) + if err != nil { + return contracts.SearchResponse{}, err + } + + results := parseBraveResults(string(body)) + return contracts.SearchResponse{ + Query: req.Query, + NumberOfResults: len(results), + Results: results, + Answers: []map[string]any{}, + Corrections: []string{}, + Infoboxes: []map[string]any{}, + Suggestions: extractBraveSuggestions(string(body)), + UnresponsiveEngines: [][2]string{}, + }, nil +} + +func parseBraveResults(body string) []contracts.MainResult { + var results []contracts.MainResult + + // Brave wraps each result in divs with data-type="web" or data-type="news". + // Pattern:
... TITLE ...
SNIPPET
+ webPattern := regexp.MustCompile(`(?s)]+data-type="web"[^>]*>(.*?)
\s*]+data-type="(web|news)"`) + matches := webPattern.FindAllStringSubmatch(body, -1) + + seen := map[string]bool{} + + for _, match := range matches { + if len(match) < 2 { + continue + } + block := match[1] + + // Extract title and URL from the result-title link. + titlePattern := regexp.MustCompile(`]+class="result-title"[^>]+href="([^"]+)"[^>]*>([^<]+)`) + titleMatch := titlePattern.FindStringSubmatch(block) + if titleMatch == nil { + continue + } + href := titleMatch[1] + title := stripTags(titleMatch[2]) + + if href == "" || !strings.HasPrefix(href, "http") { + continue + } + if seen[href] { + continue + } + seen[href] = true + + // Extract snippet. + snippet := extractBraveSnippet(block) + + // Extract favicon URL. + favicon := extractBraveFavicon(block) + + urlPtr := href + results = append(results, contracts.MainResult{ + Title: title, + URL: &urlPtr, + Content: snippet, + Thumbnail: favicon, + Engine: "brave", + Score: 1.0, + Category: "general", + Engines: []string{"brave"}, + }) + } + + return results +} + +func extractBraveSnippet(block string) string { + // Try various snippet selectors Brave uses. + patterns := []string{ + `]+class="snippet"[^>]*>(.*?)`, + `]+class="[^"]*description[^"]*"[^>]*>(.*?)

`, + `]+class="[^"]*snippet[^"]*"[^>]*>(.*?)`, + } + + for _, pat := range patterns { + re := regexp.MustCompile(`(?s)` + pat) + m := re.FindStringSubmatch(block) + if len(m) >= 2 { + text := stripTags(m[1]) + if text != "" { + return strings.TrimSpace(text) + } + } + } + return "" +} + +func extractBraveFavicon(block string) string { + imgPattern := regexp.MustCompile(`]+class="[^"]*favicon[^"]*"[^>]+src="([^"]+)"`) + m := imgPattern.FindStringSubmatch(block) + if len(m) >= 2 { + return m[1] + } + return "" +} + +func extractBraveSuggestions(body string) []string { + var suggestions []string + // Brave suggestions appear in a dropdown or related searches section. + suggestPattern := regexp.MustCompile(`(?s)]+class="[^"]*suggestion[^"]*"[^>]*>.*?]*>([^<]+)`) + matches := suggestPattern.FindAllStringSubmatch(body, -1) + seen := map[string]bool{} + for _, m := range matches { + if len(m) < 2 { + continue + } + s := strings.TrimSpace(stripTags(m[1])) + if s != "" && !seen[s] { + seen[s] = true + suggestions = append(suggestions, s) + } + } + return suggestions +} diff --git a/internal/engines/braveapi.go b/internal/engines/braveapi.go index 8977cb2..1ae6220 100644 --- a/internal/engines/braveapi.go +++ b/internal/engines/braveapi.go @@ -33,16 +33,16 @@ import ( // BraveEngine implements the Brave Web Search API. // Required: BRAVE_API_KEY env var or config. // Optional: BRAVE_ACCESS_TOKEN to gate requests. -type BraveEngine struct { +type BraveAPIEngine struct { client *http.Client apiKey string accessGateToken string resultsPerPage int } -func (e *BraveEngine) Name() string { return "braveapi" } +func (e *BraveAPIEngine) Name() string { return "braveapi" } -func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { +func (e *BraveAPIEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) { if e == nil || e.client == nil { return contracts.SearchResponse{}, errors.New("brave engine not initialized") } diff --git a/internal/engines/factory.go b/internal/engines/factory.go index 528dcb7..68f66eb 100644 --- a/internal/engines/factory.go +++ b/internal/engines/factory.go @@ -51,12 +51,13 @@ func NewDefaultPortedEngines(client *http.Client, cfg *config.Config) map[string "wikipedia": &WikipediaEngine{client: client}, "arxiv": &ArxivEngine{client: client}, "crossref": &CrossrefEngine{client: client}, - "braveapi": &BraveEngine{ + "braveapi": &BraveAPIEngine{ client: client, apiKey: braveAPIKey, accessGateToken: braveAccessToken, resultsPerPage: 20, }, + "brave": &BraveEngine{client: client}, "qwant": &QwantEngine{ client: client, category: "web-lite", diff --git a/internal/engines/planner.go b/internal/engines/planner.go index 9616a4b..270885b 100644 --- a/internal/engines/planner.go +++ b/internal/engines/planner.go @@ -23,7 +23,7 @@ import ( "github.com/metamorphosis-dev/kafka/internal/contracts" ) -var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"} +var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "brave", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"} type Planner struct { PortedSet map[string]bool @@ -122,7 +122,7 @@ func inferFromCategories(categories []string) []string { out = append(out, e) } // stable order - order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9, "youtube": 10} + order := map[string]int{"wikipedia": 0, "braveapi": 1, "brave": 2, "qwant": 3, "duckduckgo": 4, "bing": 5, "google": 6, "arxiv": 7, "crossref": 8, "github": 9, "reddit": 10, "youtube": 11} sortByOrder(out, order) return out }