feat: add Brave web search scraper engine

New brave.go: scrapes https://search.brave.com directly.
Extracts title, URL, snippet, and favicon from Brave's HTML.
No API key required.

Rename existing BraveAPIEngine (was BraveEngine) to avoid collision
with the new scraper. API engine stays as 'braveapi', scraper as 'brave'.
This commit is contained in:
Franz Kafka 2026-03-22 15:12:21 +00:00
parent 994d27ff7f
commit 2d22a8cdbb
4 changed files with 179 additions and 6 deletions

172
internal/engines/brave.go Normal file
View file

@ -0,0 +1,172 @@
package engines
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
type BraveEngine struct {
client *http.Client
}
func (e *BraveEngine) Name() string { return "brave" }
func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if strings.TrimSpace(req.Query) == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
start := (req.Pageno - 1) * 20
u := fmt.Sprintf(
"https://search.brave.com/search?q=%s&offset=%d&source=web",
url.QueryEscape(req.Query),
start,
)
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("brave error: status=%d body=%q", resp.StatusCode, string(body))
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
if err != nil {
return contracts.SearchResponse{}, err
}
results := parseBraveResults(string(body))
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: extractBraveSuggestions(string(body)),
UnresponsiveEngines: [][2]string{},
}, nil
}
func parseBraveResults(body string) []contracts.MainResult {
var results []contracts.MainResult
// Brave wraps each result in divs with data-type="web" or data-type="news".
// Pattern: <div ... data-type="web"> ... <a class="result-title" href="URL">TITLE</a> ... <div class="snippet">SNIPPET</div>
webPattern := regexp.MustCompile(`(?s)<div[^>]+data-type="web"[^>]*>(.*?)</div>\s*<div[^>]+data-type="(web|news)"`)
matches := webPattern.FindAllStringSubmatch(body, -1)
seen := map[string]bool{}
for _, match := range matches {
if len(match) < 2 {
continue
}
block := match[1]
// Extract title and URL from the result-title link.
titlePattern := regexp.MustCompile(`<a[^>]+class="result-title"[^>]+href="([^"]+)"[^>]*>([^<]+)</a>`)
titleMatch := titlePattern.FindStringSubmatch(block)
if titleMatch == nil {
continue
}
href := titleMatch[1]
title := stripTags(titleMatch[2])
if href == "" || !strings.HasPrefix(href, "http") {
continue
}
if seen[href] {
continue
}
seen[href] = true
// Extract snippet.
snippet := extractBraveSnippet(block)
// Extract favicon URL.
favicon := extractBraveFavicon(block)
urlPtr := href
results = append(results, contracts.MainResult{
Title: title,
URL: &urlPtr,
Content: snippet,
Thumbnail: favicon,
Engine: "brave",
Score: 1.0,
Category: "general",
Engines: []string{"brave"},
})
}
return results
}
func extractBraveSnippet(block string) string {
// Try various snippet selectors Brave uses.
patterns := []string{
`<div[^>]+class="snippet"[^>]*>(.*?)</div>`,
`<p[^>]+class="[^"]*description[^"]*"[^>]*>(.*?)</p>`,
`<span[^>]+class="[^"]*snippet[^"]*"[^>]*>(.*?)</span>`,
}
for _, pat := range patterns {
re := regexp.MustCompile(`(?s)` + pat)
m := re.FindStringSubmatch(block)
if len(m) >= 2 {
text := stripTags(m[1])
if text != "" {
return strings.TrimSpace(text)
}
}
}
return ""
}
func extractBraveFavicon(block string) string {
imgPattern := regexp.MustCompile(`<img[^>]+class="[^"]*favicon[^"]*"[^>]+src="([^"]+)"`)
m := imgPattern.FindStringSubmatch(block)
if len(m) >= 2 {
return m[1]
}
return ""
}
func extractBraveSuggestions(body string) []string {
var suggestions []string
// Brave suggestions appear in a dropdown or related searches section.
suggestPattern := regexp.MustCompile(`(?s)<li[^>]+class="[^"]*suggestion[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`)
matches := suggestPattern.FindAllStringSubmatch(body, -1)
seen := map[string]bool{}
for _, m := range matches {
if len(m) < 2 {
continue
}
s := strings.TrimSpace(stripTags(m[1]))
if s != "" && !seen[s] {
seen[s] = true
suggestions = append(suggestions, s)
}
}
return suggestions
}

View file

@ -33,16 +33,16 @@ import (
// BraveEngine implements the Brave Web Search API.
// Required: BRAVE_API_KEY env var or config.
// Optional: BRAVE_ACCESS_TOKEN to gate requests.
type BraveEngine struct {
type BraveAPIEngine struct {
client *http.Client
apiKey string
accessGateToken string
resultsPerPage int
}
func (e *BraveEngine) Name() string { return "braveapi" }
func (e *BraveAPIEngine) Name() string { return "braveapi" }
func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
func (e *BraveAPIEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if e == nil || e.client == nil {
return contracts.SearchResponse{}, errors.New("brave engine not initialized")
}

View file

@ -51,12 +51,13 @@ func NewDefaultPortedEngines(client *http.Client, cfg *config.Config) map[string
"wikipedia": &WikipediaEngine{client: client},
"arxiv": &ArxivEngine{client: client},
"crossref": &CrossrefEngine{client: client},
"braveapi": &BraveEngine{
"braveapi": &BraveAPIEngine{
client: client,
apiKey: braveAPIKey,
accessGateToken: braveAccessToken,
resultsPerPage: 20,
},
"brave": &BraveEngine{client: client},
"qwant": &QwantEngine{
client: client,
category: "web-lite",

View file

@ -23,7 +23,7 @@ import (
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"}
var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "brave", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"}
type Planner struct {
PortedSet map[string]bool
@ -122,7 +122,7 @@ func inferFromCategories(categories []string) []string {
out = append(out, e)
}
// stable order
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9, "youtube": 10}
order := map[string]int{"wikipedia": 0, "braveapi": 1, "brave": 2, "qwant": 3, "duckduckgo": 4, "bing": 5, "google": 6, "arxiv": 7, "crossref": 8, "github": 9, "reddit": 10, "youtube": 11}
sortByOrder(out, order)
return out
}