samsa/internal/engines/brave.go
Franz Kafka 2d22a8cdbb feat: add Brave web search scraper engine
New brave.go: scrapes https://search.brave.com directly.
Extracts title, URL, snippet, and favicon from Brave's HTML.
No API key required.

Rename existing BraveAPIEngine (was BraveEngine) to avoid collision
with the new scraper. API engine stays as 'braveapi', scraper as 'brave'.
2026-03-22 16:01:49 +00:00

172 lines
4.6 KiB
Go

package engines
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
type BraveEngine struct {
client *http.Client
}
func (e *BraveEngine) Name() string { return "brave" }
func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
if strings.TrimSpace(req.Query) == "" {
return contracts.SearchResponse{Query: req.Query}, nil
}
start := (req.Pageno - 1) * 20
u := fmt.Sprintf(
"https://search.brave.com/search?q=%s&offset=%d&source=web",
url.QueryEscape(req.Query),
start,
)
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return contracts.SearchResponse{}, err
}
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := e.client.Do(httpReq)
if err != nil {
return contracts.SearchResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return contracts.SearchResponse{}, fmt.Errorf("brave error: status=%d body=%q", resp.StatusCode, string(body))
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
if err != nil {
return contracts.SearchResponse{}, err
}
results := parseBraveResults(string(body))
return contracts.SearchResponse{
Query: req.Query,
NumberOfResults: len(results),
Results: results,
Answers: []map[string]any{},
Corrections: []string{},
Infoboxes: []map[string]any{},
Suggestions: extractBraveSuggestions(string(body)),
UnresponsiveEngines: [][2]string{},
}, nil
}
func parseBraveResults(body string) []contracts.MainResult {
var results []contracts.MainResult
// Brave wraps each result in divs with data-type="web" or data-type="news".
// Pattern: <div ... data-type="web"> ... <a class="result-title" href="URL">TITLE</a> ... <div class="snippet">SNIPPET</div>
webPattern := regexp.MustCompile(`(?s)<div[^>]+data-type="web"[^>]*>(.*?)</div>\s*<div[^>]+data-type="(web|news)"`)
matches := webPattern.FindAllStringSubmatch(body, -1)
seen := map[string]bool{}
for _, match := range matches {
if len(match) < 2 {
continue
}
block := match[1]
// Extract title and URL from the result-title link.
titlePattern := regexp.MustCompile(`<a[^>]+class="result-title"[^>]+href="([^"]+)"[^>]*>([^<]+)</a>`)
titleMatch := titlePattern.FindStringSubmatch(block)
if titleMatch == nil {
continue
}
href := titleMatch[1]
title := stripTags(titleMatch[2])
if href == "" || !strings.HasPrefix(href, "http") {
continue
}
if seen[href] {
continue
}
seen[href] = true
// Extract snippet.
snippet := extractBraveSnippet(block)
// Extract favicon URL.
favicon := extractBraveFavicon(block)
urlPtr := href
results = append(results, contracts.MainResult{
Title: title,
URL: &urlPtr,
Content: snippet,
Thumbnail: favicon,
Engine: "brave",
Score: 1.0,
Category: "general",
Engines: []string{"brave"},
})
}
return results
}
func extractBraveSnippet(block string) string {
// Try various snippet selectors Brave uses.
patterns := []string{
`<div[^>]+class="snippet"[^>]*>(.*?)</div>`,
`<p[^>]+class="[^"]*description[^"]*"[^>]*>(.*?)</p>`,
`<span[^>]+class="[^"]*snippet[^"]*"[^>]*>(.*?)</span>`,
}
for _, pat := range patterns {
re := regexp.MustCompile(`(?s)` + pat)
m := re.FindStringSubmatch(block)
if len(m) >= 2 {
text := stripTags(m[1])
if text != "" {
return strings.TrimSpace(text)
}
}
}
return ""
}
func extractBraveFavicon(block string) string {
imgPattern := regexp.MustCompile(`<img[^>]+class="[^"]*favicon[^"]*"[^>]+src="([^"]+)"`)
m := imgPattern.FindStringSubmatch(block)
if len(m) >= 2 {
return m[1]
}
return ""
}
func extractBraveSuggestions(body string) []string {
var suggestions []string
// Brave suggestions appear in a dropdown or related searches section.
suggestPattern := regexp.MustCompile(`(?s)<li[^>]+class="[^"]*suggestion[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`)
matches := suggestPattern.FindAllStringSubmatch(body, -1)
seen := map[string]bool{}
for _, m := range matches {
if len(m) < 2 {
continue
}
s := strings.TrimSpace(stripTags(m[1]))
if s != "" && !seen[s] {
seen[s] = true
suggestions = append(suggestions, s)
}
}
return suggestions
}