feat: add Brave web search scraper engine
New brave.go: scrapes https://search.brave.com directly. Extracts title, URL, snippet, and favicon from Brave's HTML. No API key required. Rename existing BraveAPIEngine (was BraveEngine) to avoid collision with the new scraper. API engine stays as 'braveapi', scraper as 'brave'.
This commit is contained in:
parent
994d27ff7f
commit
2d22a8cdbb
4 changed files with 179 additions and 6 deletions
172
internal/engines/brave.go
Normal file
172
internal/engines/brave.go
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
package engines
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
||||
)
|
||||
|
||||
type BraveEngine struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func (e *BraveEngine) Name() string { return "brave" }
|
||||
|
||||
func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if strings.TrimSpace(req.Query) == "" {
|
||||
return contracts.SearchResponse{Query: req.Query}, nil
|
||||
}
|
||||
|
||||
start := (req.Pageno - 1) * 20
|
||||
u := fmt.Sprintf(
|
||||
"https://search.brave.com/search?q=%s&offset=%d&source=web",
|
||||
url.QueryEscape(req.Query),
|
||||
start,
|
||||
)
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
httpReq.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
|
||||
httpReq.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
httpReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
resp, err := e.client.Do(httpReq)
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return contracts.SearchResponse{}, fmt.Errorf("brave error: status=%d body=%q", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
|
||||
if err != nil {
|
||||
return contracts.SearchResponse{}, err
|
||||
}
|
||||
|
||||
results := parseBraveResults(string(body))
|
||||
return contracts.SearchResponse{
|
||||
Query: req.Query,
|
||||
NumberOfResults: len(results),
|
||||
Results: results,
|
||||
Answers: []map[string]any{},
|
||||
Corrections: []string{},
|
||||
Infoboxes: []map[string]any{},
|
||||
Suggestions: extractBraveSuggestions(string(body)),
|
||||
UnresponsiveEngines: [][2]string{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseBraveResults(body string) []contracts.MainResult {
|
||||
var results []contracts.MainResult
|
||||
|
||||
// Brave wraps each result in divs with data-type="web" or data-type="news".
|
||||
// Pattern: <div ... data-type="web"> ... <a class="result-title" href="URL">TITLE</a> ... <div class="snippet">SNIPPET</div>
|
||||
webPattern := regexp.MustCompile(`(?s)<div[^>]+data-type="web"[^>]*>(.*?)</div>\s*<div[^>]+data-type="(web|news)"`)
|
||||
matches := webPattern.FindAllStringSubmatch(body, -1)
|
||||
|
||||
seen := map[string]bool{}
|
||||
|
||||
for _, match := range matches {
|
||||
if len(match) < 2 {
|
||||
continue
|
||||
}
|
||||
block := match[1]
|
||||
|
||||
// Extract title and URL from the result-title link.
|
||||
titlePattern := regexp.MustCompile(`<a[^>]+class="result-title"[^>]+href="([^"]+)"[^>]*>([^<]+)</a>`)
|
||||
titleMatch := titlePattern.FindStringSubmatch(block)
|
||||
if titleMatch == nil {
|
||||
continue
|
||||
}
|
||||
href := titleMatch[1]
|
||||
title := stripTags(titleMatch[2])
|
||||
|
||||
if href == "" || !strings.HasPrefix(href, "http") {
|
||||
continue
|
||||
}
|
||||
if seen[href] {
|
||||
continue
|
||||
}
|
||||
seen[href] = true
|
||||
|
||||
// Extract snippet.
|
||||
snippet := extractBraveSnippet(block)
|
||||
|
||||
// Extract favicon URL.
|
||||
favicon := extractBraveFavicon(block)
|
||||
|
||||
urlPtr := href
|
||||
results = append(results, contracts.MainResult{
|
||||
Title: title,
|
||||
URL: &urlPtr,
|
||||
Content: snippet,
|
||||
Thumbnail: favicon,
|
||||
Engine: "brave",
|
||||
Score: 1.0,
|
||||
Category: "general",
|
||||
Engines: []string{"brave"},
|
||||
})
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func extractBraveSnippet(block string) string {
|
||||
// Try various snippet selectors Brave uses.
|
||||
patterns := []string{
|
||||
`<div[^>]+class="snippet"[^>]*>(.*?)</div>`,
|
||||
`<p[^>]+class="[^"]*description[^"]*"[^>]*>(.*?)</p>`,
|
||||
`<span[^>]+class="[^"]*snippet[^"]*"[^>]*>(.*?)</span>`,
|
||||
}
|
||||
|
||||
for _, pat := range patterns {
|
||||
re := regexp.MustCompile(`(?s)` + pat)
|
||||
m := re.FindStringSubmatch(block)
|
||||
if len(m) >= 2 {
|
||||
text := stripTags(m[1])
|
||||
if text != "" {
|
||||
return strings.TrimSpace(text)
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractBraveFavicon(block string) string {
|
||||
imgPattern := regexp.MustCompile(`<img[^>]+class="[^"]*favicon[^"]*"[^>]+src="([^"]+)"`)
|
||||
m := imgPattern.FindStringSubmatch(block)
|
||||
if len(m) >= 2 {
|
||||
return m[1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractBraveSuggestions(body string) []string {
|
||||
var suggestions []string
|
||||
// Brave suggestions appear in a dropdown or related searches section.
|
||||
suggestPattern := regexp.MustCompile(`(?s)<li[^>]+class="[^"]*suggestion[^"]*"[^>]*>.*?<a[^>]*>([^<]+)</a>`)
|
||||
matches := suggestPattern.FindAllStringSubmatch(body, -1)
|
||||
seen := map[string]bool{}
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
s := strings.TrimSpace(stripTags(m[1]))
|
||||
if s != "" && !seen[s] {
|
||||
seen[s] = true
|
||||
suggestions = append(suggestions, s)
|
||||
}
|
||||
}
|
||||
return suggestions
|
||||
}
|
||||
|
|
@ -33,16 +33,16 @@ import (
|
|||
// BraveEngine implements the Brave Web Search API.
|
||||
// Required: BRAVE_API_KEY env var or config.
|
||||
// Optional: BRAVE_ACCESS_TOKEN to gate requests.
|
||||
type BraveEngine struct {
|
||||
type BraveAPIEngine struct {
|
||||
client *http.Client
|
||||
apiKey string
|
||||
accessGateToken string
|
||||
resultsPerPage int
|
||||
}
|
||||
|
||||
func (e *BraveEngine) Name() string { return "braveapi" }
|
||||
func (e *BraveAPIEngine) Name() string { return "braveapi" }
|
||||
|
||||
func (e *BraveEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
func (e *BraveAPIEngine) Search(ctx context.Context, req contracts.SearchRequest) (contracts.SearchResponse, error) {
|
||||
if e == nil || e.client == nil {
|
||||
return contracts.SearchResponse{}, errors.New("brave engine not initialized")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,12 +51,13 @@ func NewDefaultPortedEngines(client *http.Client, cfg *config.Config) map[string
|
|||
"wikipedia": &WikipediaEngine{client: client},
|
||||
"arxiv": &ArxivEngine{client: client},
|
||||
"crossref": &CrossrefEngine{client: client},
|
||||
"braveapi": &BraveEngine{
|
||||
"braveapi": &BraveAPIEngine{
|
||||
client: client,
|
||||
apiKey: braveAPIKey,
|
||||
accessGateToken: braveAccessToken,
|
||||
resultsPerPage: 20,
|
||||
},
|
||||
"brave": &BraveEngine{client: client},
|
||||
"qwant": &QwantEngine{
|
||||
client: client,
|
||||
category: "web-lite",
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import (
|
|||
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
||||
)
|
||||
|
||||
var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"}
|
||||
var defaultPortedEngines = []string{"wikipedia", "arxiv", "crossref", "braveapi", "brave", "qwant", "duckduckgo", "github", "reddit", "bing", "google", "youtube"}
|
||||
|
||||
type Planner struct {
|
||||
PortedSet map[string]bool
|
||||
|
|
@ -122,7 +122,7 @@ func inferFromCategories(categories []string) []string {
|
|||
out = append(out, e)
|
||||
}
|
||||
// stable order
|
||||
order := map[string]int{"wikipedia": 0, "braveapi": 1, "qwant": 2, "duckduckgo": 3, "bing": 4, "google": 5, "arxiv": 6, "crossref": 7, "github": 8, "reddit": 9, "youtube": 10}
|
||||
order := map[string]int{"wikipedia": 0, "braveapi": 1, "brave": 2, "qwant": 3, "duckduckgo": 4, "bing": 5, "google": 6, "arxiv": 7, "crossref": 8, "github": 9, "reddit": 10, "youtube": 11}
|
||||
sortByOrder(out, order)
|
||||
return out
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue