Module path now matches the GitHub mirror location. All internal imports updated across 35+ files.
137 lines
3.4 KiB
Go
137 lines
3.4 KiB
Go
package engines
|
|
|
|
import (
|
|
"io"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/metamorphosis-dev/kafka/internal/contracts"
|
|
)
|
|
|
|
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
|
|
// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
|
|
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
|
|
body, err := io.ReadAll(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
content := string(body)
|
|
results := make([]contracts.MainResult, 0)
|
|
|
|
type parsedResult struct {
|
|
href string
|
|
title string
|
|
}
|
|
|
|
var parsedLinks []parsedResult
|
|
remaining := content
|
|
|
|
for {
|
|
// DDG uses single quotes: class='result-link'
|
|
idx := strings.Index(remaining, "class='result-link'")
|
|
if idx == -1 {
|
|
break
|
|
}
|
|
|
|
block := remaining[idx:]
|
|
|
|
// Extract href from the anchor.
|
|
href := extractAttr(block, "href")
|
|
if href == "" {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
|
|
// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
|
|
if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
|
|
if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
|
|
encodedURL := href[uddgIdx+5:]
|
|
// Split on & to get just the URL (other params may follow)
|
|
if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
|
|
encodedURL = encodedURL[:ampIdx]
|
|
}
|
|
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
|
|
href = decoded
|
|
}
|
|
}
|
|
}
|
|
|
|
// Skip internal links.
|
|
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
|
|
// Extract title — text between > and </a> after the class attribute.
|
|
titleStart := strings.Index(block, ">")
|
|
if titleStart == -1 {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
afterClass := block[titleStart+1:]
|
|
titleEnd := strings.Index(afterClass, "</a>")
|
|
if titleEnd == -1 {
|
|
remaining = block[1:]
|
|
continue
|
|
}
|
|
title := stripHTML(afterClass[:titleEnd])
|
|
title = htmlUnescape(title)
|
|
|
|
if title == "" {
|
|
remaining = block[titleStart+1+titleEnd:]
|
|
continue
|
|
}
|
|
|
|
parsedLinks = append(parsedLinks, parsedResult{
|
|
href: href,
|
|
title: title,
|
|
})
|
|
|
|
remaining = block[titleStart+1+titleEnd:]
|
|
}
|
|
|
|
// Extract snippets for each result.
|
|
for i, link := range parsedLinks {
|
|
snippet := ""
|
|
linkIdx := strings.Index(content, link.href)
|
|
if linkIdx == -1 {
|
|
// Try partial match (the href might be HTML-encoded in the source).
|
|
linkIdx = strings.Index(content, url.QueryEscape(link.href))
|
|
}
|
|
|
|
if linkIdx != -1 {
|
|
snippetRegion := content[linkIdx:]
|
|
if len(snippetRegion) > 2000 {
|
|
snippetRegion = snippetRegion[:2000]
|
|
}
|
|
|
|
// DDG uses single quotes: class='result-snippet'
|
|
snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
|
|
if snippetIdx != -1 {
|
|
snippetBlock := snippetRegion[snippetIdx:]
|
|
textStart := strings.Index(snippetBlock, ">")
|
|
if textStart != -1 {
|
|
textEnd := strings.Index(snippetBlock[textStart:], "</td>")
|
|
if textEnd != -1 {
|
|
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
urlPtr := link.href
|
|
results = append(results, contracts.MainResult{
|
|
Template: "default.html",
|
|
Title: link.title,
|
|
Content: snippet,
|
|
URL: &urlPtr,
|
|
Engine: "duckduckgo",
|
|
Score: float64(len(parsedLinks) - i),
|
|
Category: "general",
|
|
Engines: []string{"duckduckgo"},
|
|
})
|
|
}
|
|
|
|
return results, nil
|
|
}
|