samsa/internal/engines/duckduckgo_parse.go
Franz Kafka 8e9aae062b
Some checks failed
Build and Push Docker Image / build-and-push (push) Failing after 11s
Mirror to GitHub / mirror (push) Failing after 5s
Tests / test (push) Successful in 42s
rename: kafka → samsa
Full project rename from kafka to samsa (after Gregor Samsa, who
woke one morning from uneasy dreams to find himself transformed).

- Module: github.com/metamorphosis-dev/kafka → samsa
- Binary: cmd/kafka/ → cmd/samsa/
- CSS: kafka.css → samsa.css
- UI: all 'kafka' product names, titles, localStorage keys → samsa
- localStorage keys: kafka-theme → samsa-theme, kafka-engines → samsa-engines
- OpenSearch: ShortName, LongName, description, URLs updated
- AGPL headers: 'kafka' → 'samsa'
- Docs, configs, examples updated
- Cache key prefix: kafka: → samsa:
2026-03-22 23:44:55 +00:00

153 lines
4.1 KiB
Go

// samsa — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package engines
import (
"io"
"net/url"
"strings"
"github.com/metamorphosis-dev/samsa/internal/contracts"
)
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
body, err := io.ReadAll(r)
if err != nil {
return nil, err
}
content := string(body)
results := make([]contracts.MainResult, 0)
type parsedResult struct {
href string
title string
}
var parsedLinks []parsedResult
remaining := content
for {
// DDG uses single quotes: class='result-link'
idx := strings.Index(remaining, "class='result-link'")
if idx == -1 {
break
}
block := remaining[idx:]
// Extract href from the anchor.
href := extractAttr(block, "href")
if href == "" {
remaining = block[1:]
continue
}
// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
encodedURL := href[uddgIdx+5:]
// Split on & to get just the URL (other params may follow)
if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
encodedURL = encodedURL[:ampIdx]
}
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
href = decoded
}
}
}
// Skip internal links.
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
remaining = block[1:]
continue
}
// Extract title — text between > and </a> after the class attribute.
titleStart := strings.Index(block, ">")
if titleStart == -1 {
remaining = block[1:]
continue
}
afterClass := block[titleStart+1:]
titleEnd := strings.Index(afterClass, "</a>")
if titleEnd == -1 {
remaining = block[1:]
continue
}
title := stripHTML(afterClass[:titleEnd])
title = htmlUnescape(title)
if title == "" {
remaining = block[titleStart+1+titleEnd:]
continue
}
parsedLinks = append(parsedLinks, parsedResult{
href: href,
title: title,
})
remaining = block[titleStart+1+titleEnd:]
}
// Extract snippets for each result.
for i, link := range parsedLinks {
snippet := ""
linkIdx := strings.Index(content, link.href)
if linkIdx == -1 {
// Try partial match (the href might be HTML-encoded in the source).
linkIdx = strings.Index(content, url.QueryEscape(link.href))
}
if linkIdx != -1 {
snippetRegion := content[linkIdx:]
if len(snippetRegion) > 2000 {
snippetRegion = snippetRegion[:2000]
}
// DDG uses single quotes: class='result-snippet'
snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
if snippetIdx != -1 {
snippetBlock := snippetRegion[snippetIdx:]
textStart := strings.Index(snippetBlock, ">")
if textStart != -1 {
textEnd := strings.Index(snippetBlock[textStart:], "</td>")
if textEnd != -1 {
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
}
}
}
}
urlPtr := link.href
results = append(results, contracts.MainResult{
Template: "default.html",
Title: link.title,
Content: snippet,
URL: &urlPtr,
Engine: "duckduckgo",
Score: float64(len(parsedLinks) - i),
Category: "general",
Engines: []string{"duckduckgo"},
})
}
return results, nil
}