kafka/internal/engines/duckduckgo_parse.go
Franz Kafka 7be03b4017 license: change from MIT to AGPLv3
Update LICENSE file and add AGPL header to all source files.

AGPLv3 ensures that if someone runs Kafka as a network service and
modifies it, they must release their source code under the same license.
2026-03-22 08:27:23 +00:00

153 lines
4.1 KiB
Go

// kafka — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package engines
import (
"io"
"net/url"
"strings"
"github.com/metamorphosis-dev/kafka/internal/contracts"
)
// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
body, err := io.ReadAll(r)
if err != nil {
return nil, err
}
content := string(body)
results := make([]contracts.MainResult, 0)
type parsedResult struct {
href string
title string
}
var parsedLinks []parsedResult
remaining := content
for {
// DDG uses single quotes: class='result-link'
idx := strings.Index(remaining, "class='result-link'")
if idx == -1 {
break
}
block := remaining[idx:]
// Extract href from the anchor.
href := extractAttr(block, "href")
if href == "" {
remaining = block[1:]
continue
}
// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
encodedURL := href[uddgIdx+5:]
// Split on & to get just the URL (other params may follow)
if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
encodedURL = encodedURL[:ampIdx]
}
if decoded, err := url.QueryUnescape(encodedURL); err == nil {
href = decoded
}
}
}
// Skip internal links.
if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
remaining = block[1:]
continue
}
// Extract title — text between > and </a> after the class attribute.
titleStart := strings.Index(block, ">")
if titleStart == -1 {
remaining = block[1:]
continue
}
afterClass := block[titleStart+1:]
titleEnd := strings.Index(afterClass, "</a>")
if titleEnd == -1 {
remaining = block[1:]
continue
}
title := stripHTML(afterClass[:titleEnd])
title = htmlUnescape(title)
if title == "" {
remaining = block[titleStart+1+titleEnd:]
continue
}
parsedLinks = append(parsedLinks, parsedResult{
href: href,
title: title,
})
remaining = block[titleStart+1+titleEnd:]
}
// Extract snippets for each result.
for i, link := range parsedLinks {
snippet := ""
linkIdx := strings.Index(content, link.href)
if linkIdx == -1 {
// Try partial match (the href might be HTML-encoded in the source).
linkIdx = strings.Index(content, url.QueryEscape(link.href))
}
if linkIdx != -1 {
snippetRegion := content[linkIdx:]
if len(snippetRegion) > 2000 {
snippetRegion = snippetRegion[:2000]
}
// DDG uses single quotes: class='result-snippet'
snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
if snippetIdx != -1 {
snippetBlock := snippetRegion[snippetIdx:]
textStart := strings.Index(snippetBlock, ">")
if textStart != -1 {
textEnd := strings.Index(snippetBlock[textStart:], "</td>")
if textEnd != -1 {
snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
}
}
}
}
urlPtr := link.href
results = append(results, contracts.MainResult{
Template: "default.html",
Title: link.title,
Content: snippet,
URL: &urlPtr,
Engine: "duckduckgo",
Score: float64(len(parsedLinks) - i),
Category: "general",
Engines: []string{"duckduckgo"},
})
}
return results, nil
}