kafka/internal/engines/duckduckgo_parse.go

package engines

import (
	"io"
	"net/url"
	"strings"

	"github.com/ashie/gosearch/internal/contracts"
)

// parseDuckDuckGoHTML parses DuckDuckGo Lite's HTML response for search results.
// DDG Lite uses HTML tables with single-quoted class attributes and DDG tracking URLs.
func parseDuckDuckGoHTML(r io.Reader) ([]contracts.MainResult, error) {
	body, err := io.ReadAll(r)
	if err != nil {
		return nil, err
	}

	content := string(body)
	results := make([]contracts.MainResult, 0)

	type parsedResult struct {
		href  string
		title string
	}

	var parsedLinks []parsedResult
	remaining := content

	for {
		// DDG uses single quotes: class='result-link'
		idx := strings.Index(remaining, "class='result-link'")
		if idx == -1 {
			break
		}

		block := remaining[idx:]

		// Extract href from the anchor.
		href := extractAttr(block, "href")
		if href == "" {
			remaining = block[1:]
			continue
		}

		// DDG wraps real URLs in tracking redirect: //duckduckgo.com/l/?uddg=ENCODED_URL
		if strings.Contains(href, "duckduckgo.com/l/") || strings.Contains(href, "uddg=") {
			if uddgIdx := strings.Index(href, "uddg="); uddgIdx != -1 {
				encodedURL := href[uddgIdx+5:]
				// Split on & to get just the URL (other params may follow)
				if ampIdx := strings.Index(encodedURL, "&"); ampIdx != -1 {
					encodedURL = encodedURL[:ampIdx]
				}
				if decoded, err := url.QueryUnescape(encodedURL); err == nil {
					href = decoded
				}
			}
		}

		// Skip internal links.
		if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "//duckduckgo.com") {
			remaining = block[1:]
			continue
		}

		// Extract title — text between > and </a> after the class attribute.
		titleStart := strings.Index(block, ">")
		if titleStart == -1 {
			remaining = block[1:]
			continue
		}
		afterClass := block[titleStart+1:]
		titleEnd := strings.Index(afterClass, "</a>")
		if titleEnd == -1 {
			remaining = block[1:]
			continue
		}
		title := stripHTML(afterClass[:titleEnd])
		title = htmlUnescape(title)

		if title == "" {
			remaining = block[titleStart+1+titleEnd:]
			continue
		}

		parsedLinks = append(parsedLinks, parsedResult{
			href:  href,
			title: title,
		})

		remaining = block[titleStart+1+titleEnd:]
	}

	// Extract snippets for each result.
	for i, link := range parsedLinks {
		snippet := ""
		linkIdx := strings.Index(content, link.href)
		if linkIdx == -1 {
			// Try partial match (the href might be HTML-encoded in the source).
			linkIdx = strings.Index(content, url.QueryEscape(link.href))
		}

		if linkIdx != -1 {
			snippetRegion := content[linkIdx:]
			if len(snippetRegion) > 2000 {
				snippetRegion = snippetRegion[:2000]
			}

			// DDG uses single quotes: class='result-snippet'
			snippetIdx := strings.Index(snippetRegion, "class='result-snippet'")
			if snippetIdx != -1 {
				snippetBlock := snippetRegion[snippetIdx:]
				textStart := strings.Index(snippetBlock, ">")
				if textStart != -1 {
					textEnd := strings.Index(snippetBlock[textStart:], "</td>")
					if textEnd != -1 {
						snippet = stripHTML(snippetBlock[textStart+1 : textStart+textEnd])
					}
				}
			}
		}

		urlPtr := link.href
		results = append(results, contracts.MainResult{
			Template: "default.html",
			Title:    link.title,
			Content:  snippet,
			URL:      &urlPtr,
			Engine:   "duckduckgo",
			Score:    float64(len(parsedLinks) - i),
			Category: "general",
			Engines:  []string{"duckduckgo"},
		})
	}

	return results, nil
}