kafka/internal/engines/html_helpers.go

// kafka — a privacy-respecting metasearch engine
// Copyright (C) 2026-present metamorphosis-dev
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

package engines

import (
	"strings"
)

// extractAttr finds attr="value" or attr='value' in an HTML string.
func extractAttr(s, attr string) string {
	prefix := attr + `="`
	idx := strings.Index(s, prefix)
	if idx == -1 {
		prefix = attr + "='"
		idx = strings.Index(s, prefix)
		if idx == -1 {
			return ""
		}
	}
	start := idx + len(prefix)
	end := strings.Index(s[start:], "\"")
	if end == -1 {
		end = strings.Index(s[start:], "'")
	}
	if end == -1 {
		end = len(s[start:])
	}
	return s[start : start+end]
}

// stripHTML removes all HTML tags from a string.
func stripHTML(s string) string {
	var result strings.Builder
	inTag := false
	for _, r := range s {
		if r == '<' {
			inTag = true
			continue
		}
		if r == '>' {
			inTag = false
			continue
		}
		if !inTag {
			result.WriteRune(r)
		}
	}
	return strings.TrimSpace(result.String())
}

// htmlUnescape handles basic HTML entities.
func htmlUnescape(s string) string {
	s = strings.ReplaceAll(s, "&amp;", "&")
	s = strings.ReplaceAll(s, "&lt;", "<")
	s = strings.ReplaceAll(s, "&gt;", ">")
	s = strings.ReplaceAll(s, "&quot;", "\"")
	s = strings.ReplaceAll(s, "&#39;", "'")
	s = strings.ReplaceAll(s, "&nbsp;", " ")
	return s
}

// extractImgSrc finds the first <img src="..."> in an HTML string and returns
// the src attribute value.
func extractImgSrc(html string) string {
	idx := strings.Index(html, "<img")
	if idx == -1 {
		return ""
	}
	remaining := html[idx:]
	return extractAttr(remaining, "src")
}