fix: fix DDG and Bing parsers — verified with live tests
DuckDuckGo: - Fixed parser to handle single-quoted class attributes (class='result-link') - Decode DDG tracking URLs (uddg= parameter) to extract real URLs - Match snippet extraction to actual DDG Lite HTML structure (</td> terminator) Bing: - Switched from HTML scraping (blocked by JS detection) to RSS endpoint (?format=rss) which returns parseable XML - Added JSON API response parsing as fallback - Returns graceful unresponsive_engines entry when blocked Live test results: - DuckDuckGo: 9 results ✅ - GitHub: 10 results (14,768 total) ✅ - Bing: 10 results via RSS ✅ - Reddit: skipped (403 from sandbox, needs browser-like context)
This commit is contained in:
parent
df8fe9474b
commit
a8ab29b23a
4 changed files with 186 additions and 157 deletions
|
|
@ -71,10 +71,10 @@ func TestDuckDuckGoRegion(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestParseDuckDuckGoHTML(t *testing.T) {
|
||||
html := `<a class="result-link" href="https://example.com">Example Title</a>
|
||||
<span class="result-snippet">This is a test snippet</span>
|
||||
<a class="result-link" href="https://example2.com">Second Result</a>
|
||||
<span class="result-snippet">Another snippet here</span>`
|
||||
html := `<a class='result-link' href="https://example.com">Example Title</a>
|
||||
<td class='result-snippet'>This is a test snippet</td>
|
||||
<a class='result-link' href="https://example2.com">Second Result</a>
|
||||
<td class='result-snippet'>Another snippet here</td>`
|
||||
|
||||
results, err := parseDuckDuckGoHTML(strings.NewReader(html))
|
||||
if err != nil {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue