From 762277688fdc1c7aad85f9bf0f2bfccc0a9cc65e Mon Sep 17 00:00:00 2001 From: f100024 Date: Fri, 12 Jun 2026 09:50:24 +0300 Subject: [PATCH] Add support legacy rss format; --- src/parser/rss.go | 9 ++++++++- src/parser/rss_test.go | 36 ++++++++++++++++++++++++++++++++++++ src/parser/util.go | 13 +++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/parser/rss.go b/src/parser/rss.go index 1470bfcfb..b629ba3e2 100644 --- a/src/parser/rss.go +++ b/src/parser/rss.go @@ -5,6 +5,7 @@ package parser import ( + "bytes" "encoding/xml" "io" "path" @@ -61,9 +62,15 @@ type rssEnclosure struct { } func ParseRSS(r io.Reader) (*Feed, error) { + body, err := io.ReadAll(r) + if err != nil { + return nil, err + } + body = stripRSSDefaultNamespace(body) + srcfeed := rssFeed{} - decoder := xmlDecoder(r) + decoder := xmlDecoder(bytes.NewReader(body)) decoder.DefaultSpace = "rss" if err := decoder.Decode(&srcfeed); err != nil { return nil, err diff --git a/src/parser/rss_test.go b/src/parser/rss_test.go index df7b981ae..169f3ccab 100644 --- a/src/parser/rss_test.go +++ b/src/parser/rss_test.go @@ -327,3 +327,39 @@ func TestRSSMultipleMedia(t *testing.T) { t.Fatal("invalid rss") } } + +// Feeds that declare a default namespace on the root element (e.g. the +// legacy Userland namespace) must still parse — see sud.ua/rss/rss_news_uk.xml. +func TestRSSDefaultNamespace(t *testing.T) { + have, _ := Parse(strings.NewReader(` + + + + Example + https://example.com/ + + Title 1 + https://example.com/news/1 + Description 1 + + + + `)) + want := &Feed{ + Title: "Example", + SiteURL: "https://example.com/", + Items: []Item{ + { + GUID: "https://example.com/news/1", + URL: "https://example.com/news/1", + Title: "Title 1", + Content: "Description 1", + }, + }, + } + if !reflect.DeepEqual(want, have) { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) + t.Fatal("default-namespaced rss not parsed") + } +} diff --git a/src/parser/util.go b/src/parser/util.go index ca1aa38c3..911319282 100644 --- a/src/parser/util.go +++ b/src/parser/util.go @@ -23,6 +23,19 @@ func firstNonEmpty(vals ...string) string { var linkRe = regexp.MustCompile(`(https?:\/\/\S+)`) +// RSS 2.0 elements live in no namespace, but some feeds (e.g. those declaring +// the legacy Userland namespace `http://backend.userland.com/rss2`) put every +// element in a default namespace. That prevents the namespaced struct tags in +// ParseRSS from matching, so items come back blank. Strip the default namespace +// declaration from the root element so its children fall back to the +// decoder's DefaultSpace. Only a default `xmlns=` is matched (not +// `xmlns:prefix=`), leaving extension namespaces intact. +var rssDefaultNamespaceRe = regexp.MustCompile(`(]*?)\s+xmlns="[^"]*"`) + +func stripRSSDefaultNamespace(body []byte) []byte { + return rssDefaultNamespaceRe.ReplaceAll(body, []byte("$1")) +} + func plain2html(text string) string { text = linkRe.ReplaceAllString(text, `$1`) text = strings.ReplaceAll(text, "\n", "
")