Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/parser/rss.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package parser

import (
"bytes"
"encoding/xml"
"io"
"path"
Expand Down Expand Up @@ -61,9 +62,15 @@ type rssEnclosure struct {
}

func ParseRSS(r io.Reader) (*Feed, error) {
body, err := io.ReadAll(r)
if err != nil {
return nil, err
}
body = stripRSSDefaultNamespace(body)

srcfeed := rssFeed{}

decoder := xmlDecoder(r)
decoder := xmlDecoder(bytes.NewReader(body))
decoder.DefaultSpace = "rss"
if err := decoder.Decode(&srcfeed); err != nil {
return nil, err
Expand Down
36 changes: 36 additions & 0 deletions src/parser/rss_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,39 @@ func TestRSSMultipleMedia(t *testing.T) {
t.Fatal("invalid rss")
}
}

// Feeds that declare a default namespace on the root <rss> element (e.g. the
// legacy Userland namespace) must still parse — see sud.ua/rss/rss_news_uk.xml.
func TestRSSDefaultNamespace(t *testing.T) {
have, _ := Parse(strings.NewReader(`
<?xml version="1.0" encoding="utf-8"?>
<rss xmlns="http://backend.userland.com/rss2" version="2.0" xmlns:yandex="https://sud.ua/">
<channel>
<title>Example</title>
<link>https://example.com/</link>
<item>
<title>Title 1</title>
<link>https://example.com/news/1</link>
<description>Description 1</description>
</item>
</channel>
</rss>
`))
want := &Feed{
Title: "Example",
SiteURL: "https://example.com/",
Items: []Item{
{
GUID: "https://example.com/news/1",
URL: "https://example.com/news/1",
Title: "Title 1",
Content: "Description 1",
},
},
}
if !reflect.DeepEqual(want, have) {
t.Logf("want: %#v", want)
t.Logf("have: %#v", have)
t.Fatal("default-namespaced rss not parsed")
}
}
13 changes: 13 additions & 0 deletions src/parser/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ func firstNonEmpty(vals ...string) string {

var linkRe = regexp.MustCompile(`(https?:\/\/\S+)`)

// RSS 2.0 elements live in no namespace, but some feeds (e.g. those declaring
// the legacy Userland namespace `http://backend.userland.com/rss2`) put every
// element in a default namespace. That prevents the namespaced struct tags in
// ParseRSS from matching, so items come back blank. Strip the default namespace
// declaration from the root <rss> element so its children fall back to the
// decoder's DefaultSpace. Only a default `xmlns=` is matched (not
// `xmlns:prefix=`), leaving extension namespaces intact.
var rssDefaultNamespaceRe = regexp.MustCompile(`(<rss\b[^>]*?)\s+xmlns="[^"]*"`)

func stripRSSDefaultNamespace(body []byte) []byte {
return rssDefaultNamespaceRe.ReplaceAll(body, []byte("$1"))
}

func plain2html(text string) string {
text = linkRe.ReplaceAllString(text, `<a href="$1">$1</a>`)
text = strings.ReplaceAll(text, "\n", "<br>")
Expand Down