Skip to content

Commit 901f297

Browse files
authored
Make Parser thread-safe (use new goxpp's base:xml state) (#202)
* Add a test to detect race conditions with Parser * Make Parser thread-safe (use new goxpp's base:xml state) This changeset implements xml:base resolution based on state maintained by the goxxp parser instead of using a shared urlStack. This allows Parser to be safely used concurrently. Depends on: mmcdole/goxpp#9 To test: `go test -race` * Depend on goxpp v1.1.0 Required for new thread-safe xml:base handling. * Fix ResolveHTML I had changed the name of UrlStack to BaseStack in the goxxp implementation.
1 parent 20c6436 commit 901f297

File tree

8 files changed

+115
-182
lines changed

8 files changed

+115
-182
lines changed

atom/parser.go

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,35 +14,23 @@ import (
1414
var (
1515
// Atom elements which contain URIs
1616
// https://tools.ietf.org/html/rfc4287
17-
uriElements = map[string]bool{
17+
atomUriElements = map[string]bool{
1818
"icon": true,
1919
"id": true,
2020
"logo": true,
2121
"uri": true,
2222
"url": true, // atom 0.3
2323
}
24-
25-
// Atom attributes which contain URIs
26-
// https://tools.ietf.org/html/rfc4287
27-
atomURIAttrs = map[string]bool{
28-
"href": true,
29-
"scheme": true,
30-
"src": true,
31-
"uri": true,
32-
}
3324
)
3425

3526
// Parser is an Atom Parser
36-
type Parser struct {
37-
base *shared.XMLBase
38-
}
27+
type Parser struct{}
3928

4029
// Parse parses an xml feed into an atom.Feed
4130
func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
4231
p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
43-
ap.base = &shared.XMLBase{URIAttrs: atomURIAttrs}
4432

45-
_, err := ap.base.FindRoot(p)
33+
_, err := shared.FindRoot(p)
4634
if err != nil {
4735
return nil, err
4836
}
@@ -67,7 +55,7 @@ func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
6755
extensions := ext.Extensions{}
6856

6957
for {
70-
tok, err := ap.base.NextTag(p)
58+
tok, err := shared.NextTag(p)
7159
if err != nil {
7260
return nil, err
7361
}
@@ -221,7 +209,7 @@ func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) {
221209
extensions := ext.Extensions{}
222210

223211
for {
224-
tok, err := ap.base.NextTag(p)
212+
tok, err := shared.NextTag(p)
225213
if err != nil {
226214
return nil, err
227215
}
@@ -376,7 +364,7 @@ func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) {
376364
extensions := ext.Extensions{}
377365

378366
for {
379-
tok, err := ap.base.NextTag(p)
367+
tok, err := shared.NextTag(p)
380368
if err != nil {
381369
return nil, err
382370
}
@@ -534,7 +522,7 @@ func (ap *Parser) parsePerson(name string, p *xpp.XMLPullParser) (*Person, error
534522
person := &Person{}
535523

536524
for {
537-
tok, err := ap.base.NextTag(p)
525+
tok, err := shared.NextTag(p)
538526
if err != nil {
539527
return nil, err
540528
}
@@ -684,7 +672,7 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
684672
if strings.Contains(result, "<![CDATA[") {
685673
result = shared.StripCDATA(result)
686674
if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
687-
result, _ = ap.base.ResolveHTML(result)
675+
result, _ = shared.ResolveHTML(p, result)
688676
}
689677
} else {
690678
// decode non-CDATA contents depending on type
@@ -695,12 +683,12 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
695683
result, err = shared.DecodeEntities(result)
696684
} else if strings.Contains(lowerType, "xhtml") {
697685
result = ap.stripWrappingDiv(result)
698-
result, _ = ap.base.ResolveHTML(result)
686+
result, _ = shared.ResolveHTML(p, result)
699687
} else if lowerType == "html" {
700688
result = ap.stripWrappingDiv(result)
701689
result, err = shared.DecodeEntities(result)
702690
if err == nil {
703-
result, _ = ap.base.ResolveHTML(result)
691+
result, _ = shared.ResolveHTML(p, result)
704692
}
705693
} else {
706694
decodedStr, err := base64.StdEncoding.DecodeString(result)
@@ -712,10 +700,10 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
712700

713701
// resolve relative URIs in URI-containing elements according to xml:base
714702
name := strings.ToLower(p.Name)
715-
if uriElements[name] {
716-
resolved, err := ap.base.ResolveURL(result)
717-
if err == nil {
718-
result = resolved
703+
if atomUriElements[name] {
704+
resolved, err := p.XmlBaseResolveUrl(result)
705+
if resolved != nil && err == nil {
706+
result = resolved.String()
719707
}
720708
}
721709

detector.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ func DetectFeedType(feed io.Reader) FeedType {
5454
// Check if it's an XML based feed
5555
p := xpp.NewXMLPullParser(bytes.NewReader(buffer.Bytes()), false, shared.NewReaderLabel)
5656

57-
xmlBase := shared.XMLBase{}
58-
_, err := xmlBase.FindRoot(p)
57+
_, err := shared.FindRoot(p)
5958
if err != nil {
6059
return FeedTypeUnknown
6160
}

go.mod

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ go 1.19
55
require (
66
github.com/PuerkitoBio/goquery v1.8.0
77
github.com/json-iterator/go v1.1.12
8-
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354
9-
github.com/stretchr/testify v1.3.0
8+
github.com/mmcdole/goxpp v1.1.0
9+
github.com/stretchr/testify v1.8.1
1010
github.com/urfave/cli v1.22.3
1111
golang.org/x/net v0.4.0
1212
golang.org/x/text v0.5.0
@@ -21,4 +21,5 @@ require (
2121
github.com/pmezard/go-difflib v1.0.0 // indirect
2222
github.com/russross/blackfriday/v2 v2.0.1 // indirect
2323
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
24+
gopkg.in/yaml.v3 v3.0.1 // indirect
2425
)

go.sum

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
1111
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
1212
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
1313
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
14-
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354 h1:Z6i7ND25ixRtXFBylIUggqpvLMV1I15yprcqMVB7WZA=
15-
github.com/mmcdole/goxpp v0.0.0-20200921145534-2f3784f67354/go.mod h1:pasqhqstspkosTneA62Nc+2p9SOBBYAPbnmRRWPQ0V8=
14+
github.com/mmcdole/goxpp v1.1.0 h1:WwslZNF7KNAXTFuzRtn/OKZxFLJAAyOA9w82mDz2ZGI=
15+
github.com/mmcdole/goxpp v1.1.0/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
1616
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
1717
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
1818
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -25,8 +25,13 @@ github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
2525
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
2626
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
2727
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
28-
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
28+
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
29+
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
2930
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
31+
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
32+
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
33+
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
34+
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
3035
github.com/urfave/cli v1.22.3 h1:FpNT6zq26xNpHZy08emi755QwzLPs6Pukqjlc7RfOMU=
3136
github.com/urfave/cli v1.22.3/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
3237
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
@@ -39,5 +44,9 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
3944
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
4045
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
4146
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
47+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
4248
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
4349
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
50+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
51+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
52+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

internal/shared/parseutils.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package shared
33
import (
44
"bytes"
55
"errors"
6+
"fmt"
67
"html"
78
"regexp"
89
"strings"
@@ -23,6 +24,26 @@ var (
2324
const CDATA_START = "<![CDATA["
2425
const CDATA_END = "]]>"
2526

27+
// FindRoot iterates through the tokens of an xml document until
28+
// it encounters its first StartTag event. It returns an error
29+
// if it reaches EndDocument before finding a tag.
30+
func FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
31+
for {
32+
event, err = p.Next()
33+
if err != nil {
34+
return event, err
35+
}
36+
if event == xpp.StartTag {
37+
break
38+
}
39+
40+
if event == xpp.EndDocument {
41+
return event, fmt.Errorf("Failed to find root node before document end.")
42+
}
43+
}
44+
return
45+
}
46+
2647
// ParseText is a helper function for parsing the text
2748
// from the current element of the XMLPullParser.
2849
// This function can handle parsing naked XML text from

0 commit comments

Comments
 (0)