diff --git a/internal/pkg/preprocessor/exclusion.go b/internal/pkg/preprocessor/exclusion.go index 7356995f..86d9bc26 100644 --- a/internal/pkg/preprocessor/exclusion.go +++ b/internal/pkg/preprocessor/exclusion.go @@ -1,12 +1,13 @@ package preprocessor import ( - "github.com/internetarchive/Zeno/internal/pkg/config" + "regexp" + "github.com/internetarchive/Zeno/pkg/models" ) -func matchRegexExclusion(item *models.Item) bool { - for _, exclusion := range config.Get().ExclusionRegexes { +func matchRegexExclusion(ExclusionRegexes []*regexp.Regexp, item *models.Item) bool { + for _, exclusion := range ExclusionRegexes { if exclusion.MatchString(item.GetURL().String()) { return true } diff --git a/internal/pkg/preprocessor/exclusion_test.go b/internal/pkg/preprocessor/exclusion_test.go new file mode 100644 index 00000000..93cda19f --- /dev/null +++ b/internal/pkg/preprocessor/exclusion_test.go @@ -0,0 +1,82 @@ +package preprocessor + +import ( + "regexp" + "testing" + + "github.com/google/uuid" + "github.com/internetarchive/Zeno/pkg/models" +) + +func TestMatchRegexExclusion(t *testing.T) { + exclusionRegex := []string{ + `(?i)^https?://(www\.)?archive-it\.org.*`, + `(?i)^https?://(www\.)?x\.com.*`, + `^https?://127\.0\.`, + `^https?://192\.168\.`, + `(?i)https?://[^/]+/wp-admin/`, + `(?i)^(mailto|sms|tel|data|javascript):`, + } + var regexps []*regexp.Regexp + for _, r := range exclusionRegex { + re, err := regexp.Compile(r) + if err != nil { + t.Fatalf("Failed to compile regex %q: %v", r, err) + } + regexps = append(regexps, re) + } + + tests := []struct { + name string + itemURL string + expectedMatched bool + }{ + { + name: "Match localhost IP", + itemURL: "http://127.0.0.1/details/testitem", + expectedMatched: true, + }, + { + name: "Match x.com post with HTTP", + itemURL: "HTTPS://x.com:/loukoumi07/status/1922747849671934061", + expectedMatched: true, + }, + { + name: "Match foo.com wp-admin", + itemURL: "https://foo.com/wp-admin/something", + expectedMatched: true, + }, + { + name: "Match mailto: uppercase link", + itemURL: "MAILTO:someone@foo.com", + expectedMatched: true, + }, + { + name: "Match tel: link", + itemURL: "tel:0090567854", + expectedMatched: true, + }, + { + name: "No match", + itemURL: "https://archive.org/details/testitem", + expectedMatched: false, + }, + { + name: "No match", + itemURL: "https://something.org/details/wp-admintestitem", + expectedMatched: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + parsedURL := &models.URL{Raw: tt.itemURL} + parsedURL.Parse() + item := models.NewItem(uuid.New().String(), parsedURL, "") + got := matchRegexExclusion(regexps, item) + if got != tt.expectedMatched { + t.Errorf("Expected match: %v, got: %v", tt.expectedMatched, got) + } + }) + } +} diff --git a/internal/pkg/preprocessor/preprocessor.go b/internal/pkg/preprocessor/preprocessor.go index 8e6cb718..68c38db4 100644 --- a/internal/pkg/preprocessor/preprocessor.go +++ b/internal/pkg/preprocessor/preprocessor.go @@ -200,7 +200,7 @@ func preprocess(workerID string, seed *models.Item) { // Apply exclusion filters even if it passed inclusion if utils.StringContainsSliceElements(items[i].GetURL().GetParsed().Host, config.Get().ExcludeHosts) || utils.StringContainsSliceElements(items[i].GetURL().String(), config.Get().ExcludeString) || - matchRegexExclusion(items[i]) { + matchRegexExclusion(config.Get().ExclusionRegexes, items[i]) { logger.Debug("URL excluded (matches exclusion filters)", "item_id", items[i].GetShortID(),