Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
71d0d56
add AssetExtractor interface as comment
AltayAkkus Jan 30, 2026
c61d204
refactor: INA AssetExtractor
AltayAkkus Jan 30, 2026
6043bc8
feat: INA extractor tests
AltayAkkus Jan 30, 2026
9a6a684
refactor: TruthSocial AssetExtractor
AltayAkkus Jan 30, 2026
2792def
feat: failing tests for TruthSocial extractor
AltayAkkus Jan 30, 2026
e1525f2
refactor: M3U8 AssetExtractor
AltayAkkus Jan 30, 2026
3f9d789
feat: failing tests for M3U8
AltayAkkus Jan 30, 2026
ad5fc09
fix: application/x-mpegURL content-type lookup
yzqzss Jan 31, 2026
4a9a1c5
fix: m3u8 extract signature
AltayAkkus Feb 2, 2026
dd404f1
refactor: JSON AssetExtractor
AltayAkkus Feb 2, 2026
82ea270
refactor: XML AssetExtractor
AltayAkkus Feb 2, 2026
b570518
refactor: HTML AssetExtractor
AltayAkkus Feb 2, 2026
0cba45f
feat: HTML extractor tests
AltayAkkus Feb 2, 2026
5b71f63
fix: add missing outlinks
AltayAkkus Feb 2, 2026
43a6589
feat: call asset extractors via unified interface
AltayAkkus Feb 22, 2026
cc5557c
fix: not used or referenced anywhere
AltayAkkus Feb 22, 2026
3ff149d
feat: add ItemFixture and HydrateItem for testing models.Item
AltayAkkus Feb 22, 2026
ee377a1
feat: add example dehydrated Item test for INA
AltayAkkus Feb 22, 2026
345e840
fix test
yzqzss Feb 27, 2026
b7ab022
Merge branch 'internetarchive:main' into refactor/asset-extractors
AltayAkkus Jun 10, 2026
6de5fee
Fix upgrade path from implicit v1 to explicit v2
AltayAkkus Jun 10, 2026
7f514e4
Merge branch 'main' into feat-item-test-fixture
AltayAkkus Jun 10, 2026
7f4dddc
Merge item test fixture into asset extractors branch
AltayAkkus Jun 10, 2026
c0d7b81
fix implicit v1 explicit v2 yada yada
AltayAkkus Jun 10, 2026
b8e9915
fix: disable truthsocial extractors & tests
AltayAkkus Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 37 additions & 54 deletions internal/pkg/postprocessor/assets.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package postprocessor

import (
"fmt"
"net/url"
"slices"
"strings"
Expand All @@ -10,7 +11,8 @@ import (
"github.com/internetarchive/Zeno/v2/internal/pkg/postprocessor/extractor"
"github.com/internetarchive/Zeno/v2/internal/pkg/postprocessor/sitespecific/ina"
"github.com/internetarchive/Zeno/v2/internal/pkg/postprocessor/sitespecific/reddit"
"github.com/internetarchive/Zeno/v2/internal/pkg/postprocessor/sitespecific/truthsocial"

//"github.com/internetarchive/Zeno/v2/internal/pkg/postprocessor/sitespecific/truthsocial"
"github.com/internetarchive/Zeno/v2/pkg/models"
)

Expand All @@ -21,62 +23,44 @@ func ExtractAssetsOutlinks(item *models.Item) (assets, outlinks []*models.URL, e
return SanitizeAssetsOutlinks(item, assets, outlinks, err)
}

// Extract assets and outlinks from the body using the appropriate extractor
// Order is important, we want to check for more specific things first,
// as they may trigger more general extractors (e.g. HTML)
// TODO this should be refactored using interfaces
type AssetExtractor interface {
Match(*models.URL) bool
Extract(*models.Item) (assets, outlinks []*models.URL, err error)
}

// Order matters: site-specific extractors are checked first, then
// general-purpose ones. The first match wins, so more specific
// extractors must precede broader ones (e.g. HTML).
var assetExtractors = []AssetExtractor{
ina.INAExtractor{},
//truthsocial.TruthsocialExtractor{},
extractor.M3U8Extractor{},
extractor.JSONExtractor{},
extractor.XMLExtractor{},
extractor.HTMLAssetsExtractor{},
}

func Extractors(item *models.Item) (assets, outlinks []*models.URL, err error) {
logger := log.NewFieldedLogger(&log.Fields{
"component": "postprocessor.Extractors",
"item": item.GetShortID(),
})

switch {
case ina.IsAPIURL(item.GetURL()):
INAAssets, err := ina.ExtractMedias(item.GetURL())
if err != nil {
logger.Error("unable to extract medias from INA", "err", err.Error())
return assets, outlinks, err
}

HTMLAssets, err := extractor.HTMLAssets(item)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
for _, ext := range assetExtractors {
// heavy debug log calls, can be ommited when merged
logger.Debug("AssetExtractor Match call", "url", item.GetURL())
if ext.Match(item.GetURL()) {
logger.Debug("matched extractor", "extractor", fmt.Sprintf("%T", ext))
assets, outlinks, err = ext.Extract(item)
logger.Debug("extraction result", "assets", len(assets), "outlinks", len(outlinks), "err", err)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
}
return assets, outlinks, err
}
}

assets = append(INAAssets, HTMLAssets...)
case truthsocial.NeedExtraction(item.GetURL()):
assets, outlinks, err = truthsocial.ExtractAssets(item)
if err != nil {
logger.Error("unable to extract assets from TruthSocial", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsM3U8(item.GetURL()):
assets, err = extractor.M3U8(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsJSON(item.GetURL()):
assets, outlinks, err = extractor.JSON(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsXML(item.GetURL()):
assets, outlinks, err = extractor.XML(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsHTML(item.GetURL()):
assets, err = extractor.HTMLAssets(item)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsEmbeddedCSS(item):
// Embedded CSS is handled separately see PR discussion
if extractor.IsEmbeddedCSS(item) {
var atImportLinks []*models.URL
assets, atImportLinks, err = extractor.ExtractFromURLCSS(item.GetURL())

Expand All @@ -88,13 +72,12 @@ func Extractors(item *models.Item) (assets, outlinks []*models.URL, err error) {
logger.Debug("extracted assets from CSS", logArgs...)
}
extractor.AddAtImportLinksToItemChild(item, atImportLinks)
default:
contentType := item.GetURL().GetResponse().Header.Get("Content-Type")
logger.Debug("no extractor used for page", "content-type", contentType, "mime", item.GetURL().GetMIMEType().String())
return assets, outlinks, nil
return assets, outlinks, err
}

return assets, outlinks, err
contentType := item.GetURL().GetResponse().Header.Get("Content-Type")
logger.Debug("no extractor used for page", "content-type", contentType, "mime", item.GetURL().GetMIMEType().String())
return assets, outlinks, nil
}

func SanitizeAssetsOutlinks(item *models.Item, assets []*models.URL, outlinks []*models.URL, err error) ([]*models.URL, []*models.URL, error) {
Expand Down
31 changes: 30 additions & 1 deletion internal/pkg/postprocessor/assets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,23 @@ package postprocessor

import (
"bytes"
_ "embed"
"io"
"net/http"
"os"
"testing"

"github.com/internetarchive/Zeno/v2/internal/pkg/config"
testutil "github.com/internetarchive/Zeno/v2/internal/pkg/postprocessor/testutil"
"github.com/internetarchive/Zeno/v2/pkg/models"
"github.com/internetarchive/gowarc/pkg/spooledtempfile"
)

//go:embed testdata/ina_api_response.json
var inaFixture []byte

func TestExtractAssets_HTML(t *testing.T) {
config.Set(&config.Config{})
config.InitConfig()
config.Get().DisableHTMLTag = []string{} // initialize as empty slice

// Create a mock response with a minimal HTML body
Expand Down Expand Up @@ -58,6 +63,30 @@ func TestExtractAssets_HTML(t *testing.T) {
}
}

func TestExtractAssets_HydrateItemFixture(t *testing.T) {
config.InitConfig()
item := testutil.HydrateItem(t, inaFixture)
assets, _, err := ExtractAssetsOutlinks(item)
if err != nil {
t.Fatalf("extract assets from fixture: %v", err)
}
// INA API fixture should yield at least resourceUrl, resourceThumbnail, embed URL, uri
if len(assets) < 1 {
t.Errorf("expected at least one asset from INA fixture, got %d", len(assets))
}
// Sanity: one of the assets should be the resource URL from the fixture body
found := false
for _, a := range assets {
if a != nil && a.Raw == "https://example.com/video.mp4" {
found = true
break
}
}
if !found {
t.Errorf("expected asset https://example.com/video.mp4 in %v", assets)
}
}

func TestSanitizeAssetsOutlinks(t *testing.T) {
var err error
newURL, _ := models.NewURL("http://example.com")
Expand Down
11 changes: 11 additions & 0 deletions internal/pkg/postprocessor/extractor/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ import (
"github.com/internetarchive/Zeno/v2/pkg/models"
)

type HTMLAssetsExtractor struct{}

func (HTMLAssetsExtractor) Match(URL *models.URL) bool {
return IsHTML(URL)
}

func (HTMLAssetsExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
assets, err = HTMLAssets(item)
return assets, nil, err
}

var (
onclickRegex = regexp.MustCompile(`window\.location(?:\.href)?\s*=\s*['"]([^'"]+)['"]`)
)
Expand Down
18 changes: 12 additions & 6 deletions internal/pkg/postprocessor/extractor/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ func TestHTMLAssetsAudioVideo(t *testing.T) {
</html>`
item := setupItem(html)

assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("HTMLAssets error = %v", err)
}
Expand All @@ -102,7 +103,8 @@ func TestHTMLAssetsAttributes(t *testing.T) {
</html>`
item := setupItem(html)

assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("HTMLAssets error = %v", err)
}
Expand Down Expand Up @@ -144,7 +146,8 @@ func TestHTMLAssetsMeta(t *testing.T) {
}
item := setupItem(html)

assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("HTMLAssets error = %v", err)
}
Expand All @@ -171,7 +174,8 @@ func TestSrcset(t *testing.T) {
</body>
</html>`
item := setupItem(html)
assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("Error extracting HTML assets %s", err)
}
Expand Down Expand Up @@ -225,7 +229,8 @@ func TestCSS(t *testing.T) {
</body>
</html>`
item := setupItem(html)
assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("Error extracting HTML assets %s", err)
}
Expand All @@ -247,7 +252,8 @@ func TestHTMLDataSrc(t *testing.T) {
</html>
`
item := setupItem(html)
assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("Error extracting HTML assets %s", err)
}
Expand Down
10 changes: 10 additions & 0 deletions internal/pkg/postprocessor/extractor/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ import (
"github.com/internetarchive/Zeno/v2/pkg/models"
)

type JSONExtractor struct{}

func (JSONExtractor) Match(URL *models.URL) bool {
return IsJSON(URL)
}

func (JSONExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
return JSON(item.GetURL())
}

func IsJSON(URL *models.URL) bool {
return URL.GetMIMEType() != nil && strings.Contains(URL.GetMIMEType().String(), "json")
}
Expand Down
16 changes: 14 additions & 2 deletions internal/pkg/postprocessor/extractor/m3u8.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,21 @@ import (
"github.com/internetarchive/Zeno/v2/pkg/models"
)

type M3U8Extractor struct{}

func (M3U8Extractor) Match(URL *models.URL) bool {
return IsM3U8(URL)
}

func (M3U8Extractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
assets, err = M3U8(item.GetURL())
return assets, nil, err
}

func IsM3U8(URL *models.URL) bool {
return URL.GetMIMEType() != nil &&
URL.GetMIMEType().Is("application/vnd.apple.mpegurl") || URL.GetMIMEType().Is("application/x-mpegURL")
mt := URL.GetMIMEType()
// TODO: https://github.com/gabriel-vasile/mimetype/pull/755 remove "application/x-mpegURL" when merged&released
return mt != nil && (mt.Is("application/vnd.apple.mpegurl") || mt.Is("application/x-mpegURL"))
}

func M3U8(URL *models.URL) (assets []*models.URL, err error) {
Expand Down
48 changes: 48 additions & 0 deletions internal/pkg/postprocessor/extractor/m3u8_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package extractor

import (
"net/http"
"testing"

"github.com/internetarchive/Zeno/v2/pkg/models"
)

func TestShouldMatchM3U8URL(t *testing.T) {
cases := []struct {
url string
mimeType string
expected bool
}{
{"https://sub.example.com/test.m3u8", "application/vnd.apple.mpegurl", true},
{"https://sub.example.com/test2.m3u8", "application/x-mpegURL", true}, // will be fixed by PRhttps://github.com/gabriel-vasile/mimetype/pull/755
{"https://sub.example.com/test3.m3u8", "application/json", false},
{"https://sub.example.com/example.html", "text/html", false},
{"https://sub.example.com/m3u8.txt", "text/plain", false},
{"https://sub.example.com/example.mp4", "application/octet-stream", false},
{"https://sub.example.com/example.form", "application/x-www-form-urlencoded", false},
}

for _, c := range cases {
t.Run(c.url, func(t *testing.T) {
url, err := models.NewURL(c.url)
if err != nil {
t.Fatalf("failed to create URL: %v", err)
}
resp := &http.Response{
Header: make(http.Header),
Body: nil,
StatusCode: 200,
}
resp.Header.Set("Content-Type", c.mimeType)
url.SetResponse(resp)

// call match, returns bool
matched := M3U8Extractor{}.Match(&url)
if matched != c.expected {
t.Errorf("M3U8Extractor.Match(%q) = %v, want %v: mimetype=%q", c.url, matched, c.expected, url.GetMIMEType())
}
})
}
}

// TODO: Add test for Extract()
11 changes: 10 additions & 1 deletion internal/pkg/postprocessor/extractor/xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ import (
"github.com/internetarchive/Zeno/v2/pkg/models"
)

type XMLExtractor struct{}

func (XMLExtractor) Match(URL *models.URL) bool {
return IsXML(URL)
}

func (XMLExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
return XML(item.GetURL())
}

// xmlBufioReaderPool pools bufio.Reader instances for XML parsing to reduce allocations when processing many XML documents.
var xmlBufioReaderPool = sync.Pool{
New: func() any {
Expand Down Expand Up @@ -110,7 +120,6 @@ func XML(URL *models.URL) (assets, outlinks []*models.URL, err error) {
body.Reset(URL.GetBody())
defer xmlBufioReaderPool.Put(body)


// Peek to check if body has any non-whitespace content
peek, err := body.Peek(512) // peek up to 512 bytes
if err != nil && err != io.EOF {
Expand Down
1 change: 0 additions & 1 deletion internal/pkg/postprocessor/item.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ func postprocessItem(item *models.Item) []*models.Item {

if (item.GetURL().GetResponse() != nil && item.GetURL().GetResponse().StatusCode == 200) || // standard item
(item.GetURL().GetResponse() == nil && item.GetURL().GetBody() != nil) { // headless item
logger.Debug("item is a success")

var outlinksFromAssets []*models.URL

Expand Down
Loading
Loading