Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions internal/pkg/postprocessor/extractor/object_storage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package extractor

import (
"strings"

"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/internetarchive/Zeno/pkg/models"
)

// All the supported object storage servers
var ObjectStorageServers = func() (s []string) {
s = append(s, s3CompatibleServers...)
s = append(s, azureServers...)
return s
}()

// IsObjectStorage checks if the response is from an object storage server
func IsObjectStorage(URL *models.URL) bool {
return utils.StringContainsSliceElements(URL.GetResponse().Header.Get("Server"), ObjectStorageServers) &&
strings.Contains(URL.GetResponse().Header.Get("Content-Type"), "/xml") // tricky match both application/xml and text/xml
Comment thread
yzqzss marked this conversation as resolved.
Outdated
Comment thread
yzqzss marked this conversation as resolved.
Outdated
}

// ObjectStorage decides which helper to call based on the object storage server
func ObjectStorage(URL *models.URL) ([]*models.URL, error) {
defer URL.RewindBody()

server := URL.GetResponse().Header.Get("Server")
if utils.StringContainsSliceElements(server, s3CompatibleServers) {
return s3Compatible(URL)
} else if utils.StringContainsSliceElements(server, azureServers) {
return azure(URL)
} else {
return nil, nil
}
}
80 changes: 80 additions & 0 deletions internal/pkg/postprocessor/extractor/object_storage_azure.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package extractor

import (
"encoding/xml"
"fmt"
"strings"

"github.com/internetarchive/Zeno/pkg/models"
)

// Azure Blob Storage
var azureServers = []string{
// Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0
"Windows-Azure-Blob",
// Blob Service Version 1.0 Microsoft-HTTPAPI/2.0
"Blob Service Version",
// emulator, https://github.com/Azure/Azurite
"Azurite-Blob",
}
Comment on lines +12 to +20

@yzqzss yzqzss May 15, 2025

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tested Zeno with the azure extracor locally with Azurite and it works. But I haven't run it on a real Azure Blob yet.

Also, I found that some Azure Blobs use the Server: Blob Service Version XXX... header, and I don't know what the difference is between them and Server: Windows-Azure-Blob....

https://en.fofa.info/result?qbase64=aGVhZGVyPSJXaW5kb3dzLUF6dXJlLUJsb2Ii
https://en.fofa.info/result?qbase64=aGVhZGVyPSJCbG9iIFNlcnZpY2UgVmVyc2lvbiI%3D


// AZureBlobEnumerationResults represents the XML structure of an AZure Blob Storage listing
// <https://learn.microsoft.com/en-us/rest/api/storageservices/enumerating-blob-resources>
type AZureBlobEnumerationResults struct {
XMLName xml.Name `xml:"EnumerationResults"`
Prefix string `xml:"Prefix"`
Marker string `xml:"Marker"`
Blobs []AZureBlob `xml:"Blobs>Blob"`
NextMarker string `xml:"NextMarker"`
}

type AZureBlob struct {
Name string `xml:"Name"` // path/to/file.txt, no leading slash
LastModified string `xml:"Properties>Last-Modified"`
Size int64 `xml:"Properties>Content-Length"`
}

func azure(URL *models.URL) ([]*models.URL, error) {
defer URL.RewindBody()

// Decode XML result
var result AZureBlobEnumerationResults
if err := xml.NewDecoder(URL.GetBody()).Decode(&result); err != nil {
return nil, fmt.Errorf("error decoding AZureBlobEnumerationResults XML: %w", err)
}

reqURL := URL.GetRequest().URL

var outlinks []string

if result.NextMarker != "" {
nextURL := *reqURL
q := nextURL.Query()
q.Set("marker", result.NextMarker)
nextURL.RawQuery = q.Encode()
outlinks = append(outlinks, nextURL.String())
}

// Build base url for files
//
// reqURL: "https://{endpoint_host}/{account}/{bucket}?..."
// ->
// baseURL: "https://{endpoint_host}/{account}/{bucket}/
baseURL := *reqURL
baseURL.RawQuery = ""
baseURL.ForceQuery = false
if !strings.HasSuffix(baseURL.Path, "/") {
baseURL.Path = baseURL.Path + "/"
}

for _, blob := range result.Blobs {
fileURL := baseURL
if strings.HasPrefix(blob.Name, "/") {
panic("TODO")
Comment thread
yzqzss marked this conversation as resolved.
Outdated
Comment thread
yzqzss marked this conversation as resolved.
Outdated
}
fileURL.Path = baseURL.Path + blob.Name
Comment thread
yzqzss marked this conversation as resolved.
Outdated
outlinks = append(outlinks, fileURL.String())
Comment thread
yzqzss marked this conversation as resolved.
Outdated
}

return toURLs(outlinks), nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,15 @@ import (
"encoding/xml"
"fmt"
"net/url"
"strings"

"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/internetarchive/Zeno/pkg/models"
)

var validS3Servers = []string{
var s3CompatibleServers = []string{
"AmazonS3",
"WasabiS3",
"UploadServer", // Google Cloud Storage
"Windows-Azure-Blob",
"AliyunOSS", // Alibaba Object Storage Service
"UploadServer", // Google Cloud Storage, https://cloud.google.com/storage/docs/listing-objects#list-objects-xml
"AliyunOSS", // Alibaba Object Storage Service
}

// S3ListBucketResult represents the XML structure of an S3 bucket listing
Expand All @@ -40,53 +37,8 @@ type CommonPrefix struct {
Prefix []string `xml:"Prefix"`
}

// IsS3 checks if the response is from an S3 server
func IsS3(URL *models.URL) bool {
return utils.StringContainsSliceElements(URL.GetResponse().Header.Get("Server"), validS3Servers) &&
strings.Contains(URL.GetResponse().Header.Get("Content-Type"), "xml")
}

// S3 decides which helper to call based on the query param: old style (no list-type=2) vs. new style (list-type=2)
func S3(URL *models.URL) ([]*models.URL, error) {
defer URL.RewindBody()

// Decode XML result
var result S3ListBucketResult
if err := xml.NewDecoder(URL.GetBody()).Decode(&result); err != nil {
return nil, fmt.Errorf("error decoding S3 XML: %v", err)
}

// Prepare base data
reqURL := URL.GetRequest().URL
listType := reqURL.Query().Get("list-type")

// Build https://<host> as the base for direct file links
baseStr := fmt.Sprintf("https://%s", reqURL.Host)
parsedBase, err := url.Parse(baseStr)
if err != nil {
return nil, fmt.Errorf("invalid base URL: %v", err)
}

var outlinkStrings []string

// Delegate to old style or new style
if listType != "2" {
// Old style S3 listing, uses marker
outlinkStrings = s3Legacy(reqURL, parsedBase, result)
} else {
// New style listing (list-type=2), uses continuation token and/or CommonPrefixes
outlinkStrings = s3V2(reqURL, parsedBase, result)
}

// Convert from []string -> []*models.URL
var outlinks []*models.URL
for _, link := range outlinkStrings {
outlinks = append(outlinks, &models.URL{Raw: link})
}
return outlinks, nil
}

// s3Legacy handles the old ListObjects style, which uses `marker` for pagination.
// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html
func s3Legacy(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []string {
var outlinks []string

Expand All @@ -113,6 +65,7 @@ func s3Legacy(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) [
}

// s3V2 handles the new ListObjectsV2 style, which uses `continuation-token` and can return CommonPrefixes.
// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html
func s3V2(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []string {
var outlinks []string

Expand Down Expand Up @@ -150,3 +103,38 @@ func s3V2(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []str

return outlinks
}

// s3Compatible decides which helper to call based on the query param: old style (no list-type=2) vs. new style (list-type=2)
func s3Compatible(URL *models.URL) ([]*models.URL, error) {
defer URL.RewindBody()

// Decode XML result
var result S3ListBucketResult
if err := xml.NewDecoder(URL.GetBody()).Decode(&result); err != nil {
return nil, fmt.Errorf("error decoding S3ListBucketResult XML: %w", err)
}

// Prepare base data
reqURL := URL.GetRequest().URL
listType := reqURL.Query().Get("list-type")

// Build https://<host> as the base for direct file links
baseStr := fmt.Sprintf("https://%s", reqURL.Host)
parsedBase, err := url.Parse(baseStr)
if err != nil {
return nil, fmt.Errorf("invalid base URL: %v", err)
}

var outlinks []string

// Delegate to old style or new style
if listType != "2" {
// Old style S3 listing, uses marker
outlinks = s3Legacy(reqURL, parsedBase, result)
} else {
// New style listing (list-type=2), uses continuation token and/or CommonPrefixes
outlinks = s3V2(reqURL, parsedBase, result)
}

return toURLs(outlinks), nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,6 @@ import (
"github.com/internetarchive/gowarc/pkg/spooledtempfile"
)

// TestIsS3 checks the Server header for known S3 strings.
func TestIsS3(t *testing.T) {
tests := []struct {
name string
server string
want bool
}{
{"AmazonS3", "AmazonS3", true},
{"WasabiS3", "WasabiS3", true},
{"AliyunOSS", "AliyunOSS", true},
{"No match", "Apache", false},
{"Partial match", "Amazon", false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a *models.URL with the response Server header set
URLObj := &models.URL{}

URLObj.SetResponse(&http.Response{
Header: http.Header{
"Server": []string{tt.server},
"Content-Type": []string{"text/xml"},
},
})

got := IsS3(URLObj)
if got != tt.want {
t.Errorf("IsS3(server=%q) = %v, want %v", tt.server, got, tt.want)
}
})
}
}

func TestS3(t *testing.T) {
// This subtest shows a scenario of a valid XML with a single object,
// and list-type != 2 => "marker" logic should be used.
Expand Down Expand Up @@ -79,7 +45,7 @@ func TestS3(t *testing.T) {

URLObj.SetBody(spooledTempFile)

outlinks, err := S3(URLObj)
outlinks, err := s3Compatible(URLObj)
if err != nil {
t.Fatalf("S3() returned unexpected error: %v", err)
}
Expand Down Expand Up @@ -124,7 +90,7 @@ func TestS3(t *testing.T) {

URLObj.SetBody(spooledTempFile)

outlinks, err := S3(URLObj)
outlinks, err := s3Compatible(URLObj)
if err != nil {
t.Fatalf("S3() returned unexpected error: %v", err)
}
Expand Down Expand Up @@ -159,7 +125,7 @@ func TestS3(t *testing.T) {

URLObj.SetBody(spooledTempFile)

outlinks, err := S3(URLObj)
outlinks, err := s3Compatible(URLObj)
if err == nil {
t.Fatalf("expected error for invalid XML, got none")
}
Expand Down
43 changes: 43 additions & 0 deletions internal/pkg/postprocessor/extractor/object_storage_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package extractor

import (
"net/http"
"testing"

"github.com/internetarchive/Zeno/pkg/models"
)

// TestIsObjectStorage checks the Server header for known OSS Server strings.
func TestIsObjectStorage(t *testing.T) {
tests := []struct {
name string
server string
want bool
}{
{"AmazonS3", "AmazonS3", true},
{"WasabiS3", "WasabiS3", true},
{"Azurite", "Azurite-Blob/3.34.0", true},
{"AliyunOSS", "AliyunOSS", true},
{"No match", "Apache", false},
{"Partial match", "Amazon", false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a *models.URL with the response Server header set
URLObj := &models.URL{}

URLObj.SetResponse(&http.Response{
Header: http.Header{
"Server": []string{tt.server},
"Content-Type": []string{"text/xml"},
},
})

got := IsObjectStorage(URLObj)
if got != tt.want {
t.Errorf("IsObjectStorage(server=%q) = %v, want %v", tt.server, got, tt.want)
}
})
}
}
9 changes: 9 additions & 0 deletions internal/pkg/postprocessor/extractor/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,12 @@ func sortURLs(urls []*models.URL) {
return urls[i].Raw < urls[j].Raw
})
}

// Convert from []string -> []*models.URL
func toURLs(s []string) []*models.URL {
var outlinks []*models.URL
Comment thread
yzqzss marked this conversation as resolved.
Outdated
for _, link := range s {
outlinks = append(outlinks, &models.URL{Raw: link})
}
return outlinks
}
6 changes: 3 additions & 3 deletions internal/pkg/postprocessor/outlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ func extractOutlinks(item *models.Item) (outlinks []*models.URL, err error) {
logger.Error("unable to extract outlinks", "extractor", "truthsocial.GenerateOutlinksURLsFromLookup", "err", err.Error(), "item", item.GetShortID(), "url", item.GetURL().String())
return outlinks, err
}
case extractor.IsS3(item.GetURL()):
outlinks, err = extractor.S3(item.GetURL())
case extractor.IsObjectStorage(item.GetURL()):
outlinks, err = extractor.ObjectStorage(item.GetURL())
if err != nil {
logger.Error("unable to extract outlinks from S3", "extractor", "S3", "err", err.Error(), "item", item.GetShortID(), "url", item.GetURL().String())
logger.Error("unable to extract outlinks from ObjectStorage", "extractor", "ObjectStorage", "err", err.Error(), "item", item.GetShortID(), "url", item.GetURL().String())
return outlinks, err
}
case extractor.IsSitemapXML(item.GetURL()):
Expand Down
Loading