Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions bin/expire_backups.pl
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,20 @@

my $dry_run = 0; # -d
my $storage_name = undef; # -s
my $workers = 1;
my $help = 0;

GetOptions(
'dry-run|d' => \$dry_run,
'storage|s=s' => \$storage_name,
'workers|w=i' => \$workers,
'help|?' => \$help
) or pod2usage(2);
pod2usage(1) if $help;

my $exp = HTFeed::BackupExpiration->new(storage_name => $storage_name, dry_run => $dry_run);
$workers = 1 if $workers < 1;

my $exp = HTFeed::BackupExpiration->new(storage_name => $storage_name, dry_run => $dry_run, max_workers => $workers);
$exp->run();

__END__
Expand All @@ -32,8 +36,8 @@ =head1 NAME

=head1 SYNOPSIS

expire_backups.pl [--dry-run] -s STORAGE_NAME
expire_backups.pl [--dry-run] [--workers WORKER_COUNT] -s STORAGE_NAME

STORAGE_NAME - storage class name matched against feed_backups.storage_name

WORKER_COUNT - maximum number of subprocesses to spawn
=cut
58 changes: 58 additions & 0 deletions bin/expire_versions.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/perl

use warnings;
use strict;

use FindBin;
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use YAML::XS;

use lib "$FindBin::Bin/../lib";
use HTFeed::BackupExpirationBatch;
use HTFeed::Log { root_logger => 'INFO, screen' };

my $dry_run = 0; # -d
my $storage_config_file = undef; # -c
my $storage_name = undef; # -s
my $help = 0;

GetOptions(
'config|c=s' => \$storage_config_file,
'dry-run|d' => \$dry_run,
'storage|s=s' => \$storage_name,
'help|?' => \$help
) or pod2usage(2);
pod2usage(1) if $help;

if (scalar @ARGV != 1) {
die "path to job file is required";
}

my $storage_config = undef;
if ($storage_config_file) {
$storage_config = YAML::XS::LoadFile($storage_config_file);
}

my $exp = HTFeed::BackupExpirationBatch->new(
dry_run => $dry_run,
job_file => $ARGV[0],
storage_config => $storage_config,
storage_name => $storage_name
);
$exp->run();

__END__

=head1 NAME

expire_versions.pl - remove a batch of superseded material from backup storage.

=head1 SYNOPSIS

expire_versions.pl [--config STORAGE_CONFIG_FILE] [--dry-run] -s STORAGE_NAME PATH_TO_JOB_FILE

STORAGE_CONFIG_FILE - path to a YAML file with config hash for STORAGE_NAME
STORAGE_NAME - storage class name matched against feed_backups.storage_name
PATH_TO_JOB_FILE - path to a TSV file with format namespace<TAB>id<TAB>version
=cut
196 changes: 136 additions & 60 deletions lib/HTFeed/BackupExpiration.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,100 +3,176 @@ package HTFeed::BackupExpiration;

use strict;
use warnings;

use HTFeed::Config qw(get_config);
use HTFeed::DBTools qw(get_dbh);
use HTFeed::Volume;
use Carp;

use File::Spec ();
use File::Temp ();
use Log::Log4perl qw(get_logger);
use File::Temp;
use YAML::XS ();

my $select_expired_sql = <<~'SQL';
SELECT namespace,id
FROM feed_backups
WHERE deleted IS NULL
AND storage_name=?
AND version < DATE_FORMAT(DATE_SUB(NOW(), INTERVAL 180 DAY),"%Y%m%d%H%i%S")
GROUP BY namespace,id
HAVING COUNT(*) > 1
SQL

my $select_versions_sql = <<~'SQL';
SELECT version
FROM feed_backups
WHERE deleted IS NULL
AND storage_name=?
AND namespace=?
AND id=?
AND version < DATE_FORMAT(DATE_SUB(NOW(), INTERVAL 180 DAY),"%Y%m%d%H%i%S")
ORDER BY version DESC
SQL

use HTFeed::Storage::PrefixedVersions;
use HTFeed::Storage::ObjectStore;

sub new {
my $class = shift;

my $self = {
storage_name => undef,
custom_storage_config => 0,
max_workers => 1,
@_
};

unless ($self->{storage_name}) {
croak "$class cannot be constructed without a storage name";
use Carp;
Carp::croak "$class cannot be constructed without a storage name";
}

# Test can init with `storage_config` because it is transient and must be known to workers.
# Production just reads the config as normal
if ($self->{storage_config}) {
$self->{custom_storage_config} = 1;
} else {
my $config = get_config('storage_classes');
my $storage_config = $config->{$self->{storage_name}};
die("Can't find storage configuration for " . $self->{storage_name}) unless $storage_config;
$self->{storage_config} = $storage_config;
}

$self->{temp_directory} = File::Temp->newdir;
$self->{workers} = {};

bless($self, $class);
return $self;
}

my $JOB_SIZE = 10000;

sub run {
my $self = shift;

my $dry_run = $self->{dry_run};
my $dry_run_text = "";
$dry_run_text = " (DRY RUN)" if $dry_run;

my $config = get_config('storage_classes');
my $storage_config = $config->{$self->{storage_name}};
die("Can't find storage configuration for " . $self->{storage_name}) unless $storage_config;

# find everything with more than one version that is at least 6 months old
# delete all but the most recent > 6 months old version
my $sth = get_dbh()->prepare(<<'SQL');
SELECT namespace,id
FROM feed_backups
WHERE deleted IS NULL
AND storage_name=?
AND version < DATE_FORMAT(DATE_SUB(NOW(), INTERVAL 180 DAY),"%Y%m%d%H%i%S")
GROUP BY namespace,id
HAVING COUNT(*) > 1
SQL

my $versions_sth = get_dbh()->prepare(<<'SQL');
SELECT version
FROM feed_backups
WHERE deleted IS NULL
AND storage_name=?
AND namespace=?
AND id=?
AND version < DATE_FORMAT(DATE_SUB(NOW(), INTERVAL 180 DAY),"%Y%m%d%H%i%S")
ORDER BY version DESC
SQL
# Write storage config to the temp directory for child processes to get at it.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I see why this is needed in terms of injecting config to the child process if we are doing fork() and exec() rather than just fork() and retaining the parents' config. It may be possible to set an env var to use a custom config (HTFEED_CONFIG) and write that out in the test? But if we need to do this I think it's OK.

# Unnecessary for production, needed for testing because it is generated as part
# of the test suite.
if ($self->{custom_storage_config}) {
$self->{storage_config_file} = File::Spec->catfile($self->{temp_directory}, 'storage_config.yml');
my $yaml = YAML::XS::Dump($self->{storage_config});
open(my $fh, '>', $self->{storage_config_file}) or die "Could not open storage config YAML $!";
print $fh $yaml;
close $fh;
}

my $update_sth = get_dbh()->prepare(<<'SQL');
UPDATE feed_backups SET deleted=1
WHERE namespace=?
AND id=?
AND version=?
AND storage_name=?
SQL
my $sth = get_dbh()->prepare($select_expired_sql);
my $versions_sth = get_dbh()->prepare($select_versions_sql);

my $job = [];
# Iterate over the entirety of feed_backups
# Reaching the end and restarting the query must take place at a
# higher level, perhaps with $self->run called repeatedly.
$sth->execute($self->{storage_name});
while (my $row = $sth->fetchrow_hashref) {
$versions_sth->execute($self->{storage_name}, $row->{namespace}, $row->{id});
my @versions = map { $_->[0]; } @{$versions_sth->fetchall_arrayref};
shift @versions; # jettison the most recent
foreach my $version (@versions) {
my $volume = new HTFeed::Volume(namespace => $row->{namespace},
objid => $row->{id},
package_type => 'ht');
my $storage = $storage_config->{class}->new(volume => $volume,
config => $storage_config,
name => $self->{storage_name});
unless (defined $storage) {
die "Unable to get storage for $volume->{namespace}.$volume->{objid}";
}
$storage->{timestamp} = $version;
$storage->{zip_suffix} = '.gpg';
get_logger->trace("deleting archive for $volume->{namespace}.$volume->{objid} version $version" . $dry_run_text);
next if $dry_run;
unless ($storage->delete_objects) {
die "Unable to delete $volume->{namespace}.$volume->{objid}";
push(@$job, [$row->{namespace}, $row->{id}, $version]);
}
# Do we have enough to spawn a worker?
if (scalar @$job >= $JOB_SIZE) {
$self->wait_for_available_worker;
$self->spawn_worker($job);
$job = [];
}
}
# Submit the leftovers if any
if (scalar @$job > 0) {
Comment thread
moseshll marked this conversation as resolved.
$self->wait_for_available_worker;
$self->spawn_worker($job);
$job = [];
}
# Set max workers to 0 so we wait for all of them to finish.
$self->{max_workers} = 0;
# Wait for all the workers to finish.
while (scalar keys %{$self->{workers}} > 0) {
$self->wait_for_available_worker;
}
}

# waitpid on existing workers (if any) until one finishes up
# but only if we are at maximum capacity.
# Only waits for workers to finish if we already have the maximum number on the go,
# or if we are finished and have set the maximum to 0.
sub wait_for_available_worker {
my $self = shift;

if (scalar keys %{$self->{workers}} >= $self->{max_workers}) {
my $pid = 0;
do {
# Wait for any worker
$pid = waitpid(-1, 0);
if ($pid > 0) {
my $job_file = $self->{workers}->{$pid};
unlink $job_file->filename;
delete $self->{workers}->{$pid};
}
get_logger->trace("setting deleted=1 for $volume->{namespace}.$volume->{objid} version $version");
$update_sth->execute($row->{namespace}, $row->{id},
$version, $self->{storage_name});
} while ($pid > 0);
}
}

sub spawn_worker {
my $self = shift;
my $job = shift;

my $job_file = File::Temp->new(
DIR => $self->{temp_directory},
SUFFIX => '.tsv',
CLEANUP => 0
);
foreach my $version (@$job) {
print $job_file join("\t", @$version) . "\n";
}
$job_file->close;

my $pid = fork();
if (!defined $pid) {
die "Fork failed: $!";
} elsif ($pid == 0) {
# WORKER PROCESS
my $worker_script = File::Spec->catfile($ENV{FEED_HOME}, 'bin', 'expire_versions.pl');
my @cmd = ('perl', $worker_script, '-s', $self->{storage_name});
if ($self->{custom_storage_config}) {
push @cmd, '--config', $self->{storage_config_file};
}
if ($self->{dry_run}) {
push @cmd, '--dry-run';
}
push @cmd, $job_file->filename;
exec(@cmd) or die "worker [$$] exec failed to run: $!\n";
} else {
# PARENT PROCESS
$self->{workers}->{$pid} = $job_file;
}
}

Expand Down
Loading