diff --git a/bin/audit/crawl_repo_mets.pl b/bin/audit/crawl_repo_mets.pl new file mode 100755 index 00000000..df9094b6 --- /dev/null +++ b/bin/audit/crawl_repo_mets.pl @@ -0,0 +1,588 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::Bin/../../lib"; + +use DBI; +use HTFeed::Config qw(get_config); +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Log {root_logger => 'INFO, screen'}; +use File::Basename; +use File::Pairtree qw(ppath2id s2ppchars); +use HTFeed::Volume; +use HTFeed::VolumeValidator; +use HTFeed::Namespace; +use HTFeed::PackageType; +use HTFeed::METS; +use POSIX qw(strftime); +use Getopt::Long; +use URI::Escape; +use Date::Manip; + +my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; + +my $insert = +"insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked) values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP) \ +ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP"; + +my $update_mets = +"update feed_audit set page_count = ?, image_size = ?, first_ingest_date = ? where namespace = ? and id = ?"; + +my $insert_detail = +"insert into feed_audit_detail (namespace, id, path, status, detail, storage_name) values (?,?,?,?,?,'main_repo_audit')"; + +my $checkpoint_sel = +"select lastmd5check > ? from feed_audit where namespace = ? and id = ?"; + +my $filesProcessed; +my $prevpath; +my $do_mets; +my $do_src_mets; +my $checkpoint; + +sub reset_options { + $filesProcessed = 0; + $prevpath = undef; + $do_mets = 0; + $do_src_mets = 0; + $checkpoint = undef; +} + +sub main { + reset_options; + + GetOptions( + 'mets!' => \$do_mets, + 'src_mets!' => \$do_src_mets, + 'checkpoint=s' => \$checkpoint, + ); + + my $base = shift @ARGV or die("Missing base directory.."); + + open( RUN, "find $base -follow -type f|" ) + or die("Can't open pipe to find: $!"); + + while ( my $line = ) { + audit_path($line); + } + + get_dbh()->disconnect(); + close(RUN); +} + +sub audit_path { + my $line = shift; + chomp($line); + + my ($sdr_partition) = ($line =~ qr#/?sdr(\d+)/?#); + my @newList = (); #initialize array + next if $line =~ /\Qpre_uplift.mets.xml\E/; + # ignore temporary location + next if $line =~ qr(obj/\.tmp); + next if $line =~ qr(obj/\w+/pairtree_version.*); + next if $line =~ qr(obj/\w+/pairtree_prefix.*); + + # ignore ".old" files if they're recent + next if recent_previous_version($line); + + eval { + $filesProcessed++; + + # if($filesProcessed % 10000== 0) { + # print "$filesProcessed files processed\n"; + # } + + + # strip trailing / from path + my ( $pt_objid, $path, $type ) = + fileparse( $line, qr/\.mets\.xml/, qr/\.zip/ ); + $path =~ s/\/$//; # remove trailing / + return if ( $prevpath and $path eq $prevpath ); + + + $prevpath = $path; + + unless ( $path =~ qr(obj/(\w+)/pairtree_root/(.*)) ) { + warn("Can't parse path: $path"); + } + my $namespace = $1; + my @pathcomp = split("/", $2); + my $last_path = pop(@pathcomp); + + my $objid = ppath2id( join( "/", @pathcomp ) ); + if ( $pt_objid ne s2ppchars($objid) ) { + set_status( $namespace, $objid, $path, "BAD_PAIRTREE", + "$objid $pt_objid" ); + } + + if ( $last_path ne $pt_objid ) { + set_status( $namespace, $objid, $path, "BAD_PAIRTREE", + "$last_path $pt_objid" ); + } + + #get last modified date + my $zipfile = "$path/$pt_objid.zip"; + my $zip_seconds; + my $zipdate; + my $zipsize; + + if ( -e $zipfile ) { + $zip_seconds = ( stat($zipfile) )[9]; + $zipdate = strftime( "%Y-%m-%d %H:%M:%S", localtime($zip_seconds) ); + $zipsize = -s $zipfile; + } + + my $metsfile = "$path/$pt_objid.mets.xml"; + + my $mets_seconds; + my $metsdate; + my $metssize; + + if ( -e $metsfile ) { + $mets_seconds = ( stat($metsfile) )[9]; + $metssize = -s $metsfile; + $metsdate = strftime( "%Y-%m-%d %H:%M:%S", + localtime( ( stat($metsfile) )[9] ) ); + } + + #insert + execute_stmt( + $insert, + + $namespace, $objid, + + $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, + + # duplicate parameters for duplicate key update + $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate + ); + + # does barcode have a zip & xml, and do they match? + opendir( my $dh, $path ); + + my $filecount = 0; + my $found_zip = 0; + my $found_mets = 0; + while ( my $file = readdir($dh) ) { + next + if $file eq '.' + or $file eq '..' + or recent_previous_version("$path/$file") + or $file =~ /pre_uplift.mets.xml$/; # ignore backup mets + if ( $file !~ /^([^.]+)\.(zip|mets.xml)$/ ) { + print("BAD_FILE $path $file\n"); + next; + } + my $dir_barcode = $1; + my $ext = $2; + $found_zip++ if $ext eq 'zip'; + $found_mets++ if $ext eq 'mets.xml'; + if ( $pt_objid ne $dir_barcode ) { + set_status( $namespace, $objid, $path, "BARCODE_MISMATCH", + "$pt_objid $dir_barcode" ); + } + $filecount++; + } + + closedir($dh); + + # check file count; do METS extraction stuff + if ( ( defined $zip_seconds ) + or ( defined $mets_seconds ) ) + { + + if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { + set_status( $namespace, $objid, $path, "BAD_FILECOUNT", + "zip=$found_zip mets=$found_mets total=$filecount" ); + } + + eval { + my $rval = check_metses( $namespace, $objid, $zipfile, $metsfile ); + }; + if ($@) { + set_status( $namespace, $objid, $path, "CANT_METS_CHECK", $@ ); + } + } + + }; + + if ($@) { + warn($@); + } +} + +sub check_metses { + my ( $namespace, $objid, $zipfile, $metsfile ) = @_; + + return if is_tombstoned($namespace, $objid); + + # don't check this item if we just looked at it + if(defined $checkpoint) { + my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); + if(my @row = $sth->fetchrow_array()) { + return if @row and $row[0]; + } + } + + my $volume = new HTFeed::Volume( + packagetype => "pkgtype", + namespace => $namespace, + objid => $objid + ); + my $mets = $volume->_parse_xpc($metsfile); + my $rval = undef; + + if ($do_mets) { + check_mets($volume, $metsfile, $mets); + } + + if($do_src_mets) { + extract_source_mets($volume, $zipfile); + } + + return $rval; +} + +sub set_status { + warn( join( " ", @_ ), "\n" ); + execute_stmt( $insert_detail, @_ ); +} + +sub execute_stmt { + my $stmt = shift; + my $dbh = get_dbh(); + my $sth = $dbh->prepare($stmt); + $sth->execute(@_); + return $sth; +} + +sub check_mets { + my $volume = shift; + my $metsfile = shift; + my $mets = shift; + my $namespace = $volume->get_namespace(); + my $objid = $volume->get_objid(); + + # extract other stuff from repo METS + { # File types & count + my %filetypes; + foreach my $file ( + $mets->findnodes('//mets:file/mets:FLocat/@xlink:href') ) + { + my ($extension) = ( $file->value =~ /\.(\w+)$/ ); + $filetypes{$extension}++; + } + while ( my ( $ext, $count ) = each(%filetypes) ) { + mets_log( $namespace, $objid, "FILETYPE", $ext, $count ); + } + } + + { # PREMIS & premis ID version + my $premisversion = "none"; + if ( $mets->findnodes('//mets:mdWrap[@MDTYPE="PREMIS"]') ) { + $premisversion = "unknown"; + } + if ( $mets->findnodes('//mets:mdWrap//premis:premis') ) { + $premisversion = "premis2"; + } + + mets_log( $namespace, $objid, "PREMIS_VERSION", $premisversion ); + } + + { # PREMIS event ID types + + my %event_id_types = (); + foreach my $eventtype ( + $mets->findnodes( + '//premis:eventIdentifierType' + ) + ) + { + $event_id_types{ $mets->findvalue( '.', $eventtype ) }++; + } + foreach my $event_id_type ( keys(%event_id_types) ) { + mets_log( $namespace, $objid, "PREMIS_EVENT_TYPE", + $event_id_type, $event_id_types{$event_id_type} ); + } + } + + { # PREMIS agent types + my %agent_id_types = (); + foreach my $agenttype ( + $mets->findnodes( + '//premis:linkingAgentIdentifierType' + ) + ) + { + $agent_id_types{ $mets->findvalue( '.', $agenttype ) }++; + } + foreach my $agent_id_type ( keys(%agent_id_types) ) { + mets_log( $namespace, $objid, "PREMIS_AGENT_TYPE", + $agent_id_type, $agent_id_types{$agent_id_type} ); + } + + } + + { # Capturing agent + foreach my $event ( + $mets->findnodes( + '//premis:event[premis:eventType="capture"]' + ) + ) + { + my $executor = $mets->findvalue( + './premis:linkingAgentIdentifier[premis:linkingAgentRole="Executor"]/premis:linkingAgentIdentifierValue', + $event + ); + my $date = $mets->findvalue( + './premis:eventDateTime', + $event ); + mets_log( $namespace, $objid, "CAPTURE", $executor, $date ); + } + } + { # Processing agent + foreach my $event ( + $mets->findnodes( + '//premis:event[premis:eventType="message digest calculation"]' + ) + ) + { + my $executor = $mets->findvalue( + './premis:linkingAgentIdentifier[premis:linkingAgentRole="Executor"]/premis:linkingAgentIdentifierValue', + $event + ); + my $date = $mets->findvalue( + './premis:eventDateTime', + $event ); + mets_log( $namespace, $objid, "MD5SUM", $executor, $date ); + } + } + + { # Ingest date + foreach my $event ( + $mets->findnodes( + '//premis:event[premis:eventType="ingestion"]' + ) + ) + { + my $date = $mets->findvalue( + './premis:eventDateTime', + $event ); + mets_log( $namespace, $objid, "INGEST", $date ); + } + } + + { # MARC present + my $marc_present = + $mets->findvalue('count(//marc:record | //record)'); + mets_log( $namespace, $objid, "MARC", $marc_present ); + } + + { # METS valid + my ( $mets_valid, $error ) = + HTFeed::METS::validate_xml( { volume => $volume }, + $metsfile ); + if ( !$mets_valid ) { + $error =~ s/\n/ /mg; + } + + mets_log( $namespace, $objid, "METS_VALID", $mets_valid, $error ); + } + + { + eval { + my %mdsecs = (); + foreach + my $mdsec ( $mets->findnodes('//mets:mdWrap | //mets:mdRef') ) + { + my @mdbits = (); + push( @mdbits, $mdsec->nodeName ); + foreach my $attr (qw(LABEL MDTYPE OTHERMDTYPE)) { + my $attrval = $mdsec->getAttribute($attr); + if ( $attrval and $attrval ne '' ) { + push( @mdbits, "$attr=$attrval" ); + } + } + mets_log( $namespace, $objid, "METS_MDSEC", + join( "; ", @mdbits ) ); + } + } + } + + { # Page tagging, image size + my $has_pagetags = $mets->findvalue( + 'count(//mets:div[@TYPE="page"]/@LABEL[string() != ""])'); + mets_log( $namespace, $objid, "PAGETAGS", $has_pagetags ); + my $pages = $mets->findvalue('count(//mets:div[@TYPE="page"])'); + mets_log( $namespace, $objid, "PAGES", $pages ); + + + my $image_size = $mets->findvalue('sum(//mets:fileGrp[@USE="image"]/mets:file/@SIZE)'); + mets_log( $namespace, $objid, "IMAGE_SIZE", $image_size); + + my $first_ingest = first_ingest_date($mets); + mets_log( $namespace, $objid, "FIRST_INGEST", $first_ingest); + + execute_stmt($update_mets,$pages,$image_size,$first_ingest,$namespace,$objid); + + + } + +} + +sub first_ingest_date { + my $mets = shift; + + my @dates; + foreach my $event ($mets->findnodes('//premis:event[premis:eventType="ingestion"]')) { + my $date = $mets->findvalue('./premis:eventDateTime',$event); + push @dates, $date; + } + @dates = sort @dates; + + my $first_date = $dates[0]; + my $dm_date = Date::Manip::Date->new($first_date); + return $dm_date->printf("%Y-%m-%d"); +} + +sub extract_source_mets { + my $volume = shift; + my $zipfile = shift; + my $namespace = $volume->get_namespace(); + my $objid = $volume->get_objid(); + my $pt_objid = $volume->get_pt_objid(); + my @srcmets = (); + + open( my $zipinfo, "unzip -l '$zipfile'|" ); + while (<$zipinfo>) { + chomp; + my @zipfields = split /\s+/; + if ( $zipfields[4] + and $zipfields[4] =~ /^\Q$pt_objid\E\/\w+_\Q$pt_objid\E.xml/i ) + { + push( @srcmets, $zipfields[4] ); + } + } + if ( !@srcmets ) { + set_status( $namespace, $objid, $zipfile, "NO_SOURCE_METS", undef ); + } + elsif ( @srcmets != 1 ) { + set_status( $namespace, $objid, $zipfile, + "MULTIPLE_SOURCE_METS_CANDIDATES", undef ); + } + else { + + # source METS found + mets_log( $namespace, $objid, "SOURCE_METS", $srcmets[0] ); + system("cd /tmp; unzip -j '$zipfile' '$srcmets[0]'"); + my ($smets_name) = ( $srcmets[0] =~ /\/([^\/]+)$/ ); + my $tmp_smets_loc = "/tmp/$smets_name"; + + eval { + my %mdsecs = (); + my $xpc = $volume->_parse_xpc($tmp_smets_loc); + $xpc->registerNs( 'gbs', "http://books.google.com/gbs" ); + foreach my $mdsec ( $xpc->findnodes('//mets:mdWrap') ) { + my @mdbits = (); + foreach my $attr (qw(LABEL MDTYPE OTHERMDTYPE)) { + my $attrval = $mdsec->getAttribute($attr); + if ( $attrval and $attrval ne '' ) { + push( @mdbits, "$attr=$attrval" ); + } + } + $mdsecs{ join( '; ', @mdbits ) } = 1; + } + foreach my $mdsec ( sort( keys(%mdsecs) ) ) { + mets_log( $namespace, $objid, "SRC_METS_MDSEC", $mdsec ); + } + + # Try to get Google reading order + foreach my $tag (qw(gbs:pageOrder gbs:pageSequence gbs:coverTag)) { + my $val = $xpc->findvalue("//$tag"); + mets_log( $namespace, $objid, "GBS_READING", $tag, $val ); + } + + foreach my $techmd ( $xpc->findnodes("//mets:techMD") ) { + if ( $techmd->getAttribute("ID") =~ /^IMAGE_METHOD/ ) { + my $imagemethod_id = $techmd->getAttribute("ID"); + my $method = + $xpc->findvalue( ".//gbs:imageMethod", $techmd ); + my $count = $xpc->findvalue( + "count(//mets:file[contains(\@ADMID,\"$imagemethod_id\")])" + ); + mets_log( $namespace, $objid, "IMAGE_METHOD", $method, + $count ); + } + } + + { # source METS PREMIS events + foreach my $event ( + $xpc->findnodes( + '//premis:event' + ) + ) + { + my $eventtype = $xpc->findvalue( + './premis:eventType', + $event + ); + my $date = $xpc->findvalue( + './premis:eventDateTime', + $event ); + mets_log( $namespace, $objid, "SRC_METS_PREMIS_EVENT", $eventtype, $date ); + } + } + + }; + if ($@) { + set_status( $namespace, $objid, $srcmets[0], "BAD_SOURCE_METS", + $@ ); + } + + unlink($tmp_smets_loc); + + } +} + +sub mets_log { + my $namespace = shift; + my $objid = shift; + my $key = shift; + my $val1 = shift; + my $val2 = shift; + $val1 = '' if not defined $val1; + $val2 = '' if not defined $val2; + print join( "\t", $namespace, $objid, $key, $val1, $val2 ), "\n"; + + #execute_stmt($fs_mets_data,$namespace,$objid,$key,$val1,$val2); +} + +sub is_tombstoned { + my $namespace = shift; + my $objid = shift; + my $sth = execute_stmt($tombstone_check,$namespace,$objid); + if(my @row = $sth->fetchrow_array()) { + return $row[0]; + } else { + return 0; + } +} + +sub recent_previous_version { + my $file = shift; + + return unless $file =~ /.old$/; + + my $ctime = ( stat($file) )[10]; + my $ctime_age = time() - $ctime; + + return 1 if $ctime_age < (86400 * 2); + +} + +main unless caller; + +__END__ diff --git a/bin/audit/main_repo_audit.pl b/bin/audit/main_repo_audit.pl deleted file mode 100755 index b795a51a..00000000 --- a/bin/audit/main_repo_audit.pl +++ /dev/null @@ -1,599 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use FindBin; -use lib "$FindBin::Bin/../../lib"; - -use DBI; -use HTFeed::Config qw(get_config); -use HTFeed::DBTools qw(get_dbh); -use HTFeed::Log {root_logger => 'INFO, screen'}; -use File::Basename; -use File::Pairtree qw(ppath2id s2ppchars); -use HTFeed::Volume; -use HTFeed::VolumeValidator; -use HTFeed::Namespace; -use HTFeed::PackageType; -use HTFeed::METS; -use POSIX qw(strftime); -use Getopt::Long; -use URI::Escape; - -my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; - -my $insert = -"insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked) values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP) \ -ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP"; -my $update = -"update feed_audit set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ?"; - -my $update_mets = -"update feed_audit set page_count = ?, image_size = ? where namespace = ? and id = ?"; - -my $insert_detail = -"insert into feed_audit_detail (namespace, id, path, status, detail) values (?,?,?,?,?)"; - -my $checkpoint_sel = -"select lastmd5check > ? from feed_audit where namespace = ? and id = ?"; - -### set /sdr1 to /sdrX for test & parallelization -my $filesProcessed = 0; -my $prevpath; -my $do_md5 = 0; -my $do_mets = 0; -my $checkpoint = undef; -GetOptions( - 'md5!' => \$do_md5, -'mets!' => \$do_mets, - 'checkpoint=s' => \$checkpoint, -); - -my $base = shift @ARGV or die("Missing base directory.."); - -my ($sdr_partition) = ($base =~ qr#/?sdr(\d+)/?#); - -open( RUN, "find $base -follow -type f|" ) - or die("Can't open pipe to find: $!"); - -while ( my $line = ) { - chomp($line); - - my @newList = (); #initialize array - next if $line =~ /\Qpre_uplift.mets.xml\E/; - # ignore temporary location - next if $line =~ qr(obj/\.tmp); - - # ignore ".old" files if they're recent - next if recent_previous_version($line); - - eval { - $filesProcessed++; - - # if($filesProcessed % 10000== 0) { - # print "$filesProcessed files processed\n"; - # } - - - # strip trailing / from path - my ( $pt_objid, $path, $type ) = - fileparse( $line, qr/\.mets\.xml/, qr/\.zip/ ); - $path =~ s/\/$//; # remove trailing / - return if ( $prevpath and $path eq $prevpath ); - - # check mtime on directory - do not check if mtime is in the past two days - # to let synciq catch up - - return if recently_modified_path($path); - - $prevpath = $path; - - my @pathcomp = split( "/", $path ); - - # remove base & any empty components - @pathcomp = grep { $_ ne '' } @pathcomp; - my $first_path = shift @pathcomp; - my $last_path = pop @pathcomp; - my $namespace = $pathcomp[1]; - - my $objid = ppath2id( join( "/", @pathcomp ) ); - if ( $pt_objid ne s2ppchars($objid) ) { - set_status( $namespace, $objid, $path, "BAD_PAIRTREE", - "$objid $pt_objid" ); - } - - if ( $last_path ne $pt_objid ) { - set_status( $namespace, $objid, $path, "BAD_PAIRTREE", - "$last_path $pt_objid" ); - } - - #get last modified date - my $zipfile = "$path/$pt_objid.zip"; - my $zip_seconds; - my $zipdate; - my $zipsize; - - if ( -e $zipfile ) { - $zip_seconds = ( stat($zipfile) )[9]; - $zipdate = strftime( "%Y-%m-%d %H:%M:%S", localtime($zip_seconds) ); - $zipsize = -s $zipfile; - } - - my $metsfile = "$path/$pt_objid.mets.xml"; - - my $mets_seconds; - my $metsdate; - my $metssize; - - if ( -e $metsfile ) { - $mets_seconds = ( stat($metsfile) )[9]; - $metssize = -s $metsfile; - $metsdate = strftime( "%Y-%m-%d %H:%M:%S", - localtime( ( stat($metsfile) )[9] ) ); - } - - my $last_touched = $zip_seconds; - $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); - - #test symlinks unless we're traversing sdr1 or the file is too new - if ( $first_path ne 'sdr1' and (defined $last_touched and time - $last_touched >= 86400) ) { - my $link_path = join( "/", "/sdr1", @pathcomp, $last_path ); - my $link_target = readlink $link_path - or set_status( $namespace, $objid, $path, "CANT_LSTAT", - "$link_path $!" ); - - if ( defined $link_target and $link_target ne $path ) { - set_status( $namespace, $objid, $path, "SYMLINK_INVALID", - $link_target ); - } - - } - - #insert - execute_stmt( - $insert, - - $namespace, $objid, - - $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, - - # duplicate parameters for duplicate key update - $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate - ); - - # does barcode have a zip & xml, and do they match? - opendir( my $dh, $path ); - - my $filecount = 0; - my $found_zip = 0; - my $found_mets = 0; - while ( my $file = readdir($dh) ) { - next - if $file eq '.' - or $file eq '..' - or recent_previous_version("$path/$file") - or $file =~ /pre_uplift.mets.xml$/; # ignore backup mets - if ( $file !~ /^([^.]+)\.(zip|mets.xml)$/ ) { - print("BAD_FILE $path $file\n"); - next; - } - my $dir_barcode = $1; - my $ext = $2; - $found_zip++ if $ext eq 'zip'; - $found_mets++ if $ext eq 'mets.xml'; - if ( $pt_objid ne $dir_barcode ) { - set_status( $namespace, $objid, $path, "BARCODE_MISMATCH", - "$pt_objid $dir_barcode" ); - } - $filecount++; - } - - closedir($dh); - -# check file count; do md5 check and METS extraction stuff, but only if it's fully replicated - if ( ( defined $zip_seconds and time - $zip_seconds > 86400 ) - or ( defined $mets_seconds and time - $mets_seconds > 86400 ) ) - { - - if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { - set_status( $namespace, $objid, $path, "BAD_FILECOUNT", - "zip=$found_zip mets=$found_mets total=$filecount" ); - } - - eval { - my $rval = zipcheck( $namespace, $objid ); - if ($rval) { - execute_stmt( $update, "1", $namespace, $objid ); - } - elsif ( defined $rval ) { - execute_stmt( $update, "0", $namespace, $objid ); - } - }; - if ($@) { - set_status( $namespace, $objid, $path, "CANT_ZIPCHECK", $@ ); - } - } - - }; - - if ($@) { - warn($@); - } -} - -sub zipcheck { - my ( $namespace, $objid ) = @_; - - return unless $do_md5 or $do_mets; - - return if is_tombstoned($namespace, $objid); - - # don't check this item if we just looked at it - if(defined $checkpoint) { - my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); - if(my @row = $sth->fetchrow_array()) { - return if @row and $row[0]; - } - } - - # use google as a 'default' namespace for now - my $volume = new HTFeed::Volume( - packagetype => "pkgtype", - namespace => $namespace, - objid => $objid - ); - my $mets = $volume->get_repository_mets_xpc(); - my $rval = undef; - -# Extract the checksum for the zip file that looks kind of like this: -# -# -# -# -# - - if ($do_md5) { - my $zipname = $volume->get_zip_filename(); - my $mets_zipsum = $mets->findvalue( - "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); - - if(not defined $mets_zipsum or length($mets_zipsum) ne 32) { - # zip name may be uri-escaped in some cases - $zipname = uri_escape($zipname); - $mets_zipsum = $mets->findvalue( - "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); - } - - if ( not defined $mets_zipsum or length($mets_zipsum) ne 32 ) { - set_status( $namespace, $objid, $volume->get_repository_mets_path(), - "MISSING_METS_CHECKSUM", undef ); - } - else { - my $realsum = HTFeed::VolumeValidator::md5sum( - $volume->get_repository_zip_path() ); - if ( $mets_zipsum eq $realsum ) { - print "$zipname OK\n"; - $rval = 1; - } - else { - set_status( $namespace, $objid, - $volume->get_repository_zip_path(), - "BAD_CHECKSUM", "expected=$mets_zipsum actual=$realsum" ); - $rval = 0; - } - } - } - - if ($do_mets) { - - # extract other stuff from repo METS - { # File types & count - my %filetypes; - foreach my $file ( - $mets->findnodes('//mets:file/mets:FLocat/@xlink:href') ) - { - my ($extension) = ( $file->value =~ /\.(\w+)$/ ); - $filetypes{$extension}++; - } - while ( my ( $ext, $count ) = each(%filetypes) ) { - mets_log( $namespace, $objid, "FILETYPE", $ext, $count ); - } - } - - { # PREMIS & premis ID version - my $premisversion = "none"; - if ( $mets->findnodes('//mets:mdWrap[@MDTYPE="PREMIS"]') ) { - $premisversion = "unknown"; - } - if ( $mets->findnodes('//mets:mdWrap//premis:premis') ) { - $premisversion = "premis2"; - } - - mets_log( $namespace, $objid, "PREMIS_VERSION", $premisversion ); - } - - { # PREMIS event ID types - - my %event_id_types = (); - foreach my $eventtype ( - $mets->findnodes( - '//premis:eventIdentifierType' - ) - ) - { - $event_id_types{ $mets->findvalue( '.', $eventtype ) }++; - } - foreach my $event_id_type ( keys(%event_id_types) ) { - mets_log( $namespace, $objid, "PREMIS_EVENT_TYPE", - $event_id_type, $event_id_types{$event_id_type} ); - } - } - - { # PREMIS agent types - my %agent_id_types = (); - foreach my $agenttype ( - $mets->findnodes( - '//premis:linkingAgentIdentifierType' - ) - ) - { - $agent_id_types{ $mets->findvalue( '.', $agenttype ) }++; - } - foreach my $agent_id_type ( keys(%agent_id_types) ) { - mets_log( $namespace, $objid, "PREMIS_AGENT_TYPE", - $agent_id_type, $agent_id_types{$agent_id_type} ); - } - - } - - { # Capturing agent - foreach my $event ( - $mets->findnodes( - '//premis:event[premis:eventType="capture"]' - ) - ) - { - my $executor = $mets->findvalue( - './premis:linkingAgentIdentifier[premis:linkingAgentRole="Executor"]/premis:linkingAgentIdentifierValue', - $event - ); - my $date = $mets->findvalue( - './premis:eventDateTime', - $event ); - mets_log( $namespace, $objid, "CAPTURE", $executor, $date ); - } - } - { # Processing agent - foreach my $event ( - $mets->findnodes( - '//premis:event[premis:eventType="message digest calculation"]' - ) - ) - { - my $executor = $mets->findvalue( - './premis:linkingAgentIdentifier[premis:linkingAgentRole="Executor"]/premis:linkingAgentIdentifierValue', - $event - ); - my $date = $mets->findvalue( - './premis:eventDateTime', - $event ); - mets_log( $namespace, $objid, "MD5SUM", $executor, $date ); - } - } - - { # Ingest date - foreach my $event ( - $mets->findnodes( - '//premis:event[premis:eventType="ingestion"]' - ) - ) - { - my $date = $mets->findvalue( - './premis:eventDateTime', - $event ); - mets_log( $namespace, $objid, "INGEST", $date ); - } - } - - { # MARC present - my $marc_present = - $mets->findvalue('count(//marc:record | //record)'); - mets_log( $namespace, $objid, "MARC", $marc_present ); - } - - { # METS valid - my ( $mets_valid, $error ) = - HTFeed::METS::validate_xml( { volume => $volume }, - $volume->get_repository_mets_path() ); - if ( !$mets_valid ) { - $error =~ s/\n/ /mg; - } - - mets_log( $namespace, $objid, "METS_VALID", $mets_valid, $error ); - } - - { - eval { - my %mdsecs = (); - foreach - my $mdsec ( $mets->findnodes('//mets:mdWrap | //mets:mdRef') ) - { - my @mdbits = (); - push( @mdbits, $mdsec->nodeName ); - foreach my $attr (qw(LABEL MDTYPE OTHERMDTYPE)) { - my $attrval = $mdsec->getAttribute($attr); - if ( $attrval and $attrval ne '' ) { - push( @mdbits, "$attr=$attrval" ); - } - } - mets_log( $namespace, $objid, "METS_MDSEC", - join( "; ", @mdbits ) ); - } - } - } - - { # Page tagging, image size - my $has_pagetags = $mets->findvalue( - 'count(//mets:div[@TYPE="page"]/@LABEL[string() != ""])'); - mets_log( $namespace, $objid, "PAGETAGS", $has_pagetags ); - my $pages = $mets->findvalue('count(//mets:div[@TYPE="page"])'); - mets_log( $namespace, $objid, "PAGES", $pages ); - - - my $image_size = $mets->findvalue('sum(//mets:fileGrp[@USE="image"]/mets:file/@SIZE)'); - mets_log( $namespace, $objid, "IMAGE_SIZE", $image_size); - - execute_stmt($update_mets,$pages,$image_size,$namespace,$objid); - - - } - - extract_source_mets($volume); - } - return $rval; -} - -sub set_status { - warn( join( " ", @_ ), "\n" ); - execute_stmt( $insert_detail, @_ ); -} - -sub execute_stmt { - my $stmt = shift; - my $dbh = get_dbh(); - my $sth = $dbh->prepare($stmt); - $sth->execute(@_); - return $sth; -} - -sub extract_source_mets { - my $volume = shift; - my $namespace = $volume->get_namespace(); - my $objid = $volume->get_objid(); - my $zipfile = $volume->get_repository_zip_path(); - my $pt_objid = $volume->get_pt_objid(); - my @srcmets = (); - - open( my $zipinfo, "unzip -l '$zipfile'|" ); - while (<$zipinfo>) { - chomp; - my @zipfields = split /\s+/; - if ( $zipfields[4] - and $zipfields[4] =~ /^\Q$pt_objid\E\/\w+_\Q$pt_objid\E.xml/i ) - { - push( @srcmets, $zipfields[4] ); - } - } - if ( !@srcmets ) { - set_status( $namespace, $objid, $zipfile, "NO_SOURCE_METS", undef ); - } - elsif ( @srcmets != 1 ) { - set_status( $namespace, $objid, $zipfile, - "MULTIPLE_SOURCE_METS_CANDIDATES", undef ); - } - else { - - # source METS found - mets_log( $namespace, $objid, "SOURCE_METS", $srcmets[0] ); - system("cd /tmp; unzip -j '$zipfile' '$srcmets[0]'"); - my ($smets_name) = ( $srcmets[0] =~ /\/([^\/]+)$/ ); - my $tmp_smets_loc = "/tmp/$smets_name"; - - eval { - my %mdsecs = (); - my $xpc = $volume->_parse_xpc($tmp_smets_loc); - $xpc->registerNs( 'gbs', "http://books.google.com/gbs" ); - foreach my $mdsec ( $xpc->findnodes('//mets:mdWrap') ) { - my @mdbits = (); - foreach my $attr (qw(LABEL MDTYPE OTHERMDTYPE)) { - my $attrval = $mdsec->getAttribute($attr); - if ( $attrval and $attrval ne '' ) { - push( @mdbits, "$attr=$attrval" ); - } - } - $mdsecs{ join( '; ', @mdbits ) } = 1; - } - foreach my $mdsec ( sort( keys(%mdsecs) ) ) { - mets_log( $namespace, $objid, "SRC_METS_MDSEC", $mdsec ); - } - - # Try to get Google reading order - foreach my $tag (qw(gbs:pageOrder gbs:pageSequence gbs:coverTag)) { - my $val = $xpc->findvalue("//$tag"); - mets_log( $namespace, $objid, "GBS_READING", $tag, $val ); - } - - foreach my $techmd ( $xpc->findnodes("//mets:techMD") ) { - if ( $techmd->getAttribute("ID") =~ /^IMAGE_METHOD/ ) { - my $imagemethod_id = $techmd->getAttribute("ID"); - my $method = - $xpc->findvalue( ".//gbs:imageMethod", $techmd ); - my $count = $xpc->findvalue( - "count(//mets:file[contains(\@ADMID,\"$imagemethod_id\")])" - ); - mets_log( $namespace, $objid, "IMAGE_METHOD", $method, - $count ); - } - } - - }; - if ($@) { - set_status( $namespace, $objid, $srcmets[0], "BAD_SOURCE_METS", - $@ ); - } - - unlink($tmp_smets_loc); - - } -} - -sub mets_log { - my $namespace = shift; - my $objid = shift; - my $key = shift; - my $val1 = shift; - my $val2 = shift; - $val1 = '' if not defined $val1; - $val2 = '' if not defined $val2; - print join( "\t", $namespace, $objid, $key, $val1, $val2 ), "\n"; - - #execute_stmt($fs_mets_data,$namespace,$objid,$key,$val1,$val2); -} - -sub is_tombstoned { - my $namespace = shift; - my $objid = shift; - my $sth = execute_stmt($tombstone_check,$namespace,$objid); - if(my @row = $sth->fetchrow_array()) { - return $row[0]; - } else { - return 0; - } -} - -sub recently_modified_path { - my $path = shift; - - my $mtime = ( stat($path) )[9]; - my $mtime_age = time() - $mtime; - - return 1 if $mtime_age < (86400 * 2); -} - -sub recent_previous_version { - my $file = shift; - - return unless $file =~ /.old$/; - - my $ctime = ( stat($file) )[10]; - my $ctime_age = time() - $ctime; - - return 1 if $ctime_age < (86400 * 2); - -} - -get_dbh()->disconnect(); -close(RUN); - -__END__ diff --git a/t/crawl_repo_mets.t b/t/crawl_repo_mets.t new file mode 100644 index 00000000..59897d18 --- /dev/null +++ b/t/crawl_repo_mets.t @@ -0,0 +1,76 @@ +use strict; +use warnings; + +require "$ENV{FEED_HOME}/bin/audit/crawl_repo_mets.pl"; + +use Data::Dumper; +use File::Copy; +use File::Pairtree qw(id2ppath s2ppchars); +use File::Spec; +use Test::Spec; + +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Storage::LocalPairtree; + + +describe "bin/audit/crawl_repo_mets.pl" => sub { + + # crawl_repo_mets.pl reconfigures this in a way that breaks the test logger; + # set it back + use HTFeed::Log {root_logger => 'TRACE, string, screen'}; + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); + + sub run_audit { + @ARGV = @_; + main(); + } + + sub stage_item { + my $zip = shift; + my $mets = shift; + + my $repo_root = $tmpdirs->{obj_dir}; + my $item_dir = "$repo_root/sdr1/obj/test/pairtree_root/te/st/test"; + + my $fetch_dir = get_config('staging','fetch'); + + system("mkdir -p $item_dir"); + system("cp -f $fetch_dir/$zip $item_dir/test.zip"); + system("cp -f $fetch_dir/$mets $item_dir/test.mets.xml"); + + } + + it "records a first ingest date in feed_audit" => sub { + stage_item('test.zip','test-oneingest.mets.xml'); + + run_audit("--mets",$tmpdirs->{obj_dir}); + + my $first_ingest_date = get_dbh()->selectrow_arrayref("SELECT date(first_ingest_date) FROM feed_audit WHERE namespace = 'test' and id = 'test'")->[0]; + is($first_ingest_date, '2012-10-09'); + }; + + it "records the earlier ingest date if there are two in the mets" => sub { + stage_item('test.zip','test-twoingests.mets.xml'); + run_audit("--mets",$tmpdirs->{obj_dir}); + + my $first_ingest_date = get_dbh()->selectrow_arrayref("SELECT date(first_ingest_date) FROM feed_audit WHERE namespace = 'test' and id = 'test'")->[0]; + is($first_ingest_date, '2012-10-09'); + }; + + it "outputs OCR date" => sub { + stage_item('test-googlemets.zip','test-googlemets.mets.xml'); + my $tmp_str = ""; + + { + # temporarily reopen stdout to output to $tmp_str + open(my $tmp_out, ">", \$tmp_str); + local *STDOUT = $tmp_out; + run_audit("--src_mets",$tmpdirs->{obj_dir}); + } + + ok($tmp_str =~ /SRC_METS_PREMIS_EVENT\tOCR\t2025-09-10/m) + }; +}; + +runtests unless caller; diff --git a/t/fixtures/volumes/test-googlemets.mets.xml b/t/fixtures/volumes/test-googlemets.mets.xml new file mode 100644 index 00000000..16f1b6a5 --- /dev/null +++ b/t/fixtures/volumes/test-googlemets.mets.xml @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/t/fixtures/volumes/test-googlemets.zip b/t/fixtures/volumes/test-googlemets.zip new file mode 100644 index 00000000..a63dba4b Binary files /dev/null and b/t/fixtures/volumes/test-googlemets.zip differ diff --git a/t/fixtures/volumes/test-oneingest.mets.xml b/t/fixtures/volumes/test-oneingest.mets.xml new file mode 100644 index 00000000..0f0e0ff7 --- /dev/null +++ b/t/fixtures/volumes/test-oneingest.mets.xml @@ -0,0 +1,54 @@ + + + + + + + + + + + HathiTrust + test.test + + + + + test + test-ingestion + + ingestion + 2012-10-09T18:32:17Z + Ingestion of object package into repository + + HathiTrust Institution ID + umich + Executor + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/t/fixtures/volumes/test-twoingests.mets.xml b/t/fixtures/volumes/test-twoingests.mets.xml new file mode 100644 index 00000000..87be1260 --- /dev/null +++ b/t/fixtures/volumes/test-twoingests.mets.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + HathiTrust + test.test + + + + + test + test-ingestion1 + + ingestion + 2012-10-09T18:32:17Z + Ingestion of object package into repository + + HathiTrust Institution ID + umich + Executor + + + + + test + test-ingestion1 + + ingestion + 2024-01-01T11:00:00Z + Ingestion of object package into repository + + HathiTrust Institution ID + umich + Executor + + + + + + + + + + + + + + + + + + + + + + + + + + + +