Introduced temporary extraction
Change-Id: I05bbc04b3f17e9398ca31a977e591f5a24ce14df
diff --git a/Changes b/Changes
index bea5911..420f1df 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
0.27 2017-04-07
- Support configuration files.
+ - Support temporary extraction.
0.26 2017-04-06
- Support wildcards on input.
diff --git a/Readme.pod b/Readme.pod
index b2cbd45..6366c81 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -215,10 +215,21 @@
Supported parameters are:
C<overwrite>, C<gzip>, C<jobs>,
C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>, C<base-sentences>, C<base-paragraphs>,
+C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
(semicolon separated), C<anno> (semicolon separated).
+=item B<--temporary-extract|-te>
+
+Only valid for the C<archive> command.
+
+This will first extract all files into a
+directory and then will archive.
+If the directory is given as C<:temp:>,
+a temporary directory is used.
+This is especially useful to avoid
+massive unzipping and potential
+network latency.
=item B<--sigle|-sg>
@@ -320,6 +331,7 @@
Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>
+
Contributor: Eliza Margaretha
L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 831d257..29b43a7 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -138,6 +138,76 @@
};
+sub extract_all {
+ my $self = shift;
+ my ($target_dir, $jobs) = @_;
+
+ my @init_cmd = (
+ 'unzip', # Use unzip program
+ '-qo', # quietly overwrite all existing files
+ '-d', $target_dir # Extract into target directory
+ );
+
+ # Iterate over all attached archives
+ my @cmds;
+ foreach my $archive (@$self) {
+
+ # $_ is the zip
+ my @cmd = @init_cmd;
+ push(@cmd, $archive->[0]); # Extract from zip
+
+ # Run system call
+ push @cmds, \@cmd;
+ };
+
+ $self->_extract($jobs, @cmds);
+};
+
+
+sub _extract {
+ my ($self, $jobs, @cmds) = @_;
+
+ # Only single call
+ if (!$jobs || $jobs == 1) {
+ foreach (@cmds) {
+ system(@$_);
+
+ # Check for return code
+ if ($? != 0) {
+ carp("System call '" . join(' ', @$_) . "' errors " . $?);
+ return;
+ };
+ };
+ }
+
+ # Extract annotations in parallel
+ else {
+ my $pool = Parallel::ForkManager->new($jobs);
+ $pool->run_on_finish(
+ sub {
+ my ($pid, $code) = @_;
+ my $data = pop;
+ print "Extract [\$$pid] " .
+ ($code ? " $code" : '') . " $$data\n";
+ }
+ );
+
+ ARCHIVE_LOOP:
+ foreach my $cmd (@cmds) {
+ my $pid = $pool->start and next ARCHIVE_LOOP;
+ system(@$cmd);
+ my $last = $cmd->[4];
+ $pool->finish($?, \"$last");
+ };
+ $pool->wait_all_children;
+ };
+
+ # Fine
+ return 1;
+};
+
+
+
# Extract document files to a directory
sub extract_doc {
my $self = shift;
@@ -190,42 +260,7 @@
push @cmds, \@cmd;
};
- if (!$jobs || $jobs == 1) {
- foreach (@cmds) {
- system(@$_);
-
- # Check for return code
- if ($? != 0) {
- carp("System call '" . join(' ', @$_) . "' errors " . $?);
- return;
- };
- };
- }
-
- # Extract annotations in parallel
- else {
- my $pool = Parallel::ForkManager->new($jobs);
- $pool->run_on_finish(
- sub {
- my ($pid, $code) = @_;
- my $data = pop;
- print "Extract [\$$pid] " .
- ($code ? " $code" : '') . " $$data\n";
- }
- );
-
- ARCHIVE_LOOP:
- foreach my $cmd (@cmds) {
- my $pid = $pool->start and next ARCHIVE_LOOP;
- system(@$cmd);
- my $last = $cmd->[4];
- $pool->finish($?, \"$last");
- };
- $pool->wait_all_children;
- };
-
- # Fine
- return 1;
+ $self->_extract($jobs, @cmds);
};
diff --git a/script/korapxml2krill b/script/korapxml2krill
index a439fff..93e5eac 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -22,6 +22,8 @@
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );
use File::Glob ':bsd_glob';
+use File::Temp qw/tempdir/;
+
# use KorAP::XML::ForkPool;
# TODO: use Parallel::Loops
@@ -100,6 +102,7 @@
#
# 2017/04/07
# - support configuration option
+# - support for temporary extraction
#
# ----------------------------------------------------------
@@ -130,6 +133,7 @@
'base-paragraphs|bp=s' => \(my $base_paragraphs),
'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
'gzip|z' => \(my $gzip),
+ 'temporary-extract|te=s' => \(my $extract_dir),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \(my $cache_file),
@@ -183,6 +187,11 @@
$jobs = $config{jobs};
};
+ # temporary-extract
+ if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
+ $extract_dir = $config{'temporary-extract'};
+ };
+
# Token base
if (!defined($token_base) && defined $config{token}) {
$token_base = $config{token};
@@ -451,7 +460,6 @@
};
};
-
# Glob files
if (@input) {
my @new_input = ();
@@ -497,10 +505,12 @@
unlink($cache_file) if $cache_delete;
stop_time;
-}
+ exit(1);
+};
+
# Extract XML files
-elsif ($cmd eq 'extract') {
+if ($cmd eq 'extract') {
# Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
@@ -508,7 +518,7 @@
# Check zip capabilities
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
- exit(1);
+ exit(0);
};
# Add further annotation archived
@@ -597,18 +607,59 @@
};
print "\n";
- exit(1);
+ # exit(1);
}
# Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
+ exit(1);
};
}
+
# Process an archive
elsif ($cmd eq 'archive') {
+ my $archive_output;
+
+ # First extract, then archive
+ if (defined $extract_dir) {
+
+ # Create new archive object
+ if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+
+ # Check zip capabilities
+ unless ($archive->test_unzip) {
+ print "Unzip is not installed or incompatible.\n\n";
+ exit(0);
+ };
+
+ # Add further annotation archived
+ $archive->attach($_) foreach @input[1..$#input];
+
+ # Create a temporary directory
+ if ($extract_dir eq ':temp:') {
+ $extract_dir = tempdir(CLEANUP => 1);
+ };
+
+ if ($archive->extract_all($extract_dir, $jobs)) {
+ @input = ($extract_dir);
+ }
+ else {
+ $log->error('Unable to extract from primary archive ' . $input[0] .
+ ' to ' . $extract_dir);
+ exit(1);
+ };
+ }
+
+ # Can't create archive object
+ else {
+ $log->error('Unable to extract from primary archive ' . $input[0]);
+ exit(1);
+ };
+ };
+
# TODO: Support sigles
# Zero means: everything runs in the parent process
@@ -767,13 +818,8 @@
print "Done.\n";
print timestr(timediff(Benchmark->new, $t))."\n\n";
-}
+};
-# Unknown command
-else {
- warn "Unknown command '$cmd'.\n\n";
- pod2usage(%ERROR_HASH);
-}
__END__
@@ -994,10 +1040,21 @@
Supported parameters are:
C<overwrite>, C<gzip>, C<jobs>,
C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>, C<base-sentences>, C<base-paragraphs>,
+C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
(semicolon separated), C<anno> (semicolon separated).
+=item B<--temporary-extract|-te>
+
+Only valid for the C<archive> command.
+
+This will first extract all files into a
+directory and then will archive.
+If the directory is given as C<:temp:>,
+a temporary directory is used.
+This is especially useful to avoid
+massive unzipping and potential
+network latency.
=item B<--sigle|-sg>
@@ -1099,6 +1156,7 @@
Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>
+
Contributor: Eliza Margaretha
L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>