Fixed archive handling and support multiple jobs for extraction
Change-Id: I656cb0aa31c7139bf30b223928725ded195254a1
diff --git a/Changes b/Changes
index 7e4f606..93a1244 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.23 2016-10-27
+0.23 2016-10-28
- Added wildcard support for document extraction
+ - Fixed archive iteration to not duplicate the first archive
+ - Added parallel extraction for document sigles
0.22 2016-10-26
- Added support for document extraction
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 714e876..37e09b5 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -140,8 +140,7 @@
# Extract document files to a directory
sub extract_doc {
my $self = shift;
- my $doc_path = shift;
- my $target_dir = shift;
+ my ($doc_path, $target_dir, $jobs) = @_;
my $first = 1;
@@ -152,6 +151,7 @@
);
my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
+ my @cmds;
# Iterate over all attached archives
foreach my $archive (@$self) {
@@ -170,7 +170,6 @@
if ($first) {
# Only extract from first file
push(@cmd, join('/', @breadcrumbs, 'header.xml'));
- # push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
$first = 0;
};
@@ -187,13 +186,41 @@
push(@cmd, join('/', @breadcrumbs));
# Run system call
- system(@cmd);
+ push @cmds, \@cmd;
+ };
- # Check for return code
- if ($? != 0) {
- carp("System call '" . join(' ', @cmd) . "' errors " . $?);
- return;
+ if (!$jobs || $jobs == 1) {
+ foreach (@cmds) {
+ system(@$_);
+
+ # Check for return code
+ if ($? != 0) {
+ carp("System call '" . join(' ', @$_) . "' errors " . $?);
+ return;
+ };
};
+ }
+
+ # Extract annotations in parallel
+ else {
+ my $pool = Parallel::ForkManager->new($jobs);
+ $pool->run_on_finish(
+ sub {
+ my ($pid, $code) = @_;
+ my $data = pop;
+ print "Extract [\$$pid] " .
+ ($code ? " $code" : '') . " $$data\n";
+ }
+ );
+
+ ARCHIVE_LOOP:
+ foreach my $cmd (@cmds) {
+ my $pid = $pool->start and next ARCHIVE_LOOP;
+ system(@$cmd);
+ my $last = $cmd->[4];
+ $pool->finish($?, \"$last");
+ };
+ $pool->wait_all_children;
};
# Fine
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 1a418af..cedd84e 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -75,6 +75,7 @@
#
# 1016/10/27
# - Added wildcard support for document extraction
+#
# ----------------------------------------------------------
our $LAST_CHANGE = '2016/10/27';
@@ -322,7 +323,7 @@
};
# Add further annotation archived
- $archive->attach($_) foreach @input;
+ $archive->attach($_) foreach @input[1..$#input];
my $prefix = 1;
@@ -351,8 +352,8 @@
# Sigle is a doc sigle
if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
- print "$_ ";
+ print "$_ ...\n";
# Check if a prefix is needed
unless ($prefix_check) {
$prefix = $archive->check_prefix;
@@ -360,9 +361,11 @@
};
# TODO: Make this OS independent
- print '' . (
+ my $path = ($prefix ? './' : '') . $_;
+
+ print '... ' . (
$archive->extract_doc(
- ($prefix ? './' : '') . $_, $output
+ $path, $output, $jobs
) ? '' : 'not '
);
print "extracted.\n";
@@ -376,10 +379,10 @@
# Iterate over all given sigles and extract
foreach (@sigle) {
- print "$_ ";
+ print "$_ ...\n";
# TODO: Make this OS independent
- print '' . (
+ print '... ' . (
$archive->extract_text(
($prefix ? './' : '') . $_, $output
) ? '' : 'not '
@@ -483,7 +486,7 @@
};
# Add further annotation archived
- $archive->attach($_) foreach @input;
+ $archive->attach($_) foreach @input[1..$#input];
print "Start processing ...\n";
$t = Benchmark->new;
diff --git a/t/script/extract.t b/t/script/extract.t
index 4e633e4..cc43bc8 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -49,12 +49,14 @@
'--output' => $output,
);
+my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
+
# Test without compression
stdout_like(
sub {
system($call);
},
- qr!TEST/BSP/1 extracted.!s,
+ qr!TEST/BSP/1 $sep extracted.!s,
$call
);
@@ -85,7 +87,7 @@
sub {
system($call);
},
- qr!TEST/BSP/4 extracted.!s,
+ qr!TEST/BSP/4 $sep extracted.!s,
$call
);
@@ -94,7 +96,7 @@
sub {
system($call);
},
- qr!TEST/BSP/5 extracted.!s,
+ qr!TEST/BSP/5 $sep extracted.!s,
$call
);
@@ -123,7 +125,7 @@
sub {
system($call);
},
- qr!REI/BNG extracted!s,
+ qr!REI/BNG $sep extracted!s,
$call
);
@@ -132,7 +134,7 @@
sub {
system($call);
},
- qr!REI/RBR extracted!s,
+ qr!REI/RBR $sep extracted!s,
$call
);
@@ -159,7 +161,7 @@
sub {
system($call);
},
- qr!REI/BN\* extracted!s,
+ qr!REI/BN\* $sep extracted!s,
$call
);
@@ -168,7 +170,7 @@
sub {
system($call);
},
- qr!REI/RBR extracted!s,
+ qr!REI/RBR $sep extracted!s,
$call
);
@@ -201,7 +203,7 @@
sub {
system($call);
},
- qr!WPD15/A00/00081 extracted.!s,
+ qr!WPD15/A00/00081 $sep extracted.!s,
$call
);