Fixed archive handling and support multiple jobs for extraction

Change-Id: I656cb0aa31c7139bf30b223928725ded195254a1
diff --git a/Changes b/Changes
index 7e4f606..93a1244 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.23 2016-10-27
+0.23 2016-10-28
         - Added wildcard support for document extraction
+        - Fixed archive iteration to not duplicate the first archive
+        - Added parallel extraction for document sigles
 
 0.22 2016-10-26
         - Added support for document extraction
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 714e876..37e09b5 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -140,8 +140,7 @@
 # Extract document files to a directory
 sub extract_doc {
   my $self = shift;
-  my $doc_path = shift;
-  my $target_dir = shift;
+  my ($doc_path, $target_dir, $jobs) = @_;
 
   my $first = 1;
 
@@ -152,6 +151,7 @@
   );
 
   my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
+  my @cmds;
 
   # Iterate over all attached archives
   foreach my $archive (@$self) {
@@ -170,7 +170,6 @@
     if ($first) {
       # Only extract from first file
       push(@cmd, join('/', @breadcrumbs, 'header.xml'));
-      # push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
       $first = 0;
     };
 
@@ -187,13 +186,41 @@
     push(@cmd, join('/', @breadcrumbs));
 
     # Run system call
-    system(@cmd);
+    push @cmds, \@cmd;
+  };
 
-    # Check for return code
-    if ($? != 0) {
-      carp("System call '" . join(' ', @cmd) . "' errors " . $?);
-      return;
+  if (!$jobs || $jobs == 1) {
+    foreach (@cmds) {
+      system(@$_);
+
+      # Check for return code
+      if ($? != 0) {
+        carp("System call '" . join(' ', @$_) . "' errors " . $?);
+        return;
+      };
     };
+  }
+
+  # Extract annotations in parallel
+  else {
+    my $pool = Parallel::ForkManager->new($jobs);
+    $pool->run_on_finish(
+      sub {
+        my ($pid, $code) = @_;
+        my $data = pop;
+        print "Extract [\$$pid] " .
+          ($code ? " $code" : '') . " $$data\n";
+      }
+    );
+
+  ARCHIVE_LOOP:
+    foreach my $cmd (@cmds) {
+      my $pid = $pool->start and next ARCHIVE_LOOP;
+      system(@$cmd);
+      my $last = $cmd->[4];
+      $pool->finish($?, \"$last");
+    };
+    $pool->wait_all_children;
   };
 
   # Fine
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 1a418af..cedd84e 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -75,6 +75,7 @@
 #
 # 1016/10/27
 # - Added wildcard support for document extraction
+#
 # ----------------------------------------------------------
 
 our $LAST_CHANGE = '2016/10/27';
@@ -322,7 +323,7 @@
     };
 
     # Add further annotation archived
-    $archive->attach($_) foreach @input;
+    $archive->attach($_) foreach @input[1..$#input];
 
     my $prefix = 1;
 
@@ -351,8 +352,8 @@
 
         # Sigle is a doc sigle
         if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
-          print "$_ ";
 
+          print "$_ ...\n";
           # Check if a prefix is needed
           unless ($prefix_check) {
             $prefix = $archive->check_prefix;
@@ -360,9 +361,11 @@
           };
 
           # TODO: Make this OS independent
-          print '' . (
+          my $path = ($prefix ? './' : '') . $_;
+
+          print '... ' . (
             $archive->extract_doc(
-              ($prefix ? './' : '') . $_, $output
+              $path, $output, $jobs
             ) ? '' : 'not '
           );
           print "extracted.\n";
@@ -376,10 +379,10 @@
 
     # Iterate over all given sigles and extract
     foreach (@sigle) {
-      print "$_ ";
+      print "$_ ...\n";
 
       # TODO: Make this OS independent
-      print '' . (
+      print '... ' . (
         $archive->extract_text(
           ($prefix ? './' : '') . $_, $output
         ) ? '' : 'not '
@@ -483,7 +486,7 @@
     };
 
     # Add further annotation archived
-    $archive->attach($_) foreach @input;
+    $archive->attach($_) foreach @input[1..$#input];
 
     print "Start processing ...\n";
     $t = Benchmark->new;
diff --git a/t/script/extract.t b/t/script/extract.t
index 4e633e4..cc43bc8 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -49,12 +49,14 @@
   '--output' => $output,
 );
 
+my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
+
 # Test without compression
 stdout_like(
   sub {
     system($call);
   },
-  qr!TEST/BSP/1 extracted.!s,
+  qr!TEST/BSP/1 $sep extracted.!s,
   $call
 );
 
@@ -85,7 +87,7 @@
   sub {
     system($call);
   },
-  qr!TEST/BSP/4 extracted.!s,
+  qr!TEST/BSP/4 $sep extracted.!s,
   $call
 );
 
@@ -94,7 +96,7 @@
   sub {
     system($call);
   },
-  qr!TEST/BSP/5 extracted.!s,
+  qr!TEST/BSP/5 $sep extracted.!s,
   $call
 );
 
@@ -123,7 +125,7 @@
   sub {
     system($call);
   },
-  qr!REI/BNG extracted!s,
+  qr!REI/BNG $sep extracted!s,
   $call
 );
 
@@ -132,7 +134,7 @@
   sub {
     system($call);
   },
-  qr!REI/RBR extracted!s,
+  qr!REI/RBR $sep extracted!s,
   $call
 );
 
@@ -159,7 +161,7 @@
   sub {
     system($call);
   },
-  qr!REI/BN\* extracted!s,
+  qr!REI/BN\* $sep extracted!s,
   $call
 );
 
@@ -168,7 +170,7 @@
   sub {
     system($call);
   },
-  qr!REI/RBR extracted!s,
+  qr!REI/RBR $sep extracted!s,
   $call
 );
 
@@ -201,7 +203,7 @@
   sub {
     system($call);
   },
-  qr!WPD15/A00/00081 extracted.!s,
+  qr!WPD15/A00/00081 $sep extracted.!s,
   $call
 );