Introduced temporary extraction

Change-Id: I05bbc04b3f17e9398ca31a977e591f5a24ce14df
diff --git a/Changes b/Changes
index bea5911..420f1df 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
 0.27 2017-04-07
         - Support configuration files.
+        - Support temporary extraction.
 
 0.26 2017-04-06
         - Support wildcards on input.
diff --git a/Readme.pod b/Readme.pod
index b2cbd45..6366c81 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -215,10 +215,21 @@
 Supported parameters are:
 C<overwrite>, C<gzip>, C<jobs>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>, C<base-sentences>, C<base-paragraphs>,
+C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
 C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
 (semicolon separated), C<anno> (semicolon separated).
 
+=item B<--temporary-extract|-te>
+
+Only valid for the C<archive> command.
+
+This will first extract all files into a
+directory and then will archive.
+If the directory is given as C<:temp:>,
+a temporary directory is used.
+This is especially useful to avoid
+massive unzipping and potential
+network latency.
 
 =item B<--sigle|-sg>
 
@@ -320,6 +331,7 @@
 Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
 
 Author: L<Nils Diewald|http://nils-diewald.de/>
+
 Contributor: Eliza Margaretha
 
 L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 831d257..29b43a7 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -138,6 +138,76 @@
 };
 
 
+sub extract_all {
+  my $self = shift;
+  my ($target_dir, $jobs) = @_;
+
+  my @init_cmd = (
+    'unzip',          # Use unzip program
+    '-qo',            # quietly overwrite all existing files
+    '-d', $target_dir # Extract into target directory
+  );
+
+  # Iterate over all attached archives
+  my @cmds;
+  foreach my $archive (@$self) {
+
+    # $_ is the zip
+    my @cmd = @init_cmd;
+    push(@cmd, $archive->[0]); # Extract from zip
+
+    # Run system call
+    push @cmds, \@cmd;
+  };
+
+  $self->_extract($jobs, @cmds);
+};
+
+
+sub _extract {
+  my ($self, $jobs, @cmds) = @_;
+
+  # Only single call
+  if (!$jobs || $jobs == 1) {
+    foreach (@cmds) {
+      system(@$_);
+
+      # Check for return code
+      if ($? != 0) {
+        carp("System call '" . join(' ', @$_) . "' errors " . $?);
+        return;
+      };
+    };
+  }
+
+  # Extract annotations in parallel
+  else {
+    my $pool = Parallel::ForkManager->new($jobs);
+    $pool->run_on_finish(
+      sub {
+        my ($pid, $code) = @_;
+        my $data = pop;
+        print "Extract [\$$pid] " .
+          ($code ? " $code" : '') . " $$data\n";
+      }
+    );
+
+  ARCHIVE_LOOP:
+    foreach my $cmd (@cmds) {
+      my $pid = $pool->start and next ARCHIVE_LOOP;
+      system(@$cmd);
+      my $last = $cmd->[4];
+      $pool->finish($?, \"$last");
+    };
+    $pool->wait_all_children;
+  };
+
+  # Fine
+  return 1;
+};
+
+
+
 # Extract document files to a directory
 sub extract_doc {
   my $self = shift;
@@ -190,42 +260,7 @@
     push @cmds, \@cmd;
   };
 
-  if (!$jobs || $jobs == 1) {
-    foreach (@cmds) {
-      system(@$_);
-
-      # Check for return code
-      if ($? != 0) {
-        carp("System call '" . join(' ', @$_) . "' errors " . $?);
-        return;
-      };
-    };
-  }
-
-  # Extract annotations in parallel
-  else {
-    my $pool = Parallel::ForkManager->new($jobs);
-    $pool->run_on_finish(
-      sub {
-        my ($pid, $code) = @_;
-        my $data = pop;
-        print "Extract [\$$pid] " .
-          ($code ? " $code" : '') . " $$data\n";
-      }
-    );
-
-  ARCHIVE_LOOP:
-    foreach my $cmd (@cmds) {
-      my $pid = $pool->start and next ARCHIVE_LOOP;
-      system(@$cmd);
-      my $last = $cmd->[4];
-      $pool->finish($?, \"$last");
-    };
-    $pool->wait_all_children;
-  };
-
-  # Fine
-  return 1;
+  $self->_extract($jobs, @cmds);
 };
 
 
diff --git a/script/korapxml2krill b/script/korapxml2krill
index a439fff..93e5eac 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -22,6 +22,8 @@
 use Sys::Info;
 use Sys::Info::Constants qw( :device_cpu );
 use File::Glob ':bsd_glob';
+use File::Temp qw/tempdir/;
+
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -100,6 +102,7 @@
 #
 # 2017/04/07
 # - support configuration option
+# - support for temporary extraction
 #
 # ----------------------------------------------------------
 
@@ -130,6 +133,7 @@
   'base-paragraphs|bp=s'  => \(my $base_paragraphs),
   'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
   'gzip|z'      => \(my $gzip),
+  'temporary-extract|te=s' => \(my $extract_dir),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
   'cache|c=s'   => \(my $cache_file),
@@ -183,6 +187,11 @@
     $jobs = $config{jobs};
   };
 
+  # temporary-extract
+  if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
+    $extract_dir = $config{'temporary-extract'};
+  };
+
   # Token base
   if (!defined($token_base) && defined $config{token}) {
     $token_base = $config{token};
@@ -451,7 +460,6 @@
   };
 };
 
-
 # Glob files
 if (@input) {
   my @new_input = ();
@@ -497,10 +505,12 @@
   unlink($cache_file) if $cache_delete;
 
   stop_time;
-}
+  exit(1);
+};
+
 
 # Extract XML files
-elsif ($cmd eq 'extract') {
+if ($cmd eq 'extract') {
 
   # Create new archive object
   if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
@@ -508,7 +518,7 @@
     # Check zip capabilities
     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
-      exit(1);
+      exit(0);
     };
 
     # Add further annotation archived
@@ -597,18 +607,59 @@
     };
 
     print "\n";
-    exit(1);
+    # exit(1);
   }
 
   # Can't create archive object
   else {
     $log->error('Unable to extract from primary archive ' . $input[0]);
+    exit(1);
   };
 }
 
+
 # Process an archive
 elsif ($cmd eq 'archive') {
 
+  my $archive_output;
+
+  # First extract, then archive
+  if (defined $extract_dir) {
+
+    # Create new archive object
+    if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+
+      # Check zip capabilities
+      unless ($archive->test_unzip) {
+        print "Unzip is not installed or incompatible.\n\n";
+        exit(0);
+      };
+
+      # Add further annotation archived
+      $archive->attach($_) foreach @input[1..$#input];
+
+      # Create a temporary directory
+      if ($extract_dir eq ':temp:') {
+        $extract_dir = tempdir(CLEANUP => 1);
+      };
+
+      if ($archive->extract_all($extract_dir, $jobs)) {
+        @input = ($extract_dir);
+      }
+      else {
+        $log->error('Unable to extract from primary archive ' . $input[0] .
+                      ' to ' . $extract_dir);
+        exit(1);
+      };
+    }
+
+    # Can't create archive object
+    else {
+      $log->error('Unable to extract from primary archive ' . $input[0]);
+      exit(1);
+    };
+  };
+
   # TODO: Support sigles
 
   # Zero means: everything runs in the parent process
@@ -767,13 +818,8 @@
 
   print "Done.\n";
   print timestr(timediff(Benchmark->new, $t))."\n\n";
-}
+};
 
-# Unknown command
-else {
-  warn "Unknown command '$cmd'.\n\n";
-  pod2usage(%ERROR_HASH);
-}
 
 __END__
 
@@ -994,10 +1040,21 @@
 Supported parameters are:
 C<overwrite>, C<gzip>, C<jobs>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>, C<base-sentences>, C<base-paragraphs>,
+C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
 C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
 (semicolon separated), C<anno> (semicolon separated).
 
+=item B<--temporary-extract|-te>
+
+Only valid for the C<archive> command.
+
+This will first extract all files into a
+directory and then will archive.
+If the directory is given as C<:temp:>,
+a temporary directory is used.
+This is especially useful to avoid
+massive unzipping and potential
+network latency.
 
 =item B<--sigle|-sg>
 
@@ -1099,6 +1156,7 @@
 Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
 
 Author: L<Nils Diewald|http://nils-diewald.de/>
+
 Contributor: Eliza Margaretha
 
 L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>