Introduced temporary extraction

Change-Id: I05bbc04b3f17e9398ca31a977e591f5a24ce14df
diff --git a/script/korapxml2krill b/script/korapxml2krill
index a439fff..93e5eac 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -22,6 +22,8 @@
 use Sys::Info;
 use Sys::Info::Constants qw( :device_cpu );
 use File::Glob ':bsd_glob';
+use File::Temp qw/tempdir/;
+
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -100,6 +102,7 @@
 #
 # 2017/04/07
 # - support configuration option
+# - support for temporary extraction
 #
 # ----------------------------------------------------------
 
@@ -130,6 +133,7 @@
   'base-paragraphs|bp=s'  => \(my $base_paragraphs),
   'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
   'gzip|z'      => \(my $gzip),
+  'temporary-extract|te=s' => \(my $extract_dir),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
   'cache|c=s'   => \(my $cache_file),
@@ -183,6 +187,11 @@
     $jobs = $config{jobs};
   };
 
+  # temporary-extract
+  if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
+    $extract_dir = $config{'temporary-extract'};
+  };
+
   # Token base
   if (!defined($token_base) && defined $config{token}) {
     $token_base = $config{token};
@@ -451,7 +460,6 @@
   };
 };
 
-
 # Glob files
 if (@input) {
   my @new_input = ();
@@ -497,10 +505,12 @@
   unlink($cache_file) if $cache_delete;
 
   stop_time;
-}
+  exit(1);
+};
+
 
 # Extract XML files
-elsif ($cmd eq 'extract') {
+if ($cmd eq 'extract') {
 
   # Create new archive object
   if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
@@ -508,7 +518,7 @@
     # Check zip capabilities
     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
-      exit(1);
+      exit(0);
     };
 
     # Add further annotation archived
@@ -597,18 +607,59 @@
     };
 
     print "\n";
-    exit(1);
+    # exit(1);
   }
 
   # Can't create archive object
   else {
     $log->error('Unable to extract from primary archive ' . $input[0]);
+    exit(1);
   };
 }
 
+
 # Process an archive
 elsif ($cmd eq 'archive') {
 
+  my $archive_output;
+
+  # First extract, then archive
+  if (defined $extract_dir) {
+
+    # Create new archive object
+    if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+
+      # Check zip capabilities
+      unless ($archive->test_unzip) {
+        print "Unzip is not installed or incompatible.\n\n";
+        exit(0);
+      };
+
+      # Add further annotation archived
+      $archive->attach($_) foreach @input[1..$#input];
+
+      # Create a temporary directory
+      if ($extract_dir eq ':temp:') {
+        $extract_dir = tempdir(CLEANUP => 1);
+      };
+
+      if ($archive->extract_all($extract_dir, $jobs)) {
+        @input = ($extract_dir);
+      }
+      else {
+        $log->error('Unable to extract from primary archive ' . $input[0] .
+                      ' to ' . $extract_dir);
+        exit(1);
+      };
+    }
+
+    # Can't create archive object
+    else {
+      $log->error('Unable to extract from primary archive ' . $input[0]);
+      exit(1);
+    };
+  };
+
   # TODO: Support sigles
 
   # Zero means: everything runs in the parent process
@@ -767,13 +818,8 @@
 
   print "Done.\n";
   print timestr(timediff(Benchmark->new, $t))."\n\n";
-}
+};
 
-# Unknown command
-else {
-  warn "Unknown command '$cmd'.\n\n";
-  pod2usage(%ERROR_HASH);
-}
 
 __END__
 
@@ -994,10 +1040,21 @@
 Supported parameters are:
 C<overwrite>, C<gzip>, C<jobs>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>, C<base-sentences>, C<base-paragraphs>,
+C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
 C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
 (semicolon separated), C<anno> (semicolon separated).
 
+=item B<--temporary-extract|-te>
+
+Only valid for the C<archive> command.
+
+This will first extract all files into a
+directory and then will archive.
+If the directory is given as C<:temp:>,
+a temporary directory is used.
+This is especially useful to avoid
+massive unzipping and potential
+network latency.
 
 =item B<--sigle|-sg>
 
@@ -1099,6 +1156,7 @@
 Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
 
 Author: L<Nils Diewald|http://nils-diewald.de/>
+
 Contributor: Eliza Margaretha
 
 L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>