Improved tar support

Change-Id: I318b6f18e571c81a34752911bc9d009d726c7d14
diff --git a/MANIFEST b/MANIFEST
index 812c953..65f5f5a 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -102,6 +102,7 @@
 t/script/usage.t
 t/script/extract.t
 t/script/archive.t
+t/script/archive_tar.t
 t/script/config.t
 t/script/base.t
 t/corpus/archive.zip
diff --git a/Makefile.PL b/Makefile.PL
index ab696f0..7eec5c1 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -40,7 +40,8 @@
     'Sys::Info'       => 0.78,
     'Config::Simple'  => 4.58,
     'String::Random'  => 0.29,
-    'File::Path'      => 2.12
+    'File::Path'      => 2.12,
+    'Archive::Tar::Builder' => 2.5002
   },
   MIN_PERL_VERSION => '5.014',
   test => {
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 226c35a..a6aa95f 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -157,7 +157,7 @@
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs),
-  'to-tar=s'       => \(my $to_tar),
+  'to-tar'      => \(my $to_tar),
   'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
@@ -350,7 +350,7 @@
 # Start serial processing
 if ($cmd eq 'serial') {
 
-  if ($output && (!-e $output || !-d $output)) {
+  if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
     print "Directory '$output' does not exist.\n\n";
     exit(0);
   };
@@ -383,7 +383,7 @@
     # This will create a directory
     my $new_out = catdir($output, get_file_name_from_glob($_));
 
-    # Create new path
+    # Create new path, in case the output is not meant to be tarred
     unless ($to_tar) {
       if (make_path($new_out) == 0 && !-d $new_out) {
         $log->error("Can\'t create path $new_out");
@@ -558,7 +558,7 @@
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
 
 if ($cmd) {
-  if ($output && (!-e $output || !-d $output)) {
+  if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
     print "Directory '$output' does not exist.\n\n";
     exit(0);
   };
@@ -807,6 +807,34 @@
   #    exit(1);
   #  };
 
+  my $tar_archive;
+  my $output_dir = $output;
+
+  # Initialize tar archive
+  if ($to_tar) {
+    $tar_archive = Archive::Tar::Builder->new(
+      ignore_errors => 1
+    );
+
+    # Set output name
+    my $tar_file = $output;
+    unless ($tar_file =~ /\.tar$/) {
+      $tar_file .= '.tar';
+    };
+
+    # Initiate the tar file
+    print "Writing to file $tar_file\n";
+    my $fh = IO::File->new($tar_file, 'w');
+    $fh->binmode(1);
+
+    # Set handle
+    $tar_archive->set_handle($fh);
+
+    # Output to temporary directory
+    $output_dir = File::Temp->newdir;
+  };
+
+
   # Input is a directory
   if (-d $input[0]) {
     my $it = Directory::Iterator->new($input[0]);
@@ -826,26 +854,6 @@
     $t = Benchmark->new;
     $count = scalar @dirs;
 
-    my $tar_archive;
-    my $output_dir = $output;
-    if ($to_tar) {
-      $tar_archive = Archive::Tar::Builder->new(
-        ignore_errors => 1
-      );
-
-      # Set output name
-      my $tar_file = $output;
-      unless ($tar_file =~ /\.tar$/) {
-        $tar_file .= '.tar';
-      };
-      my $fh = IO::File->new($tar_file, 'w');
-      $fh->binmode(1);
-
-      # Set handle
-      $tar_archive->set_handle($fh);
-      $output_dir = File::Temp->newdir;
-    };
-
   DIRECTORY_LOOP:
     for (my $i = 0; $i < $count; $i++) {
 
@@ -858,16 +866,17 @@
       $pool->start and next DIRECTORY_LOOP;
 
       if (my $return = $batch_file->process($dirs[$i] => $filename)) {
-        $pool->finish(
-          0,
-          ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
-        );
 
         # Add to tar archive
         if ($to_tar) {
           $tar_archive->archive($filename);
           unlink $filename;
         };
+
+        $pool->finish(
+          0,
+          ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
+        );
       }
       else {
         $pool->finish(1, ["Unable to process " . $dirs[$i]]);
@@ -898,7 +907,7 @@
       my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
 
       my $filename = catfile(
-        $output,
+        $output_dir,
         get_file_name(
           catfile($corpus, $doc, $text)
             . '.json' . ($gzip ? '.gz' : '')
@@ -925,6 +934,13 @@
 
         # Write file
         if (my $return = $batch_file->process($dir => $filename)) {
+
+          # Add to tar archive
+          if ($to_tar) {
+            $tar_archive->archive($filename);
+            unlink $filename;
+          };
+
           # Delete temporary file
           $pool->finish(
             0,
diff --git a/t/script/archive_tar.t b/t/script/archive_tar.t
new file mode 100644
index 0000000..aee6542
--- /dev/null
+++ b/t/script/archive_tar.t
@@ -0,0 +1,68 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/tempdir/;
+use Mojo::File;
+use Mojo::Util qw/quote/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output qw/:stdout :stderr :functions/;
+use Data::Dumper;
+use KorAP::XML::Archive;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $call = join(
+  ' ',
+  'perl', $script,
+  'archive'
+);
+
+unless (KorAP::XML::Archive::test_unzip) {
+  plan skip_all => 'unzip not found';
+};
+
+# Test without parameters
+stdout_like(
+  sub {
+    system($call);
+  },
+  qr!archive.+?\$ korapxml2krill!s,
+  $call
+);
+
+my $input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+my $output = File::Temp->new;
+
+ok(-f $output, 'Output directory exists');
+
+my $input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => $input_quotes,
+  '--output' => $output . '.tar',
+  '-t' => 'Base#tokens_aggr',
+  '--to-tar'
+);
+
+# Test without parameters
+my $combined = combined_from( sub { system($call) });
+
+diag $combined;
+
+#qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
+
+
+
+done_testing;
+__END__