Finished tar flag

Change-Id: I54f6fddcc8392c51eab59e0e84a60fe2455bccd4
diff --git a/Changes b/Changes
index 06eeff2..28b58c3 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.29 2017-04-23
+        - support --to-tar flag.
+
 0.28 2017-04-12
         - Improved overwriting behaviour for unzip.
         - Introduced --sequential-extraction flag.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index e94baaa..ddc859d 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.28';
+our $VERSION = '0.29';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index a6aa95f..7a7b8f7 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -28,6 +28,7 @@
 use String::Random qw(random_string);
 use IO::File;
 use Archive::Tar::Builder;
+use Fcntl qw(:flock SEEK_END);
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -780,6 +781,34 @@
   my $count = 0;  # Texts to process
   my $iter  = 1;  # Current text in process
 
+  my $tar_archive;
+  my $output_dir = $output;
+  my $tar_fh;
+
+  # Initialize tar archive
+  if ($to_tar) {
+    $tar_archive = Archive::Tar::Builder->new(
+      ignore_errors => 1
+    );
+
+    # Set output name
+    my $tar_file = $output;
+    unless ($tar_file =~ /\.tar$/) {
+      $tar_file .= '.tar';
+    };
+
+    # Initiate the tar file
+    print "Writing to file $tar_file\n";
+    $tar_fh = IO::File->new($tar_file, 'w');
+    $tar_fh->binmode(1);
+
+    # Set handle
+    $tar_archive->set_handle($tar_fh);
+
+    # Output to temporary directory
+    $output_dir = File::Temp->newdir;
+  };
+
   # Report on fork message
   $pool->run_on_finish (
     sub {
@@ -790,6 +819,25 @@
         ($iter++) . "/$count]" .
         ($code ? " $code" : '') .
         ' ' . $data->[0] . "\n";
+
+      if (!$code && $to_tar && $data->[2]) {
+        my $filename = $data->[2];
+
+        # Lock filehandle
+        if (flock($tar_fh, LOCK_EX)) {
+
+          # Archive and remove file
+          $tar_archive->archive($filename);
+          unlink $filename;
+
+          # Unlock filehandle
+          flock($tar_fh, LOCK_UN);
+        }
+        else {
+          $log->warn("Unable to add $filename to archive");
+        };
+      };
+
       $data->[1] = undef if $data->[1];
     }
   );
@@ -807,33 +855,6 @@
   #    exit(1);
   #  };
 
-  my $tar_archive;
-  my $output_dir = $output;
-
-  # Initialize tar archive
-  if ($to_tar) {
-    $tar_archive = Archive::Tar::Builder->new(
-      ignore_errors => 1
-    );
-
-    # Set output name
-    my $tar_file = $output;
-    unless ($tar_file =~ /\.tar$/) {
-      $tar_file .= '.tar';
-    };
-
-    # Initiate the tar file
-    print "Writing to file $tar_file\n";
-    my $fh = IO::File->new($tar_file, 'w');
-    $fh->binmode(1);
-
-    # Set handle
-    $tar_archive->set_handle($fh);
-
-    # Output to temporary directory
-    $output_dir = File::Temp->newdir;
-  };
-
 
   # Input is a directory
   if (-d $input[0]) {
@@ -866,16 +887,13 @@
       $pool->start and next DIRECTORY_LOOP;
 
       if (my $return = $batch_file->process($dirs[$i] => $filename)) {
-
-        # Add to tar archive
-        if ($to_tar) {
-          $tar_archive->archive($filename);
-          unlink $filename;
-        };
-
         $pool->finish(
           0,
-          ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
+          [
+            "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
+            undef,
+            $filename
+          ]
         );
       }
       else {
@@ -935,16 +953,14 @@
         # Write file
         if (my $return = $batch_file->process($dir => $filename)) {
 
-          # Add to tar archive
-          if ($to_tar) {
-            $tar_archive->archive($filename);
-            unlink $filename;
-          };
-
           # Delete temporary file
           $pool->finish(
             0,
-            ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
+            [
+              "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
+              $temp,
+              $filename
+            ]
           );
           #$pool->finish(0, ["Processed " . $filename, $temp]);
         }
@@ -970,6 +986,13 @@
   # Delete cache file
   unlink($cache_file) if $cache_delete;
 
+  # Close tar filehandle
+  if ($to_tar && $tar_fh) {
+    $tar_archive->finish;
+    $tar_fh->close;
+    print "Wrote to tar archive.\n";
+  };
+
   print timestr(timediff(Benchmark->new, $t))."\n";
   print "Done.\n";
 };
diff --git a/t/script/archive_tar.t b/t/script/archive_tar.t
index aee6542..666e2c5 100644
--- a/t/script/archive_tar.t
+++ b/t/script/archive_tar.t
@@ -58,9 +58,10 @@
 # Test without parameters
 my $combined = combined_from( sub { system($call) });
 
-diag $combined;
+like($combined, qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Input is fine');
 
-#qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
+like($combined, qr!Writing to file .+?\.tar!, 'Write out');
+like($combined, qr!Wrote to tar archive!, 'Write out');