Added preliminary tar support

Change-Id: Id34f301b320e8bc5d4a34f07754f76d6c135bfd7
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 1b994c2..226c35a 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -26,6 +26,8 @@
 use File::Path qw(remove_tree make_path);
 use Mojo::Collection 'c';
 use String::Random qw(random_string);
+use IO::File;
+use Archive::Tar::Builder;
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -155,6 +157,7 @@
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs),
+  'to-tar=s'       => \(my $to_tar),
   'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
@@ -263,6 +266,11 @@
     $base_pagebreaks = $config{'base-pagebreaks'} ;
   };
 
+  # Write to tar
+  if (!(defined $to_tar) && defined $config{'to-tar'}) {
+    $to_tar = $config{'to-tar'} ;
+  };
+
   # Log
   if (!(defined $log_level) && defined $config{'log'}) {
     $log_level = $config{'log'} ;
@@ -372,12 +380,15 @@
   # Iterate over all inputs
   foreach (@input) {
 
+    # This will create a directory
     my $new_out = catdir($output, get_file_name_from_glob($_));
 
     # Create new path
-    if (make_path($new_out) == 0 && !-d $new_out) {
-      $log->error("Can\'t create path $new_out");
-      exit(0);
+    unless ($to_tar) {
+      if (make_path($new_out) == 0 && !-d $new_out) {
+        $log->error("Can\'t create path $new_out");
+        exit(0);
+      };
     };
 
     # Create archive command
@@ -815,11 +826,31 @@
     $t = Benchmark->new;
     $count = scalar @dirs;
 
+    my $tar_archive;
+    my $output_dir = $output;
+    if ($to_tar) {
+      $tar_archive = Archive::Tar::Builder->new(
+        ignore_errors => 1
+      );
+
+      # Set output name
+      my $tar_file = $output;
+      unless ($tar_file =~ /\.tar$/) {
+        $tar_file .= '.tar';
+      };
+      my $fh = IO::File->new($tar_file, 'w');
+      $fh->binmode(1);
+
+      # Set handle
+      $tar_archive->set_handle($fh);
+      $output_dir = File::Temp->newdir;
+    };
+
   DIRECTORY_LOOP:
     for (my $i = 0; $i < $count; $i++) {
 
       my $filename = catfile(
-        $output,
+        $output_dir,
         get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
       );
 
@@ -831,6 +862,12 @@
           0,
           ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
         );
+
+        # Add to tar archive
+        if ($to_tar) {
+          $tar_archive->archive($filename);
+          unlink $filename;
+        };
       }
       else {
         $pool->finish(1, ["Unable to process " . $dirs[$i]]);
@@ -976,7 +1013,7 @@
 
 =item B<archive>
 
-  $ korapxml2krill archive -z --input <directory|archive> --output <directory>
+  $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
 
 Converts an archive of KorAP-XML documents. It expects a directory
 (pointing to the corpus level folder) or one or more zip files as input.
@@ -994,7 +1031,8 @@
 Convert archives sequentially. The inputs are not merged but treated
 as they are (so they may be premerged or globs).
 the C<--out> directory is treated as the base directory where subdirectories
-are created based on the archive name.
+are created based on the archive name. In case the C<--to-tar> flag is given,
+the output will be a tar file.
 
 
 =back