Added archive test script

Change-Id: Iaa6e9dd9c8186fe02432c0c512c23db8a9275d8b
diff --git a/Changes b/Changes
index 6ea6c3d..8d1d990 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.18 2016-08-16
+0.18 2016-08-17
         - Added REI test.
 	      - Added multiple archive support to korapxml2krill.
         - Added support for prefix negation in korapxml2krill.
@@ -12,9 +12,10 @@
         - Fixed setting multiple annotations in
           script.
         - Fixed output of version and help messages.
-        - Added extraction test.
+        - Added script test for extraction.
         - Fixed extraction with multiple archives and prefix
           negation support.
+        - Added script test for archives.
 
 0.17 2016-03-22
         - Rewrite siglen to use slashes as separators.
diff --git a/MANIFEST b/MANIFEST
index 856b8f2..098ee93 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -100,6 +100,7 @@
 t/script/single.t
 t/script/usage.t
 t/script/extract.t
+t/script/archive.t
 t/corpus/archive.zip
 t/corpus/BZK/header.xml
 t/corpus/GOE/header.xml
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index ea376f4..fe57d78 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -5,7 +5,7 @@
 use Scalar::Util qw/weaken/;
 use XML::Fast;
 use Try::Tiny;
-use Carp qw/croak/;
+use Carp qw/croak carp/;
 use KorAP::XML::Document::Primary;
 use KorAP::XML::Tokenizer;
 use Log::Log4perl;
@@ -73,11 +73,11 @@
 
     # Load file
     $file = b($data_xml)->slurp;
-
     try {
       local $SIG{__WARN__} = sub {
-	$error = 1;
+        $error = 1;
       };
+
       $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
     } catch  {
       $self->log->warn($unable);
@@ -97,17 +97,22 @@
       $self->corpus_sigle($1);
     }
     else {
-      croak $unable . ': ID not parseable';
+      $self->log->warn($unable . ': ID not parseable');
+      return;
     };
   }
   else {
-    croak $unable . ': No raw_text found or no ID';
+    $self->log->warn($unable . ': No raw_text found or no ID');
+    return;
   };
 
   # Get primary data
   my $pd = $rt->{text};
 
-  croak $unable unless $pd;
+  unless ($pd) {
+    $self->log->warn($unable . ': No primary data found');
+    return;
+  };
 
   # Associate primary data
   $self->{pd} = KorAP::XML::Document::Primary->new($pd);
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 65bc89a..939dcd4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -92,37 +92,37 @@
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
   'cache|c=s'   => \(my $cache_file = 'korapxml2krill.cache'),
-  'cache-size|cs=s'   => \(my $cache_size = '50m'),
-  'cache-delete|cd!' => \(my $cache_delete = 1),
-  'cache-init|ci!'   => \(my $cache_init = 1),
   'log|l=s'     => \(my $log_level = 'ERROR'),
   'anno|a=s'    => \@anno,
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs = 0),
+  'cache-size|cs=s'  => \(my $cache_size = '50m'),
+  'cache-delete|cd!' => \(my $cache_delete = 1),
+  'cache-init|ci!'   => \(my $cache_init = 1),
   'help|h'      => sub {
     pod2usage(
       -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-      -verbose => 99,
-      -msg => $VERSION_MSG,
-      -output => '-'
+      -verbose  => 99,
+      -msg      => $VERSION_MSG,
+      -output   => '-'
     );
   },
   'version|v'   => sub {
     pod2usage(
-      -verbose => 0,
-      -msg => $VERSION_MSG,
-      -output => '-'
+      -verbose  => 0,
+      -msg      => $VERSION_MSG,
+      -output   => '-'
     )
   }
 );
 
 my %ERROR_HASH = (
   -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-  -verbose => 99,
-  -msg => $VERSION_MSG,
-  -output => '-',
-  -exit => 1
+  -verbose  => 99,
+  -msg      => $VERSION_MSG,
+  -output   => '-',
+  -exit     => 1
 );
 
 # Input has to be defined
@@ -281,6 +281,14 @@
 # Convert sigle to path construct
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
 
+if ($cmd) {
+  if ($output && (!-e $output || !-d $output)) {
+    print "Directory '$output' does not exist.\n\n";
+    exit(0);
+  };
+};
+
+
 # Process a single file
 unless ($cmd) {
   my $input = $input[0];
@@ -303,6 +311,7 @@
   # Create and parse new document
   $input =~ s{([^/])$}{$1/};
 
+  # Process file
   $batch_file->process($input, $output);
 
   # Delete cache file
@@ -314,14 +323,10 @@
 # Extract XML files
 elsif ($cmd eq 'extract') {
 
-  if ($output && (!-e $output || !-d $output)) {
-    print "Directory '$output' does not exist.\n\n";
-    exit(0);
-  };
-
-  # TODO: Support sigles and full archives
+  # Create new archive object
   if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
 
+    # Check zip capabilities
     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
       exit(1);
@@ -349,6 +354,7 @@
     # Iterate over all given sigles and extract
     foreach (@sigle) {
       print "$_ ";
+
       # TODO: Make this OS independent
       print '' . (
         $archive->extract(
@@ -361,6 +367,8 @@
     print "\n";
     exit(1);
   }
+
+  # Can't create archive object
   else {
     $log->error('Unable to extract from primary archive ' . $input[0]);
   };
@@ -369,32 +377,20 @@
 # Process an archive
 elsif ($cmd eq 'archive') {
 
-warn '!!!!!!!!!!!!!------------> ';
-
-if ($output && (!-e $output || !-d $output)) {
-  print "Directory '$output' does not exist.\n\n";
-  exit(0);
-};
-
-
   # TODO: Support sigles
 
-  if ($output && (!-e $output || !-d $output)) {
-    print "Directory '$output' does not exist.\n\n";
-    exit(0);
-  };
-
-# Zero means: everything runs in the parent process
+  # Zero means: everything runs in the parent process
   my $pool = Parallel::ForkManager->new($jobs);
 
-  my $count = 0; # Texts to process
+  my $count = 0;  # Texts to process
   my $iter  = 1;  # Current text in process
 
   # Report on fork message
   $pool->run_on_finish (
     sub {
-      my ($pid, $code) = shift;
+      my ($pid, $code) = @_;
       my $data = pop;
+
       print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
         ($iter++) . "/$count]" .
         ($code ? " $code" : '') .
@@ -403,16 +399,17 @@
   );
 
   my $t;
+  my $temp;
   print "Reading data ...\n";
 
-#  unless (Cache::FastMmap->new(
-#    share_file => $cache_file,
-#    cache_size => $cache_size,
-#    init_file => $cache_init
-#  )) {
-#    print "Unable to intialize cache '$cache_file'\n\n";
-#    exit(1);
-#  };
+  #  unless (Cache::FastMmap->new(
+  #    share_file => $cache_file,
+  #    cache_size => $cache_size,
+  #    init_file => $cache_init
+  #  )) {
+  #    print "Unable to intialize cache '$cache_file'\n\n";
+  #    exit(1);
+  #  };
 
   # Input is a directory
   if (-d $input[0]) {
@@ -420,10 +417,11 @@
     my @dirs;
     my $dir;
 
+    # Todo: Make a DO WHILE
     while (1) {
       if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
-	push @dirs, $dir;
-	$it->prune;
+        push @dirs, $dir;
+        $it->prune;
       };
       last unless $it->next;
     };
@@ -436,15 +434,13 @@
     for (my $i = 0; $i < $count; $i++) {
 
       my $filename = catfile(
-	$output,
-	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+        $output,
+        get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
       );
 
       # Get the next fork
-      my $pid = $pool->start and next DIRECTORY_LOOP;
-      my $msg;
-
-      $msg = $batch_file->process($dirs[$i] => $filename);
+      $pool->start and next DIRECTORY_LOOP;
+      my $msg = $batch_file->process($dirs[$i] => $filename);
       $pool->finish(0, \$msg);
     };
   }
@@ -465,6 +461,9 @@
     my @dirs = $archive->list_texts;
     $count = scalar @dirs;
 
+    # Create temporary file
+    $temp = File::Temp->newdir;
+
   ARCHIVE_LOOP:
     for (my $i = 0; $i < $count; $i++) {
 
@@ -472,41 +471,41 @@
       my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
 
       my $filename = catfile(
-	$output,
-	get_file_name(
-	  catfile($corpus, $doc, $text)
-	    . '.json' . ($gzip ? '.gz' : '')
-	  )
+        $output,
+        get_file_name(
+          catfile($corpus, $doc, $text)
+            . '.json' . ($gzip ? '.gz' : '')
+          )
       );
 
       # Get the next fork
-      my $pid = $pool->start and next ARCHIVE_LOOP;
-
-      # Create temporary file
-      my $temp = File::Temp->newdir;
+      $pool->start and next ARCHIVE_LOOP;
 
       my $msg;
 
       # Extract from archive
       if ($archive->extract($dirs[$i], $temp)) {
 
-	# Create corpus directory
-	my $input = catdir("$temp", $corpus);
+        # Create corpus directory
+        my $input = catdir("$temp", $corpus);
 
-	# Temporary directory
-	my $dir = catdir($input, $doc, $text);
+        # Temporary directory
+        my $dir = catdir($input, $doc, $text);
 
-	# Write file
-	$msg = $batch_file->process($dir => $output);
-
-	$temp = undef;
-	$pool->finish(0, \$msg);
+        # Write file
+        if ($batch_file->process($dir => $filename)) {
+          $pool->finish(0, \("Processed " . $filename));
+        }
+        else {
+          $pool->finish(1, \("Unable to process " . $dir));
+        };
       }
+
+      # Unable to extract
       else {
 
-	$temp = undef;
-	$msg = "Unable to extract " . $dirs[$i] . "\n";
-	$pool->finish(1, \$msg);
+        $msg = "Unable to extract " . $dirs[$i] . "\n";
+        $pool->finish(1, \$msg);
       };
     };
   }
@@ -517,6 +516,9 @@
 
   $pool->wait_all_children;
 
+  # Delete temporary file
+  $temp = undef;
+
   # Delete cache file
   unlink($cache_file) if $cache_delete;
 
diff --git a/t/script/archive.t b/t/script/archive.t
new file mode 100644
index 0000000..2032180
--- /dev/null
+++ b/t/script/archive.t
@@ -0,0 +1,70 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/tempdir/;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output qw/:stdout :stderr :functions/;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $call = join(
+  ' ',
+  'perl', $script,
+  'archive'
+);
+
+# Test without parameters
+stdout_like(
+  sub {
+    system($call);
+  },
+  qr!archive.+?Process an!s,
+  $call
+);
+
+my $input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+my $output = tempdir(CLEANUP => 1);
+ok(-d $output, 'Output directory exists');
+
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => $input,
+  '--output' => $output,
+  '-t' => 'Base#tokens_aggr',
+  '-m' => 'Sgbr'
+);
+
+# Test without compression
+my $json;
+{
+  local $SIG{__WARN__} = sub {};
+  my $out = stdout_from(sub { system($call); });
+
+  like($out, qr!TEST-BSP-1\.json!s, $call);
+
+  $out =~ m!Processed (.+?\.json)!;
+  $json = $1;
+};
+
+ok(-f $json, 'Json file exists');
+ok((my $file = slurp $json), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+
+is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
+is($json->{sgbrKodex}, 'M', 'Kodex meta data');
+
+done_testing;
+__END__