Added archive test for directories and parallel processing

Change-Id: Iaee14a663786cbbe7f018d42488dbb96ba3b4d15
diff --git a/MANIFEST b/MANIFEST
index 098ee93..f6e215b 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -650,6 +650,7 @@
 t/annotation/corpus/doc/0001/struct/structure.xml
 t/annotation/corpus/doc/0001/tree_tagger/morpho.xml
 t/annotation/corpus/doc/0001/tree_tagger/sentences.xml
+t/annotation/corpus/doc/0001/tree_tagger/tokens.xml
 t/annotation/corpus/doc/0001/xip/constituency.xml
 t/annotation/corpus/doc/0001/xip/dependency.xml
 t/annotation/corpus/doc/0001/xip/morpho.xml
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 939dcd4..1a64842 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -241,7 +241,12 @@
 # Get file name based on path information
 sub get_file_name ($) {
   my $i = $input[0];
+  if (-d $i) {
+    $i =~ s![^\/]+$!!;
+  };
   my $file = shift;
+
+  # Remove temp dir fragments
   $file =~ s!^/?tmp/[^/]+!!;
   $file =~ s/^?\/?$i//;
   $file =~ tr/\//-/;
@@ -249,35 +254,6 @@
   return $file;
 };
 
-
-# Write file
-#sub write_file {
-#  my $anno = shift;
-#  my $file = get_file_name $anno;
-#
-#  # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
-#
-#  my $call = 'perl ' . $LOCAL . '/korapxml2krill';
-#  $call .= ' -i ' . $anno;
-#  $call .= ' -o ' . $output . '/' . $file . '.json';
-#  $call .= '.gz -z' if $gzip;
-#  $call .= ' -m ' . $meta if $meta;
-#  $call .= ' -w' if $overwrite;
-#  $call .= ' -t ' . $token_base if $token_base;
-#  $call .= ' -l ' . $log_level if $log_level;
-#  $call .= ' -c ' . $cache_file;
-#  $call .= ' -cs ' . $cache_size;
-#  $call .= ' --no-cache-delete'; # Don't delete the cache
-#  $call .= ' --no-cache-init'; # Don't initialize the cache
-#  $call .= ' --no-primary ' if $primary;
-#  $call .= ' -y ' . $pretty if $pretty;
-#  $call .= ' -a ' . $_ foreach @anno;
-#  $call .= ' -s ' . $_ foreach @skip;
-#  system($call);
-#  return "$file";
-#};
-
-
 # Convert sigle to path construct
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
 
@@ -440,8 +416,13 @@
 
       # Get the next fork
       $pool->start and next DIRECTORY_LOOP;
-      my $msg = $batch_file->process($dirs[$i] => $filename);
-      $pool->finish(0, \$msg);
+
+      if ($batch_file->process($dirs[$i] => $filename)) {
+        $pool->finish(0, \("Processed " . $filename));
+      }
+      else {
+        $pool->finish(1, \("Unable to process " . $dirs[$i]));
+      };
     };
   }
 
@@ -481,8 +462,6 @@
       # Get the next fork
       $pool->start and next ARCHIVE_LOOP;
 
-      my $msg;
-
       # Extract from archive
       if ($archive->extract($dirs[$i], $temp)) {
 
@@ -503,9 +482,7 @@
 
       # Unable to extract
       else {
-
-        $msg = "Unable to extract " . $dirs[$i] . "\n";
-        $pool->finish(1, \$msg);
+        $pool->finish(1, \("Unable to extract " . $dirs[$i]));
       };
     };
   }
diff --git a/t/annotation/corpus/doc/0001/tree_tagger/tokens.xml b/t/annotation/corpus/doc/0001/tree_tagger/tokens.xml
new file mode 100644
index 0000000..cd56e1c
--- /dev/null
+++ b/t/annotation/corpus/doc/0001/tree_tagger/tokens.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="Corpus_Doc.0001" VERSION="KorAP-0.4">
+<spanList>
+      <span id="s_7" from="0" to="3"/>
+      <span id="s_8" from="4" to="11"/>
+      <span id="s_9" from="12" to="23"/>
+      <span id="s_10" from="24" to="30"/>
+      <span id="s_11" from="31" to="35"/>
+      <span id="s_12" from="36" to="39"/>
+      <span id="s_13" from="40" to="47"/>
+      <span id="s_14" from="48" to="51"/>
+      <span id="s_15" from="52" to="63"/>
+      <span id="s_16" from="64" to="73"/>
+      <span id="s_17" from="74" to="77"/>
+      <span id="s_18" from="77" to="78"/>
+      <span id="s_19" from="79" to="84"/>
+      <span id="s_20" from="85" to="88"/>
+      <span id="s_21" from="89" to="96"/>
+      <span id="s_22" from="97" to="101"/>
+      <span id="s_23" from="102" to="111"/>
+      <span id="s_24" from="112" to="123"/>
+      <span id="s_25" from="124" to="128"/>
+      <span id="s_26" from="128" to="129"/>
+   </spanList>
+</layer>
diff --git a/t/script/archive.t b/t/script/archive.t
index 2032180..308414a 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -33,7 +33,9 @@
 my $input = catfile($f, '..', 'corpus', 'archive.zip');
 ok(-f $input, 'Input archive found');
 
-my $output = tempdir(CLEANUP => 1);
+my $output = File::Temp->newdir(CLEANUP => 0);
+$output->unlink_on_destroy(0);
+
 ok(-d $output, 'Output directory exists');
 
 $call = join(
@@ -66,5 +68,62 @@
 is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
 is($json->{sgbrKodex}, 'M', 'Kodex meta data');
 
+
+# Use directory
+$input = catdir($f, '..', 'annotation', 'corpus');
+
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => $input,
+  '--output' => $output,
+  '-t' => 'Tree_Tagger#Tokens',
+  '-j' => 4 # 4 jobs!
+);
+
+my ($json_1, $json_2);
+
+{
+  local $SIG{__WARN__} = sub {};
+
+  # That's not really stable on slow machines!
+  my $out = stdout_from(sub { system($call); });
+
+  ok($out =~ m!\[\$(\d+?):1\/2\]!s, $call . ' pid 1');
+  my $pid1 = $1;
+  ok($out =~ m!\[\$(\d+?):2\/2\]!s, $call . ' pid 2');
+  my $pid2 = $1;
+
+  isnt($pid1, $pid2, 'No PID match');
+
+  ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
+  ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
+
+  ok(-d $output, 'Temporary directory still exists');
+  my $json_1 = catfile($output, 'corpus-doc-0001.json');
+  ok(-f $json_1, 'Json file exists 1');
+  my $json_2 = catfile($output, 'corpus-doc-0002.json');
+  ok(-f $json_2, 'Json file exists 2');
+
+  ok(($file = slurp $json_1), 'Slurp data');
+  ok(($json_1 = decode_json $file), 'decode json');
+
+  is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
+  is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+  is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
+
+  ok(-f $json_2, 'Json file exists');
+  ok(($file = slurp $json_2), 'Slurp data');
+  ok(($json_2 = decode_json $file), 'decode json');
+
+  is($json_2->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
+  is($json_2->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure malt malt/dependency treetagger treetagger/morpho treetagger/sentences', 'Foundries');
+  is($json_2->{textSigle}, 'Corpus/Doc/0002', 'Sigle');
+};
+
+ok(-d $output, 'Ouput directory exists');
+unlink($output);
+
 done_testing;
 __END__