Added batch processing class for documents

Change-Id: I91ab289ffff525978fce3079a4dad71caddfe98a
diff --git a/Changes b/Changes
index cd3dbfe..ca1cec6 100644
--- a/Changes
+++ b/Changes
@@ -1,10 +1,11 @@
-0.18 2016-07-06
+0.18 2016-07-08
         - Added REI test.
 	- Added multiple archive support to korapxml2krill.
 	- Added support for prefix negation in korapxml2krill.
 	- Added support for Malt#Dependency.
 	- Improved test suite for caching and REI.
 	- Added support for MDParser annotation.
+	- Added batch processing class for documents.
 
 0.17 2016-03-22
         - Rewrite siglen to use slashes as separators.
diff --git a/MANIFEST b/MANIFEST
index c0ca61e..9911feb 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,6 +1,7 @@
 lib/KorAP/XML/Krill.pm
 lib/KorAP/XML/Log.pm
 lib/KorAP/XML/Archive.pm
+lib/KorAP/XML/Batch/File.pm
 lib/KorAP/XML/Tokenizer.pm
 lib/KorAP/XML/Tokenizer/Match.pm
 lib/KorAP/XML/Tokenizer/Range.pm
@@ -46,6 +47,7 @@
 lib/KorAP/XML/Annotation/XIP/Morpho.pm
 lib/KorAP/XML/Annotation/XIP/Sentences.pm
 t/archive.t
+t/batch_file.t
 t/meta.t
 t/meta_caching.t
 t/multiple_archives.t
diff --git a/Makefile.PL b/Makefile.PL
index b4bdacb..1ab8172 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -22,6 +22,7 @@
     'List::MoreUtils' => 0.33,
     'Parallel::ForkManager' => 1.17,
     'IO::Compress::Gzip' => 2.069,
+    'IO::Uncompress::Gunzip' => 2.069,
     'IO::Dir::Recursive' => 0.03,
     'File::Temp'      => 0,
     'Directory::Iterator' => 0,
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index de34e4f..d91074f 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -17,19 +17,24 @@
     foundry   => $param{foundry}   // 'Base',
     layer     => $param{layer}     // 'Tokens',
     anno      => $param{anno}      // [[]],
-    log       => $param{log}       // Mojo::Log->new,
+    log       => $param{log}       // Mojo::Log->new(level => 'fatal'),
     primary   => $param{primary},
     pretty    => $param{pretty},
     gzip      => $param{gzip} // 0
   }, $class;
 };
 
-
+# Process a file
 sub process {
   my $self = shift;
   my $input = shift;
   my $output = shift;
 
+  if (!$self->{overwrite} && $output && -e $output) {
+    $self->{log}->debug($output . ' already exists');
+    return;
+  };
+
   # Create and parse new document
   $input =~ s{([^/])$}{$1/};
   my $doc = KorAP::XML::Krill->new(
@@ -85,3 +90,144 @@
 };
 
 1;
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 NAME
+
+KorAP::XML::Batch::File - Process multiple files with identical setup
+
+
+=head1 SYNOPSIS
+
+
+  # Create Converter Object
+  my $converter = KorAP::XML::Batch::File->new(
+    overwrite => 1,
+    gzip => 1
+  );
+
+  $converter->process('/my/data' => 'my-output.gz');
+
+=head1 DESCRIPTION
+
+Set up the configuration for a corpus and process
+multiple texts with the same configuration.
+
+=head1 METHODS
+
+Construct a new converter object.
+
+  my $converter = KorAP::XML::Batch::File->new(
+    overwrite => 1,
+    gzip => 1
+  );
+
+
+=head2 new
+
+=over 2
+
+=item cache
+
+A L<Cache::FastMmap> compatible cache object.
+
+=item meta_type
+
+Meta data type to be parsed. Defaults to C<I5>,
+also supports all classes in the C<KorAP::XML::Meta> namespace.
+
+=item overwrite
+
+Overwrite existing files!
+Defaults to C<false>.
+
+=item foundry
+
+The foundry to use for tokenization,
+defaults to C<Base>.
+
+=item layer
+
+The layer to use for tokenization,
+defaults to C<Tokens>.
+
+=item anno
+
+  my $converter = KorAP::XML::Batch::File->new(
+    anno => [
+      ['CoreNLP', 'Morpho'],
+      ['OpenNLP', 'Morpho']
+    ]
+  );
+
+An array reference of array references,
+containing annotation layers as foundry-layer
+pairs to parse.
+The list is empty by default.
+
+=item log
+
+A L<Mojo::Log> compatible log object.
+
+=item primary
+
+Export primary text associated with the document.
+Defaults to C<true>.
+
+=item pretty
+
+Pretty print the output JSON.
+Defaults to C<false>.
+
+=item gzip
+
+Compress the output using Gzip.
+This will be ignored, if the output is undefined
+(i.e. C<STDOUT>).
+Defaults to C<false>.
+
+=back
+
+=head2 process
+
+  $converter->process('/mydoc/');
+  $converter->process('/mydoc/', '/myoutput.gzip');
+
+Process a file and pass to a chosen output.
+The first argument is mandatory and
+represents the path to the KorapXML text files.
+The second argument is optional and
+represents a file path to write.
+If the second argument is not given,
+the process will write to C<STDOUT>
+(in that case, the C<gzip> parameter is ignored).
+
+=head1 AVAILABILITY
+
+  https://github.com/KorAP/KorAP-XML-Krill
+
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Author: L<Nils Diewald|http://nils-diewald.de/>
+
+KorAP::XML::Krill is developed as part of the
+L<KorAP|http://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
+and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
+funded by the
+L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
+
+KorAP::XML::Krill is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
+
+=cut
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index affb633..5740492 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -517,7 +517,7 @@
 sub to_data {
   my $self = shift;
   my $primary = defined $_[0] ? $_[0] : 1;
-  my $legacy =  defined $_[1] ? $_[1] : 0;
+  my $legacy  = defined $_[1] ? $_[1] : 0;
 
   my %data = %{$self->doc->to_hash};
   my @fields;
@@ -538,6 +538,7 @@
 
   else {
     my $tokens = $self->to_hash;
+
     $tokens->{text} = $self->doc->primary->data if $primary;
     $data{data} = $tokens;
     $data{version} = '0.03';
diff --git a/script/korapxml2krill b/script/korapxml2krill
index af75538..8a56858 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -14,7 +14,6 @@
 use KorAP::XML::Krill;
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
-use KorAP::XML::ProcessFile;
 use Parallel::ForkManager;
 # TODO: use Parallel::Loops
 # TODO: make output files
diff --git a/t/annotation/corpus/doc/0001/corenlp/tokens.xml b/t/annotation/corpus/doc/0001/corenlp/tokens.xml
index 78f562b..6ff5eae 100644
--- a/t/annotation/corpus/doc/0001/corenlp/tokens.xml
+++ b/t/annotation/corpus/doc/0001/corenlp/tokens.xml
@@ -23,204 +23,5 @@
       <span id="s2_n26" from="117" to="120"/>
       <span id="s2_n28" from="120" to="121"/>
       <span id="s2_n31" from="122" to="127"/>
-      <span id="s2_n34" from="128" to="131"/>
-      <span id="s2_n36" from="132" to="139"/>
-      <span id="s2_n40" from="140" to="144"/>
-      <span id="s2_n42" from="145" to="154"/>
-      <span id="s2_n44" from="155" to="166"/>
-      <span id="s2_n46" from="167" to="171"/>
-      <span id="s2_n48" from="171" to="172"/>
-      <span id="s3_n4" from="173" to="175"/>
-      <span id="s3_n6" from="176" to="185"/>
-      <span id="s3_n9" from="186" to="193"/>
-      <span id="s3_n11" from="193" to="194"/>
-      <span id="s3_n13" from="195" to="196"/>
-      <span id="s3_n15" from="196" to="197"/>
-      <span id="s4_n3" from="198" to="203"/>
-      <span id="s4_n5" from="203" to="204"/>
-      <span id="s4_n7" from="205" to="211"/>
-      <span id="s4_n13" from="212" to="214"/>
-      <span id="s4_n15" from="215" to="220"/>
-      <span id="s4_n17" from="221" to="224"/>
-      <span id="s4_n20" from="225" to="228"/>
-      <span id="s4_n22" from="229" to="239"/>
-      <span id="s4_n25" from="240" to="243"/>
-      <span id="s4_n27" from="244" to="255"/>
-      <span id="s4_n31" from="256" to="259"/>
-      <span id="s4_n34" from="260" to="266"/>
-      <span id="s4_n36" from="267" to="274"/>
-      <span id="s4_n41" from="275" to="276"/>
-      <span id="s4_n43" from="276" to="282"/>
-      <span id="s4_n47" from="282" to="283"/>
-      <span id="s4_n49" from="284" to="287"/>
-      <span id="s4_n52" from="288" to="293"/>
-      <span id="s4_n54" from="294" to="298"/>
-      <span id="s4_n57" from="299" to="300"/>
-      <span id="s4_n59" from="300" to="313"/>
-      <span id="s4_n61" from="314" to="317"/>
-      <span id="s4_n63" from="318" to="331"/>
-      <span id="s4_n65" from="331" to="332"/>
-      <span id="s4_n68" from="333" to="335"/>
-      <span id="s4_n70" from="336" to="345"/>
-      <span id="s4_n72" from="346" to="357"/>
-      <span id="s4_n74" from="358" to="363"/>
-      <span id="s4_n76" from="363" to="364"/>
-      <span id="s5_n5" from="365" to="368"/>
-      <span id="s5_n7" from="369" to="372"/>
-      <span id="s5_n9" from="373" to="385"/>
-      <span id="s5_n11" from="386" to="395"/>
-      <span id="s5_n13" from="396" to="402"/>
-      <span id="s5_n17" from="403" to="409"/>
-      <span id="s5_n19" from="410" to="416"/>
-      <span id="s5_n21" from="417" to="420"/>
-      <span id="s5_n24" from="421" to="425"/>
-      <span id="s5_n26" from="426" to="432"/>
-      <span id="s5_n28" from="432" to="433"/>
-      <span id="s5_n32" from="434" to="445"/>
-      <span id="s5_n34" from="446" to="451"/>
-      <span id="s5_n36" from="452" to="459"/>
-      <span id="s5_n39" from="460" to="465"/>
-      <span id="s5_n41" from="466" to="476"/>
-      <span id="s5_n43" from="476" to="477"/>
-      <span id="s6_n4" from="478" to="481"/>
-      <span id="s6_n6" from="482" to="493"/>
-      <span id="s6_n8" from="494" to="500"/>
-      <span id="s6_n12" from="501" to="504"/>
-      <span id="s6_n14" from="505" to="507"/>
-      <span id="s6_n16" from="508" to="510"/>
-      <span id="s6_n18" from="510" to="511"/>
-      <span id="s7_n5" from="512" to="517"/>
-      <span id="s7_n7" from="518" to="521"/>
-      <span id="s7_n10" from="522" to="525"/>
-      <span id="s7_n14" from="526" to="528"/>
-      <span id="s7_n16" from="529" to="536"/>
-      <span id="s7_n19" from="537" to="540"/>
-      <span id="s7_n22" from="541" to="548"/>
-      <span id="s7_n25" from="549" to="552"/>
-      <span id="s7_n28" from="553" to="558"/>
-      <span id="s7_n30" from="559" to="562"/>
-      <span id="s7_n32" from="563" to="568"/>
-      <span id="s7_n34" from="569" to="572"/>
-      <span id="s7_n36" from="573" to="578"/>
-      <span id="s7_n39" from="579" to="585"/>
-      <span id="s7_n42" from="586" to="589"/>
-      <span id="s7_n44" from="590" to="597"/>
-      <span id="s7_n47" from="598" to="601"/>
-      <span id="s7_n50" from="602" to="607"/>
-      <span id="s7_n52" from="608" to="611"/>
-      <span id="s7_n54" from="612" to="617"/>
-      <span id="s7_n56" from="617" to="620"/>
-      <span id="s7_n58" from="621" to="632"/>
-      <span id="s7_n60" from="632" to="633"/>
-      <span id="s8_n3" from="634" to="642"/>
-      <span id="s8_n6" from="643" to="658"/>
-      <span id="s8_n8" from="659" to="663"/>
-      <span id="s8_n10" from="663" to="664"/>
-      <span id="s9_n4" from="665" to="668"/>
-      <span id="s9_n6" from="669" to="689"/>
-      <span id="s9_n8" from="690" to="695"/>
-      <span id="s9_n10" from="696" to="699"/>
-      <span id="s9_n12" from="699" to="700"/>
-      <span id="s9_n15" from="701" to="705"/>
-      <span id="s9_n18" from="706" to="713"/>
-      <span id="s9_n20" from="714" to="724"/>
-      <span id="s9_n24" from="725" to="729"/>
-      <span id="s9_n26" from="730" to="747"/>
-      <span id="s9_n29" from="748" to="751"/>
-      <span id="s9_n31" from="752" to="755"/>
-      <span id="s9_n33" from="756" to="774"/>
-      <span id="s9_n35" from="775" to="784"/>
-      <span id="s9_n37" from="785" to="788"/>
-      <span id="s9_n39" from="788" to="789"/>
-      <span id="s10_n4" from="790" to="793"/>
-      <span id="s10_n6" from="794" to="797"/>
-      <span id="s10_n9" from="798" to="802"/>
-      <span id="s10_n11" from="803" to="806"/>
-      <span id="s10_n13" from="807" to="816"/>
-      <span id="s10_n15" from="817" to="826"/>
-      <span id="s10_n17" from="827" to="831"/>
-      <span id="s10_n19" from="832" to="835"/>
-      <span id="s10_n22" from="836" to="839"/>
-      <span id="s10_n24" from="840" to="845"/>
-      <span id="s10_n26" from="846" to="850"/>
-      <span id="s10_n29" from="851" to="864"/>
-      <span id="s10_n31" from="864" to="865"/>
-      <span id="s11_n4" from="866" to="869"/>
-      <span id="s11_n6" from="870" to="875"/>
-      <span id="s11_n8" from="876" to="892"/>
-      <span id="s11_n10" from="893" to="898"/>
-      <span id="s11_n13" from="899" to="902"/>
-      <span id="s11_n15" from="903" to="909"/>
-      <span id="s11_n18" from="910" to="913"/>
-      <span id="s11_n20" from="914" to="927"/>
-      <span id="s11_n24" from="928" to="935"/>
-      <span id="s11_n26" from="936" to="947"/>
-      <span id="s11_n28" from="947" to="948"/>
-      <span id="s11_n32" from="949" to="957"/>
-      <span id="s11_n34" from="958" to="962"/>
-      <span id="s11_n36" from="962" to="963"/>
-      <span id="s11_n38" from="964" to="986"/>
-      <span id="s11_n40" from="986" to="987"/>
-      <span id="s11_n42" from="988" to="995"/>
-      <span id="s11_n44" from="995" to="996"/>
-      <span id="s11_n47" from="997" to="1007"/>
-      <span id="s11_n49" from="1007" to="1008"/>
-      <span id="s12_n4" from="1009" to="1025"/>
-      <span id="s12_n7" from="1026" to="1029"/>
-      <span id="s12_n9" from="1030" to="1036"/>
-      <span id="s12_n11" from="1037" to="1049"/>
-      <span id="s12_n13" from="1049" to="1050"/>
-      <span id="s13_n4" from="1051" to="1054"/>
-      <span id="s13_n6" from="1055" to="1056"/>
-      <span id="s13_n9" from="1056" to="1064"/>
-      <span id="s13_n11" from="1065" to="1070"/>
-      <span id="s13_n13" from="1070" to="1071"/>
-      <span id="s13_n15" from="1072" to="1076"/>
-      <span id="s13_n18" from="1077" to="1079"/>
-      <span id="s13_n20" from="1080" to="1085"/>
-      <span id="s13_n22" from="1086" to="1090"/>
-      <span id="s13_n24" from="1091" to="1094"/>
-      <span id="s13_n26" from="1094" to="1095"/>
-      <span id="s13_n31" from="1096" to="1098"/>
-      <span id="s13_n33" from="1099" to="1102"/>
-      <span id="s13_n36" from="1103" to="1109"/>
-      <span id="s13_n38" from="1110" to="1122"/>
-      <span id="s13_n40" from="1123" to="1129"/>
-      <span id="s13_n42" from="1130" to="1134"/>
-      <span id="s13_n44" from="1134" to="1135"/>
-      <span id="s13_n47" from="1136" to="1139"/>
-      <span id="s13_n50" from="1140" to="1143"/>
-      <span id="s13_n54" from="1144" to="1152"/>
-      <span id="s13_n56" from="1153" to="1156"/>
-      <span id="s13_n58" from="1157" to="1168"/>
-      <span id="s13_n60" from="1169" to="1177"/>
-      <span id="s13_n65" from="1177" to="1178"/>
-      <span id="s13_n67" from="1179" to="1182"/>
-      <span id="s13_n69" from="1183" to="1195"/>
-      <span id="s13_n72" from="1196" to="1199"/>
-      <span id="s13_n74" from="1200" to="1207"/>
-      <span id="s13_n76" from="1208" to="1214"/>
-      <span id="s13_n78" from="1215" to="1219"/>
-      <span id="s13_n80" from="1219" to="1220"/>
-      <span id="s14_n4" from="1221" to="1224"/>
-      <span id="s14_n6" from="1225" to="1237"/>
-      <span id="s14_n8" from="1238" to="1243"/>
-      <span id="s14_n10" from="1244" to="1247"/>
-      <span id="s14_n12" from="1248" to="1252"/>
-      <span id="s14_n14" from="1253" to="1258"/>
-      <span id="s14_n17" from="1259" to="1262"/>
-      <span id="s14_n20" from="1263" to="1268"/>
-      <span id="s14_n22" from="1269" to="1274"/>
-      <span id="s14_n24" from="1274" to="1275"/>
-      <span id="s14_n27" from="1276" to="1280"/>
-      <span id="s14_n30" from="1281" to="1284"/>
-      <span id="s14_n32" from="1285" to="1287"/>
-      <span id="s14_n34" from="1288" to="1290"/>
-      <span id="s14_n36" from="1290" to="1291"/>
-      <span id="s15_n4" from="1292" to="1295"/>
-      <span id="s15_n6" from="1296" to="1310"/>
-      <span id="s15_n8" from="1311" to="1314"/>
-      <span id="s15_n10" from="1315" to="1326"/>
-      <span id="s15_n12" from="1326" to="1327"/>
    </spanList>
 </layer>
diff --git a/t/batch_file.t b/t/batch_file.t
index aa7993d..947e1ef 100644
--- a/t/batch_file.t
+++ b/t/batch_file.t
@@ -7,6 +7,8 @@
 use File::Temp qw/ :POSIX /;
 use Mojo::Util qw/slurp/;
 use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Data::Dumper;
 
 use_ok('KorAP::XML::Batch::File');
 
@@ -16,8 +18,6 @@
   layer => 'Tokens'
 ), 'Construct new batch file object');
 
-# gzip => 1,
-
 my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
 
 my $output = tmpnam();
@@ -37,5 +37,91 @@
 is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
 is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
 
+# Generate with Gzip
+$bf->{gzip} = 1;
+
+$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+
+my $out;
+my $gz = IO::Uncompress::Gunzip->new($output);
+ok($gz->read($out), 'Uncompress');
+
+ok($json = decode_json $out, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, '', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+
+# Generate with annotations
+$bf->{gzip} = 0;
+$bf->{anno} = [
+  ['CoreNLP', 'Morpho'],
+  ['OpenNLP', 'Morpho']
+];
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+
+my $token = $json->{data}->{stream}->[0];
+
+like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
+
+is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
+is($token->[2], '_0$<i>0<i>3', 'position');
+is($token->[3], 'corenlp/p:APPRART', 'corenlp');
+is($token->[5], 'opennlp/p:APPRART', 'opennlp');
+
+$token = $json->{data}->{stream}->[-1];
+
+is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
+is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
+
+# Check layer and foundry for base tokenization
+# No primary data
+$bf->{anno} = [[]];
+$bf->{primary} = 0;
+$bf->{foundry} = 'CoreNLP';
+$bf->{layer} = 'Tokens';
+
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+ok(!$json->{data}->{text}, 'No Primary text');
+is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
+
+like($file, qr/^\{"/, 'No pretty printing');
+
+# Check pretty printing
+$bf->{pretty} = 1;
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+like($file, qr/^\{\n\s+"/, 'No pretty printing');
+
+# Check overwriting
+$bf->{overwrite} = 0;
+
+ok(!$bf->process($path => $output), 'Process file');
+
 done_testing;
 __END__
+
+
+
+