Added batch processing class for documents

Change-Id: I91ab289ffff525978fce3079a4dad71caddfe98a
diff --git a/t/annotation/corpus/doc/0001/corenlp/tokens.xml b/t/annotation/corpus/doc/0001/corenlp/tokens.xml
index 78f562b..6ff5eae 100644
--- a/t/annotation/corpus/doc/0001/corenlp/tokens.xml
+++ b/t/annotation/corpus/doc/0001/corenlp/tokens.xml
@@ -23,204 +23,5 @@
       <span id="s2_n26" from="117" to="120"/>
       <span id="s2_n28" from="120" to="121"/>
       <span id="s2_n31" from="122" to="127"/>
-      <span id="s2_n34" from="128" to="131"/>
-      <span id="s2_n36" from="132" to="139"/>
-      <span id="s2_n40" from="140" to="144"/>
-      <span id="s2_n42" from="145" to="154"/>
-      <span id="s2_n44" from="155" to="166"/>
-      <span id="s2_n46" from="167" to="171"/>
-      <span id="s2_n48" from="171" to="172"/>
-      <span id="s3_n4" from="173" to="175"/>
-      <span id="s3_n6" from="176" to="185"/>
-      <span id="s3_n9" from="186" to="193"/>
-      <span id="s3_n11" from="193" to="194"/>
-      <span id="s3_n13" from="195" to="196"/>
-      <span id="s3_n15" from="196" to="197"/>
-      <span id="s4_n3" from="198" to="203"/>
-      <span id="s4_n5" from="203" to="204"/>
-      <span id="s4_n7" from="205" to="211"/>
-      <span id="s4_n13" from="212" to="214"/>
-      <span id="s4_n15" from="215" to="220"/>
-      <span id="s4_n17" from="221" to="224"/>
-      <span id="s4_n20" from="225" to="228"/>
-      <span id="s4_n22" from="229" to="239"/>
-      <span id="s4_n25" from="240" to="243"/>
-      <span id="s4_n27" from="244" to="255"/>
-      <span id="s4_n31" from="256" to="259"/>
-      <span id="s4_n34" from="260" to="266"/>
-      <span id="s4_n36" from="267" to="274"/>
-      <span id="s4_n41" from="275" to="276"/>
-      <span id="s4_n43" from="276" to="282"/>
-      <span id="s4_n47" from="282" to="283"/>
-      <span id="s4_n49" from="284" to="287"/>
-      <span id="s4_n52" from="288" to="293"/>
-      <span id="s4_n54" from="294" to="298"/>
-      <span id="s4_n57" from="299" to="300"/>
-      <span id="s4_n59" from="300" to="313"/>
-      <span id="s4_n61" from="314" to="317"/>
-      <span id="s4_n63" from="318" to="331"/>
-      <span id="s4_n65" from="331" to="332"/>
-      <span id="s4_n68" from="333" to="335"/>
-      <span id="s4_n70" from="336" to="345"/>
-      <span id="s4_n72" from="346" to="357"/>
-      <span id="s4_n74" from="358" to="363"/>
-      <span id="s4_n76" from="363" to="364"/>
-      <span id="s5_n5" from="365" to="368"/>
-      <span id="s5_n7" from="369" to="372"/>
-      <span id="s5_n9" from="373" to="385"/>
-      <span id="s5_n11" from="386" to="395"/>
-      <span id="s5_n13" from="396" to="402"/>
-      <span id="s5_n17" from="403" to="409"/>
-      <span id="s5_n19" from="410" to="416"/>
-      <span id="s5_n21" from="417" to="420"/>
-      <span id="s5_n24" from="421" to="425"/>
-      <span id="s5_n26" from="426" to="432"/>
-      <span id="s5_n28" from="432" to="433"/>
-      <span id="s5_n32" from="434" to="445"/>
-      <span id="s5_n34" from="446" to="451"/>
-      <span id="s5_n36" from="452" to="459"/>
-      <span id="s5_n39" from="460" to="465"/>
-      <span id="s5_n41" from="466" to="476"/>
-      <span id="s5_n43" from="476" to="477"/>
-      <span id="s6_n4" from="478" to="481"/>
-      <span id="s6_n6" from="482" to="493"/>
-      <span id="s6_n8" from="494" to="500"/>
-      <span id="s6_n12" from="501" to="504"/>
-      <span id="s6_n14" from="505" to="507"/>
-      <span id="s6_n16" from="508" to="510"/>
-      <span id="s6_n18" from="510" to="511"/>
-      <span id="s7_n5" from="512" to="517"/>
-      <span id="s7_n7" from="518" to="521"/>
-      <span id="s7_n10" from="522" to="525"/>
-      <span id="s7_n14" from="526" to="528"/>
-      <span id="s7_n16" from="529" to="536"/>
-      <span id="s7_n19" from="537" to="540"/>
-      <span id="s7_n22" from="541" to="548"/>
-      <span id="s7_n25" from="549" to="552"/>
-      <span id="s7_n28" from="553" to="558"/>
-      <span id="s7_n30" from="559" to="562"/>
-      <span id="s7_n32" from="563" to="568"/>
-      <span id="s7_n34" from="569" to="572"/>
-      <span id="s7_n36" from="573" to="578"/>
-      <span id="s7_n39" from="579" to="585"/>
-      <span id="s7_n42" from="586" to="589"/>
-      <span id="s7_n44" from="590" to="597"/>
-      <span id="s7_n47" from="598" to="601"/>
-      <span id="s7_n50" from="602" to="607"/>
-      <span id="s7_n52" from="608" to="611"/>
-      <span id="s7_n54" from="612" to="617"/>
-      <span id="s7_n56" from="617" to="620"/>
-      <span id="s7_n58" from="621" to="632"/>
-      <span id="s7_n60" from="632" to="633"/>
-      <span id="s8_n3" from="634" to="642"/>
-      <span id="s8_n6" from="643" to="658"/>
-      <span id="s8_n8" from="659" to="663"/>
-      <span id="s8_n10" from="663" to="664"/>
-      <span id="s9_n4" from="665" to="668"/>
-      <span id="s9_n6" from="669" to="689"/>
-      <span id="s9_n8" from="690" to="695"/>
-      <span id="s9_n10" from="696" to="699"/>
-      <span id="s9_n12" from="699" to="700"/>
-      <span id="s9_n15" from="701" to="705"/>
-      <span id="s9_n18" from="706" to="713"/>
-      <span id="s9_n20" from="714" to="724"/>
-      <span id="s9_n24" from="725" to="729"/>
-      <span id="s9_n26" from="730" to="747"/>
-      <span id="s9_n29" from="748" to="751"/>
-      <span id="s9_n31" from="752" to="755"/>
-      <span id="s9_n33" from="756" to="774"/>
-      <span id="s9_n35" from="775" to="784"/>
-      <span id="s9_n37" from="785" to="788"/>
-      <span id="s9_n39" from="788" to="789"/>
-      <span id="s10_n4" from="790" to="793"/>
-      <span id="s10_n6" from="794" to="797"/>
-      <span id="s10_n9" from="798" to="802"/>
-      <span id="s10_n11" from="803" to="806"/>
-      <span id="s10_n13" from="807" to="816"/>
-      <span id="s10_n15" from="817" to="826"/>
-      <span id="s10_n17" from="827" to="831"/>
-      <span id="s10_n19" from="832" to="835"/>
-      <span id="s10_n22" from="836" to="839"/>
-      <span id="s10_n24" from="840" to="845"/>
-      <span id="s10_n26" from="846" to="850"/>
-      <span id="s10_n29" from="851" to="864"/>
-      <span id="s10_n31" from="864" to="865"/>
-      <span id="s11_n4" from="866" to="869"/>
-      <span id="s11_n6" from="870" to="875"/>
-      <span id="s11_n8" from="876" to="892"/>
-      <span id="s11_n10" from="893" to="898"/>
-      <span id="s11_n13" from="899" to="902"/>
-      <span id="s11_n15" from="903" to="909"/>
-      <span id="s11_n18" from="910" to="913"/>
-      <span id="s11_n20" from="914" to="927"/>
-      <span id="s11_n24" from="928" to="935"/>
-      <span id="s11_n26" from="936" to="947"/>
-      <span id="s11_n28" from="947" to="948"/>
-      <span id="s11_n32" from="949" to="957"/>
-      <span id="s11_n34" from="958" to="962"/>
-      <span id="s11_n36" from="962" to="963"/>
-      <span id="s11_n38" from="964" to="986"/>
-      <span id="s11_n40" from="986" to="987"/>
-      <span id="s11_n42" from="988" to="995"/>
-      <span id="s11_n44" from="995" to="996"/>
-      <span id="s11_n47" from="997" to="1007"/>
-      <span id="s11_n49" from="1007" to="1008"/>
-      <span id="s12_n4" from="1009" to="1025"/>
-      <span id="s12_n7" from="1026" to="1029"/>
-      <span id="s12_n9" from="1030" to="1036"/>
-      <span id="s12_n11" from="1037" to="1049"/>
-      <span id="s12_n13" from="1049" to="1050"/>
-      <span id="s13_n4" from="1051" to="1054"/>
-      <span id="s13_n6" from="1055" to="1056"/>
-      <span id="s13_n9" from="1056" to="1064"/>
-      <span id="s13_n11" from="1065" to="1070"/>
-      <span id="s13_n13" from="1070" to="1071"/>
-      <span id="s13_n15" from="1072" to="1076"/>
-      <span id="s13_n18" from="1077" to="1079"/>
-      <span id="s13_n20" from="1080" to="1085"/>
-      <span id="s13_n22" from="1086" to="1090"/>
-      <span id="s13_n24" from="1091" to="1094"/>
-      <span id="s13_n26" from="1094" to="1095"/>
-      <span id="s13_n31" from="1096" to="1098"/>
-      <span id="s13_n33" from="1099" to="1102"/>
-      <span id="s13_n36" from="1103" to="1109"/>
-      <span id="s13_n38" from="1110" to="1122"/>
-      <span id="s13_n40" from="1123" to="1129"/>
-      <span id="s13_n42" from="1130" to="1134"/>
-      <span id="s13_n44" from="1134" to="1135"/>
-      <span id="s13_n47" from="1136" to="1139"/>
-      <span id="s13_n50" from="1140" to="1143"/>
-      <span id="s13_n54" from="1144" to="1152"/>
-      <span id="s13_n56" from="1153" to="1156"/>
-      <span id="s13_n58" from="1157" to="1168"/>
-      <span id="s13_n60" from="1169" to="1177"/>
-      <span id="s13_n65" from="1177" to="1178"/>
-      <span id="s13_n67" from="1179" to="1182"/>
-      <span id="s13_n69" from="1183" to="1195"/>
-      <span id="s13_n72" from="1196" to="1199"/>
-      <span id="s13_n74" from="1200" to="1207"/>
-      <span id="s13_n76" from="1208" to="1214"/>
-      <span id="s13_n78" from="1215" to="1219"/>
-      <span id="s13_n80" from="1219" to="1220"/>
-      <span id="s14_n4" from="1221" to="1224"/>
-      <span id="s14_n6" from="1225" to="1237"/>
-      <span id="s14_n8" from="1238" to="1243"/>
-      <span id="s14_n10" from="1244" to="1247"/>
-      <span id="s14_n12" from="1248" to="1252"/>
-      <span id="s14_n14" from="1253" to="1258"/>
-      <span id="s14_n17" from="1259" to="1262"/>
-      <span id="s14_n20" from="1263" to="1268"/>
-      <span id="s14_n22" from="1269" to="1274"/>
-      <span id="s14_n24" from="1274" to="1275"/>
-      <span id="s14_n27" from="1276" to="1280"/>
-      <span id="s14_n30" from="1281" to="1284"/>
-      <span id="s14_n32" from="1285" to="1287"/>
-      <span id="s14_n34" from="1288" to="1290"/>
-      <span id="s14_n36" from="1290" to="1291"/>
-      <span id="s15_n4" from="1292" to="1295"/>
-      <span id="s15_n6" from="1296" to="1310"/>
-      <span id="s15_n8" from="1311" to="1314"/>
-      <span id="s15_n10" from="1315" to="1326"/>
-      <span id="s15_n12" from="1326" to="1327"/>
    </spanList>
 </layer>
diff --git a/t/batch_file.t b/t/batch_file.t
index aa7993d..947e1ef 100644
--- a/t/batch_file.t
+++ b/t/batch_file.t
@@ -7,6 +7,8 @@
 use File::Temp qw/ :POSIX /;
 use Mojo::Util qw/slurp/;
 use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Data::Dumper;
 
 use_ok('KorAP::XML::Batch::File');
 
@@ -16,8 +18,6 @@
   layer => 'Tokens'
 ), 'Construct new batch file object');
 
-# gzip => 1,
-
 my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
 
 my $output = tmpnam();
@@ -37,5 +37,91 @@
 is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
 is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
 
+# Generate with Gzip
+$bf->{gzip} = 1;
+
+$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+
+my $out;
+my $gz = IO::Uncompress::Gunzip->new($output);
+ok($gz->read($out), 'Uncompress');
+
+ok($json = decode_json $out, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, '', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+
+# Generate with annotations
+$bf->{gzip} = 0;
+$bf->{anno} = [
+  ['CoreNLP', 'Morpho'],
+  ['OpenNLP', 'Morpho']
+];
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+
+my $token = $json->{data}->{stream}->[0];
+
+like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
+
+is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
+is($token->[2], '_0$<i>0<i>3', 'position');
+is($token->[3], 'corenlp/p:APPRART', 'corenlp');
+is($token->[5], 'opennlp/p:APPRART', 'opennlp');
+
+$token = $json->{data}->{stream}->[-1];
+
+is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
+is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
+
+# Check layer and foundry for base tokenization
+# No primary data
+$bf->{anno} = [[]];
+$bf->{primary} = 0;
+$bf->{foundry} = 'CoreNLP';
+$bf->{layer} = 'Tokens';
+
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+ok(!$json->{data}->{text}, 'No Primary text');
+is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
+
+like($file, qr/^\{"/, 'No pretty printing');
+
+# Check pretty printing
+$bf->{pretty} = 1;
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+like($file, qr/^\{\n\s+"/, 'No pretty printing');
+
+# Check overwriting
+$bf->{overwrite} = 0;
+
+ok(!$bf->process($path => $output), 'Process file');
+
 done_testing;
 __END__
+
+
+
+