Added batch processing class for documents Change-Id: I91ab289ffff525978fce3079a4dad71caddfe98a

commit: cdf0e0017b93f25d4b3fb2768e7bea9e6fa32239 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Jul 08 16:42:04 2016 +0200
committer: Akron <nils@diewald-online.de> Fri Jul 08 19:42:44 2016 +0200
tree: 1bedefc42b1403d4bcb7f937394a95e67758243b
parent: 405f0c5239b1056952cb436a4bc0ea8210f307d9 [diff] [blame]
diff --git a/t/batch_file.t b/t/batch_file.t
index aa7993d..947e1ef 100644
--- a/t/batch_file.t
+++ b/t/batch_file.t

@@ -7,6 +7,8 @@
 use File::Temp qw/ :POSIX /;
 use Mojo::Util qw/slurp/;
 use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Data::Dumper;
 
 use_ok('KorAP::XML::Batch::File');
 
@@ -16,8 +18,6 @@
   layer => 'Tokens'
 ), 'Construct new batch file object');
 
-# gzip => 1,
-
 my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
 
 my $output = tmpnam();
@@ -37,5 +37,91 @@
 is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
 is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
 
+# Generate with Gzip
+$bf->{gzip} = 1;
+
+$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+
+my $out;
+my $gz = IO::Uncompress::Gunzip->new($output);
+ok($gz->read($out), 'Uncompress');
+
+ok($json = decode_json $out, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, '', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+
+# Generate with annotations
+$bf->{gzip} = 0;
+$bf->{anno} = [
+  ['CoreNLP', 'Morpho'],
+  ['OpenNLP', 'Morpho']
+];
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+
+my $token = $json->{data}->{stream}->[0];
+
+like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
+
+is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
+is($token->[2], '_0$<i>0<i>3', 'position');
+is($token->[3], 'corenlp/p:APPRART', 'corenlp');
+is($token->[5], 'opennlp/p:APPRART', 'opennlp');
+
+$token = $json->{data}->{stream}->[-1];
+
+is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
+is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
+
+# Check layer and foundry for base tokenization
+# No primary data
+$bf->{anno} = [[]];
+$bf->{primary} = 0;
+$bf->{foundry} = 'CoreNLP';
+$bf->{layer} = 'Tokens';
+
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+ok(!$json->{data}->{text}, 'No Primary text');
+is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
+
+like($file, qr/^\{"/, 'No pretty printing');
+
+# Check pretty printing
+$bf->{pretty} = 1;
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+like($file, qr/^\{\n\s+"/, 'No pretty printing');
+
+# Check overwriting
+$bf->{overwrite} = 0;
+
+ok(!$bf->process($path => $output), 'Process file');
+
 done_testing;
 __END__
+
+
+
+
commit	cdf0e0017b93f25d4b3fb2768e7bea9e6fa32239	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Jul 08 16:42:04 2016 +0200
committer	Akron <nils@diewald-online.de>	Fri Jul 08 19:42:44 2016 +0200
tree	1bedefc42b1403d4bcb7f937394a95e67758243b
parent	405f0c5239b1056952cb436a4bc0ea8210f307d9 [diff] [blame]