blob: ed32401054b0939aa2ed41b7e1836f65e7a1c2d2 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
use File::Temp qw/ :POSIX /;
use Mojo::File;
use Mojo::JSON qw/decode_json/;
use IO::Uncompress::Gunzip;
use Data::Dumper;
use_ok('KorAP::XML::Batch::File');
ok(my $bf = KorAP::XML::Batch::File->new(
overwrite => 1,
foundry => 'OpenNLP',
layer => 'Tokens'
), 'Construct new batch file object');
my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
my $output = tmpnam();
ok($bf->process($path => $output), 'Process file');
ok(-f $output, 'File exists');
ok(my $file = Mojo::File->new($output)->slurp, 'Slurp data');
ok(my $json = decode_json $file, 'decode json');
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
is($json->{data}->{foundries}, '', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
# Generate with Gzip
$bf->{gzip} = 1;
$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
$output = tmpnam();
ok($bf->process($path => $output), 'Process file');
my $out;
my $gz = IO::Uncompress::Gunzip->new($output);
ok($gz->read($out), 'Uncompress');
ok($json = decode_json $out, 'decode json');
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
is($json->{data}->{foundries}, '', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
# Generate with annotations
$bf->{gzip} = 0;
$bf->{anno} = [
['CoreNLP', 'Morpho'],
['OpenNLP', 'Morpho']
];
$output = tmpnam();
ok($bf->process($path => $output), 'Process file');
ok($file = Mojo::File->new($output)->slurp, 'Slurp data');
ok($json = decode_json $file, 'decode json');
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
my $token = $json->{data}->{stream}->[0];
like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
is($token->[2], '_0$<i>0<i>3', 'position');
is($token->[3], 'corenlp/p:APPRART', 'corenlp');
is($token->[5], 'opennlp/p:APPRART', 'opennlp');
$token = $json->{data}->{stream}->[-1];
is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
# Check layer and foundry for base tokenization
# No primary data
$bf->{anno} = [[]];
$bf->{primary} = 0;
$bf->{foundry} = 'CoreNLP';
$bf->{layer} = 'Tokens';
ok($bf->process($path => $output), 'Process file');
ok(-f $output, 'File exists');
ok($file = Mojo::File->new($output)->slurp, 'Slurp data');
ok($json = decode_json $file, 'decode json');
ok(!$json->{data}->{text}, 'No Primary text');
is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
like($file, qr/^\{"/, 'No pretty printing');
# Check pretty printing
$bf->{pretty} = 1;
ok($bf->process($path => $output), 'Process file');
ok(-f $output, 'File exists');
ok($file = Mojo::File->new($output)->slurp, 'Slurp data');
like($file, qr/^\{[\n\s]+"/, 'No pretty printing');
# Check overwriting
$bf->{overwrite} = 0;
is($bf->process($path => $output), -1, 'Process file');
done_testing;
__END__