blob: 20070a2fd841ce76b9719a9e0820c4fe675f52d8 [file] [log] [blame]
#/usr/bin/env perl
use strict;
use warnings;
use File::Basename 'dirname';
use File::Spec::Functions qw/catdir catfile/;
use File::Temp qw/ :POSIX /;
use Mojo::File;
use Mojo::JSON qw/decode_json/;
use IO::Uncompress::Gunzip;
use Test::More;
use Test::Output;
use Data::Dumper;
use utf8;
my $f = dirname(__FILE__);
my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
my $input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
ok(-d $input, 'Input directory found');
my $output = tmpnam();
ok(!-f $output, 'Output does not exist');
my $call = join(
' ',
'perl', $script,
'--input' => $input,
'--output' => $output,
'-t' => 'Base#tokens_aggr',
'-bs' => 'DeReKo#Structure',
'-bp' => 'DeReKo#Structure',
'-l' => 'INFO'
);
# Test without compression
stderr_like(
sub {
system($call);
},
qr!The code took!,
$call
);
ok(-f $output, 'Output does exist');
ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
ok((my $json = decode_json $file), 'decode json');
is($json->{textType}, 'Autobiographie', 'text type');
is($json->{title}, 'Autobiographische Einzelheiten', 'Title');
is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
is($json->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base-sentences-paragraphs', 'Foundries');
my $stream = $json->{data}->{stream};
my $token = $stream->[0];
is($token->[0], '-:base/paragraphs$<i>14', 'Paragraphs');
is($token->[1], '-:base/sentences$<i>215', 'Sentences');
is($token->[5], '<>:base/s:s$<b>64<i>0<i>30<i>2<b>2', 'struct');
is($token->[7], '<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4', 'struct');
is($token->[8], '<>:base/s:t$<b>64<i>0<i>35242<i>5238<b>0', 'struct');
$token = $stream->[4];
is($token->[0], '<>:base/s:s$<b>64<i>53<i>254<i>32<b>2', 'struct');
is($token->[1], '<>:dereko/s:s$<b>64<i>53<i>254<i>32<b>5<s>1', 'struct');
is($token->[2], '<>:base/s:p$<b>64<i>53<i>3299<i>504<b>1', 'struct');
is($token->[3], '<>:dereko/s:p$<b>64<i>53<i>3299<i>504<b>4', 'struct');
done_testing;
__END__