blob: 7bfe48d644d1e3ff09af62429f8e1d72e389ac1b [file] [log] [blame]
Akron3741f8b2016-12-21 19:55:21 +01001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/ :POSIX /;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akron3741f8b2016-12-21 19:55:21 +01008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
11use Test::Output;
12use Data::Dumper;
13use utf8;
14
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
17
18my $input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
19ok(-d $input, 'Input directory found');
20
21my $output = tmpnam();
Akron5fd2d8e2017-06-19 15:29:39 +020022my $cache = tmpnam();
Akron3741f8b2016-12-21 19:55:21 +010023
24ok(!-f $output, 'Output does not exist');
25
26my $call = join(
27 ' ',
28 'perl', $script,
29 '--input' => $input,
30 '--output' => $output,
Akron5fd2d8e2017-06-19 15:29:39 +020031 '--cache' => $cache,
Akron3c56f502017-10-24 15:37:27 +020032 '-t' => 'Base#tokens_aggr.xml',
Akron3741f8b2016-12-21 19:55:21 +010033 '-bs' => 'DeReKo#Structure',
34 '-bp' => 'DeReKo#Structure',
35 '-l' => 'INFO'
36);
37
38# Test without compression
39stderr_like(
40 sub {
41 system($call);
42 },
43 qr!The code took!,
44 $call
45);
46
47ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010048ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron3741f8b2016-12-21 19:55:21 +010049ok((my $json = decode_json $file), 'decode json');
50is($json->{textType}, 'Autobiographie', 'text type');
51is($json->{title}, 'Autobiographische Einzelheiten', 'Title');
52is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
Akron3bd942f2017-02-20 20:09:14 +010053is($json->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base-sentences-paragraphs marmot marmot/morpho', 'Foundries');
Akron3741f8b2016-12-21 19:55:21 +010054my $stream = $json->{data}->{stream};
55my $token = $stream->[0];
56is($token->[0], '-:base/paragraphs$<i>14', 'Paragraphs');
57is($token->[1], '-:base/sentences$<i>215', 'Sentences');
58
59is($token->[5], '<>:base/s:s$<b>64<i>0<i>30<i>2<b>2', 'struct');
60is($token->[7], '<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4', 'struct');
Akrondec43122020-03-03 11:22:25 +010061is($token->[8], '<>:base/s:t$<b>64<i>0<i>35242<i>5239<b>0', 'struct');
Akron3741f8b2016-12-21 19:55:21 +010062
63$token = $stream->[4];
64is($token->[0], '<>:base/s:s$<b>64<i>53<i>254<i>32<b>2', 'struct');
65is($token->[1], '<>:dereko/s:s$<b>64<i>53<i>254<i>32<b>5<s>1', 'struct');
66is($token->[2], '<>:base/s:p$<b>64<i>53<i>3299<i>504<b>1', 'struct');
67is($token->[3], '<>:dereko/s:p$<b>64<i>53<i>3299<i>504<b>4', 'struct');
68
69done_testing;
70
71__END__