blob: 301357c6300d709027cf5b80d6dff2a6e2629248 [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
Akronfab17d32020-07-31 14:38:29 +020015if ($ENV{SKIP_SCRIPT}) {
16 plan skip_all => 'Skip script tests';
17};
18
Akrone1dbc382016-07-08 22:24:52 +020019my $f = dirname(__FILE__);
20my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020021
22my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020023ok(-d $input, 'Input directory found');
24
Akron5f51d422016-08-16 16:26:43 +020025my $output = tmpnam();
Akrond5bb4342017-06-19 11:50:49 +020026my $cache = tmpnam();
27
Akrone1dbc382016-07-08 22:24:52 +020028
Akron5f51d422016-08-16 16:26:43 +020029ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020030
Akron5f51d422016-08-16 16:26:43 +020031my $call = join(
32 ' ',
33 'perl', $script,
34 '--input' => $input,
35 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020036 '--cache' => $cache,
Akron263274c2019-02-07 09:48:30 +010037 '-k' => 0.03,
Akron5f51d422016-08-16 16:26:43 +020038 '-t' => 'OpenNLP#Tokens',
39 '-l' => 'INFO'
40);
41
42# Test without compression
43stderr_like(
44 sub {
45 system($call);
46 },
47 qr!The code took!,
48 $call
49);
50
51ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010052ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020053ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020054is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
55is($json->{title}, 'Beispiel Text', 'Title');
56is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
57is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
58like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
59is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020060is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020061
Akron5f51d422016-08-16 16:26:43 +020062# Delete output
63unlink $output;
64ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020065
Akron5f51d422016-08-16 16:26:43 +020066$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020067
Akron5f51d422016-08-16 16:26:43 +020068# Test with compression
69stderr_like(
70 sub { system($call); },
71 qr!The code took!,
72 $call
73);
74
75ok(-f $output, 'Output does exist');
76
77# Uncompress the data using a buffer
78my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
79($file, my $buffer) = '';
80while ($gz->read($buffer)) {
81 $file .= $buffer;
82};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020083$gz->close;
Akron5f51d422016-08-16 16:26:43 +020084
85ok($json = decode_json($file), 'decode json');
86
Akrone1dbc382016-07-08 22:24:52 +020087is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
88is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020089is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020090is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
91like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
92is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
93
Akron5f51d422016-08-16 16:26:43 +020094# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020095is(unlink($output), 1, 'Unlink successful');
96ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020097
98# Use a different token source and skip all annotations,
99# except for DeReKo#Structure and Mate#Dependency
100$call = join(
101 ' ',
102 'perl', $script,
103 '--input' => $input,
104 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200105 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +0200106 '-t' => 'CoreNLP#Tokens',
107 '-s' => '#all',
108 '-a' => 'DeReKo#Structure',
109 '-a' => 'Mate#Dependency',
110 '-l' => 'INFO'
111);
112
113stderr_like(
114 sub {
115 system($call);
116 },
117 qr!The code took!,
118 $call
119);
120
121ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100122ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200123ok(($json = decode_json $file), 'decode json');
124
125is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
126
127is($json->{title}, 'Beispiel Text', 'Title');
128is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
129is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
130
131like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
132is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
133
Akrone2b902d2016-08-16 16:50:11 +0200134
135# Check overwrite
136$call = join(
137 ' ',
138 'perl', $script,
139 '--input' => $input,
140 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200141 '--cache' => $cache,
Akrone2b902d2016-08-16 16:50:11 +0200142 '-t' => 'CoreNLP#Tokens',
143 '-s' => '#all',
144 '-a' => 'DeReKo#Structure',
145 '-a' => 'Mate#Dependency',
146 '-l' => 'DEBUG'
147);
148
149ok(-f $output, 'Output does exist');
150stderr_like(
151 sub {
152 system($call);
153 },
154 qr!already exists!,
155 $call
156);
157
158$call .= ' -w ';
159
160stderr_unlike(
161 sub {
162 system($call);
163 },
164 qr!already exists!,
165 $call
166);
167
Akronf98b6692016-08-16 19:17:44 +0200168# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200169
Akronf98b6692016-08-16 19:17:44 +0200170# Delete output
171unlink $output;
172ok(!-f $output, 'Output does not exist');
173
Akron263274c2019-02-07 09:48:30 +0100174# Koral version
175$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
176$call = join(
177 ' ',
178 'perl', $script,
179 '--input' => $input,
180 '--output' => $output,
181 '--cache' => $cache,
182 '-t' => 'OpenNLP#Tokens',
183 '-k' => 0.4,
184 '-l' => 'INFO'
185);
186
187$call .= ' -w ';
188
189stderr_like(
190 sub {
191 system($call);
192 },
193 qr!The code took!,
194 $call
195);
196
197ok(-f $output, 'Output does exist');
198ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
199ok(($json = decode_json $file), 'decode json');
200ok(!$json->{textType}, 'text type');
201ok(!$json->{title}, 'Title');
202
203is($json->{fields}->[0]->{key}, 'corpusSigle');
204is($json->{fields}->[0]->{type}, 'type:string');
205is($json->{fields}->[0]->{value}, 'Corpus');
206is($json->{fields}->[0]->{'@type'}, 'koral:field');
207
Akron0d68a4b2019-11-13 15:42:11 +0100208is($json->{fields}->[4]->{key}, 'distributor');
209is($json->{fields}->[4]->{value}, 'data:,Institut für Deutsche Sprache');
210is($json->{fields}->[4]->{type}, 'type:attachement');
211is($json->{fields}->[4]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100212
Akron0d68a4b2019-11-13 15:42:11 +0100213is($json->{fields}->[9]->{key}, 'textClass');
214is($json->{fields}->[9]->{value}->[0], 'freizeit-unterhaltung');
215is($json->{fields}->[9]->{value}->[1], 'vereine-veranstaltungen');
216is($json->{fields}->[9]->{type}, 'type:keywords');
217is($json->{fields}->[9]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100218
Akron0d68a4b2019-11-13 15:42:11 +0100219is($json->{fields}->[14]->{key}, 'textType');
220is($json->{fields}->[14]->{value}, 'Zeitung: Tageszeitung');
221is($json->{fields}->[14]->{type}, 'type:string');
222is($json->{fields}->[14]->{'@type'}, 'koral:field');
223
224is($json->{fields}->[22]->{key}, 'title');
225is($json->{fields}->[22]->{value}, 'Beispiel Text');
226is($json->{fields}->[22]->{type}, 'type:text');
227is($json->{fields}->[22]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100228
229is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
230is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
231like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
232is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
233is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
234
235# Delete output
236unlink $output;
237ok(!-f $output, 'Output does not exist');
238
239
Akrone1dbc382016-07-08 22:24:52 +0200240done_testing;
241__END__
Akron5f51d422016-08-16 16:26:43 +0200242