blob: dddf7d884afc3bace7e7eee91aba90e3ac1e7d8a [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
Akronfab17d32020-07-31 14:38:29 +020015if ($ENV{SKIP_SCRIPT}) {
16 plan skip_all => 'Skip script tests';
17};
18
Akrone1dbc382016-07-08 22:24:52 +020019my $f = dirname(__FILE__);
20my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020021
22my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020023ok(-d $input, 'Input directory found');
24
Akron5f51d422016-08-16 16:26:43 +020025my $output = tmpnam();
Akrond5bb4342017-06-19 11:50:49 +020026my $cache = tmpnam();
27
Akrone1dbc382016-07-08 22:24:52 +020028
Akron5f51d422016-08-16 16:26:43 +020029ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020030
Akron5f51d422016-08-16 16:26:43 +020031my $call = join(
32 ' ',
33 'perl', $script,
34 '--input' => $input,
35 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020036 '--cache' => $cache,
Akron263274c2019-02-07 09:48:30 +010037 '-k' => 0.03,
Akron5f51d422016-08-16 16:26:43 +020038 '-t' => 'OpenNLP#Tokens',
39 '-l' => 'INFO'
40);
41
42# Test without compression
43stderr_like(
44 sub {
45 system($call);
46 },
47 qr!The code took!,
48 $call
49);
50
51ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010052ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020053ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020054is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
55is($json->{title}, 'Beispiel Text', 'Title');
56is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
57is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
58like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
59is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020060is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020061
Akron5f51d422016-08-16 16:26:43 +020062# Delete output
63unlink $output;
64ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020065
Akron5f51d422016-08-16 16:26:43 +020066$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020067
Akron5f51d422016-08-16 16:26:43 +020068# Test with compression
69stderr_like(
70 sub { system($call); },
71 qr!The code took!,
72 $call
73);
74
75ok(-f $output, 'Output does exist');
76
77# Uncompress the data using a buffer
78my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
79($file, my $buffer) = '';
80while ($gz->read($buffer)) {
81 $file .= $buffer;
82};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020083$gz->close;
Akron5f51d422016-08-16 16:26:43 +020084
85ok($json = decode_json($file), 'decode json');
86
Akrone1dbc382016-07-08 22:24:52 +020087is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
88is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020089is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020090is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
91like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
92is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
93
Akron5f51d422016-08-16 16:26:43 +020094# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020095is(unlink($output), 1, 'Unlink successful');
96ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020097
98# Use a different token source and skip all annotations,
99# except for DeReKo#Structure and Mate#Dependency
100$call = join(
101 ' ',
102 'perl', $script,
103 '--input' => $input,
104 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200105 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +0200106 '-t' => 'CoreNLP#Tokens',
107 '-s' => '#all',
108 '-a' => 'DeReKo#Structure',
109 '-a' => 'Mate#Dependency',
110 '-l' => 'INFO'
111);
112
113stderr_like(
114 sub {
115 system($call);
116 },
117 qr!The code took!,
118 $call
119);
120
121ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100122ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200123ok(($json = decode_json $file), 'decode json');
124
125is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
126
127is($json->{title}, 'Beispiel Text', 'Title');
128is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
129is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
130
131like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
132is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
133
Akrone2b902d2016-08-16 16:50:11 +0200134
135# Check overwrite
136$call = join(
137 ' ',
138 'perl', $script,
139 '--input' => $input,
140 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200141 '--cache' => $cache,
Akrone2b902d2016-08-16 16:50:11 +0200142 '-t' => 'CoreNLP#Tokens',
143 '-s' => '#all',
144 '-a' => 'DeReKo#Structure',
145 '-a' => 'Mate#Dependency',
146 '-l' => 'DEBUG'
147);
148
149ok(-f $output, 'Output does exist');
150stderr_like(
151 sub {
152 system($call);
153 },
154 qr!already exists!,
155 $call
156);
157
158$call .= ' -w ';
159
160stderr_unlike(
161 sub {
162 system($call);
163 },
164 qr!already exists!,
165 $call
166);
167
Akronf98b6692016-08-16 19:17:44 +0200168# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200169
Akronf98b6692016-08-16 19:17:44 +0200170# Delete output
171unlink $output;
172ok(!-f $output, 'Output does not exist');
173
174$input = catdir($f, '..', 'sgbr', 'PRO-DUD', 'BSP-2013-01', '32');
175
176# Use a different token source and skip all annotations,
177# except for DeReKo#Structure and Mate#Dependency
178$call = join(
179 ' ',
180 'perl', $script,
181 '--input' => $input,
182 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200183 '--cache' => $cache,
Akronf98b6692016-08-16 19:17:44 +0200184 '-m' => 'Sgbr',
185 '-t' => 'Base#Tokens_aggr',
186 '-l' => 'INFO'
187);
188
189stderr_like(
190 sub {
191 system($call);
192 },
193 qr!The code took!,
194 $call
195);
196
197ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100198ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akronf98b6692016-08-16 19:17:44 +0200199ok(($json = decode_json $file), 'decode json');
200
201is($json->{data}->{text}, 'Selbst ist der Jeck', 'Text');
202is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'TokenSource');
203is($json->{pubPlace}, 'Stadtingen', 'pubPlace');
204is($json->{textSigle}, 'PRO-DUD/BSP-2013-01/32', 'textSigle');
205is($json->{docSigle}, 'PRO-DUD/BSP-2013-01', 'docSigle');
206is($json->{corpusSigle}, 'PRO-DUD', 'corpusSigle');
207is($json->{sgbrKodex}, 'T', 'sgbrKodex');
208is($json->{author}, 'unbekannt', 'Author');
209is($json->{language}, 'de', 'Language');
210is($json->{docTitle}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'docTitle');
211is($json->{funder}, 'Bundesministerium für Bildung und Forschung', 'docTitle');
212is($json->{title}, 'Nur Platt, kein Deutsch', 'title');
213is($json->{pubDate}, '20130126', 'pubDate');
214is($json->{docSubTitle}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'docSubTitle');
215is($json->{keywords}, 'sgbrKodex:T', 'keywords');
216is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
217
Akron636bd9c2017-02-09 17:13:00 +0100218
219
220# AGA with base info
221unlink $output;
222ok(!-f $output, 'Output does not exist');
223$input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
224ok(-d $input, 'Input directory found');
225
226ok(!-f $output, 'Output does not exist');
227
228$call = join(
229 ' ',
230 'perl', $script,
231 '--input' => $input,
232 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200233 '--cache' => $cache,
Akron636bd9c2017-02-09 17:13:00 +0100234 '-t' => 'base#tokens_aggr',
235 '-bs' => 'DeReKo#Structure',
236 '-bp' => 'DeReKo#Structure',
237 '-bpb' => 'DeReKo#Structure',
238 '-l' => 'INFO'
239);
240
241stderr_like(
242 sub {
243 system($call);
244 },
245 qr!The code took!,
246 $call
247);
248ok(-f $output, 'Output does exist');
249ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
250ok(($json = decode_json $file), 'decode json');
251
252is($json->{title}, 'Autobiographische Einzelheiten', 'title');
253is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
254
Akron263274c2019-02-07 09:48:30 +0100255
256
257# Koral version
258$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
259$call = join(
260 ' ',
261 'perl', $script,
262 '--input' => $input,
263 '--output' => $output,
264 '--cache' => $cache,
265 '-t' => 'OpenNLP#Tokens',
266 '-k' => 0.4,
267 '-l' => 'INFO'
268);
269
270$call .= ' -w ';
271
272stderr_like(
273 sub {
274 system($call);
275 },
276 qr!The code took!,
277 $call
278);
279
280ok(-f $output, 'Output does exist');
281ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
282ok(($json = decode_json $file), 'decode json');
283ok(!$json->{textType}, 'text type');
284ok(!$json->{title}, 'Title');
285
286is($json->{fields}->[0]->{key}, 'corpusSigle');
287is($json->{fields}->[0]->{type}, 'type:string');
288is($json->{fields}->[0]->{value}, 'Corpus');
289is($json->{fields}->[0]->{'@type'}, 'koral:field');
290
Akron0d68a4b2019-11-13 15:42:11 +0100291is($json->{fields}->[4]->{key}, 'distributor');
292is($json->{fields}->[4]->{value}, 'data:,Institut für Deutsche Sprache');
293is($json->{fields}->[4]->{type}, 'type:attachement');
294is($json->{fields}->[4]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100295
Akron0d68a4b2019-11-13 15:42:11 +0100296is($json->{fields}->[9]->{key}, 'textClass');
297is($json->{fields}->[9]->{value}->[0], 'freizeit-unterhaltung');
298is($json->{fields}->[9]->{value}->[1], 'vereine-veranstaltungen');
299is($json->{fields}->[9]->{type}, 'type:keywords');
300is($json->{fields}->[9]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100301
Akron0d68a4b2019-11-13 15:42:11 +0100302is($json->{fields}->[14]->{key}, 'textType');
303is($json->{fields}->[14]->{value}, 'Zeitung: Tageszeitung');
304is($json->{fields}->[14]->{type}, 'type:string');
305is($json->{fields}->[14]->{'@type'}, 'koral:field');
306
307is($json->{fields}->[22]->{key}, 'title');
308is($json->{fields}->[22]->{value}, 'Beispiel Text');
309is($json->{fields}->[22]->{type}, 'type:text');
310is($json->{fields}->[22]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100311
312is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
313is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
314like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
315is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
316is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
317
318# Delete output
319unlink $output;
320ok(!-f $output, 'Output does not exist');
321
322
Akrone1dbc382016-07-08 22:24:52 +0200323done_testing;
324__END__
Akron5f51d422016-08-16 16:26:43 +0200325