blob: 4772c1cc4d568223edacdda8ce9d65f665a377e5 [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
Akronfab17d32020-07-31 14:38:29 +020015if ($ENV{SKIP_SCRIPT}) {
16 plan skip_all => 'Skip script tests';
17};
18
Akrone1dbc382016-07-08 22:24:52 +020019my $f = dirname(__FILE__);
20my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020021
22my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020023ok(-d $input, 'Input directory found');
24
Akron5f51d422016-08-16 16:26:43 +020025my $output = tmpnam();
Akrond5bb4342017-06-19 11:50:49 +020026my $cache = tmpnam();
27
Akrone1dbc382016-07-08 22:24:52 +020028
Akron5f51d422016-08-16 16:26:43 +020029ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020030
Akron5f51d422016-08-16 16:26:43 +020031my $call = join(
32 ' ',
33 'perl', $script,
34 '--input' => $input,
35 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020036 '--cache' => $cache,
Akron263274c2019-02-07 09:48:30 +010037 '-k' => 0.03,
Akron5f51d422016-08-16 16:26:43 +020038 '-t' => 'OpenNLP#Tokens',
39 '-l' => 'INFO'
40);
41
42# Test without compression
43stderr_like(
44 sub {
45 system($call);
46 },
47 qr!The code took!,
48 $call
49);
50
51ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010052ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020053ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020054is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
55is($json->{title}, 'Beispiel Text', 'Title');
56is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
Marc Kupietzb8c53822024-03-16 18:54:08 +010057is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akrone1dbc382016-07-08 22:24:52 +020058like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
59is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020060is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020061
Akron5f51d422016-08-16 16:26:43 +020062# Delete output
63unlink $output;
64ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020065
Akron5f51d422016-08-16 16:26:43 +020066$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020067
Akron5f51d422016-08-16 16:26:43 +020068# Test with compression
69stderr_like(
70 sub { system($call); },
71 qr!The code took!,
72 $call
73);
74
75ok(-f $output, 'Output does exist');
76
77# Uncompress the data using a buffer
78my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
79($file, my $buffer) = '';
80while ($gz->read($buffer)) {
81 $file .= $buffer;
82};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020083$gz->close;
Akron5f51d422016-08-16 16:26:43 +020084
85ok($json = decode_json($file), 'decode json');
86
Akrone1dbc382016-07-08 22:24:52 +020087is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
88is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020089is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Marc Kupietzb8c53822024-03-16 18:54:08 +010090is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akrone1dbc382016-07-08 22:24:52 +020091like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
92is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
93
Akron5f51d422016-08-16 16:26:43 +020094# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020095is(unlink($output), 1, 'Unlink successful');
96ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020097
98# Use a different token source and skip all annotations,
99# except for DeReKo#Structure and Mate#Dependency
100$call = join(
101 ' ',
102 'perl', $script,
103 '--input' => $input,
104 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200105 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +0200106 '-t' => 'CoreNLP#Tokens',
107 '-s' => '#all',
108 '-a' => 'DeReKo#Structure',
109 '-a' => 'Mate#Dependency',
110 '-l' => 'INFO'
111);
112
113stderr_like(
114 sub {
115 system($call);
116 },
117 qr!The code took!,
118 $call
119);
120
121ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100122ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200123ok(($json = decode_json $file), 'decode json');
124
125is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
126
127is($json->{title}, 'Beispiel Text', 'Title');
128is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
129is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
130
131like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
132is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
133
Akrone2b902d2016-08-16 16:50:11 +0200134
135# Check overwrite
136$call = join(
137 ' ',
138 'perl', $script,
139 '--input' => $input,
140 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200141 '--cache' => $cache,
Akrone2b902d2016-08-16 16:50:11 +0200142 '-t' => 'CoreNLP#Tokens',
143 '-s' => '#all',
144 '-a' => 'DeReKo#Structure',
145 '-a' => 'Mate#Dependency',
146 '-l' => 'DEBUG'
147);
148
149ok(-f $output, 'Output does exist');
150stderr_like(
151 sub {
152 system($call);
153 },
154 qr!already exists!,
155 $call
156);
157
158$call .= ' -w ';
159
160stderr_unlike(
161 sub {
162 system($call);
163 },
164 qr!already exists!,
165 $call
166);
167
Akronf98b6692016-08-16 19:17:44 +0200168# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200169
Akronf98b6692016-08-16 19:17:44 +0200170# Delete output
171unlink $output;
172ok(!-f $output, 'Output does not exist');
173
Akron263274c2019-02-07 09:48:30 +0100174# Koral version
175$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
176$call = join(
177 ' ',
178 'perl', $script,
179 '--input' => $input,
180 '--output' => $output,
181 '--cache' => $cache,
182 '-t' => 'OpenNLP#Tokens',
183 '-k' => 0.4,
184 '-l' => 'INFO'
185);
186
187$call .= ' -w ';
188
189stderr_like(
190 sub {
191 system($call);
192 },
193 qr!The code took!,
194 $call
195);
196
197ok(-f $output, 'Output does exist');
198ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
199ok(($json = decode_json $file), 'decode json');
200ok(!$json->{textType}, 'text type');
201ok(!$json->{title}, 'Title');
202
203is($json->{fields}->[0]->{key}, 'corpusSigle');
204is($json->{fields}->[0]->{type}, 'type:string');
205is($json->{fields}->[0]->{value}, 'Corpus');
206is($json->{fields}->[0]->{'@type'}, 'koral:field');
207
Akron0d68a4b2019-11-13 15:42:11 +0100208is($json->{fields}->[4]->{key}, 'distributor');
209is($json->{fields}->[4]->{value}, 'data:,Institut für Deutsche Sprache');
210is($json->{fields}->[4]->{type}, 'type:attachement');
211is($json->{fields}->[4]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100212
Akron0d68a4b2019-11-13 15:42:11 +0100213is($json->{fields}->[9]->{key}, 'textClass');
214is($json->{fields}->[9]->{value}->[0], 'freizeit-unterhaltung');
215is($json->{fields}->[9]->{value}->[1], 'vereine-veranstaltungen');
216is($json->{fields}->[9]->{type}, 'type:keywords');
217is($json->{fields}->[9]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100218
Akron0d68a4b2019-11-13 15:42:11 +0100219is($json->{fields}->[14]->{key}, 'textType');
220is($json->{fields}->[14]->{value}, 'Zeitung: Tageszeitung');
221is($json->{fields}->[14]->{type}, 'type:string');
222is($json->{fields}->[14]->{'@type'}, 'koral:field');
223
224is($json->{fields}->[22]->{key}, 'title');
225is($json->{fields}->[22]->{value}, 'Beispiel Text');
226is($json->{fields}->[22]->{type}, 'type:text');
227is($json->{fields}->[22]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100228
229is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
Marc Kupietzb8c53822024-03-16 18:54:08 +0100230is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akron263274c2019-02-07 09:48:30 +0100231like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
232is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
233is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
234
Marc Kupietzb8c53822024-03-16 18:54:08 +0100235my $token = join(',',@{$json->{data}->{stream}->[1]});
236
237like($token, qr!<>:xip\/c:AP\$<b>64<i>4<i>11<i>2<b>5!);
238like($token, qr!<>:xip\/c:ADJ\$<b>64<i>4<i>11<i>2<b>6!);
239like($token, qr!<>:cnx\/c:np\$<b>64<i>4<i>30<i>4<b>0!);
240like($token, qr!<>:xip\/c:NP\$<b>64<i>4<i>30<i>4<b>3!);
241like($token, qr!<>:xip\/c:NPA\$<b>64<i>4<i>30<i>4<b>4!);
242like($token, qr!>:mate\/d:NK\$<b>32<i>3!);
243like($token, qr!_1\$<i>4<i>11!);
244like($token, qr!cnx\/l:letzt!);
245like($token, qr!cnx\/p:A!);
246like($token, qr!cnx\/syn:\@PREMOD!);
247like($token, qr!corenlp\/p:ADJ!);
248like($token, qr!glemm\/l:__letzt-!);
249like($token, qr!i:letzten!);
250like($token, qr!mate\/l:letzter!);
251like($token, qr!mate\/m:case:dat!);
252like($token, qr!mate\/m:degree:pos!);
253like($token, qr!mate\/m:gender:neut!);
254like($token, qr!mate\/m:number:sg!);
255like($token, qr!mate\/p:ADJA!);
256like($token, qr!opennlp\/p:ADJA!);
257like($token, qr!s:letzten!);
258like($token, qr!spacy\/l:letzter!);
259like($token, qr!spacy\/p:ADJ!);
260like($token, qr!tt\/l:letzt!);
261like($token, qr!tt\/p:ADJA!);
262like($token, qr!xip\/l:letzt!);
263like($token, qr!xip\/p:ADJ!);
264
Akron263274c2019-02-07 09:48:30 +0100265# Delete output
266unlink $output;
267ok(!-f $output, 'Output does not exist');
268
269
Akron64f7fae2022-07-27 12:45:33 +0200270# Koral version
271$input = catdir($f, '..', 'real', 'corpus', 'NKJP', 'NKJP', 'KOT');
272$call = join(
273 ' ',
274 'perl', $script,
275 '--input' => $input,
276 '--output' => $output,
277 '--cache' => $cache,
278 '-t' => 'NKJP#Morpho',
279 '-l' => 'INFO',
280 '--lang' => 'en'
281);
282
283$call .= ' -w ';
284
285stderr_like(
286 sub {
287 system($call);
288 },
289 qr!The code took!,
290 $call
291);
292
293ok(-f $output, 'Output does exist');
294ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
295ok(($json = decode_json $file), 'decode json');
296is($json->{corpusTitle}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Title');
297
298
299
300
301
Akrone1dbc382016-07-08 22:24:52 +0200302done_testing;
303__END__
Akron5f51d422016-08-16 16:26:43 +0200304