blob: 54ae7a103a8dd99cd406e42ac8ce537e163106f7 [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
Akronfab17d32020-07-31 14:38:29 +020015if ($ENV{SKIP_SCRIPT}) {
16 plan skip_all => 'Skip script tests';
17};
18
Akrone1dbc382016-07-08 22:24:52 +020019my $f = dirname(__FILE__);
20my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020021
22my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020023ok(-d $input, 'Input directory found');
24
Akron5f51d422016-08-16 16:26:43 +020025my $output = tmpnam();
Akrond5bb4342017-06-19 11:50:49 +020026my $cache = tmpnam();
27
Akron5f51d422016-08-16 16:26:43 +020028ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020029
Akron5f51d422016-08-16 16:26:43 +020030my $call = join(
31 ' ',
32 'perl', $script,
33 '--input' => $input,
34 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020035 '--cache' => $cache,
Akronc0ac4ff2024-04-15 18:03:15 +020036 '--no-cache-delete',
Akron263274c2019-02-07 09:48:30 +010037 '-k' => 0.03,
Akron5f51d422016-08-16 16:26:43 +020038 '-t' => 'OpenNLP#Tokens',
39 '-l' => 'INFO'
40);
41
42# Test without compression
43stderr_like(
44 sub {
45 system($call);
46 },
47 qr!The code took!,
48 $call
49);
50
Akronc0ac4ff2024-04-15 18:03:15 +020051ok(-f $cache, 'Cache does exist');
Akron5f51d422016-08-16 16:26:43 +020052ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010053ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020054ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020055is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
56is($json->{title}, 'Beispiel Text', 'Title');
57is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
Marc Kupietzb8c53822024-03-16 18:54:08 +010058is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akrone1dbc382016-07-08 22:24:52 +020059like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
60is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020061is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020062
Akron5f51d422016-08-16 16:26:43 +020063# Delete output
64unlink $output;
Akronc0ac4ff2024-04-15 18:03:15 +020065unlink $cache;
Akron5f51d422016-08-16 16:26:43 +020066ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020067
Akron5f51d422016-08-16 16:26:43 +020068$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020069
Akron5f51d422016-08-16 16:26:43 +020070# Test with compression
71stderr_like(
72 sub { system($call); },
73 qr!The code took!,
74 $call
75);
76
77ok(-f $output, 'Output does exist');
78
79# Uncompress the data using a buffer
80my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
81($file, my $buffer) = '';
82while ($gz->read($buffer)) {
83 $file .= $buffer;
84};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020085$gz->close;
Akron5f51d422016-08-16 16:26:43 +020086
87ok($json = decode_json($file), 'decode json');
88
Akrone1dbc382016-07-08 22:24:52 +020089is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
90is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020091is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Marc Kupietzb8c53822024-03-16 18:54:08 +010092is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akrone1dbc382016-07-08 22:24:52 +020093like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
94is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
95
Akron5f51d422016-08-16 16:26:43 +020096# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020097is(unlink($output), 1, 'Unlink successful');
98ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020099
100# Use a different token source and skip all annotations,
101# except for DeReKo#Structure and Mate#Dependency
102$call = join(
103 ' ',
104 'perl', $script,
105 '--input' => $input,
106 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200107 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +0200108 '-t' => 'CoreNLP#Tokens',
109 '-s' => '#all',
110 '-a' => 'DeReKo#Structure',
111 '-a' => 'Mate#Dependency',
112 '-l' => 'INFO'
113);
114
115stderr_like(
116 sub {
117 system($call);
118 },
119 qr!The code took!,
120 $call
121);
122
123ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100124ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200125ok(($json = decode_json $file), 'decode json');
126
127is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
128
129is($json->{title}, 'Beispiel Text', 'Title');
130is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
131is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
132
133like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
134is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
135
Akrone2b902d2016-08-16 16:50:11 +0200136
137# Check overwrite
138$call = join(
139 ' ',
140 'perl', $script,
141 '--input' => $input,
142 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200143 '--cache' => $cache,
Akrone2b902d2016-08-16 16:50:11 +0200144 '-t' => 'CoreNLP#Tokens',
145 '-s' => '#all',
146 '-a' => 'DeReKo#Structure',
147 '-a' => 'Mate#Dependency',
148 '-l' => 'DEBUG'
149);
150
151ok(-f $output, 'Output does exist');
152stderr_like(
153 sub {
154 system($call);
155 },
156 qr!already exists!,
157 $call
158);
159
160$call .= ' -w ';
161
162stderr_unlike(
163 sub {
164 system($call);
165 },
166 qr!already exists!,
167 $call
168);
169
Akronf98b6692016-08-16 19:17:44 +0200170# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200171
Akronf98b6692016-08-16 19:17:44 +0200172# Delete output
173unlink $output;
174ok(!-f $output, 'Output does not exist');
175
Akron263274c2019-02-07 09:48:30 +0100176# Koral version
177$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
178$call = join(
179 ' ',
180 'perl', $script,
181 '--input' => $input,
182 '--output' => $output,
183 '--cache' => $cache,
184 '-t' => 'OpenNLP#Tokens',
185 '-k' => 0.4,
186 '-l' => 'INFO'
187);
188
189$call .= ' -w ';
190
191stderr_like(
192 sub {
193 system($call);
194 },
195 qr!The code took!,
196 $call
197);
198
199ok(-f $output, 'Output does exist');
200ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
201ok(($json = decode_json $file), 'decode json');
202ok(!$json->{textType}, 'text type');
203ok(!$json->{title}, 'Title');
204
205is($json->{fields}->[0]->{key}, 'corpusSigle');
206is($json->{fields}->[0]->{type}, 'type:string');
207is($json->{fields}->[0]->{value}, 'Corpus');
208is($json->{fields}->[0]->{'@type'}, 'koral:field');
209
Akron0d68a4b2019-11-13 15:42:11 +0100210is($json->{fields}->[4]->{key}, 'distributor');
211is($json->{fields}->[4]->{value}, 'data:,Institut für Deutsche Sprache');
212is($json->{fields}->[4]->{type}, 'type:attachement');
213is($json->{fields}->[4]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100214
Akron0d68a4b2019-11-13 15:42:11 +0100215is($json->{fields}->[9]->{key}, 'textClass');
216is($json->{fields}->[9]->{value}->[0], 'freizeit-unterhaltung');
217is($json->{fields}->[9]->{value}->[1], 'vereine-veranstaltungen');
218is($json->{fields}->[9]->{type}, 'type:keywords');
219is($json->{fields}->[9]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100220
Akron0d68a4b2019-11-13 15:42:11 +0100221is($json->{fields}->[14]->{key}, 'textType');
222is($json->{fields}->[14]->{value}, 'Zeitung: Tageszeitung');
223is($json->{fields}->[14]->{type}, 'type:string');
224is($json->{fields}->[14]->{'@type'}, 'koral:field');
225
226is($json->{fields}->[22]->{key}, 'title');
227is($json->{fields}->[22]->{value}, 'Beispiel Text');
228is($json->{fields}->[22]->{type}, 'type:text');
229is($json->{fields}->[22]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100230
231is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
Marc Kupietzb8c53822024-03-16 18:54:08 +0100232is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akron263274c2019-02-07 09:48:30 +0100233like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
234is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
235is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
236
Marc Kupietzb8c53822024-03-16 18:54:08 +0100237my $token = join(',',@{$json->{data}->{stream}->[1]});
238
239like($token, qr!<>:xip\/c:AP\$<b>64<i>4<i>11<i>2<b>5!);
240like($token, qr!<>:xip\/c:ADJ\$<b>64<i>4<i>11<i>2<b>6!);
241like($token, qr!<>:cnx\/c:np\$<b>64<i>4<i>30<i>4<b>0!);
242like($token, qr!<>:xip\/c:NP\$<b>64<i>4<i>30<i>4<b>3!);
243like($token, qr!<>:xip\/c:NPA\$<b>64<i>4<i>30<i>4<b>4!);
244like($token, qr!>:mate\/d:NK\$<b>32<i>3!);
245like($token, qr!_1\$<i>4<i>11!);
246like($token, qr!cnx\/l:letzt!);
247like($token, qr!cnx\/p:A!);
248like($token, qr!cnx\/syn:\@PREMOD!);
249like($token, qr!corenlp\/p:ADJ!);
250like($token, qr!glemm\/l:__letzt-!);
251like($token, qr!i:letzten!);
252like($token, qr!mate\/l:letzter!);
253like($token, qr!mate\/m:case:dat!);
254like($token, qr!mate\/m:degree:pos!);
255like($token, qr!mate\/m:gender:neut!);
256like($token, qr!mate\/m:number:sg!);
257like($token, qr!mate\/p:ADJA!);
258like($token, qr!opennlp\/p:ADJA!);
259like($token, qr!s:letzten!);
260like($token, qr!spacy\/l:letzter!);
261like($token, qr!spacy\/p:ADJ!);
262like($token, qr!tt\/l:letzt!);
263like($token, qr!tt\/p:ADJA!);
264like($token, qr!xip\/l:letzt!);
265like($token, qr!xip\/p:ADJ!);
266
Akron263274c2019-02-07 09:48:30 +0100267# Delete output
268unlink $output;
269ok(!-f $output, 'Output does not exist');
270
271
Akron64f7fae2022-07-27 12:45:33 +0200272# Koral version
273$input = catdir($f, '..', 'real', 'corpus', 'NKJP', 'NKJP', 'KOT');
274$call = join(
275 ' ',
276 'perl', $script,
277 '--input' => $input,
278 '--output' => $output,
279 '--cache' => $cache,
280 '-t' => 'NKJP#Morpho',
281 '-l' => 'INFO',
282 '--lang' => 'en'
283);
284
285$call .= ' -w ';
286
287stderr_like(
288 sub {
289 system($call);
290 },
291 qr!The code took!,
292 $call
293);
294
295ok(-f $output, 'Output does exist');
296ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
297ok(($json = decode_json $file), 'decode json');
298is($json->{corpusTitle}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Title');
299
300
301
302
303
Akrone1dbc382016-07-08 22:24:52 +0200304done_testing;
305__END__
Akron5f51d422016-08-16 16:26:43 +0200306