blob: 8f2c4dbdde70303e05fec8064e9cf5090814a9e4 [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020017
18my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020019ok(-d $input, 'Input directory found');
20
Akron5f51d422016-08-16 16:26:43 +020021my $output = tmpnam();
Akrond5bb4342017-06-19 11:50:49 +020022my $cache = tmpnam();
23
Akrone1dbc382016-07-08 22:24:52 +020024
Akron5f51d422016-08-16 16:26:43 +020025ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020026
Akron5f51d422016-08-16 16:26:43 +020027my $call = join(
28 ' ',
29 'perl', $script,
30 '--input' => $input,
31 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020032 '--cache' => $cache,
Akron263274c2019-02-07 09:48:30 +010033 '-k' => 0.03,
Akron5f51d422016-08-16 16:26:43 +020034 '-t' => 'OpenNLP#Tokens',
35 '-l' => 'INFO'
36);
37
38# Test without compression
39stderr_like(
40 sub {
41 system($call);
42 },
43 qr!The code took!,
44 $call
45);
46
47ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010048ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020049ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020050is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
51is($json->{title}, 'Beispiel Text', 'Title');
52is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
53is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
54like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
55is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020056is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020057
Akron5f51d422016-08-16 16:26:43 +020058# Delete output
59unlink $output;
60ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020061
Akron5f51d422016-08-16 16:26:43 +020062$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020063
Akron5f51d422016-08-16 16:26:43 +020064# Test with compression
65stderr_like(
66 sub { system($call); },
67 qr!The code took!,
68 $call
69);
70
71ok(-f $output, 'Output does exist');
72
73# Uncompress the data using a buffer
74my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
75($file, my $buffer) = '';
76while ($gz->read($buffer)) {
77 $file .= $buffer;
78};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020079$gz->close;
Akron5f51d422016-08-16 16:26:43 +020080
81ok($json = decode_json($file), 'decode json');
82
Akrone1dbc382016-07-08 22:24:52 +020083is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
84is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020085is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020086is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
87like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
88is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
89
Akron5f51d422016-08-16 16:26:43 +020090# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020091is(unlink($output), 1, 'Unlink successful');
92ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020093
94# Use a different token source and skip all annotations,
95# except for DeReKo#Structure and Mate#Dependency
96$call = join(
97 ' ',
98 'perl', $script,
99 '--input' => $input,
100 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200101 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +0200102 '-t' => 'CoreNLP#Tokens',
103 '-s' => '#all',
104 '-a' => 'DeReKo#Structure',
105 '-a' => 'Mate#Dependency',
106 '-l' => 'INFO'
107);
108
109stderr_like(
110 sub {
111 system($call);
112 },
113 qr!The code took!,
114 $call
115);
116
117ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100118ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200119ok(($json = decode_json $file), 'decode json');
120
121is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
122
123is($json->{title}, 'Beispiel Text', 'Title');
124is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
125is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
126
127like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
128is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
129
Akrone2b902d2016-08-16 16:50:11 +0200130
131# Check overwrite
132$call = join(
133 ' ',
134 'perl', $script,
135 '--input' => $input,
136 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200137 '--cache' => $cache,
Akrone2b902d2016-08-16 16:50:11 +0200138 '-t' => 'CoreNLP#Tokens',
139 '-s' => '#all',
140 '-a' => 'DeReKo#Structure',
141 '-a' => 'Mate#Dependency',
142 '-l' => 'DEBUG'
143);
144
145ok(-f $output, 'Output does exist');
146stderr_like(
147 sub {
148 system($call);
149 },
150 qr!already exists!,
151 $call
152);
153
154$call .= ' -w ';
155
156stderr_unlike(
157 sub {
158 system($call);
159 },
160 qr!already exists!,
161 $call
162);
163
Akronf98b6692016-08-16 19:17:44 +0200164# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200165
Akronf98b6692016-08-16 19:17:44 +0200166# Delete output
167unlink $output;
168ok(!-f $output, 'Output does not exist');
169
170$input = catdir($f, '..', 'sgbr', 'PRO-DUD', 'BSP-2013-01', '32');
171
172# Use a different token source and skip all annotations,
173# except for DeReKo#Structure and Mate#Dependency
174$call = join(
175 ' ',
176 'perl', $script,
177 '--input' => $input,
178 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200179 '--cache' => $cache,
Akronf98b6692016-08-16 19:17:44 +0200180 '-m' => 'Sgbr',
181 '-t' => 'Base#Tokens_aggr',
182 '-l' => 'INFO'
183);
184
185stderr_like(
186 sub {
187 system($call);
188 },
189 qr!The code took!,
190 $call
191);
192
193ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100194ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akronf98b6692016-08-16 19:17:44 +0200195ok(($json = decode_json $file), 'decode json');
196
197is($json->{data}->{text}, 'Selbst ist der Jeck', 'Text');
198is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'TokenSource');
199is($json->{pubPlace}, 'Stadtingen', 'pubPlace');
200is($json->{textSigle}, 'PRO-DUD/BSP-2013-01/32', 'textSigle');
201is($json->{docSigle}, 'PRO-DUD/BSP-2013-01', 'docSigle');
202is($json->{corpusSigle}, 'PRO-DUD', 'corpusSigle');
203is($json->{sgbrKodex}, 'T', 'sgbrKodex');
204is($json->{author}, 'unbekannt', 'Author');
205is($json->{language}, 'de', 'Language');
206is($json->{docTitle}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'docTitle');
207is($json->{funder}, 'Bundesministerium für Bildung und Forschung', 'docTitle');
208is($json->{title}, 'Nur Platt, kein Deutsch', 'title');
209is($json->{pubDate}, '20130126', 'pubDate');
210is($json->{docSubTitle}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'docSubTitle');
211is($json->{keywords}, 'sgbrKodex:T', 'keywords');
212is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
213
Akron636bd9c2017-02-09 17:13:00 +0100214
215
216# AGA with base info
217unlink $output;
218ok(!-f $output, 'Output does not exist');
219$input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
220ok(-d $input, 'Input directory found');
221
222ok(!-f $output, 'Output does not exist');
223
224$call = join(
225 ' ',
226 'perl', $script,
227 '--input' => $input,
228 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200229 '--cache' => $cache,
Akron636bd9c2017-02-09 17:13:00 +0100230 '-t' => 'base#tokens_aggr',
231 '-bs' => 'DeReKo#Structure',
232 '-bp' => 'DeReKo#Structure',
233 '-bpb' => 'DeReKo#Structure',
234 '-l' => 'INFO'
235);
236
237stderr_like(
238 sub {
239 system($call);
240 },
241 qr!The code took!,
242 $call
243);
244ok(-f $output, 'Output does exist');
245ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
246ok(($json = decode_json $file), 'decode json');
247
248is($json->{title}, 'Autobiographische Einzelheiten', 'title');
249is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
250
Akron263274c2019-02-07 09:48:30 +0100251
252
253# Koral version
254$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
255$call = join(
256 ' ',
257 'perl', $script,
258 '--input' => $input,
259 '--output' => $output,
260 '--cache' => $cache,
261 '-t' => 'OpenNLP#Tokens',
262 '-k' => 0.4,
263 '-l' => 'INFO'
264);
265
266$call .= ' -w ';
267
268stderr_like(
269 sub {
270 system($call);
271 },
272 qr!The code took!,
273 $call
274);
275
276ok(-f $output, 'Output does exist');
277ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
278ok(($json = decode_json $file), 'decode json');
279ok(!$json->{textType}, 'text type');
280ok(!$json->{title}, 'Title');
281
282is($json->{fields}->[0]->{key}, 'corpusSigle');
283is($json->{fields}->[0]->{type}, 'type:string');
284is($json->{fields}->[0]->{value}, 'Corpus');
285is($json->{fields}->[0]->{'@type'}, 'koral:field');
286
Akron0d68a4b2019-11-13 15:42:11 +0100287is($json->{fields}->[4]->{key}, 'distributor');
288is($json->{fields}->[4]->{value}, 'data:,Institut für Deutsche Sprache');
289is($json->{fields}->[4]->{type}, 'type:attachement');
290is($json->{fields}->[4]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100291
Akron0d68a4b2019-11-13 15:42:11 +0100292is($json->{fields}->[9]->{key}, 'textClass');
293is($json->{fields}->[9]->{value}->[0], 'freizeit-unterhaltung');
294is($json->{fields}->[9]->{value}->[1], 'vereine-veranstaltungen');
295is($json->{fields}->[9]->{type}, 'type:keywords');
296is($json->{fields}->[9]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100297
Akron0d68a4b2019-11-13 15:42:11 +0100298is($json->{fields}->[14]->{key}, 'textType');
299is($json->{fields}->[14]->{value}, 'Zeitung: Tageszeitung');
300is($json->{fields}->[14]->{type}, 'type:string');
301is($json->{fields}->[14]->{'@type'}, 'koral:field');
302
303is($json->{fields}->[22]->{key}, 'title');
304is($json->{fields}->[22]->{value}, 'Beispiel Text');
305is($json->{fields}->[22]->{type}, 'type:text');
306is($json->{fields}->[22]->{'@type'}, 'koral:field');
Akron263274c2019-02-07 09:48:30 +0100307
308is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
309is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
310like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
311is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
312is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
313
314# Delete output
315unlink $output;
316ok(!-f $output, 'Output does not exist');
317
318
Akrone1dbc382016-07-08 22:24:52 +0200319done_testing;
320__END__
Akron5f51d422016-08-16 16:26:43 +0200321