blob: 24e255918844c8ebd76b433fb7be758a5274cbd4 [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020017
18my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020019ok(-d $input, 'Input directory found');
20
Akron5f51d422016-08-16 16:26:43 +020021my $output = tmpnam();
Akrond5bb4342017-06-19 11:50:49 +020022my $cache = tmpnam();
23
Akrone1dbc382016-07-08 22:24:52 +020024
Akron5f51d422016-08-16 16:26:43 +020025ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020026
Akron5f51d422016-08-16 16:26:43 +020027my $call = join(
28 ' ',
29 'perl', $script,
30 '--input' => $input,
31 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020032 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +020033 '-t' => 'OpenNLP#Tokens',
34 '-l' => 'INFO'
35);
36
37# Test without compression
38stderr_like(
39 sub {
40 system($call);
41 },
42 qr!The code took!,
43 $call
44);
45
46ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010047ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020048ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020049is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
50is($json->{title}, 'Beispiel Text', 'Title');
51is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
52is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
53like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
54is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020055is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020056
Akron5f51d422016-08-16 16:26:43 +020057# Delete output
58unlink $output;
59ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020060
Akron5f51d422016-08-16 16:26:43 +020061$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020062
Akron5f51d422016-08-16 16:26:43 +020063# Test with compression
64stderr_like(
65 sub { system($call); },
66 qr!The code took!,
67 $call
68);
69
70ok(-f $output, 'Output does exist');
71
72# Uncompress the data using a buffer
73my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
74($file, my $buffer) = '';
75while ($gz->read($buffer)) {
76 $file .= $buffer;
77};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020078$gz->close;
Akron5f51d422016-08-16 16:26:43 +020079
80ok($json = decode_json($file), 'decode json');
81
Akrone1dbc382016-07-08 22:24:52 +020082is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
83is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020084is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020085is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
86like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
87is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
88
Akron5f51d422016-08-16 16:26:43 +020089# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020090is(unlink($output), 1, 'Unlink successful');
91ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020092
93# Use a different token source and skip all annotations,
94# except for DeReKo#Structure and Mate#Dependency
95$call = join(
96 ' ',
97 'perl', $script,
98 '--input' => $input,
99 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200100 '--cache' => $cache,
Akron5f51d422016-08-16 16:26:43 +0200101 '-t' => 'CoreNLP#Tokens',
102 '-s' => '#all',
103 '-a' => 'DeReKo#Structure',
104 '-a' => 'Mate#Dependency',
105 '-l' => 'INFO'
106);
107
108stderr_like(
109 sub {
110 system($call);
111 },
112 qr!The code took!,
113 $call
114);
115
116ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100117ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200118ok(($json = decode_json $file), 'decode json');
119
120is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
121
122is($json->{title}, 'Beispiel Text', 'Title');
123is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
124is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
125
126like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
127is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
128
Akrone2b902d2016-08-16 16:50:11 +0200129
130# Check overwrite
131$call = join(
132 ' ',
133 'perl', $script,
134 '--input' => $input,
135 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200136 '--cache' => $cache,
Akrone2b902d2016-08-16 16:50:11 +0200137 '-t' => 'CoreNLP#Tokens',
138 '-s' => '#all',
139 '-a' => 'DeReKo#Structure',
140 '-a' => 'Mate#Dependency',
141 '-l' => 'DEBUG'
142);
143
144ok(-f $output, 'Output does exist');
145stderr_like(
146 sub {
147 system($call);
148 },
149 qr!already exists!,
150 $call
151);
152
153$call .= ' -w ';
154
155stderr_unlike(
156 sub {
157 system($call);
158 },
159 qr!already exists!,
160 $call
161);
162
Akronf98b6692016-08-16 19:17:44 +0200163# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200164
Akronf98b6692016-08-16 19:17:44 +0200165# Delete output
166unlink $output;
167ok(!-f $output, 'Output does not exist');
168
169$input = catdir($f, '..', 'sgbr', 'PRO-DUD', 'BSP-2013-01', '32');
170
171# Use a different token source and skip all annotations,
172# except for DeReKo#Structure and Mate#Dependency
173$call = join(
174 ' ',
175 'perl', $script,
176 '--input' => $input,
177 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200178 '--cache' => $cache,
Akronf98b6692016-08-16 19:17:44 +0200179 '-m' => 'Sgbr',
180 '-t' => 'Base#Tokens_aggr',
181 '-l' => 'INFO'
182);
183
184stderr_like(
185 sub {
186 system($call);
187 },
188 qr!The code took!,
189 $call
190);
191
192ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100193ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akronf98b6692016-08-16 19:17:44 +0200194ok(($json = decode_json $file), 'decode json');
195
196is($json->{data}->{text}, 'Selbst ist der Jeck', 'Text');
197is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'TokenSource');
198is($json->{pubPlace}, 'Stadtingen', 'pubPlace');
199is($json->{textSigle}, 'PRO-DUD/BSP-2013-01/32', 'textSigle');
200is($json->{docSigle}, 'PRO-DUD/BSP-2013-01', 'docSigle');
201is($json->{corpusSigle}, 'PRO-DUD', 'corpusSigle');
202is($json->{sgbrKodex}, 'T', 'sgbrKodex');
203is($json->{author}, 'unbekannt', 'Author');
204is($json->{language}, 'de', 'Language');
205is($json->{docTitle}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'docTitle');
206is($json->{funder}, 'Bundesministerium für Bildung und Forschung', 'docTitle');
207is($json->{title}, 'Nur Platt, kein Deutsch', 'title');
208is($json->{pubDate}, '20130126', 'pubDate');
209is($json->{docSubTitle}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'docSubTitle');
210is($json->{keywords}, 'sgbrKodex:T', 'keywords');
211is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
212
Akron636bd9c2017-02-09 17:13:00 +0100213
214
215# AGA with base info
216unlink $output;
217ok(!-f $output, 'Output does not exist');
218$input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
219ok(-d $input, 'Input directory found');
220
221ok(!-f $output, 'Output does not exist');
222
223$call = join(
224 ' ',
225 'perl', $script,
226 '--input' => $input,
227 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200228 '--cache' => $cache,
Akron636bd9c2017-02-09 17:13:00 +0100229 '-t' => 'base#tokens_aggr',
230 '-bs' => 'DeReKo#Structure',
231 '-bp' => 'DeReKo#Structure',
232 '-bpb' => 'DeReKo#Structure',
233 '-l' => 'INFO'
234);
235
236stderr_like(
237 sub {
238 system($call);
239 },
240 qr!The code took!,
241 $call
242);
243ok(-f $output, 'Output does exist');
244ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
245ok(($json = decode_json $file), 'decode json');
246
247is($json->{title}, 'Autobiographische Einzelheiten', 'title');
248is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
249
Akrone1dbc382016-07-08 22:24:52 +0200250done_testing;
251__END__
Akron5f51d422016-08-16 16:26:43 +0200252