blob: a1d6c581e263e0a75710ac9769488afef2de5f81 [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/ :POSIX /;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone1dbc382016-07-08 22:24:52 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020017
18my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020019ok(-d $input, 'Input directory found');
20
Akron5f51d422016-08-16 16:26:43 +020021my $output = tmpnam();
Akrone1dbc382016-07-08 22:24:52 +020022
Akron5f51d422016-08-16 16:26:43 +020023ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020024
Akron5f51d422016-08-16 16:26:43 +020025my $call = join(
26 ' ',
27 'perl', $script,
28 '--input' => $input,
29 '--output' => $output,
30 '-t' => 'OpenNLP#Tokens',
31 '-l' => 'INFO'
32);
33
34# Test without compression
35stderr_like(
36 sub {
37 system($call);
38 },
39 qr!The code took!,
40 $call
41);
42
43ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +010044ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +020045ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020046is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
47is($json->{title}, 'Beispiel Text', 'Title');
48is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
49is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
50like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
51is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020052is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020053
Akron5f51d422016-08-16 16:26:43 +020054# Delete output
55unlink $output;
56ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020057
Akron5f51d422016-08-16 16:26:43 +020058$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020059
Akron5f51d422016-08-16 16:26:43 +020060# Test with compression
61stderr_like(
62 sub { system($call); },
63 qr!The code took!,
64 $call
65);
66
67ok(-f $output, 'Output does exist');
68
69# Uncompress the data using a buffer
70my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
71($file, my $buffer) = '';
72while ($gz->read($buffer)) {
73 $file .= $buffer;
74};
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020075$gz->close;
Akron5f51d422016-08-16 16:26:43 +020076
77ok($json = decode_json($file), 'decode json');
78
Akrone1dbc382016-07-08 22:24:52 +020079is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
80is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020081is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020082is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
83like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
84is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
85
Akron5f51d422016-08-16 16:26:43 +020086# Delete output
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020087is(unlink($output), 1, 'Unlink successful');
88ok(!-e $output, 'Output does not exist');
Akron5f51d422016-08-16 16:26:43 +020089
90# Use a different token source and skip all annotations,
91# except for DeReKo#Structure and Mate#Dependency
92$call = join(
93 ' ',
94 'perl', $script,
95 '--input' => $input,
96 '--output' => $output,
97 '-t' => 'CoreNLP#Tokens',
98 '-s' => '#all',
99 '-a' => 'DeReKo#Structure',
100 '-a' => 'Mate#Dependency',
101 '-l' => 'INFO'
102);
103
104stderr_like(
105 sub {
106 system($call);
107 },
108 qr!The code took!,
109 $call
110);
111
112ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100113ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron5f51d422016-08-16 16:26:43 +0200114ok(($json = decode_json $file), 'decode json');
115
116is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
117
118is($json->{title}, 'Beispiel Text', 'Title');
119is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
120is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
121
122like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
123is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
124
Akrone2b902d2016-08-16 16:50:11 +0200125
126# Check overwrite
127$call = join(
128 ' ',
129 'perl', $script,
130 '--input' => $input,
131 '--output' => $output,
132 '-t' => 'CoreNLP#Tokens',
133 '-s' => '#all',
134 '-a' => 'DeReKo#Structure',
135 '-a' => 'Mate#Dependency',
136 '-l' => 'DEBUG'
137);
138
139ok(-f $output, 'Output does exist');
140stderr_like(
141 sub {
142 system($call);
143 },
144 qr!already exists!,
145 $call
146);
147
148$call .= ' -w ';
149
150stderr_unlike(
151 sub {
152 system($call);
153 },
154 qr!already exists!,
155 $call
156);
157
Akronf98b6692016-08-16 19:17:44 +0200158# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200159
Akronf98b6692016-08-16 19:17:44 +0200160# Delete output
161unlink $output;
162ok(!-f $output, 'Output does not exist');
163
164$input = catdir($f, '..', 'sgbr', 'PRO-DUD', 'BSP-2013-01', '32');
165
166# Use a different token source and skip all annotations,
167# except for DeReKo#Structure and Mate#Dependency
168$call = join(
169 ' ',
170 'perl', $script,
171 '--input' => $input,
172 '--output' => $output,
173 '-m' => 'Sgbr',
174 '-t' => 'Base#Tokens_aggr',
175 '-l' => 'INFO'
176);
177
178stderr_like(
179 sub {
180 system($call);
181 },
182 qr!The code took!,
183 $call
184);
185
186ok(-f $output, 'Output does exist');
Akron3ec0a1c2017-01-18 14:41:55 +0100187ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akronf98b6692016-08-16 19:17:44 +0200188ok(($json = decode_json $file), 'decode json');
189
190is($json->{data}->{text}, 'Selbst ist der Jeck', 'Text');
191is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'TokenSource');
192is($json->{pubPlace}, 'Stadtingen', 'pubPlace');
193is($json->{textSigle}, 'PRO-DUD/BSP-2013-01/32', 'textSigle');
194is($json->{docSigle}, 'PRO-DUD/BSP-2013-01', 'docSigle');
195is($json->{corpusSigle}, 'PRO-DUD', 'corpusSigle');
196is($json->{sgbrKodex}, 'T', 'sgbrKodex');
197is($json->{author}, 'unbekannt', 'Author');
198is($json->{language}, 'de', 'Language');
199is($json->{docTitle}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'docTitle');
200is($json->{funder}, 'Bundesministerium für Bildung und Forschung', 'docTitle');
201is($json->{title}, 'Nur Platt, kein Deutsch', 'title');
202is($json->{pubDate}, '20130126', 'pubDate');
203is($json->{docSubTitle}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'docSubTitle');
204is($json->{keywords}, 'sgbrKodex:T', 'keywords');
205is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
206
Akron636bd9c2017-02-09 17:13:00 +0100207
208
209# AGA with base info
210unlink $output;
211ok(!-f $output, 'Output does not exist');
212$input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
213ok(-d $input, 'Input directory found');
214
215ok(!-f $output, 'Output does not exist');
216
217$call = join(
218 ' ',
219 'perl', $script,
220 '--input' => $input,
221 '--output' => $output,
222 '-t' => 'base#tokens_aggr',
223 '-bs' => 'DeReKo#Structure',
224 '-bp' => 'DeReKo#Structure',
225 '-bpb' => 'DeReKo#Structure',
226 '-l' => 'INFO'
227);
228
229stderr_like(
230 sub {
231 system($call);
232 },
233 qr!The code took!,
234 $call
235);
236ok(-f $output, 'Output does exist');
237ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
238ok(($json = decode_json $file), 'decode json');
239
240is($json->{title}, 'Autobiographische Einzelheiten', 'title');
241is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
242
Akrone1dbc382016-07-08 22:24:52 +0200243done_testing;
244__END__
Akron5f51d422016-08-16 16:26:43 +0200245