blob: 9d8d28f3debc30f61eb06b1f0d9f408851222adc [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/ :POSIX /;
7use Mojo::Util qw/slurp/;
8use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akronf98b6692016-08-16 19:17:44 +020013use utf8;
Akrone1dbc382016-07-08 22:24:52 +020014
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020017
18my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020019ok(-d $input, 'Input directory found');
20
Akron5f51d422016-08-16 16:26:43 +020021my $output = tmpnam();
Akrone1dbc382016-07-08 22:24:52 +020022
Akron5f51d422016-08-16 16:26:43 +020023ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020024
Akron5f51d422016-08-16 16:26:43 +020025my $call = join(
26 ' ',
27 'perl', $script,
28 '--input' => $input,
29 '--output' => $output,
30 '-t' => 'OpenNLP#Tokens',
31 '-l' => 'INFO'
32);
33
34# Test without compression
35stderr_like(
36 sub {
37 system($call);
38 },
39 qr!The code took!,
40 $call
41);
42
43ok(-f $output, 'Output does exist');
44ok((my $file = slurp $output), 'Slurp data');
45ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020046is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
47is($json->{title}, 'Beispiel Text', 'Title');
48is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
49is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
50like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
51is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020052is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020053
Akron5f51d422016-08-16 16:26:43 +020054# Delete output
55unlink $output;
56ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020057
Akron5f51d422016-08-16 16:26:43 +020058$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020059
Akron5f51d422016-08-16 16:26:43 +020060# Test with compression
61stderr_like(
62 sub { system($call); },
63 qr!The code took!,
64 $call
65);
66
67ok(-f $output, 'Output does exist');
68
69# Uncompress the data using a buffer
70my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
71($file, my $buffer) = '';
72while ($gz->read($buffer)) {
73 $file .= $buffer;
74};
75
76ok($json = decode_json($file), 'decode json');
77
Akrone1dbc382016-07-08 22:24:52 +020078is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
79is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020080is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020081is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
82like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
83is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
84
Akron5f51d422016-08-16 16:26:43 +020085# Delete output
86unlink $output;
87ok(!-f $output, 'Output does not exist');
88
89# Use a different token source and skip all annotations,
90# except for DeReKo#Structure and Mate#Dependency
91$call = join(
92 ' ',
93 'perl', $script,
94 '--input' => $input,
95 '--output' => $output,
96 '-t' => 'CoreNLP#Tokens',
97 '-s' => '#all',
98 '-a' => 'DeReKo#Structure',
99 '-a' => 'Mate#Dependency',
100 '-l' => 'INFO'
101);
102
103stderr_like(
104 sub {
105 system($call);
106 },
107 qr!The code took!,
108 $call
109);
110
111ok(-f $output, 'Output does exist');
112ok(($file = slurp $output), 'Slurp data');
113ok(($json = decode_json $file), 'decode json');
114
115is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
116
117is($json->{title}, 'Beispiel Text', 'Title');
118is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
119is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
120
121like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
122is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
123
Akrone2b902d2016-08-16 16:50:11 +0200124
125# Check overwrite
126$call = join(
127 ' ',
128 'perl', $script,
129 '--input' => $input,
130 '--output' => $output,
131 '-t' => 'CoreNLP#Tokens',
132 '-s' => '#all',
133 '-a' => 'DeReKo#Structure',
134 '-a' => 'Mate#Dependency',
135 '-l' => 'DEBUG'
136);
137
138ok(-f $output, 'Output does exist');
139stderr_like(
140 sub {
141 system($call);
142 },
143 qr!already exists!,
144 $call
145);
146
147$call .= ' -w ';
148
149stderr_unlike(
150 sub {
151 system($call);
152 },
153 qr!already exists!,
154 $call
155);
156
Akronf98b6692016-08-16 19:17:44 +0200157# Check meta data switch
Akrone2b902d2016-08-16 16:50:11 +0200158
Akronf98b6692016-08-16 19:17:44 +0200159# Delete output
160unlink $output;
161ok(!-f $output, 'Output does not exist');
162
163$input = catdir($f, '..', 'sgbr', 'PRO-DUD', 'BSP-2013-01', '32');
164
165# Use a different token source and skip all annotations,
166# except for DeReKo#Structure and Mate#Dependency
167$call = join(
168 ' ',
169 'perl', $script,
170 '--input' => $input,
171 '--output' => $output,
172 '-m' => 'Sgbr',
173 '-t' => 'Base#Tokens_aggr',
174 '-l' => 'INFO'
175);
176
177stderr_like(
178 sub {
179 system($call);
180 },
181 qr!The code took!,
182 $call
183);
184
185ok(-f $output, 'Output does exist');
186ok(($file = slurp $output), 'Slurp data');
187ok(($json = decode_json $file), 'decode json');
188
189is($json->{data}->{text}, 'Selbst ist der Jeck', 'Text');
190is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'TokenSource');
191is($json->{pubPlace}, 'Stadtingen', 'pubPlace');
192is($json->{textSigle}, 'PRO-DUD/BSP-2013-01/32', 'textSigle');
193is($json->{docSigle}, 'PRO-DUD/BSP-2013-01', 'docSigle');
194is($json->{corpusSigle}, 'PRO-DUD', 'corpusSigle');
195is($json->{sgbrKodex}, 'T', 'sgbrKodex');
196is($json->{author}, 'unbekannt', 'Author');
197is($json->{language}, 'de', 'Language');
198is($json->{docTitle}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'docTitle');
199is($json->{funder}, 'Bundesministerium für Bildung und Forschung', 'docTitle');
200is($json->{title}, 'Nur Platt, kein Deutsch', 'title');
201is($json->{pubDate}, '20130126', 'pubDate');
202is($json->{docSubTitle}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'docSubTitle');
203is($json->{keywords}, 'sgbrKodex:T', 'keywords');
204is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
205
Akrone1dbc382016-07-08 22:24:52 +0200206done_testing;
207__END__
Akron5f51d422016-08-16 16:26:43 +0200208