blob: 290784be4e7fb7ac3795b8a4e09cdcc1ec6357cd [file] [log] [blame]
Akron7d4cdd82016-08-17 21:39:45 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akron821db3d2017-04-06 21:19:31 +02008use Mojo::Util qw/quote/;
Akron7d4cdd82016-08-17 21:39:45 +02009use Mojo::JSON qw/decode_json/;
10use IO::Uncompress::Gunzip;
11use Test::More;
Akron63d03ee2019-02-13 18:49:38 +010012use Test::Output qw/:stdout :stderr :combined :functions/;
Akron7d4cdd82016-08-17 21:39:45 +020013use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020014use KorAP::XML::Archive;
Akron7d4cdd82016-08-17 21:39:45 +020015use utf8;
16
17my $f = dirname(__FILE__);
18my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
19
20my $call = join(
21 ' ',
22 'perl', $script,
23 'archive'
24);
25
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020026unless (KorAP::XML::Archive::test_unzip) {
27 plan skip_all => 'unzip not found';
28};
29
Akron7d4cdd82016-08-17 21:39:45 +020030# Test without parameters
31stdout_like(
32 sub {
33 system($call);
34 },
Akrona76d8352016-10-27 16:27:32 +020035 qr!archive.+?\$ korapxml2krill!s,
Akron7d4cdd82016-08-17 21:39:45 +020036 $call
37);
38
39my $input = catfile($f, '..', 'corpus', 'archive.zip');
40ok(-f $input, 'Input archive found');
Akron3ec48972016-08-17 23:24:52 +020041my $output = File::Temp->newdir(CLEANUP => 0);
42$output->unlink_on_destroy(0);
43
Akrond5bb4342017-06-19 11:50:49 +020044my $cache = tmpnam();
45
Akron7d4cdd82016-08-17 21:39:45 +020046ok(-d $output, 'Output directory exists');
47
48$call = join(
49 ' ',
50 'perl', $script,
51 'archive',
Akron821db3d2017-04-06 21:19:31 +020052 '--input' => '' . $input,
Akron7d4cdd82016-08-17 21:39:45 +020053 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020054 '--cache' => $cache,
Akron7d4cdd82016-08-17 21:39:45 +020055 '-t' => 'Base#tokens_aggr',
56 '-m' => 'Sgbr'
57);
58
59# Test without compression
60my $json;
61{
62 local $SIG{__WARN__} = sub {};
63 my $out = stdout_from(sub { system($call); });
64
65 like($out, qr!TEST-BSP-1\.json!s, $call);
66
67 $out =~ m!Processed (.+?\.json)!;
68 $json = $1;
69};
70
71ok(-f $json, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +010072ok((my $file = Mojo::File->new($json)->slurp), 'Slurp data');
Akron7d4cdd82016-08-17 21:39:45 +020073ok(($json = decode_json $file), 'decode json');
74
75is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
76is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
77is($json->{sgbrKodex}, 'M', 'Kodex meta data');
78
Akron3ec48972016-08-17 23:24:52 +020079
80# Use directory
81$input = catdir($f, '..', 'annotation', 'corpus');
82
83$call = join(
84 ' ',
85 'perl', $script,
86 'archive',
87 '--input' => $input,
88 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020089 '--cache' => $cache,
Akron3ec48972016-08-17 23:24:52 +020090 '-t' => 'Tree_Tagger#Tokens',
91 '-j' => 4 # 4 jobs!
92);
93
94my ($json_1, $json_2);
95
96{
97 local $SIG{__WARN__} = sub {};
98
99 # That's not really stable on slow machines!
100 my $out = stdout_from(sub { system($call); });
101
102 ok($out =~ m!\[\$(\d+?):1\/2\]!s, $call . ' pid 1');
103 my $pid1 = $1;
104 ok($out =~ m!\[\$(\d+?):2\/2\]!s, $call . ' pid 2');
105 my $pid2 = $1;
106
107 isnt($pid1, $pid2, 'No PID match');
108
109 ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
110 ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
111
112 ok(-d $output, 'Temporary directory still exists');
113 my $json_1 = catfile($output, 'corpus-doc-0001.json');
114 ok(-f $json_1, 'Json file exists 1');
115 my $json_2 = catfile($output, 'corpus-doc-0002.json');
116 ok(-f $json_2, 'Json file exists 2');
117
Akron3ec0a1c2017-01-18 14:41:55 +0100118 ok(($file = Mojo::File->new($json_1)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200119 ok(($json_1 = decode_json $file), 'decode json');
120
121 is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
122 is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
123 is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
124
125 ok(-f $json_2, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +0100126 ok(($file = Mojo::File->new($json_2)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200127 ok(($json_2 = decode_json $file), 'decode json');
128
129 is($json_2->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
130 is($json_2->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure malt malt/dependency treetagger treetagger/morpho treetagger/sentences', 'Foundries');
131 is($json_2->{textSigle}, 'Corpus/Doc/0002', 'Sigle');
132};
133
134ok(-d $output, 'Ouput directory exists');
Akron89df4fa2016-11-04 14:35:37 +0100135
136
Akron63d03ee2019-02-13 18:49:38 +0100137my $temp_extract = tmpnam();
138
139# Ignore -te when archive is a directory
140$call = join(
141 ' ',
142 'perl', $script,
143 'archive',
144 '--input' => $input,
145 '--output' => $output,
146 '--cache' => $cache,
147 '-t' => 'Tree_Tagger#Tokens',
148 '-j' => 4, # 4 jobs!
149 '-te' => $temp_extract
150);
151
152{
153 local $SIG{__WARN__} = sub {};
154
155 my $out = combined_from(sub { system($call); });
156
157 ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
158 ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
159};
160
161
Akron89df4fa2016-11-04 14:35:37 +0100162$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
163$call = join(
164 ' ',
165 'perl', $script,
Akrond5bb4342017-06-19 11:50:49 +0200166 '--input' => $input,
167 '--cache' => $cache
Akron89df4fa2016-11-04 14:35:37 +0100168);
169
170# Test without compression
171{
172 local $SIG{__WARN__} = sub {};
173 my $out = stderr_from(sub { system($call); });
174
175 like($out, qr!no base tokenization!s, $call);
176};
177
Akronf6240842017-02-17 23:45:26 +0100178my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
179$call = join(
180 ' ',
181 'perl', $script,
182 'archive',
183 '--input' => $input_quotes,
184 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200185 '--cache' => $cache,
Akronf6240842017-02-17 23:45:26 +0100186 '-t' => 'Base#tokens_aggr'
187);
188
189# Test without parameters
190stdout_like(
191 sub {
192 system($call);
193 },
194 qr!Done\.!is,
195 $call
196);
197
Akron89df4fa2016-11-04 14:35:37 +0100198
Akron3ec48972016-08-17 23:24:52 +0200199unlink($output);
200
Akron821db3d2017-04-06 21:19:31 +0200201
202$input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
203
204$call = join(
205 ' ',
206 'perl', $script,
207 'archive',
208 '--input' => $input_quotes,
209 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200210 '--cache' => $cache,
Akron821db3d2017-04-06 21:19:31 +0200211 '-t' => 'Base#tokens_aggr'
212);
213
214# Test without parameters
215stdout_like(
216 sub {
217 system($call);
218 },
Akron63f20d42017-04-10 23:40:29 +0200219 qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
Akron821db3d2017-04-06 21:19:31 +0200220 $call
221);
222
Akron31a08cb2019-02-20 20:43:26 +0100223
224
225# Test with sigles
226$input = catfile($f, '..', 'corpus', 'archive.zip');
227ok(-f $input, 'Input archive found');
228
229unlink($output);
230
231$call = join(
232 ' ',
233 'perl', $script,
234 'archive',
235 '--input' => '' . $input,
236 '--output' => $output,
237 '--sigle' => 'TEST/BSP/2',
238 '--sigle' => 'TEST/BSP/5',
239 '-t' => 'Base#tokens_aggr',
240 '-m' => 'Sgbr'
241);
242
243{
244 local $SIG{__WARN__} = sub {};
245 my $out = stdout_from(sub { system($call); });
246
247 like($out, qr!TEST-BSP-1\.json!s, $call);
248
249 $out =~ m!Processed (.+?\.json)!;
250 $json = $1;
251};
252
253ok(-f $json, 'Json file exists');
254
255
Akron7d4cdd82016-08-17 21:39:45 +0200256done_testing;
257__END__