blob: 6420e52cbc86e844c0823f4d3fcb43a0e5e5bc48 [file] [log] [blame]
Akron03b24db2016-08-16 20:54:32 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX tempdir/;
Akron03b24db2016-08-16 20:54:32 +02007use Mojo::Util qw/slurp/;
8use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
11use Test::Output;
12use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020013use KorAP::XML::Archive;
Akron03b24db2016-08-16 20:54:32 +020014use utf8;
15
16my $f = dirname(__FILE__);
17my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
18
19my $call = join(
20 ' ',
21 'perl', $script,
22 'extract'
23);
24
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020025unless (KorAP::XML::Archive::test_unzip) {
26 plan skip_all => 'unzip not found';
27};
28
Akron03b24db2016-08-16 20:54:32 +020029# Test without parameters
30stdout_like(
31 sub {
32 system($call);
33 },
Akrona76d8352016-10-27 16:27:32 +020034 qr!extract.+?\$ korapxml2krill!s,
Akron03b24db2016-08-16 20:54:32 +020035 $call
36);
37
38my $input = catfile($f, '..', 'corpus', 'archive.zip');
39ok(-f $input, 'Input archive found');
40
41my $output = tempdir(CLEANUP => 1);
42ok(-d $output, 'Output directory exists');
43
Akrond5bb4342017-06-19 11:50:49 +020044my $cache = tmpnam();
45
Akron03b24db2016-08-16 20:54:32 +020046$call = join(
47 ' ',
48 'perl', $script,
49 'extract',
50 '--input' => $input,
Akrond5bb4342017-06-19 11:50:49 +020051 '--cache' => $cache,
Akron03b24db2016-08-16 20:54:32 +020052 '--output' => $output,
53);
54
Akron2812ba22016-10-28 21:55:59 +020055my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
56
Akron03b24db2016-08-16 20:54:32 +020057# Test without compression
58stdout_like(
59 sub {
60 system($call);
61 },
Akron9ec88872017-04-12 16:29:06 +020062 qr!TEST/BSP/1 $sep extracted!s,
63# qr!TEST/BSP/1 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020064 $call
65);
66
67ok(-d catdir($output, 'TEST', 'BSP', '1'), 'Directory created');
68ok(-d catdir($output, 'TEST', 'BSP', '1', 'base'), 'Directory created');
69ok(-d catdir($output, 'TEST', 'BSP', '1', 'sgbr'), 'Directory created');
70ok(-d catdir($output, 'TEST', 'BSP', '1', 'struct'), 'Directory created');
71ok(-f catfile($output, 'TEST', 'BSP', '1', 'data.xml'), 'File created');
72ok(-f catfile($output, 'TEST', 'BSP', '1', 'header.xml'), 'File created');
73ok(-d catdir($output, 'TEST', 'BSP', '2'), 'Directory created');
74ok(-d catdir($output, 'TEST', 'BSP', '3'), 'Directory created');
75
76# Check sigles
77my $output2 = tempdir(CLEANUP => 1);
78ok(-d $output2, 'Output directory exists');
79
80$call = join(
81 ' ',
82 'perl', $script,
83 'extract',
84 '--input' => $input,
85 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +020086 '--cache' => $cache,
Akron03b24db2016-08-16 20:54:32 +020087 '-sg' => 'TEST/BSP/4'
88);
89
90# Test with sigle
91stdout_like(
92 sub {
93 system($call);
94 },
Akron2812ba22016-10-28 21:55:59 +020095 qr!TEST/BSP/4 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020096 $call
97);
98
99# Test with sigle
100stdout_unlike(
101 sub {
102 system($call);
103 },
Akron2812ba22016-10-28 21:55:59 +0200104 qr!TEST/BSP/5 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +0200105 $call
106);
107
108ok(!-d catdir($output2, 'TEST', 'BSP', '1'), 'Directory created');
109ok(!-d catdir($output2, 'TEST', 'BSP', '2'), 'Directory created');
110ok(!-d catdir($output2, 'TEST', 'BSP', '3'), 'Directory created');
111ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
112ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
113
Akron20807582016-10-26 17:11:34 +0200114
115# Test with document sigle
116my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
117ok(-f $input_rei, 'Input archive found');
118
119$call = join(
120 ' ',
121 'perl', $script,
122 'extract',
123 '--input' => $input_rei,
124 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200125 '--cache' => $cache,
Akron20807582016-10-26 17:11:34 +0200126 '-sg' => 'REI/BNG'
127);
128
129# Test with sigle
130stdout_like(
131 sub {
132 system($call);
133 },
Akron9ec88872017-04-12 16:29:06 +0200134 qr!Extract .+? REI/BNG!s,
Akron20807582016-10-26 17:11:34 +0200135 $call
136);
137
138# Test with sigle
139stdout_unlike(
140 sub {
141 system($call);
142 },
Akron9ec88872017-04-12 16:29:06 +0200143 qr!Extract .+? REI/RBR!s,
Akron20807582016-10-26 17:11:34 +0200144 $call
145);
146
147ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
148ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
149ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
150
Akron2fd402b2016-10-27 21:26:48 +0200151
152# Test with document sigle
153$output2 = undef;
154$output2 = tempdir(CLEANUP => 1);
155
156$call = join(
157 ' ',
158 'perl', $script,
159 'extract',
160 '--input' => $input_rei,
161 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200162 '--cache' => $cache,
Akron2fd402b2016-10-27 21:26:48 +0200163 '-sg' => 'REI/BN*'
164);
165
166# Test with sigle
167stdout_like(
168 sub {
169 system($call);
170 },
Akron9ec88872017-04-12 16:29:06 +0200171 qr!Extract .+? REI/BN\*!s,
Akron2fd402b2016-10-27 21:26:48 +0200172 $call
173);
174
175# Test with sigle
176stdout_unlike(
177 sub {
178 system($call);
179 },
Akron2812ba22016-10-28 21:55:59 +0200180 qr!REI/RBR $sep extracted!s,
Akron2fd402b2016-10-27 21:26:48 +0200181 $call
182);
183
184ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
185ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
186ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
187
188
189
190
191
192
193
Akron651cb8d2016-08-16 21:44:49 +0200194# Check multiple archives
195$output = tempdir(CLEANUP => 1);
196ok(-d $output, 'Output directory exists');
197
198$call = join(
199 ' ',
200 'perl', $script,
201 'extract',
202 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.zip'),
203 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.tree_tagger.zip'),
204 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.opennlp.zip'),
205 '--output' => $output
206);
207
208# Test with sigle
209stdout_like(
210 sub {
211 system($call);
212 },
Akron9ec88872017-04-12 16:29:06 +0200213 qr!WPD15/A00/00081 $sep extracted!s,
Akron651cb8d2016-08-16 21:44:49 +0200214 $call
215);
216
217ok(-d catdir($output, 'WPD15', 'A00', '00081'), 'Directory created');
218ok(-f catfile($output, 'WPD15', 'A00', 'header.xml'), 'Header file created');
219ok(-d catdir($output, 'WPD15', 'A00', '00081', 'base'), 'Directory created');
220
221ok(-f catfile($output, 'WPD15', 'A00', '00081', 'tree_tagger', 'morpho.xml'), 'New archive');
222ok(-f catfile($output, 'WPD15', 'A00', '00081', 'opennlp', 'morpho.xml'), 'New archive');
223
Akron03b24db2016-08-16 20:54:32 +0200224
Akron60a8caa2017-02-17 21:51:27 +0100225# With quotes:
226# Test with document sigle
227my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
228ok(-f $input, 'Input archive found');
229$output2 = undef;
230$output2 = tempdir(CLEANUP => 1);
231
232$call = join(
233 ' ',
234 'perl', $script,
235 'extract',
236 '--input' => $input_quotes,
237 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200238 '--cache' => $cache,
Akron60a8caa2017-02-17 21:51:27 +0100239 '-sg' => '"TEST/BSP \"Example\"/1"'
240);
241
242# Test with sigle
243stdout_like(
244 sub {
245 system($call);
246 },
247 qr!TEST/BSP "Example"\/1 $sep extracted!s,
Akron9ec88872017-04-12 16:29:06 +0200248 # qr!Extract .+? TEST/BSP "Example"\/1!s,
Akron60a8caa2017-02-17 21:51:27 +0100249 $call
250);
251
Akron03b24db2016-08-16 20:54:32 +0200252done_testing;
253__END__