blob: 084908bc502a2c55b131b5564114d94bdaaec9ed [file] [log] [blame]
Akron03b24db2016-08-16 20:54:32 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX tempdir/;
Akron03b24db2016-08-16 20:54:32 +02007use Mojo::JSON qw/decode_json/;
8use IO::Uncompress::Gunzip;
9use Test::More;
10use Test::Output;
11use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020012use KorAP::XML::Archive;
Akron03b24db2016-08-16 20:54:32 +020013use utf8;
14
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
17
18my $call = join(
19 ' ',
20 'perl', $script,
21 'extract'
22);
23
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020024unless (KorAP::XML::Archive::test_unzip) {
25 plan skip_all => 'unzip not found';
26};
27
Akron03b24db2016-08-16 20:54:32 +020028# Test without parameters
29stdout_like(
30 sub {
31 system($call);
32 },
Akrona76d8352016-10-27 16:27:32 +020033 qr!extract.+?\$ korapxml2krill!s,
Akron03b24db2016-08-16 20:54:32 +020034 $call
35);
36
37my $input = catfile($f, '..', 'corpus', 'archive.zip');
38ok(-f $input, 'Input archive found');
39
40my $output = tempdir(CLEANUP => 1);
41ok(-d $output, 'Output directory exists');
42
Akrond5bb4342017-06-19 11:50:49 +020043my $cache = tmpnam();
44
Akron03b24db2016-08-16 20:54:32 +020045$call = join(
46 ' ',
47 'perl', $script,
48 'extract',
49 '--input' => $input,
50 '--output' => $output,
Akronaaea2462017-06-19 16:56:54 +020051 '--cache' => $cache
Akron03b24db2016-08-16 20:54:32 +020052);
53
Akron955b75b2019-02-21 14:28:41 +010054# my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
Akron2812ba22016-10-28 21:55:59 +020055
Akron03b24db2016-08-16 20:54:32 +020056# Test without compression
57stdout_like(
58 sub {
59 system($call);
60 },
Akron955b75b2019-02-21 14:28:41 +010061 qr!TEST/BSP/1 .* extracted!s,
Akron9ec88872017-04-12 16:29:06 +020062# qr!TEST/BSP/1 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020063 $call
64);
65
66ok(-d catdir($output, 'TEST', 'BSP', '1'), 'Directory created');
67ok(-d catdir($output, 'TEST', 'BSP', '1', 'base'), 'Directory created');
68ok(-d catdir($output, 'TEST', 'BSP', '1', 'sgbr'), 'Directory created');
69ok(-d catdir($output, 'TEST', 'BSP', '1', 'struct'), 'Directory created');
70ok(-f catfile($output, 'TEST', 'BSP', '1', 'data.xml'), 'File created');
71ok(-f catfile($output, 'TEST', 'BSP', '1', 'header.xml'), 'File created');
72ok(-d catdir($output, 'TEST', 'BSP', '2'), 'Directory created');
73ok(-d catdir($output, 'TEST', 'BSP', '3'), 'Directory created');
74
75# Check sigles
76my $output2 = tempdir(CLEANUP => 1);
77ok(-d $output2, 'Output directory exists');
78
79$call = join(
80 ' ',
81 'perl', $script,
82 'extract',
83 '--input' => $input,
84 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +020085 '--cache' => $cache,
Akron03b24db2016-08-16 20:54:32 +020086 '-sg' => 'TEST/BSP/4'
87);
88
89# Test with sigle
90stdout_like(
91 sub {
92 system($call);
93 },
Akron955b75b2019-02-21 14:28:41 +010094 # qr!TEST/BSP/4 $sep extracted.!s,
95 qr!TEST/BSP/4 .* extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020096 $call
97);
98
99# Test with sigle
100stdout_unlike(
101 sub {
102 system($call);
103 },
Akron955b75b2019-02-21 14:28:41 +0100104 # qr!TEST/BSP/5 $sep extracted.!s,
105 qr!TEST/BSP/5 .* extracted.!s,
Akron03b24db2016-08-16 20:54:32 +0200106 $call
107);
108
109ok(!-d catdir($output2, 'TEST', 'BSP', '1'), 'Directory created');
110ok(!-d catdir($output2, 'TEST', 'BSP', '2'), 'Directory created');
111ok(!-d catdir($output2, 'TEST', 'BSP', '3'), 'Directory created');
112ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
113ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
114
Akron20807582016-10-26 17:11:34 +0200115# Test with document sigle
116my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
117ok(-f $input_rei, 'Input archive found');
118
119$call = join(
120 ' ',
121 'perl', $script,
122 'extract',
123 '--input' => $input_rei,
124 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200125 '--cache' => $cache,
Akron20807582016-10-26 17:11:34 +0200126 '-sg' => 'REI/BNG'
127);
128
129# Test with sigle
130stdout_like(
131 sub {
132 system($call);
133 },
Akron9ec88872017-04-12 16:29:06 +0200134 qr!Extract .+? REI/BNG!s,
Akron20807582016-10-26 17:11:34 +0200135 $call
136);
137
138# Test with sigle
139stdout_unlike(
140 sub {
141 system($call);
142 },
Akron9ec88872017-04-12 16:29:06 +0200143 qr!Extract .+? REI/RBR!s,
Akron20807582016-10-26 17:11:34 +0200144 $call
145);
146
147ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
148ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
149ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
150
Akron2fd402b2016-10-27 21:26:48 +0200151# Test with document sigle
152$output2 = undef;
153$output2 = tempdir(CLEANUP => 1);
154
155$call = join(
156 ' ',
157 'perl', $script,
158 'extract',
159 '--input' => $input_rei,
160 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200161 '--cache' => $cache,
Akron2fd402b2016-10-27 21:26:48 +0200162 '-sg' => 'REI/BN*'
163);
164
165# Test with sigle
166stdout_like(
167 sub {
168 system($call);
169 },
Akron31a08cb2019-02-20 20:43:26 +0100170 qr!Extract .+? REI/BN!s,
Akron2fd402b2016-10-27 21:26:48 +0200171 $call
172);
173
174# Test with sigle
175stdout_unlike(
176 sub {
177 system($call);
178 },
Akron955b75b2019-02-21 14:28:41 +0100179 qr!REI/RBR .* extracted!s,
Akron2fd402b2016-10-27 21:26:48 +0200180 $call
181);
182
183ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
184ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
185ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
186
187
Akron651cb8d2016-08-16 21:44:49 +0200188# Check multiple archives
189$output = tempdir(CLEANUP => 1);
190ok(-d $output, 'Output directory exists');
191
192$call = join(
193 ' ',
194 'perl', $script,
195 'extract',
196 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.zip'),
197 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.tree_tagger.zip'),
198 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.opennlp.zip'),
Akronaaea2462017-06-19 16:56:54 +0200199 '--output' => $output,
200 '--cache' => $cache
Akron651cb8d2016-08-16 21:44:49 +0200201);
202
203# Test with sigle
204stdout_like(
205 sub {
206 system($call);
207 },
Akron955b75b2019-02-21 14:28:41 +0100208 qr!WPD15/A00/00081 .* extracted!s,
Akron651cb8d2016-08-16 21:44:49 +0200209 $call
210);
211
212ok(-d catdir($output, 'WPD15', 'A00', '00081'), 'Directory created');
213ok(-f catfile($output, 'WPD15', 'A00', 'header.xml'), 'Header file created');
214ok(-d catdir($output, 'WPD15', 'A00', '00081', 'base'), 'Directory created');
215
216ok(-f catfile($output, 'WPD15', 'A00', '00081', 'tree_tagger', 'morpho.xml'), 'New archive');
217ok(-f catfile($output, 'WPD15', 'A00', '00081', 'opennlp', 'morpho.xml'), 'New archive');
218
Akron03b24db2016-08-16 20:54:32 +0200219
Akron60a8caa2017-02-17 21:51:27 +0100220# With quotes:
221# Test with document sigle
222my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
223ok(-f $input, 'Input archive found');
224$output2 = undef;
225$output2 = tempdir(CLEANUP => 1);
226
227$call = join(
228 ' ',
229 'perl', $script,
230 'extract',
231 '--input' => $input_quotes,
232 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200233 '--cache' => $cache,
Akron60a8caa2017-02-17 21:51:27 +0100234 '-sg' => '"TEST/BSP \"Example\"/1"'
235);
236
237# Test with sigle
238stdout_like(
239 sub {
240 system($call);
241 },
Akron955b75b2019-02-21 14:28:41 +0100242 qr!TEST/BSP "Example"\/1 .* extracted!s,
243 # qr!TEST/BSP "Example"\/1 $sep extracted!s,
Akron9ec88872017-04-12 16:29:06 +0200244 # qr!Extract .+? TEST/BSP "Example"\/1!s,
Akron60a8caa2017-02-17 21:51:27 +0100245 $call
246);
247
Akron03b24db2016-08-16 20:54:32 +0200248done_testing;
249__END__