blob: 4478a5fab7df4520e30d80b6580971e808ebdf2b [file] [log] [blame]
Akron03b24db2016-08-16 20:54:32 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX tempdir/;
Akron03b24db2016-08-16 20:54:32 +02007use Mojo::JSON qw/decode_json/;
8use IO::Uncompress::Gunzip;
9use Test::More;
10use Test::Output;
11use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020012use KorAP::XML::Archive;
Akron03b24db2016-08-16 20:54:32 +020013use utf8;
14
15my $f = dirname(__FILE__);
16my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
17
18my $call = join(
19 ' ',
20 'perl', $script,
21 'extract'
22);
23
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020024unless (KorAP::XML::Archive::test_unzip) {
25 plan skip_all => 'unzip not found';
26};
27
Akron03b24db2016-08-16 20:54:32 +020028# Test without parameters
29stdout_like(
30 sub {
31 system($call);
32 },
Akrona76d8352016-10-27 16:27:32 +020033 qr!extract.+?\$ korapxml2krill!s,
Akron03b24db2016-08-16 20:54:32 +020034 $call
35);
36
37my $input = catfile($f, '..', 'corpus', 'archive.zip');
38ok(-f $input, 'Input archive found');
39
40my $output = tempdir(CLEANUP => 1);
41ok(-d $output, 'Output directory exists');
42
Akrond5bb4342017-06-19 11:50:49 +020043my $cache = tmpnam();
44
Akron03b24db2016-08-16 20:54:32 +020045$call = join(
46 ' ',
47 'perl', $script,
48 'extract',
49 '--input' => $input,
Akrond5bb4342017-06-19 11:50:49 +020050 '--cache' => $cache,
Akron03b24db2016-08-16 20:54:32 +020051 '--output' => $output,
52);
53
Akron2812ba22016-10-28 21:55:59 +020054my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
55
Akron03b24db2016-08-16 20:54:32 +020056# Test without compression
57stdout_like(
58 sub {
59 system($call);
60 },
Akron9ec88872017-04-12 16:29:06 +020061 qr!TEST/BSP/1 $sep extracted!s,
62# qr!TEST/BSP/1 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020063 $call
64);
65
66ok(-d catdir($output, 'TEST', 'BSP', '1'), 'Directory created');
67ok(-d catdir($output, 'TEST', 'BSP', '1', 'base'), 'Directory created');
68ok(-d catdir($output, 'TEST', 'BSP', '1', 'sgbr'), 'Directory created');
69ok(-d catdir($output, 'TEST', 'BSP', '1', 'struct'), 'Directory created');
70ok(-f catfile($output, 'TEST', 'BSP', '1', 'data.xml'), 'File created');
71ok(-f catfile($output, 'TEST', 'BSP', '1', 'header.xml'), 'File created');
72ok(-d catdir($output, 'TEST', 'BSP', '2'), 'Directory created');
73ok(-d catdir($output, 'TEST', 'BSP', '3'), 'Directory created');
74
75# Check sigles
76my $output2 = tempdir(CLEANUP => 1);
77ok(-d $output2, 'Output directory exists');
78
79$call = join(
80 ' ',
81 'perl', $script,
82 'extract',
83 '--input' => $input,
84 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +020085 '--cache' => $cache,
Akron03b24db2016-08-16 20:54:32 +020086 '-sg' => 'TEST/BSP/4'
87);
88
89# Test with sigle
90stdout_like(
91 sub {
92 system($call);
93 },
Akron2812ba22016-10-28 21:55:59 +020094 qr!TEST/BSP/4 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020095 $call
96);
97
98# Test with sigle
99stdout_unlike(
100 sub {
101 system($call);
102 },
Akron2812ba22016-10-28 21:55:59 +0200103 qr!TEST/BSP/5 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +0200104 $call
105);
106
107ok(!-d catdir($output2, 'TEST', 'BSP', '1'), 'Directory created');
108ok(!-d catdir($output2, 'TEST', 'BSP', '2'), 'Directory created');
109ok(!-d catdir($output2, 'TEST', 'BSP', '3'), 'Directory created');
110ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
111ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
112
Akron20807582016-10-26 17:11:34 +0200113
114# Test with document sigle
115my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
116ok(-f $input_rei, 'Input archive found');
117
118$call = join(
119 ' ',
120 'perl', $script,
121 'extract',
122 '--input' => $input_rei,
123 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200124 '--cache' => $cache,
Akron20807582016-10-26 17:11:34 +0200125 '-sg' => 'REI/BNG'
126);
127
128# Test with sigle
129stdout_like(
130 sub {
131 system($call);
132 },
Akron9ec88872017-04-12 16:29:06 +0200133 qr!Extract .+? REI/BNG!s,
Akron20807582016-10-26 17:11:34 +0200134 $call
135);
136
137# Test with sigle
138stdout_unlike(
139 sub {
140 system($call);
141 },
Akron9ec88872017-04-12 16:29:06 +0200142 qr!Extract .+? REI/RBR!s,
Akron20807582016-10-26 17:11:34 +0200143 $call
144);
145
146ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
147ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
148ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
149
Akron2fd402b2016-10-27 21:26:48 +0200150
151# Test with document sigle
152$output2 = undef;
153$output2 = tempdir(CLEANUP => 1);
154
155$call = join(
156 ' ',
157 'perl', $script,
158 'extract',
159 '--input' => $input_rei,
160 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200161 '--cache' => $cache,
Akron2fd402b2016-10-27 21:26:48 +0200162 '-sg' => 'REI/BN*'
163);
164
165# Test with sigle
166stdout_like(
167 sub {
168 system($call);
169 },
Akron9ec88872017-04-12 16:29:06 +0200170 qr!Extract .+? REI/BN\*!s,
Akron2fd402b2016-10-27 21:26:48 +0200171 $call
172);
173
174# Test with sigle
175stdout_unlike(
176 sub {
177 system($call);
178 },
Akron2812ba22016-10-28 21:55:59 +0200179 qr!REI/RBR $sep extracted!s,
Akron2fd402b2016-10-27 21:26:48 +0200180 $call
181);
182
183ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
184ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
185ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
186
187
188
189
190
191
192
Akron651cb8d2016-08-16 21:44:49 +0200193# Check multiple archives
194$output = tempdir(CLEANUP => 1);
195ok(-d $output, 'Output directory exists');
196
197$call = join(
198 ' ',
199 'perl', $script,
200 'extract',
201 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.zip'),
202 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.tree_tagger.zip'),
203 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.opennlp.zip'),
204 '--output' => $output
205);
206
207# Test with sigle
208stdout_like(
209 sub {
210 system($call);
211 },
Akron9ec88872017-04-12 16:29:06 +0200212 qr!WPD15/A00/00081 $sep extracted!s,
Akron651cb8d2016-08-16 21:44:49 +0200213 $call
214);
215
216ok(-d catdir($output, 'WPD15', 'A00', '00081'), 'Directory created');
217ok(-f catfile($output, 'WPD15', 'A00', 'header.xml'), 'Header file created');
218ok(-d catdir($output, 'WPD15', 'A00', '00081', 'base'), 'Directory created');
219
220ok(-f catfile($output, 'WPD15', 'A00', '00081', 'tree_tagger', 'morpho.xml'), 'New archive');
221ok(-f catfile($output, 'WPD15', 'A00', '00081', 'opennlp', 'morpho.xml'), 'New archive');
222
Akron03b24db2016-08-16 20:54:32 +0200223
Akron60a8caa2017-02-17 21:51:27 +0100224# With quotes:
225# Test with document sigle
226my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
227ok(-f $input, 'Input archive found');
228$output2 = undef;
229$output2 = tempdir(CLEANUP => 1);
230
231$call = join(
232 ' ',
233 'perl', $script,
234 'extract',
235 '--input' => $input_quotes,
236 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200237 '--cache' => $cache,
Akron60a8caa2017-02-17 21:51:27 +0100238 '-sg' => '"TEST/BSP \"Example\"/1"'
239);
240
241# Test with sigle
242stdout_like(
243 sub {
244 system($call);
245 },
246 qr!TEST/BSP "Example"\/1 $sep extracted!s,
Akron9ec88872017-04-12 16:29:06 +0200247 # qr!Extract .+? TEST/BSP "Example"\/1!s,
Akron60a8caa2017-02-17 21:51:27 +0100248 $call
249);
250
Akron03b24db2016-08-16 20:54:32 +0200251done_testing;
252__END__