blob: e2e96406d2be7a6729410426ff36e2276c620b58 [file] [log] [blame]
Akron03b24db2016-08-16 20:54:32 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX tempdir/;
Akron03b24db2016-08-16 20:54:32 +02007use Mojo::JSON qw/decode_json/;
8use IO::Uncompress::Gunzip;
9use Test::More;
10use Test::Output;
11use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020012use KorAP::XML::Archive;
Akron03b24db2016-08-16 20:54:32 +020013use utf8;
14
Akronfab17d32020-07-31 14:38:29 +020015if ($ENV{SKIP_SCRIPT}) {
16 plan skip_all => 'Skip script tests';
17};
18
Akron03b24db2016-08-16 20:54:32 +020019my $f = dirname(__FILE__);
20my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
21
22my $call = join(
23 ' ',
24 'perl', $script,
25 'extract'
26);
27
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020028unless (KorAP::XML::Archive::test_unzip) {
29 plan skip_all => 'unzip not found';
30};
31
Akron03b24db2016-08-16 20:54:32 +020032# Test without parameters
33stdout_like(
34 sub {
35 system($call);
36 },
Akrona76d8352016-10-27 16:27:32 +020037 qr!extract.+?\$ korapxml2krill!s,
Akron03b24db2016-08-16 20:54:32 +020038 $call
39);
40
41my $input = catfile($f, '..', 'corpus', 'archive.zip');
42ok(-f $input, 'Input archive found');
43
44my $output = tempdir(CLEANUP => 1);
45ok(-d $output, 'Output directory exists');
46
Akrond5bb4342017-06-19 11:50:49 +020047my $cache = tmpnam();
48
Akron03b24db2016-08-16 20:54:32 +020049$call = join(
50 ' ',
51 'perl', $script,
52 'extract',
53 '--input' => $input,
54 '--output' => $output,
Akronaaea2462017-06-19 16:56:54 +020055 '--cache' => $cache
Akron03b24db2016-08-16 20:54:32 +020056);
57
Akron955b75b2019-02-21 14:28:41 +010058# my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
Akron2812ba22016-10-28 21:55:59 +020059
Akron03b24db2016-08-16 20:54:32 +020060# Test without compression
61stdout_like(
62 sub {
63 system($call);
64 },
Akron955b75b2019-02-21 14:28:41 +010065 qr!TEST/BSP/1 .* extracted!s,
Akron9ec88872017-04-12 16:29:06 +020066# qr!TEST/BSP/1 $sep extracted.!s,
Akron03b24db2016-08-16 20:54:32 +020067 $call
68);
69
70ok(-d catdir($output, 'TEST', 'BSP', '1'), 'Directory created');
71ok(-d catdir($output, 'TEST', 'BSP', '1', 'base'), 'Directory created');
72ok(-d catdir($output, 'TEST', 'BSP', '1', 'sgbr'), 'Directory created');
73ok(-d catdir($output, 'TEST', 'BSP', '1', 'struct'), 'Directory created');
74ok(-f catfile($output, 'TEST', 'BSP', '1', 'data.xml'), 'File created');
75ok(-f catfile($output, 'TEST', 'BSP', '1', 'header.xml'), 'File created');
76ok(-d catdir($output, 'TEST', 'BSP', '2'), 'Directory created');
77ok(-d catdir($output, 'TEST', 'BSP', '3'), 'Directory created');
78
79# Check sigles
80my $output2 = tempdir(CLEANUP => 1);
81ok(-d $output2, 'Output directory exists');
82
83$call = join(
84 ' ',
85 'perl', $script,
86 'extract',
87 '--input' => $input,
88 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +020089 '--cache' => $cache,
Akron03b24db2016-08-16 20:54:32 +020090 '-sg' => 'TEST/BSP/4'
91);
92
93# Test with sigle
94stdout_like(
95 sub {
96 system($call);
97 },
Akron955b75b2019-02-21 14:28:41 +010098 # qr!TEST/BSP/4 $sep extracted.!s,
99 qr!TEST/BSP/4 .* extracted.!s,
Akron03b24db2016-08-16 20:54:32 +0200100 $call
101);
102
103# Test with sigle
104stdout_unlike(
105 sub {
106 system($call);
107 },
Akron955b75b2019-02-21 14:28:41 +0100108 # qr!TEST/BSP/5 $sep extracted.!s,
109 qr!TEST/BSP/5 .* extracted.!s,
Akron03b24db2016-08-16 20:54:32 +0200110 $call
111);
112
113ok(!-d catdir($output2, 'TEST', 'BSP', '1'), 'Directory created');
114ok(!-d catdir($output2, 'TEST', 'BSP', '2'), 'Directory created');
115ok(!-d catdir($output2, 'TEST', 'BSP', '3'), 'Directory created');
116ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
117ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
118
Akron20807582016-10-26 17:11:34 +0200119# Test with document sigle
120my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
121ok(-f $input_rei, 'Input archive found');
122
123$call = join(
124 ' ',
125 'perl', $script,
126 'extract',
127 '--input' => $input_rei,
128 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200129 '--cache' => $cache,
Akron20807582016-10-26 17:11:34 +0200130 '-sg' => 'REI/BNG'
131);
132
133# Test with sigle
134stdout_like(
135 sub {
136 system($call);
137 },
Akron9ec88872017-04-12 16:29:06 +0200138 qr!Extract .+? REI/BNG!s,
Akron20807582016-10-26 17:11:34 +0200139 $call
140);
141
142# Test with sigle
143stdout_unlike(
144 sub {
145 system($call);
146 },
Akron9ec88872017-04-12 16:29:06 +0200147 qr!Extract .+? REI/RBR!s,
Akron20807582016-10-26 17:11:34 +0200148 $call
149);
150
151ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
152ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
153ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
154
Akron2fd402b2016-10-27 21:26:48 +0200155# Test with document sigle
156$output2 = undef;
157$output2 = tempdir(CLEANUP => 1);
158
159$call = join(
160 ' ',
161 'perl', $script,
162 'extract',
163 '--input' => $input_rei,
164 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200165 '--cache' => $cache,
Akron2fd402b2016-10-27 21:26:48 +0200166 '-sg' => 'REI/BN*'
167);
168
169# Test with sigle
170stdout_like(
171 sub {
172 system($call);
173 },
Akron31a08cb2019-02-20 20:43:26 +0100174 qr!Extract .+? REI/BN!s,
Akron2fd402b2016-10-27 21:26:48 +0200175 $call
176);
177
178# Test with sigle
179stdout_unlike(
180 sub {
181 system($call);
182 },
Akron955b75b2019-02-21 14:28:41 +0100183 qr!REI/RBR .* extracted!s,
Akron2fd402b2016-10-27 21:26:48 +0200184 $call
185);
186
187ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
188ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
189ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
190
191
Akron651cb8d2016-08-16 21:44:49 +0200192# Check multiple archives
193$output = tempdir(CLEANUP => 1);
194ok(-d $output, 'Output directory exists');
195
196$call = join(
197 ' ',
198 'perl', $script,
199 'extract',
200 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.zip'),
201 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.tree_tagger.zip'),
202 '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.opennlp.zip'),
Akronaaea2462017-06-19 16:56:54 +0200203 '--output' => $output,
204 '--cache' => $cache
Akron651cb8d2016-08-16 21:44:49 +0200205);
206
207# Test with sigle
208stdout_like(
209 sub {
210 system($call);
211 },
Akron955b75b2019-02-21 14:28:41 +0100212 qr!WPD15/A00/00081 .* extracted!s,
Akron651cb8d2016-08-16 21:44:49 +0200213 $call
214);
215
216ok(-d catdir($output, 'WPD15', 'A00', '00081'), 'Directory created');
217ok(-f catfile($output, 'WPD15', 'A00', 'header.xml'), 'Header file created');
218ok(-d catdir($output, 'WPD15', 'A00', '00081', 'base'), 'Directory created');
219
220ok(-f catfile($output, 'WPD15', 'A00', '00081', 'tree_tagger', 'morpho.xml'), 'New archive');
221ok(-f catfile($output, 'WPD15', 'A00', '00081', 'opennlp', 'morpho.xml'), 'New archive');
222
Akron03b24db2016-08-16 20:54:32 +0200223
Akron60a8caa2017-02-17 21:51:27 +0100224# With quotes:
225# Test with document sigle
226my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
227ok(-f $input, 'Input archive found');
228$output2 = undef;
229$output2 = tempdir(CLEANUP => 1);
230
231$call = join(
232 ' ',
233 'perl', $script,
234 'extract',
235 '--input' => $input_quotes,
236 '--output' => $output2,
Akrond5bb4342017-06-19 11:50:49 +0200237 '--cache' => $cache,
Akron60a8caa2017-02-17 21:51:27 +0100238 '-sg' => '"TEST/BSP \"Example\"/1"'
239);
240
241# Test with sigle
242stdout_like(
243 sub {
244 system($call);
245 },
Akron955b75b2019-02-21 14:28:41 +0100246 qr!TEST/BSP "Example"\/1 .* extracted!s,
247 # qr!TEST/BSP "Example"\/1 $sep extracted!s,
Akron9ec88872017-04-12 16:29:06 +0200248 # qr!Extract .+? TEST/BSP "Example"\/1!s,
Akron60a8caa2017-02-17 21:51:27 +0100249 $call
250);
251
Akron03b24db2016-08-16 20:54:32 +0200252done_testing;
253__END__