blob: 3606086a4bbff63c97b9544cb06a8e61f02fb00f [file] [log] [blame]
Akron150b29e2016-02-14 23:06:48 +01001package KorAP::XML::Archive;
2use Carp qw/carp/;
Akron60a8caa2017-02-17 21:51:27 +01003use Mojo::Util qw/quote/;
Akron31a08cb2019-02-20 20:43:26 +01004use List::Util qw/uniq/;
Akron150b29e2016-02-14 23:06:48 +01005use File::Spec::Functions qw(rel2abs);
6use strict;
7use warnings;
8
Akron0c3e3752016-06-28 15:55:53 +02009# Construct new archive helper
Akron150b29e2016-02-14 23:06:48 +010010sub new {
11 my $class = shift;
Akron2cfe8092016-06-24 17:48:49 +020012 my @file;
13
14 foreach (@_) {
15 my $file = _file_to_array($_) or return;
16 push(@file, $file);
17 };
18
19 return unless @file;
20
Akron08385f62016-03-22 20:37:04 +010021 bless \@file, $class;
Akron150b29e2016-02-14 23:06:48 +010022};
23
24
25# Check if unzip is installed
26sub test_unzip {
27 return 1 if grep { -x "$_/unzip"} split /:/, $ENV{PATH};
28 return;
29};
30
Akron0278ca22017-04-11 19:29:05 +020031
Akron150b29e2016-02-14 23:06:48 +010032# Check the compressed archive
33sub test {
34 my $self = shift;
Akron08385f62016-03-22 20:37:04 +010035 foreach (@$self) {
Akron2cfe8092016-06-24 17:48:49 +020036 my $x = $_->[0];
37 my $out = `unzip -t $x`;
Akron08385f62016-03-22 20:37:04 +010038 if ($out !~ /no errors/i) {
39 return 0;
40 };
Akron150b29e2016-02-14 23:06:48 +010041 };
Akron08385f62016-03-22 20:37:04 +010042 return 1;
Akron150b29e2016-02-14 23:06:48 +010043};
44
45
46# List all text paths contained in the file
47sub list_texts {
48 my $self = shift;
Akrone8adfcc2016-03-22 13:18:26 +010049 my @texts;
Akron2cfe8092016-06-24 17:48:49 +020050 my $file = $self->[0]->[0];
Akron08385f62016-03-22 20:37:04 +010051 foreach (`unzip -l -UU -qq $file "*/data.xml"`) {
Akrone8adfcc2016-03-22 13:18:26 +010052 if (m![\t\s]
Akron38873012017-02-06 20:27:37 +010053 ((?:\./)?
54 [^\s\t/\.]+?/ # Corpus
55 [^\/]+?/ # Document
56 [^/]+? # Text
57 )/data\.xml$!x) {
Akrone8adfcc2016-03-22 13:18:26 +010058 push @texts, $1;
Akron150b29e2016-02-14 23:06:48 +010059 };
60 };
Akrone8adfcc2016-03-22 13:18:26 +010061 return @texts;
Akron150b29e2016-02-14 23:06:48 +010062};
63
64
Akron20807582016-10-26 17:11:34 +020065# Check, if the archive has a prefix
66sub check_prefix {
67 my $self = shift;
68 my $nr = shift // 0;
69 my $file = $self->[$nr]->[0];
70 my ($header) = `unzip -l -UU -qq $file "*/header.xml"`;
Akron955b75b2019-02-21 14:28:41 +010071 return ($header && $header =~ m![\s\t]\.[/\\]!) ? 1 : 0;
Akron20807582016-10-26 17:11:34 +020072};
73
74
Akron150b29e2016-02-14 23:06:48 +010075# Split a text path to prefix, corpus, document, text
76sub split_path {
77 my $self = shift;
78 my $text_path = shift;
79
80 unless ($text_path) {
81 carp('No text path given');
82 return 0;
83 };
84
85 # Check for '.' prefix in text
86 my $prefix = '';
87 if ($text_path =~ s!^\./!!) {
88 $prefix = '.';
89 };
90
91 # Unix form
Akron20807582016-10-26 17:11:34 +020092 if ($text_path =~ m!^([^/]+?)/([^/]+?)[\\/]([^/]+?)$!) {
Akron150b29e2016-02-14 23:06:48 +010093 return ($prefix, $1, $2, $3);
94 }
95
96 # Windows form
Akron20807582016-10-26 17:11:34 +020097 elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)[\\/]([^\\]+?)$!) {
Akron150b29e2016-02-14 23:06:48 +010098 return ($prefix, $1, $2, $3);
99 };
100
101 # Text has not the expected pattern
Akron2cfe8092016-06-24 17:48:49 +0200102 carp $text_path . ' is not a well-formed text path in ' . $self->[0]->[0];
Akron150b29e2016-02-14 23:06:48 +0100103 return;
104};
105
106
107# Get the archives path
Akron08385f62016-03-22 20:37:04 +0100108# Deprecated
Akron150b29e2016-02-14 23:06:48 +0100109sub path {
Akron08385f62016-03-22 20:37:04 +0100110 my $self = shift;
111 my $archive = shift // 0;
Akron2cfe8092016-06-24 17:48:49 +0200112 return rel2abs($self->[$archive]->[0]);
Akron08385f62016-03-22 20:37:04 +0100113};
114
115
Akron2cfe8092016-06-24 17:48:49 +0200116# Attach another archive
Akron08385f62016-03-22 20:37:04 +0100117sub attach {
118 my $self = shift;
Akron2cfe8092016-06-24 17:48:49 +0200119 my $file = _file_to_array(shift()) or return;
120 push @$self, $file;
121 return 1;
Akron150b29e2016-02-14 23:06:48 +0100122};
123
124
Akron2cfe8092016-06-24 17:48:49 +0200125# Check attached file for prefix negation
126sub _file_to_array {
127 my $file = shift;
128 my $prefix = 1;
129
130 # Should the archive support prefixes
131 if (index($file, '#') == 0) {
132 $file = substr($file, 1);
133 $prefix = 0;
134 };
135
136 # The archive is a valid file
137 if (-e $file) {
138 return [$file, $prefix]
139 };
140};
141
142
Akron81500102017-04-07 20:45:44 +0200143sub extract_all {
144 my $self = shift;
145 my ($target_dir, $jobs) = @_;
146
147 my @init_cmd = (
148 'unzip', # Use unzip program
149 '-qo', # quietly overwrite all existing files
Akron3a486f82017-04-11 21:15:55 +0200150 '-uo',
Akron81500102017-04-07 20:45:44 +0200151 '-d', $target_dir # Extract into target directory
152 );
153
154 # Iterate over all attached archives
155 my @cmds;
156 foreach my $archive (@$self) {
157
158 # $_ is the zip
159 my @cmd = @init_cmd;
160 push(@cmd, $archive->[0]); # Extract from zip
161
162 # Run system call
163 push @cmds, \@cmd;
164 };
165
166 $self->_extract($jobs, @cmds);
167};
168
169
170sub _extract {
171 my ($self, $jobs, @cmds) = @_;
172
173 # Only single call
174 if (!$jobs || $jobs == 1) {
175 foreach (@cmds) {
Akron86db52e2017-04-11 20:36:43 +0200176
Akron81500102017-04-07 20:45:44 +0200177 system(@$_);
178
179 # Check for return code
Akron9ec88872017-04-12 16:29:06 +0200180 my $code = $?;
181
182 print "Extract" .
183 ($code ? " $code" : '') . " " . join(' ', @$_) . "\n";
Akron81500102017-04-07 20:45:44 +0200184 };
185 }
186
187 # Extract annotations in parallel
188 else {
189 my $pool = Parallel::ForkManager->new($jobs);
190 $pool->run_on_finish(
191 sub {
192 my ($pid, $code) = @_;
193 my $data = pop;
194 print "Extract [\$$pid] " .
195 ($code ? " $code" : '') . " $$data\n";
196 }
197 );
198
199 ARCHIVE_LOOP:
200 foreach my $cmd (@cmds) {
201 my $pid = $pool->start and next ARCHIVE_LOOP;
202 system(@$cmd);
Akron86db52e2017-04-11 20:36:43 +0200203 my $code = $?;
Akron81500102017-04-07 20:45:44 +0200204 my $last = $cmd->[4];
Akron86db52e2017-04-11 20:36:43 +0200205 $pool->finish($code, \"$last");
Akron81500102017-04-07 20:45:44 +0200206 };
207 $pool->wait_all_children;
208 };
209
210 # Fine
211 return 1;
212};
213
214
Akron150b29e2016-02-14 23:06:48 +0100215
Akron31a08cb2019-02-20 20:43:26 +0100216# Extract from sigle
217sub extract_sigle {
218 my ($self, $sigle, $target_dir, $jobs) = @_;
219 my @cmds = $self->cmds_from_sigle($sigle);
220
221 @cmds = map {
222 push @{$_}, '-d', $target_dir;
223 $_;
224 } @cmds;
225
226 return $self->_extract($jobs, @cmds);
227};
228
229
230# Create commands for sigle
231sub cmds_from_sigle {
232 my ($self, $sigle) = @_;
233
234 my $first = 1;
235
236 my @init_cmd = (
237 'unzip', # Use unzip program
238 '-qo', # quietly overwrite all existing files
239 '-uo',
240 );
241
242 my @cmds;
243
244 # Iterate over all attached archives
245 for (my $i = 0; $i < @$self; $i++) {
246 my $archive = $self->[$i];
247 my $prefix_check = 0;
248 my $prefix = 0;
249
250 # $_ is the zip
251 my @cmd = @init_cmd;
252 push(@cmd, $archive->[0]); # Extract from zip
253
254 foreach (@$sigle) {
255 my ($corpus,$doc,$text) = split '/', $_;
256
257 # Add some interesting files for extraction
258 # Can't use catfile(), as this removes the '.' prefix
259 my @breadcrumbs = ($corpus);
260
261 # If the prefix is not forbidden - prefix!
262 unless ($prefix_check) {
263 $prefix = $self->check_prefix($i);
264 $prefix_check = 1;
265 };
266
Akron955b75b2019-02-21 14:28:41 +0100267 unshift @breadcrumbs, '.' if $prefix;
Akron31a08cb2019-02-20 20:43:26 +0100268
269 if ($first) {
270
271 # Only extract from first file
272 push(@cmd, join('/', @breadcrumbs, 'header.xml'));
273 push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
274 };
275
276 # With wildcard on doc level
277 if (index($doc, '*') > 0) {
278 push @breadcrumbs, $doc;
279 }
280
281 # For full-defined doc sigle
282 elsif (!$text) {
283 push @breadcrumbs, $doc, '*';
284 }
285
286 # For text sigle
287 else {
288 push @breadcrumbs, $doc, $text, '*';
289 }
290
291 # Add to command
292 push(@cmd, join('/', @breadcrumbs));
293 };
294
295 # Add to commands
296 push @cmds, [uniq @cmd];
297
298 $first = 0;
299 };
300
301 return @cmds;
302};
303
Akron150b29e2016-02-14 23:06:48 +01003041;
305
Akron96165ad2016-02-15 18:09:41 +0100306
Akron150b29e2016-02-14 23:06:48 +0100307__END__
308
309=POD
310
311C<KorAP::XML::Archive> expects the unzip tool to be installed.
312
313
314=head1 new
315
316=head1 test
317
Akron2cfe8092016-06-24 17:48:49 +0200318=head1 attach
319
Akron20807582016-10-26 17:11:34 +0200320=head1 check_prefix
321
Akron150b29e2016-02-14 23:06:48 +0100322=head1 list_texts
323
324Returns all texts found in the zip file
325
Akron20807582016-10-26 17:11:34 +0200326=head1 extract_text
Akron150b29e2016-02-14 23:06:48 +0100327
Akron20807582016-10-26 17:11:34 +0200328 $archive->extract_text('./GOE/AGU/0004', '~/temp');
Akron150b29e2016-02-14 23:06:48 +0100329
330Extract all files for the named text to a certain directory.