| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 1 | package KorAP::XML::Archive; |
| 2 | use Carp qw/carp/; |
| Akron | 60a8caa | 2017-02-17 21:51:27 +0100 | [diff] [blame] | 3 | use Mojo::Util qw/quote/; |
| Akron | 31a08cb | 2019-02-20 20:43:26 +0100 | [diff] [blame] | 4 | use List::Util qw/uniq/; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 5 | use File::Spec::Functions qw(rel2abs); |
| 6 | use strict; |
| 7 | use warnings; |
| 8 | |
| Akron | 0c3e375 | 2016-06-28 15:55:53 +0200 | [diff] [blame] | 9 | # Construct new archive helper |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 10 | sub new { |
| 11 | my $class = shift; |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 12 | my @file; |
| 13 | |
| 14 | foreach (@_) { |
| 15 | my $file = _file_to_array($_) or return; |
| 16 | push(@file, $file); |
| 17 | }; |
| 18 | |
| 19 | return unless @file; |
| 20 | |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 21 | bless \@file, $class; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 22 | }; |
| 23 | |
| 24 | |
| 25 | # Check if unzip is installed |
| 26 | sub test_unzip { |
| 27 | return 1 if grep { -x "$_/unzip"} split /:/, $ENV{PATH}; |
| 28 | return; |
| 29 | }; |
| 30 | |
| Akron | 0278ca2 | 2017-04-11 19:29:05 +0200 | [diff] [blame] | 31 | |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 32 | # Check the compressed archive |
| 33 | sub test { |
| 34 | my $self = shift; |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 35 | foreach (@$self) { |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 36 | my $x = $_->[0]; |
| 37 | my $out = `unzip -t $x`; |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 38 | if ($out !~ /no errors/i) { |
| 39 | return 0; |
| 40 | }; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 41 | }; |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 42 | return 1; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 43 | }; |
| 44 | |
| 45 | |
| 46 | # List all text paths contained in the file |
| 47 | sub list_texts { |
| 48 | my $self = shift; |
| Akron | e8adfcc | 2016-03-22 13:18:26 +0100 | [diff] [blame] | 49 | my @texts; |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 50 | my $file = $self->[0]->[0]; |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 51 | foreach (`unzip -l -UU -qq $file "*/data.xml"`) { |
| Akron | e8adfcc | 2016-03-22 13:18:26 +0100 | [diff] [blame] | 52 | if (m![\t\s] |
| Akron | 3887301 | 2017-02-06 20:27:37 +0100 | [diff] [blame] | 53 | ((?:\./)? |
| 54 | [^\s\t/\.]+?/ # Corpus |
| 55 | [^\/]+?/ # Document |
| 56 | [^/]+? # Text |
| 57 | )/data\.xml$!x) { |
| Akron | e8adfcc | 2016-03-22 13:18:26 +0100 | [diff] [blame] | 58 | push @texts, $1; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 59 | }; |
| 60 | }; |
| Akron | e8adfcc | 2016-03-22 13:18:26 +0100 | [diff] [blame] | 61 | return @texts; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 62 | }; |
| 63 | |
| 64 | |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 65 | # Check, if the archive has a prefix |
| 66 | sub check_prefix { |
| 67 | my $self = shift; |
| 68 | my $nr = shift // 0; |
| 69 | my $file = $self->[$nr]->[0]; |
| 70 | my ($header) = `unzip -l -UU -qq $file "*/header.xml"`; |
| Akron | 955b75b | 2019-02-21 14:28:41 +0100 | [diff] [blame] | 71 | return ($header && $header =~ m![\s\t]\.[/\\]!) ? 1 : 0; |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 72 | }; |
| 73 | |
| 74 | |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 75 | # Split a text path to prefix, corpus, document, text |
| 76 | sub split_path { |
| 77 | my $self = shift; |
| 78 | my $text_path = shift; |
| 79 | |
| 80 | unless ($text_path) { |
| 81 | carp('No text path given'); |
| 82 | return 0; |
| 83 | }; |
| 84 | |
| 85 | # Check for '.' prefix in text |
| 86 | my $prefix = ''; |
| 87 | if ($text_path =~ s!^\./!!) { |
| 88 | $prefix = '.'; |
| 89 | }; |
| 90 | |
| 91 | # Unix form |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 92 | if ($text_path =~ m!^([^/]+?)/([^/]+?)[\\/]([^/]+?)$!) { |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 93 | return ($prefix, $1, $2, $3); |
| 94 | } |
| 95 | |
| 96 | # Windows form |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 97 | elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)[\\/]([^\\]+?)$!) { |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 98 | return ($prefix, $1, $2, $3); |
| 99 | }; |
| 100 | |
| 101 | # Text has not the expected pattern |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 102 | carp $text_path . ' is not a well-formed text path in ' . $self->[0]->[0]; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 103 | return; |
| 104 | }; |
| 105 | |
| 106 | |
| 107 | # Get the archives path |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 108 | # Deprecated |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 109 | sub path { |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 110 | my $self = shift; |
| 111 | my $archive = shift // 0; |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 112 | return rel2abs($self->[$archive]->[0]); |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 113 | }; |
| 114 | |
| 115 | |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 116 | # Attach another archive |
| Akron | 08385f6 | 2016-03-22 20:37:04 +0100 | [diff] [blame] | 117 | sub attach { |
| 118 | my $self = shift; |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 119 | my $file = _file_to_array(shift()) or return; |
| 120 | push @$self, $file; |
| 121 | return 1; |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 122 | }; |
| 123 | |
| 124 | |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 125 | # Check attached file for prefix negation |
| 126 | sub _file_to_array { |
| 127 | my $file = shift; |
| 128 | my $prefix = 1; |
| 129 | |
| 130 | # Should the archive support prefixes |
| 131 | if (index($file, '#') == 0) { |
| 132 | $file = substr($file, 1); |
| 133 | $prefix = 0; |
| 134 | }; |
| 135 | |
| 136 | # The archive is a valid file |
| 137 | if (-e $file) { |
| 138 | return [$file, $prefix] |
| 139 | }; |
| 140 | }; |
| 141 | |
| 142 | |
| Akron | 8150010 | 2017-04-07 20:45:44 +0200 | [diff] [blame] | 143 | sub extract_all { |
| 144 | my $self = shift; |
| 145 | my ($target_dir, $jobs) = @_; |
| 146 | |
| 147 | my @init_cmd = ( |
| 148 | 'unzip', # Use unzip program |
| 149 | '-qo', # quietly overwrite all existing files |
| Akron | 3a486f8 | 2017-04-11 21:15:55 +0200 | [diff] [blame] | 150 | '-uo', |
| Akron | 8150010 | 2017-04-07 20:45:44 +0200 | [diff] [blame] | 151 | '-d', $target_dir # Extract into target directory |
| 152 | ); |
| 153 | |
| 154 | # Iterate over all attached archives |
| 155 | my @cmds; |
| 156 | foreach my $archive (@$self) { |
| 157 | |
| 158 | # $_ is the zip |
| 159 | my @cmd = @init_cmd; |
| 160 | push(@cmd, $archive->[0]); # Extract from zip |
| 161 | |
| 162 | # Run system call |
| 163 | push @cmds, \@cmd; |
| 164 | }; |
| 165 | |
| 166 | $self->_extract($jobs, @cmds); |
| 167 | }; |
| 168 | |
| 169 | |
| 170 | sub _extract { |
| 171 | my ($self, $jobs, @cmds) = @_; |
| 172 | |
| 173 | # Only single call |
| 174 | if (!$jobs || $jobs == 1) { |
| 175 | foreach (@cmds) { |
| Akron | 86db52e | 2017-04-11 20:36:43 +0200 | [diff] [blame] | 176 | |
| Akron | 8150010 | 2017-04-07 20:45:44 +0200 | [diff] [blame] | 177 | system(@$_); |
| 178 | |
| 179 | # Check for return code |
| Akron | 9ec8887 | 2017-04-12 16:29:06 +0200 | [diff] [blame] | 180 | my $code = $?; |
| 181 | |
| 182 | print "Extract" . |
| 183 | ($code ? " $code" : '') . " " . join(' ', @$_) . "\n"; |
| Akron | 8150010 | 2017-04-07 20:45:44 +0200 | [diff] [blame] | 184 | }; |
| 185 | } |
| 186 | |
| 187 | # Extract annotations in parallel |
| 188 | else { |
| 189 | my $pool = Parallel::ForkManager->new($jobs); |
| 190 | $pool->run_on_finish( |
| 191 | sub { |
| 192 | my ($pid, $code) = @_; |
| 193 | my $data = pop; |
| 194 | print "Extract [\$$pid] " . |
| 195 | ($code ? " $code" : '') . " $$data\n"; |
| 196 | } |
| 197 | ); |
| 198 | |
| 199 | ARCHIVE_LOOP: |
| 200 | foreach my $cmd (@cmds) { |
| 201 | my $pid = $pool->start and next ARCHIVE_LOOP; |
| 202 | system(@$cmd); |
| Akron | 86db52e | 2017-04-11 20:36:43 +0200 | [diff] [blame] | 203 | my $code = $?; |
| Akron | 8150010 | 2017-04-07 20:45:44 +0200 | [diff] [blame] | 204 | my $last = $cmd->[4]; |
| Akron | 86db52e | 2017-04-11 20:36:43 +0200 | [diff] [blame] | 205 | $pool->finish($code, \"$last"); |
| Akron | 8150010 | 2017-04-07 20:45:44 +0200 | [diff] [blame] | 206 | }; |
| 207 | $pool->wait_all_children; |
| 208 | }; |
| 209 | |
| 210 | # Fine |
| 211 | return 1; |
| 212 | }; |
| 213 | |
| 214 | |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 215 | |
| Akron | 31a08cb | 2019-02-20 20:43:26 +0100 | [diff] [blame] | 216 | # Extract from sigle |
| 217 | sub extract_sigle { |
| 218 | my ($self, $sigle, $target_dir, $jobs) = @_; |
| 219 | my @cmds = $self->cmds_from_sigle($sigle); |
| 220 | |
| 221 | @cmds = map { |
| 222 | push @{$_}, '-d', $target_dir; |
| 223 | $_; |
| 224 | } @cmds; |
| 225 | |
| 226 | return $self->_extract($jobs, @cmds); |
| 227 | }; |
| 228 | |
| 229 | |
| 230 | # Create commands for sigle |
| 231 | sub cmds_from_sigle { |
| 232 | my ($self, $sigle) = @_; |
| 233 | |
| 234 | my $first = 1; |
| 235 | |
| 236 | my @init_cmd = ( |
| 237 | 'unzip', # Use unzip program |
| 238 | '-qo', # quietly overwrite all existing files |
| 239 | '-uo', |
| 240 | ); |
| 241 | |
| 242 | my @cmds; |
| 243 | |
| 244 | # Iterate over all attached archives |
| 245 | for (my $i = 0; $i < @$self; $i++) { |
| 246 | my $archive = $self->[$i]; |
| 247 | my $prefix_check = 0; |
| 248 | my $prefix = 0; |
| 249 | |
| 250 | # $_ is the zip |
| 251 | my @cmd = @init_cmd; |
| 252 | push(@cmd, $archive->[0]); # Extract from zip |
| 253 | |
| 254 | foreach (@$sigle) { |
| 255 | my ($corpus,$doc,$text) = split '/', $_; |
| 256 | |
| 257 | # Add some interesting files for extraction |
| 258 | # Can't use catfile(), as this removes the '.' prefix |
| 259 | my @breadcrumbs = ($corpus); |
| 260 | |
| 261 | # If the prefix is not forbidden - prefix! |
| 262 | unless ($prefix_check) { |
| 263 | $prefix = $self->check_prefix($i); |
| 264 | $prefix_check = 1; |
| 265 | }; |
| 266 | |
| Akron | 955b75b | 2019-02-21 14:28:41 +0100 | [diff] [blame] | 267 | unshift @breadcrumbs, '.' if $prefix; |
| Akron | 31a08cb | 2019-02-20 20:43:26 +0100 | [diff] [blame] | 268 | |
| 269 | if ($first) { |
| 270 | |
| 271 | # Only extract from first file |
| 272 | push(@cmd, join('/', @breadcrumbs, 'header.xml')); |
| 273 | push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml')); |
| 274 | }; |
| 275 | |
| 276 | # With wildcard on doc level |
| 277 | if (index($doc, '*') > 0) { |
| 278 | push @breadcrumbs, $doc; |
| 279 | } |
| 280 | |
| 281 | # For full-defined doc sigle |
| 282 | elsif (!$text) { |
| 283 | push @breadcrumbs, $doc, '*'; |
| 284 | } |
| 285 | |
| 286 | # For text sigle |
| 287 | else { |
| 288 | push @breadcrumbs, $doc, $text, '*'; |
| 289 | } |
| 290 | |
| 291 | # Add to command |
| 292 | push(@cmd, join('/', @breadcrumbs)); |
| 293 | }; |
| 294 | |
| 295 | # Add to commands |
| 296 | push @cmds, [uniq @cmd]; |
| 297 | |
| 298 | $first = 0; |
| 299 | }; |
| 300 | |
| 301 | return @cmds; |
| 302 | }; |
| 303 | |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 304 | 1; |
| 305 | |
| Akron | 96165ad | 2016-02-15 18:09:41 +0100 | [diff] [blame] | 306 | |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 307 | __END__ |
| 308 | |
| 309 | =POD |
| 310 | |
| 311 | C<KorAP::XML::Archive> expects the unzip tool to be installed. |
| 312 | |
| 313 | |
| 314 | =head1 new |
| 315 | |
| 316 | =head1 test |
| 317 | |
| Akron | 2cfe809 | 2016-06-24 17:48:49 +0200 | [diff] [blame] | 318 | =head1 attach |
| 319 | |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 320 | =head1 check_prefix |
| 321 | |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 322 | =head1 list_texts |
| 323 | |
| 324 | Returns all texts found in the zip file |
| 325 | |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 326 | =head1 extract_text |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 327 | |
| Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 328 | $archive->extract_text('./GOE/AGU/0004', '~/temp'); |
| Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 329 | |
| 330 | Extract all files for the named text to a certain directory. |