blob: 3090155e5fe40184529f6a83941f876802656f62 [file] [log] [blame]
package KorAP::XML::Archive;
use Carp qw/carp/;
use File::Spec::Functions qw(rel2abs);
use strict;
use warnings;
# Convert new archive helper
sub new {
my $class = shift;
my $file = shift or return;
bless \$file, $class;
};
# Check if unzip is installed
sub test_unzip {
return 1 if grep { -x "$_/unzip"} split /:/, $ENV{PATH};
return;
};
# Check the compressed archive
sub test {
my $self = shift;
my $file = $$self;
my $out = `unzip -t $file`;
if ($out =~ /no errors/i) {
return 1;
};
return 0;
};
# List all text paths contained in the file
sub list_texts {
my $self = shift;
my $file = $$self;
my %texts;
foreach (`unzip -l $file *.xml`) {
if ($_ =~ m![\t\s]
((?:\./)?
[^\t\s/\.]+?/ # Corpus
[^\t\s/]+?/ # Document
[^\t\s/]+? # Text
)/(?:[^/]+?)\.xml$!x) {
$texts{$1} = 1;
};
};
return sort {$a cmp $b} keys %texts;
};
# Split a text path to prefix, corpus, document, text
sub split_path {
my $self = shift;
my $text_path = shift;
unless ($text_path) {
carp('No text path given');
return 0;
};
# Check for '.' prefix in text
my $prefix = '';
if ($text_path =~ s!^\./!!) {
$prefix = '.';
};
# Unix form
if ($text_path =~ m!^([^/]+?)/([^/]+?)/([^/]+?)$!) {
return ($prefix, $1, $2, $3);
}
# Windows form
elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)\\([^\\]+?)$!) {
return ($prefix, $1, $2, $3);
};
# Text has not the expected pattern
carp $text_path . ' is not a well-formed text path in ' . $$self;
return;
};
# Get the archives path
sub path {
return rel2abs(${$_[0]});
};
# Extract files to a directory
sub extract {
my $self = shift;
my $text_path = shift;
my $target_dir = shift;
my @cmd = (
'unzip', # Use unzip program
'-qo', # quietly overwrite all existing files
'-d', $target_dir # Extract into target directory
);
push(@cmd, $$self); # Extract from zip
my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
# Add some interesting files for extraction
# Can't use catfile(), as this removes the '.' prefix
push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
# Run system call
system(@cmd);
# Check for return code
if ($? != 0) {
carp("System call '" . join(' ', @cmd) . "' errors " . $?);
return;
};
# Fine
return 1;
};
1;
__END__
=POD
C<KorAP::XML::Archive> expects the unzip tool to be installed.
=head1 new
=head1 test
=head1 list_texts
Returns all texts found in the zip file
=head1 extract
$archive->extract('./GOE/AGU/0004', '~/temp');
Extract all files for the named text to a certain directory.