First step to multi-archive support
Change-Id: I619b039abe396b8a4c6f8efc4d618077b7a8fcd2
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index e44741d..b1201eb 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -7,8 +7,8 @@
# Convert new archive helper
sub new {
my $class = shift;
- my $file = shift or return;
- bless \$file, $class;
+ my @file = @_ or return;
+ bless \@file, $class;
};
@@ -21,12 +21,13 @@
# Check the compressed archive
sub test {
my $self = shift;
- my $file = $$self;
- my $out = `unzip -t $file`;
- if ($out =~ /no errors/i) {
- return 1;
+ foreach (@$self) {
+ my $out = `unzip -t $_`;
+ if ($out !~ /no errors/i) {
+ return 0;
+ };
};
- return 0;
+ return 1;
};
@@ -34,7 +35,8 @@
sub list_texts {
my $self = shift;
my @texts;
- foreach (`unzip -l -UU -qq $$self "*/data.xml"`) {
+ my $file = $self->[0];
+ foreach (`unzip -l -UU -qq $file "*/data.xml"`) {
if (m![\t\s]
((?:\./)?
[^\t\s/\.]+?/ # Corpus
@@ -75,14 +77,27 @@
};
# Text has not the expected pattern
- carp $text_path . ' is not a well-formed text path in ' . $$self;
+ carp $text_path . ' is not a well-formed text path in ' . $self->[0];
return;
};
# Get the archives path
+# Deprecated
sub path {
- return rel2abs(${$_[0]});
+ my $self = shift;
+ my $archive = shift // 0;
+ return rel2abs($self->[$archive]);
+};
+
+
+sub attach {
+ my $self = shift;
+ if (-e $_[0]) {
+ push @$self, $_[0];
+ return 1;
+ };
+ return 0;
};
@@ -92,29 +107,40 @@
my $text_path = shift;
my $target_dir = shift;
- my @cmd = (
+ my $first = 1;
+
+ my @init_cmd = (
'unzip', # Use unzip program
'-qo', # quietly overwrite all existing files
'-d', $target_dir # Extract into target directory
);
- push(@cmd, $$self); # Extract from zip
+ foreach (@$self) {
+ my @cmd = @init_cmd;
+ push(@cmd, $_); # Extract from zip
- my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
+ my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
- # Add some interesting files for extraction
- # Can't use catfile(), as this removes the '.' prefix
- push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
- push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
- push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
+ # Add some interesting files for extraction
+ # Can't use catfile(), as this removes the '.' prefix
+ if ($first) {
+ # Only extract from first file
+ push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
+ push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
+ $first = 0;
+ };
- # Run system call
- system(@cmd);
+ # With prefix
+ push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
- # Check for return code
- if ($? != 0) {
- carp("System call '" . join(' ', @cmd) . "' errors " . $?);
- return;
+ # Run system call
+ system(@cmd);
+
+ # Check for return code
+ if ($? != 0) {
+ carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+ return;
+ };
};
# Fine
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5cdacc4..b9e43ef 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -16,6 +16,7 @@
use KorAP::XML::Tokenizer;
use Parallel::ForkManager;
# TODO: use Parallel::Loops
+# TODO: make output files
# CHANGES:
# ----------------------------------------------------------
@@ -69,12 +70,12 @@
$cmd = shift @ARGV;
};
-my (@skip, @sigle);
+my (@skip, @sigle, @input);
my $text;
# Parse options from the command line
GetOptions(
- 'input|i=s' => \(my $input),
+ 'input|i=s' => \@input,
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
'meta|m=s' => \(my $meta),
@@ -114,7 +115,7 @@
);
# Input has to be defined
-pod2usage(%ERROR_HASH) unless $input;
+pod2usage(%ERROR_HASH) unless @input;
# Initialize log4perl object
@@ -130,8 +131,9 @@
# Get file name based on path information
sub get_file_name ($) {
+ my $i = $input[0];
my $file = shift;
- $file =~ s/^?\/?$input//;
+ $file =~ s/^?\/?$i//;
$file =~ tr/\//-/;
$file =~ s{^-+}{};
return $file;
@@ -145,8 +147,9 @@
# TODO: This should be done directly with a data structure! KorAP::XML::Wrap
- my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
- $anno . ' -o ' . $output . '/' . $file . '.json';
+ my $call = 'perl ' . $LOCAL . '/korapxml2krill';
+ $call .= ' -i ' . $anno;
+ $call .= ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
$call .= ' -m ' . $meta if $meta;
$call .= ' -w' if $overwrite;
@@ -170,6 +173,7 @@
# Process a single file
unless ($cmd) {
+ my $input = $input[0];
# Can't print gzip to STDOUT
pod2usage(%ERROR_HASH) if $gzip && !$output;
@@ -329,6 +333,8 @@
# Extract XML files
elsif ($cmd eq 'extract') {
+ my $input = $input[0];
+
pod2usage(%ERROR_HASH) unless $output;
# TODO: Support sigles and full archives
@@ -338,6 +344,8 @@
exit(0);
};
+#TODOOOOOO
+
if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
unless ($archive->test_unzip) {
@@ -380,7 +388,7 @@
sub {
my ($pid, $code) = shift;
my $data = pop;
- print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
+ print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
" $$data\n";
@@ -400,8 +408,8 @@
};
# Input is a directory
- if (-d $input) {
- my $it = Directory::Iterator->new($input);
+ if (-d $input[0]) {
+ my $it = Directory::Iterator->new($input[0]);
my @dirs;
my $dir;
@@ -443,12 +451,15 @@
}
# Input is a file
- elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+ elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new(shift @input))) {
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
};
+ # Add further annotation archived
+ # $doc->attach($_) foreach @input;
+
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
@@ -485,7 +496,7 @@
if ($archive->extract($dirs[$i], $temp)) {
# Create corpus directory
- $input = catdir("$temp", $corpus);
+ my $input = catdir("$temp", $corpus);
# Temporary directory
my $dir = catdir($input, $doc, $text);
diff --git a/t/archive.t b/t/archive.t
index 22ad61f..f31cb0c 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -16,7 +16,7 @@
};
ok($archive->test, 'Test archive');
-like($archive->path, qr/archive\.zip$/, 'Archive path');
+like($archive->path(0), qr/archive\.zip$/, 'Archive path');
my @list = $archive->list_texts;
is(scalar @list, 10, 'Found all tests');
@@ -42,6 +42,8 @@
ok(-f catdir($dir, 'TEST', 'BSP', 'header.xml'), 'Test doc header exists');
+
+
done_testing;
__END__