Added pefix negation to multiple archive support
Change-Id: I0d3eaf9b243aaafcd7ee7fed855bb0bb6cb6b615
diff --git a/Changes b/Changes
index 1cbeada..384590f 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.18 2016-06-22
+0.18 2016-06-24
- Added REI test.
+ - Added multiple archive support to korapxml2krill.
+ - Added support for prefix negation in korapxml2krill.
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index b1201eb..1dd7798 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -7,7 +7,15 @@
# Convert new archive helper
sub new {
my $class = shift;
- my @file = @_ or return;
+ my @file;
+
+ foreach (@_) {
+ my $file = _file_to_array($_) or return;
+ push(@file, $file);
+ };
+
+ return unless @file;
+
bless \@file, $class;
};
@@ -22,7 +30,8 @@
sub test {
my $self = shift;
foreach (@$self) {
- my $out = `unzip -t $_`;
+ my $x = $_->[0];
+ my $out = `unzip -t $x`;
if ($out !~ /no errors/i) {
return 0;
};
@@ -35,7 +44,7 @@
sub list_texts {
my $self = shift;
my @texts;
- my $file = $self->[0];
+ my $file = $self->[0]->[0];
foreach (`unzip -l -UU -qq $file "*/data.xml"`) {
if (m![\t\s]
((?:\./)?
@@ -77,7 +86,7 @@
};
# Text has not the expected pattern
- carp $text_path . ' is not a well-formed text path in ' . $self->[0];
+ carp $text_path . ' is not a well-formed text path in ' . $self->[0]->[0];
return;
};
@@ -87,20 +96,38 @@
sub path {
my $self = shift;
my $archive = shift // 0;
- return rel2abs($self->[$archive]);
+ return rel2abs($self->[$archive]->[0]);
};
+# Attach another archive
sub attach {
my $self = shift;
- if (-e $_[0]) {
- push @$self, $_[0];
- return 1;
- };
- return 0;
+ my $file = _file_to_array(shift()) or return;
+ push @$self, $file;
+ return 1;
};
+# Check attached file for prefix negation
+sub _file_to_array {
+ my $file = shift;
+ my $prefix = 1;
+
+ # Should the archive support prefixes
+ if (index($file, '#') == 0) {
+ $file = substr($file, 1);
+ $prefix = 0;
+ };
+
+ # The archive is a valid file
+ if (-e $file) {
+ return [$file, $prefix]
+ };
+};
+
+
+
# Extract files to a directory
sub extract {
my $self = shift;
@@ -115,11 +142,14 @@
'-d', $target_dir # Extract into target directory
);
- foreach (@$self) {
- my @cmd = @init_cmd;
- push(@cmd, $_); # Extract from zip
+ my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
- my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
+ # Iterate over all attached archives
+ foreach my $archive (@$self) {
+
+ # $_ is the zip
+ my @cmd = @init_cmd;
+ push(@cmd, $archive->[0]); # Extract from zip
# Add some interesting files for extraction
# Can't use catfile(), as this removes the '.' prefix
@@ -131,7 +161,12 @@
};
# With prefix
- push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
+ my @path = ($corpus, $doc, $text, '*');
+
+ # If the prefix is not forbidden - prefix!
+ unshift @path, $prefix if $archive->[1];
+
+ push(@cmd, join('/', @path));
# Run system call
system(@cmd);
@@ -162,6 +197,8 @@
=head1 test
+=head1 attach
+
=head1 list_texts
Returns all texts found in the zip file
diff --git a/script/korapxml2krill b/script/korapxml2krill
index f840795..f84439f 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -54,6 +54,10 @@
#
# 2016/03/18
# - Added meta data caching
+#
+# 2016/06/24
+# - Added multi archive support
+# - Added prefix negation support
# ----------------------------------------------------------
our $LAST_CHANGE = '2016/03/17';
@@ -589,10 +593,22 @@
=over 2
-=item B<--input|-i> <directory|file>
+=item B<--input|-i> <directory|file|files>
Directory or archive file of documents to convert.
+Multiple input archives are supported for archiving,
+with the constraint,
+that the first archive listed contains all primary data files
+and all meta data files.
+The directory structure follows the base directory format,
+starting with a C<.> root folder.
+In case an attached archive has no C<.> root folder,
+the archive path should start with a hash.
+
+ -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
+
+
=item B<--output|-o> <directory|file>
Output folder for archive processing or