Added pefix negation to multiple archive support

Change-Id: I0d3eaf9b243aaafcd7ee7fed855bb0bb6cb6b615
diff --git a/Changes b/Changes
index 1cbeada..384590f 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.18 2016-06-22
+0.18 2016-06-24
         - Added REI test.
+	- Added multiple archive support to korapxml2krill.
+	- Added support for prefix negation in korapxml2krill.
 
 0.17 2016-03-22
         - Rewrite siglen to use slashes as separators.
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index b1201eb..1dd7798 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -7,7 +7,15 @@
 # Convert new archive helper
 sub new {
   my $class = shift;
-  my @file = @_ or return;
+  my @file;
+
+  foreach (@_) {
+    my $file = _file_to_array($_) or return;
+    push(@file, $file);
+  };
+
+  return unless @file;
+
   bless \@file, $class;
 };
 
@@ -22,7 +30,8 @@
 sub test {
   my $self = shift;
   foreach (@$self) {
-    my $out = `unzip -t $_`;
+    my $x = $_->[0];
+    my $out = `unzip -t $x`;
     if ($out !~ /no errors/i) {
       return 0;
     };
@@ -35,7 +44,7 @@
 sub list_texts {
   my $self = shift;
   my @texts;
-  my $file = $self->[0];
+  my $file = $self->[0]->[0];
   foreach (`unzip -l -UU -qq $file "*/data.xml"`) {
     if (m![\t\s]
       ((?:\./)?
@@ -77,7 +86,7 @@
   };
 
   # Text has not the expected pattern
-  carp $text_path . ' is not a well-formed text path in ' . $self->[0];
+  carp $text_path . ' is not a well-formed text path in ' . $self->[0]->[0];
   return;
 };
 
@@ -87,20 +96,38 @@
 sub path {
   my $self = shift;
   my $archive = shift // 0;
-  return rel2abs($self->[$archive]);
+  return rel2abs($self->[$archive]->[0]);
 };
 
 
+# Attach another archive
 sub attach {
   my $self = shift;
-  if (-e $_[0]) {
-    push @$self, $_[0];
-    return 1;
-  };
-  return 0;
+  my $file = _file_to_array(shift()) or return;
+  push @$self, $file;
+  return 1;
 };
 
 
+# Check attached file for prefix negation
+sub _file_to_array {
+  my $file = shift;
+  my $prefix = 1;
+
+  # Should the archive support prefixes
+  if (index($file, '#') == 0) {
+    $file = substr($file, 1);
+    $prefix = 0;
+  };
+
+  # The archive is a valid file
+  if (-e $file) {
+    return [$file, $prefix]
+  };
+};
+
+
+
 # Extract files to a directory
 sub extract {
   my $self = shift;
@@ -115,11 +142,14 @@
     '-d', $target_dir # Extract into target directory
   );
 
-  foreach (@$self) {
-    my @cmd = @init_cmd;
-    push(@cmd, $_); # Extract from zip
+  my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
 
-    my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
+  # Iterate over all attached archives
+  foreach my $archive (@$self) {
+
+    # $_ is the zip
+    my @cmd = @init_cmd;
+    push(@cmd, $archive->[0]); # Extract from zip
 
     # Add some interesting files for extraction
     # Can't use catfile(), as this removes the '.' prefix
@@ -131,7 +161,12 @@
     };
 
     # With prefix
-    push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
+    my @path = ($corpus, $doc, $text, '*');
+
+    # If the prefix is not forbidden - prefix!
+    unshift @path, $prefix if $archive->[1];
+
+    push(@cmd, join('/', @path));
 
     # Run system call
     system(@cmd);
@@ -162,6 +197,8 @@
 
 =head1 test
 
+=head1 attach
+
 =head1 list_texts
 
 Returns all texts found in the zip file
diff --git a/script/korapxml2krill b/script/korapxml2krill
index f840795..f84439f 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -54,6 +54,10 @@
 #
 # 2016/03/18
 # - Added meta data caching
+#
+# 2016/06/24
+# - Added multi archive support
+# - Added prefix negation support
 # ----------------------------------------------------------
 
 our $LAST_CHANGE = '2016/03/17';
@@ -589,10 +593,22 @@
 
 =over 2
 
-=item B<--input|-i> <directory|file>
+=item B<--input|-i> <directory|file|files>
 
 Directory or archive file of documents to convert.
 
+Multiple input archives are supported for archiving,
+with the constraint,
+that the first archive listed contains all primary data files
+and all meta data files.
+The directory structure follows the base directory format,
+starting with a C<.> root folder.
+In case an attached archive has no C<.> root folder,
+the archive path should start with a hash.
+
+  -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
+
+
 =item B<--output|-o> <directory|file>
 
 Output folder for archive processing or