First step to multi-archive support

Change-Id: I619b039abe396b8a4c6f8efc4d618077b7a8fcd2
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index e44741d..b1201eb 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -7,8 +7,8 @@
 # Convert new archive helper
 sub new {
   my $class = shift;
-  my $file = shift or return;
-  bless \$file, $class;
+  my @file = @_ or return;
+  bless \@file, $class;
 };
 
 
@@ -21,12 +21,13 @@
 # Check the compressed archive
 sub test {
   my $self = shift;
-  my $file = $$self;
-  my $out = `unzip -t $file`;
-  if ($out =~ /no errors/i) {
-    return 1;
+  foreach (@$self) {
+    my $out = `unzip -t $_`;
+    if ($out !~ /no errors/i) {
+      return 0;
+    };
   };
-  return 0;
+  return 1;
 };
 
 
@@ -34,7 +35,8 @@
 sub list_texts {
   my $self = shift;
   my @texts;
-  foreach (`unzip -l -UU -qq $$self "*/data.xml"`) {
+  my $file = $self->[0];
+  foreach (`unzip -l -UU -qq $file "*/data.xml"`) {
     if (m![\t\s]
       ((?:\./)?
 	[^\t\s/\.]+?/ # Corpus
@@ -75,14 +77,27 @@
   };
 
   # Text has not the expected pattern
-  carp $text_path . ' is not a well-formed text path in ' . $$self;
+  carp $text_path . ' is not a well-formed text path in ' . $self->[0];
   return;
 };
 
 
 # Get the archives path
+# Deprecated
 sub path {
-  return rel2abs(${$_[0]});
+  my $self = shift;
+  my $archive = shift // 0;
+  return rel2abs($self->[$archive]);
+};
+
+
+sub attach {
+  my $self = shift;
+  if (-e $_[0]) {
+    push @$self, $_[0];
+    return 1;
+  };
+  return 0;
 };
 
 
@@ -92,29 +107,40 @@
   my $text_path = shift;
   my $target_dir = shift;
 
-  my @cmd = (
+  my $first = 1;
+
+  my @init_cmd = (
     'unzip',           # Use unzip program
     '-qo',             # quietly overwrite all existing files
     '-d', $target_dir # Extract into target directory
   );
 
-  push(@cmd, $$self); # Extract from zip
+  foreach (@$self) {
+    my @cmd = @init_cmd;
+    push(@cmd, $_); # Extract from zip
 
-  my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
+    my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
 
-  # Add some interesting files for extraction
-  # Can't use catfile(), as this removes the '.' prefix
-  push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
-  push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
-  push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
+    # Add some interesting files for extraction
+    # Can't use catfile(), as this removes the '.' prefix
+    if ($first) {
+      # Only extract from first file
+      push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
+      push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
+      $first = 0;
+    };
 
-  # Run system call
-  system(@cmd);
+    # With prefix
+    push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
 
-  # Check for return code
-  if ($? != 0) {
-    carp("System call '" . join(' ', @cmd) . "' errors " . $?);
-    return;
+    # Run system call
+    system(@cmd);
+
+    # Check for return code
+    if ($? != 0) {
+      carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+      return;
+    };
   };
 
   # Fine
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5cdacc4..b9e43ef 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -16,6 +16,7 @@
 use KorAP::XML::Tokenizer;
 use Parallel::ForkManager;
 # TODO: use Parallel::Loops
+# TODO: make output files
 
 # CHANGES:
 # ----------------------------------------------------------
@@ -69,12 +70,12 @@
   $cmd = shift @ARGV;
 };
 
-my (@skip, @sigle);
+my (@skip, @sigle, @input);
 my $text;
 
 # Parse options from the command line
 GetOptions(
-  'input|i=s'   => \(my $input),
+  'input|i=s'   => \@input,
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
@@ -114,7 +115,7 @@
 );
 
 # Input has to be defined
-pod2usage(%ERROR_HASH) unless $input;
+pod2usage(%ERROR_HASH) unless @input;
 
 
 # Initialize log4perl object
@@ -130,8 +131,9 @@
 
 # Get file name based on path information
 sub get_file_name ($) {
+  my $i = $input[0];
   my $file = shift;
-  $file =~ s/^?\/?$input//;
+  $file =~ s/^?\/?$i//;
   $file =~ tr/\//-/;
   $file =~ s{^-+}{};
   return $file;
@@ -145,8 +147,9 @@
 
   # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
 
-  my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
-    $anno . ' -o ' . $output . '/' . $file . '.json';
+  my $call = 'perl ' . $LOCAL . '/korapxml2krill';
+  $call .= ' -i ' . $anno;
+  $call .= ' -o ' . $output . '/' . $file . '.json';
   $call .= '.gz -z' if $gzip;
   $call .= ' -m ' . $meta if $meta;
   $call .= ' -w' if $overwrite;
@@ -170,6 +173,7 @@
 
 # Process a single file
 unless ($cmd) {
+  my $input = $input[0];
 
   # Can't print gzip to STDOUT
   pod2usage(%ERROR_HASH) if $gzip && !$output;
@@ -329,6 +333,8 @@
 # Extract XML files
 elsif ($cmd eq 'extract') {
 
+  my $input = $input[0];
+
   pod2usage(%ERROR_HASH) unless $output;
 
   # TODO: Support sigles and full archives
@@ -338,6 +344,8 @@
     exit(0);
   };
 
+#TODOOOOOO
+
   if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
 
     unless ($archive->test_unzip) {
@@ -380,7 +388,7 @@
     sub {
       my ($pid, $code) = shift;
       my $data = pop;
-      print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
+      print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
 	($iter++) . "/$count]" .
 	  ($code ? " $code" : '') .
 	    " $$data\n";
@@ -400,8 +408,8 @@
   };
 
   # Input is a directory
-  if (-d $input) {
-    my $it = Directory::Iterator->new($input);
+  if (-d $input[0]) {
+    my $it = Directory::Iterator->new($input[0]);
     my @dirs;
     my $dir;
 
@@ -443,12 +451,15 @@
   }
 
   # Input is a file
-  elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+  elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new(shift @input))) {
     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
       exit(1);
     };
 
+    # Add further annotation archived
+    # $doc->attach($_) foreach @input;
+
     print "Start processing ...\n";
     $t = Benchmark->new;
     my @dirs = $archive->list_texts;
@@ -485,7 +496,7 @@
       if ($archive->extract($dirs[$i], $temp)) {
 
 	# Create corpus directory
-	$input = catdir("$temp", $corpus);
+	my $input = catdir("$temp", $corpus);
 
 	# Temporary directory
 	my $dir = catdir($input, $doc, $text);
diff --git a/t/archive.t b/t/archive.t
index 22ad61f..f31cb0c 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -16,7 +16,7 @@
 };
 
 ok($archive->test, 'Test archive');
-like($archive->path, qr/archive\.zip$/, 'Archive path');
+like($archive->path(0), qr/archive\.zip$/, 'Archive path');
 
 my @list = $archive->list_texts;
 is(scalar @list, 10, 'Found all tests');
@@ -42,6 +42,8 @@
 ok(-f catdir($dir, 'TEST', 'BSP', 'header.xml'), 'Test doc header exists');
 
 
+
+
 done_testing;
 
 __END__