Add extract_sigle method to archive

Change-Id: Ic2eb6578c6e8fb57e2191f685a000fd712ec463d
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index bbdcfbd..00b87b2 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -1,6 +1,7 @@
 package KorAP::XML::Archive;
 use Carp qw/carp/;
 use Mojo::Util qw/quote/;
+use List::Util qw/uniq/;
 use File::Spec::Functions qw(rel2abs);
 use strict;
 use warnings;
@@ -211,6 +212,24 @@
 };
 
 
+sub extract_doc_new {
+  my ($self, $doc_path, $target_dir, $jobs) = @_;
+
+  my ($prefix, $corpus, $doc) = $self->split_path(
+    $doc_path . '/UNKNOWN' ) or return;
+
+  my @cmds = $self->cmds_from_sigle(
+    [join('/', $corpus, $doc)], $prefix
+  );
+
+  @cmds = map {
+    push @{$_}, '-d', $target_dir;
+    $_;
+  } @cmds;
+
+  return $self->_extract($jobs, @cmds);
+};
+
 
 # Extract document files to a directory
 sub extract_doc {
@@ -257,7 +276,7 @@
     # As a folder sigle
     else {
       push @breadcrumbs, $doc, '*';
-    }
+    };
 
     push(@cmd, join('/', @breadcrumbs));
 
@@ -328,6 +347,94 @@
 };
 
 
+# Extract from sigle
+sub extract_sigle {
+  my ($self, $sigle, $target_dir, $jobs) = @_;
+  my @cmds = $self->cmds_from_sigle($sigle);
+
+  @cmds = map {
+    push @{$_}, '-d', $target_dir;
+    $_;
+  } @cmds;
+
+  return $self->_extract($jobs, @cmds);
+};
+
+
+# Create commands for sigle
+sub cmds_from_sigle {
+  my ($self, $sigle) = @_;
+
+  my $first = 1;
+
+  my @init_cmd = (
+    'unzip',          # Use unzip program
+    '-qo',            # quietly overwrite all existing files
+    '-uo',
+  );
+
+  my @cmds;
+
+  # Iterate over all attached archives
+  for  (my $i = 0; $i < @$self; $i++) {
+    my $archive = $self->[$i];
+    my $prefix_check = 0;
+    my $prefix = 0;
+
+    # $_ is the zip
+    my @cmd = @init_cmd;
+    push(@cmd, $archive->[0]); # Extract from zip
+
+    foreach (@$sigle) {
+      my ($corpus,$doc,$text) = split '/', $_;
+
+      # Add some interesting files for extraction
+      # Can't use catfile(), as this removes the '.' prefix
+      my @breadcrumbs = ($corpus);
+
+      # If the prefix is not forbidden - prefix!
+      unless ($prefix_check) {
+        $prefix = $self->check_prefix($i);
+        $prefix_check = 1;
+      };
+
+      unshift @breadcrumbs, $prefix if $prefix;
+
+      if ($first) {
+
+        # Only extract from first file
+        push(@cmd, join('/', @breadcrumbs, 'header.xml'));
+        push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
+      };
+
+      # With wildcard on doc level
+      if (index($doc, '*') > 0) {
+        push @breadcrumbs, $doc;
+      }
+
+      # For full-defined doc sigle
+      elsif (!$text) {
+        push @breadcrumbs, $doc, '*';
+      }
+
+      # For text sigle
+      else {
+        push @breadcrumbs, $doc, $text, '*';
+      }
+
+      # Add to command
+      push(@cmd, join('/', @breadcrumbs));
+    };
+
+    # Add to commands
+    push @cmds, [uniq @cmd];
+
+    $first = 0;
+  };
+
+  return @cmds;
+};
+
 1;
 
 
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 3530288..d53004b 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -691,72 +691,75 @@
     # Add further annotation archived
     $archive->attach($_) foreach @input[1..$#input];
 
-    my $prefix = 1;
+    # Will set @sigle
+    my $prefix = set_sigle($archive);
 
-    # No sigles given
-    unless (@sigle) {
-
-      # Get files
-      foreach ($archive->list_texts) {
-
-        # Split path information
-        ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
-
-        # TODO: Make this OS independent
-        push @sigle, join '/', $corpus, $doc, $text;
-      };
-    }
-
-    # Check sigle for doc sigles
-    else {
-      my @new_sigle;
-
-      my $prefix_check = 0;
-
-      # Iterate over all sigle
-      foreach (@sigle) {
-
-        # Sigle is a doc sigle
-        if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
-
-          print "$_ ...";
-          # Check if a prefix is needed
-          unless ($prefix_check) {
-
-            if ($prefix = $archive->check_prefix) {
-              print " with prefix ...";
-            };
-            $prefix_check = 1;
-          };
-
-          print "\n";
-
-          # TODO: Make this OS independent
-          my $path = ($prefix ? './' : '') . $_;
-
-          print '... ' . (
-            $archive->extract_doc(
-              $path, $output, $sequential_extraction ? 1 : $jobs
-            ) ? '' : 'not '
-          );
-          print "extracted.\n";
-        }
-
-        # Sigle is a text sigle
-        else {
-          push @new_sigle, $_;
-
-          unless ($prefix_check) {
-
-            if ($prefix = $archive->check_prefix) {
-              print " with prefix ...";
-            };
-            $prefix_check = 1;
-          };
-        };
-      };
-      @sigle = @new_sigle;
-    };
+#    my $prefix = 1;
+#
+#    # No sigles given
+#    unless (@sigle) {
+#
+#      # Get files
+#      foreach ($archive->list_texts) {
+#
+#        # Split path information
+#        ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
+#
+#        # TODO: Make this OS independent
+#        push @sigle, join '/', $corpus, $doc, $text;
+#      };
+#    }
+#
+#    # Check sigle for doc sigles
+#    else {
+#      my @new_sigle;
+#
+#      my $prefix_check = 0;
+#
+#      # Iterate over all sigle
+#      foreach (@sigle) {
+#
+#        # Sigle is a doc sigle
+#        if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+#
+#          print "$_ ...";
+#          # Check if a prefix is needed
+#          unless ($prefix_check) {
+#
+#            if ($prefix = $archive->check_prefix) {
+#              print " with prefix ...";
+#            };
+#            $prefix_check = 1;
+#          };
+#
+#          print "\n";
+#
+#          # TODO: Make this OS independent
+#          my $path = ($prefix ? './' : '') . $_;
+#
+#          print '... ' . (
+#            $archive->extract_doc(
+#              $path, $output, $sequential_extraction ? 1 : $jobs
+#            ) ? '' : 'not '
+#          );
+#          print "extracted.\n";
+#        }
+#
+#        # Sigle is a text sigle
+#        else {
+#          push @new_sigle, $_;
+#
+#          unless ($prefix_check) {
+#
+#            if ($prefix = $archive->check_prefix) {
+#              print " with prefix ...";
+#            };
+#            $prefix_check = 1;
+#          };
+#        };
+#      };
+#      @sigle = @new_sigle;
+#    };
 
     # Iterate over all given sigles and extract
     foreach (@sigle) {
@@ -811,7 +814,7 @@
       # Add some random extra to avoid clashes with multiple archives
       $extract_dir = catdir($extract_dir, random_string('cccccc'));
 
-      # Extract to temprary directory
+      # Extract to temporary directory
       if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
         @input = ($extract_dir);
       }
@@ -829,8 +832,6 @@
     };
   };
 
-  # TODO: Support sigles
-
   # Zero means: everything runs in the parent process
   my $pool = Parallel::ForkManager->new($jobs);
 
@@ -971,6 +972,9 @@
     # Add further annotation archived
     $archive->attach($_) foreach @input[1..$#input];
 
+    # Get sigles to extract
+    my $prefix = set_sigle($archive);
+
     print "Start processing ...\n";
     $t = Benchmark->new;
     my @dirs = $archive->list_texts;
@@ -1056,6 +1060,86 @@
 };
 
 
+# For an archive, this will create the list
+# of all sigles to process
+sub set_sigle {
+  my $archive = shift;
+
+  my $prefix = 1;
+  my @dirs = ();
+
+  # No sigles given
+  unless (@sigle) {
+
+    # Get files
+    foreach ($archive->list_texts) {
+
+      push @dirs, $_;
+
+      # Split path information
+      ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
+
+      # TODO: Make this OS independent
+      push @sigle, join '/', $corpus, $doc, $text;
+    };
+  }
+
+  # Check sigle for doc sigles
+  else {
+    my @new_sigle;
+
+    my $prefix_check = 0;
+
+    # Iterate over all sigle
+    foreach (@sigle) {
+
+      # Sigle is a doc sigle
+      if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+
+        print "$_ ...";
+        # Check if a prefix is needed
+        unless ($prefix_check) {
+
+          if ($prefix = $archive->check_prefix) {
+            print " with prefix ...";
+          };
+          $prefix_check = 1;
+        };
+
+        print "\n";
+
+        # TODO: Make this OS independent
+        my $path = ($prefix ? './' : '') . $_;
+
+        print '... ' . (
+          $archive->extract_doc(
+            $path, $output, $sequential_extraction ? 1 : $jobs
+          ) ? '' : 'not '
+        );
+        print "extracted.\n";
+      }
+
+      # Sigle is a text sigle
+      else {
+        push @new_sigle, $_;
+
+        unless ($prefix_check) {
+
+          if ($prefix = $archive->check_prefix) {
+            print " with prefix ...";
+          };
+          $prefix_check = 1;
+        };
+      };
+    };
+    @sigle = @new_sigle;
+  };
+
+  return $prefix;
+};
+
+
+
 # Cleanup temporary extraction directory
 if ($extract_dir) {
   my $objects = remove_tree($extract_dir, { safe => 1 });
@@ -1344,8 +1428,8 @@
 Supported parameters are:
 C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>,
-C<temp-extract>, C<sequential-extraction>,
+C<output>, C<koral>,
+C<tempary-extract>, C<sequential-extraction>,
 C<base-sentences>, C<base-paragraphs>,
 C<base-pagebreaks>,
 C<skip> (semicolon separated), C<sigle>
diff --git a/t/archive.t b/t/archive.t
index 1cd3c00..f86277f 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -1,6 +1,7 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
+use Data::Dumper;
 use Test::More;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile catdir/;
@@ -52,6 +53,35 @@
 $archive = KorAP::XML::Archive->new($file);
 ok(!$archive->check_prefix, 'Archive has no dot prefix');
 
+my @cmd = map { join ' ', @{$_} } $archive->cmds_from_sigle(['REI/RB*', 'REI/BNG/00071']);
+
+like($cmd[0], qr!unzip -qo -uo t/corpus/archive_rei\.zip!);
+like($cmd[0], qr!\QREI/header.xml REI/RB*/header.xml REI/RB* REI/BNG/header.xml REI/BNG/00071/*\E!);
+ok(!$cmd[1]);
+
+# New temporary directory
+$dir = tempdir(CLEANUP => 1);
+
+{
+  local $SIG{__WARN__} = sub {};
+  ok($archive->extract_sigle(['REI/RB*', 'REI/BNG/00071'], $dir), 'Fine');
+};
+
+ok(-d catdir($dir, 'REI'), 'Test corpus directory exists');
+ok(-d catdir($dir, 'REI','BNG'), 'Test corpus directory exists');
+ok(-d catdir($dir, 'REI','BNG','00071'), 'Test corpus directory exists');
+
+ok(-f catdir($dir, 'REI', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','BNG', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','BNG','00071', 'header.xml'), 'Test corpus directory exists');
+
+ok(-f catdir($dir, 'REI','RBR', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','RBR','00610', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','RBR','00610', 'header.xml'), 'Test corpus directory exists');
+
+ok(!-e catdir($dir, 'REI','BNG','00128'), 'Test corpus directory does not exist');
+
+
 done_testing;
 
 __END__
diff --git a/t/script/archive.t b/t/script/archive.t
index 8d0fba1..290784b 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -38,7 +38,6 @@
 
 my $input = catfile($f, '..', 'corpus', 'archive.zip');
 ok(-f $input, 'Input archive found');
-
 my $output = File::Temp->newdir(CLEANUP => 0);
 $output->unlink_on_destroy(0);
 
@@ -221,5 +220,38 @@
   $call
 );
 
+
+
+# Test with sigles
+$input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+unlink($output);
+
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => '' . $input,
+  '--output' => $output,
+  '--sigle' => 'TEST/BSP/2',
+  '--sigle' => 'TEST/BSP/5',
+  '-t' => 'Base#tokens_aggr',
+  '-m' => 'Sgbr'
+);
+
+{
+  local $SIG{__WARN__} = sub {};
+  my $out = stdout_from(sub { system($call); });
+
+  like($out, qr!TEST-BSP-1\.json!s, $call);
+
+  $out =~ m!Processed (.+?\.json)!;
+  $json = $1;
+};
+
+ok(-f $json, 'Json file exists');
+
+
 done_testing;
 __END__
diff --git a/t/script/extract.t b/t/script/extract.t
index e9f666f..11d7cf3 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -167,7 +167,7 @@
   sub {
     system($call);
   },
-  qr!Extract .+? REI/BN\*!s,
+  qr!Extract .+? REI/BN!s,
   $call
 );