Added extraction method for documents in archives

Change-Id: Id4ea7d9801a5750c77f81a2251d389adb6e06d31
diff --git a/Changes b/Changes
index f3f575a..b19f50a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.22 2016-10-26
+        - Added support for document extraction
+
 0.21 2016-10-14
         - Improved Windows support
 
diff --git a/MANIFEST b/MANIFEST
index f6e215b..5e4784b 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -102,6 +102,7 @@
 t/script/extract.t
 t/script/archive.t
 t/corpus/archive.zip
+t/corpus/archive_rei.zip
 t/corpus/BZK/header.xml
 t/corpus/GOE/header.xml
 t/corpus/VDI/header.xml
diff --git a/Readme.pod b/Readme.pod
index cd6258f..296cf13 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -10,8 +10,8 @@
 =head1 SYNOPSIS
 
   $ korapxml2krill -z --input <directory> --output <filename>
+  $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
   $ korapxml2krill archive -z --input <directory|archive> --output <directory>
-  $ korapxml2krill extract --input <directory|archive> --output <filename> --sigle <SIGLE>
 
 
 =head1 DESCRIPTION
@@ -161,10 +161,11 @@
 
 =item B<--sigle|-sg>
 
-Extract the given text sigles.
+Extract the given texts.
 Can be set multiple times.
 I<Currently only supported on C<extract>.>
 Sigles have the structure C<Corpus>/C<Document>/C<Text>.
+In case the C<Text> path is omitted, the whole document will be extracted.
 
 =item B<--log|-l>
 
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 56e0622..17a4f53 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -59,6 +59,16 @@
 };
 
 
+# Check, if the archive has a prefix
+sub check_prefix {
+  my $self = shift;
+  my $nr = shift // 0;
+  my $file = $self->[$nr]->[0];
+  my ($header) = `unzip -l -UU -qq $file "*/header.xml"`;
+  return $header =~ m![\s\t]\.[/\\]! ? 1 : 0;
+};
+
+
 # Split a text path to prefix, corpus, document, text
 sub split_path {
   my $self = shift;
@@ -76,12 +86,12 @@
   };
 
   # Unix form
-  if ($text_path =~ m!^([^/]+?)/([^/]+?)/([^/]+?)$!) {
+  if ($text_path =~ m!^([^/]+?)/([^/]+?)[\\/]([^/]+?)$!) {
     return ($prefix, $1, $2, $3);
   }
 
   # Windows form
-  elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)\\([^\\]+?)$!) {
+  elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)[\\/]([^\\]+?)$!) {
     return ($prefix, $1, $2, $3);
   };
 
@@ -127,9 +137,65 @@
 };
 
 
+# Extract document files to a directory
+sub extract_doc {
+  my $self = shift;
+  my $doc_path = shift;
+  my $target_dir = shift;
 
-# Extract files to a directory
-sub extract {
+  my $first = 1;
+
+  my @init_cmd = (
+    'unzip',          # Use unzip program
+    '-qo',            # quietly overwrite all existing files
+    '-d', $target_dir # Extract into target directory
+  );
+
+  my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
+
+  # Iterate over all attached archives
+  foreach my $archive (@$self) {
+
+    # $_ is the zip
+    my @cmd = @init_cmd;
+    push(@cmd, $archive->[0]); # Extract from zip
+
+    # Add some interesting files for extraction
+    # Can't use catfile(), as this removes the '.' prefix
+    my @breadcrumbs = ($corpus);
+
+    # If the prefix is not forbidden - prefix!
+    unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
+
+    if ($first) {
+      # Only extract from first file
+      push(@cmd, join('/', @breadcrumbs, 'header.xml'));
+      # push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
+      $first = 0;
+    };
+
+    # With prefix
+    push @breadcrumbs, $doc, '*';
+
+    push(@cmd, join('/', @breadcrumbs));
+
+    # Run system call
+    system(@cmd);
+
+    # Check for return code
+    if ($? != 0) {
+      carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+      return;
+    };
+  };
+
+  # Fine
+  return 1;
+};
+
+
+# Extract text files to a directory
+sub extract_text {
   my $self = shift;
   my $text_path = shift;
   my $target_dir = shift;
@@ -201,12 +267,14 @@
 
 =head1 attach
 
+=head1 check_prefix
+
 =head1 list_texts
 
 Returns all texts found in the zip file
 
-=head1 extract
+=head1 extract_text
 
-  $archive->extract('./GOE/AGU/0004', '~/temp');
+  $archive->extract_text('./GOE/AGU/0004', '~/temp');
 
 Extract all files for the named text to a certain directory.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 6d42dbc..6ba77db 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -15,7 +15,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.21';
+our $VERSION = '0.22';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 2713342..d00ba9e 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -332,6 +332,40 @@
         # TODO: Make this OS independent
         push @sigle, join '/', $corpus, $doc, $text;
       };
+    }
+
+    # Check sigle for doc sigles
+    else {
+      my @new_sigle;
+
+      my $prefix_check = 0;
+
+      # Iterate over all sigle
+      foreach (@sigle) {
+
+        # Sigle is a doc sigle
+        if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+          print "$_ ";
+
+          # Check if a prefix is needed
+          unless ($prefix_check) {
+            $prefix = $archive->check_prefix;
+            $prefix_check = 1;
+          };
+
+          # TODO: Make this OS independent
+          print '' . (
+            $archive->extract_doc(
+              ($prefix ? './' : '') . $_, $output
+            ) ? '' : 'not '
+          );
+          print "extracted.\n";
+        }
+        else {
+          push @new_sigle, $_;
+        };
+      };
+      @sigle = @new_sigle;
     };
 
     # Iterate over all given sigles and extract
@@ -340,7 +374,7 @@
 
       # TODO: Make this OS independent
       print '' . (
-        $archive->extract(
+        $archive->extract_text(
           ($prefix ? './' : '') . $_, $output
         ) ? '' : 'not '
       );
@@ -474,7 +508,7 @@
       # because extraction can be horrible slow!
 
       # Extract from archive
-      if ($archive->extract($dirs[$i], $temp)) {
+      if ($archive->extract_text($dirs[$i], $temp)) {
 
         # Create corpus directory
         my $input = catdir("$temp", $corpus);
@@ -533,8 +567,8 @@
 =head1 SYNOPSIS
 
   $ korapxml2krill -z --input <directory> --output <filename>
+  $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
   $ korapxml2krill archive -z --input <directory|archive> --output <directory>
-  $ korapxml2krill extract --input <directory|archive> --output <filename> --sigle <SIGLE>
 
 
 =head1 DESCRIPTION
@@ -684,10 +718,11 @@
 
 =item B<--sigle|-sg>
 
-Extract the given text sigles.
+Extract the given texts.
 Can be set multiple times.
 I<Currently only supported on C<extract>.>
 Sigles have the structure C<Corpus>/C<Document>/C<Text>.
+In case the C<Text> path is omitted, the whole document will be extracted.
 
 =item B<--log|-l>
 
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 30a8708..478bbc1 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -42,7 +42,7 @@
 my $dir = tempdir();
 
 my $f_path = 'WPD15/A00/00081';
-$archive->extract($f_path, $dir);
+$archive->extract_text($f_path, $dir);
 
 ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path));
 
diff --git a/t/archive.t b/t/archive.t
index 3d40549..0a17165 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -18,6 +18,8 @@
 ok($archive->test, 'Test archive');
 like($archive->path(0), qr/archive\.zip$/, 'Archive path');
 
+ok($archive->check_prefix, 'Archive has dot prefix');
+
 my @list = $archive->list_texts;
 is(scalar @list, 10, 'Found all tests');
 is($list[0], './TEST/BSP/1', 'First document');
@@ -33,7 +35,7 @@
 
 {
   local $SIG{__WARN__} = sub {};
-  ok($archive->extract('./TEST/BSP/8', $dir), 'Wrong path');
+  ok($archive->extract_text('./TEST/BSP/8', $dir), 'Wrong path');
 };
 
 ok(-d catdir($dir, 'TEST'), 'Test corpus directory exists');
@@ -41,6 +43,10 @@
 ok(-d catdir($dir, 'TEST', 'BSP'), 'Test doc directory exists');
 ok(-f catdir($dir, 'TEST', 'BSP', 'header.xml'), 'Test doc header exists');
 
+$file = catfile(dirname(__FILE__), 'corpus','archive_rei.zip');
+$archive = KorAP::XML::Archive->new($file);
+ok(!$archive->check_prefix, 'Archive has no prefix');
+
 
 # TODO: Test attaching!
 
diff --git a/t/corpus/archive_rei.zip b/t/corpus/archive_rei.zip
new file mode 100644
index 0000000..8a00aaa
--- /dev/null
+++ b/t/corpus/archive_rei.zip
Binary files differ
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index 7865101..1da4e9d 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t
@@ -60,7 +60,7 @@
 my $dir = tempdir(CLEANUP => 1);
 {
   local $SIG{__WARN__} = sub {};
-  ok($archive->extract($list[0], $dir), 'Wrong path');
+  ok($archive->extract_text($list[0], $dir), 'Wrong path');
 };
 
 ok(-d catdir($dir, 'WPD15'), 'Test corpus directory exists');
diff --git a/t/script/extract.t b/t/script/extract.t
index 0f8f0b7..6c689f9 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -104,6 +104,42 @@
 ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
 ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
 
+
+# Test with document sigle
+my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
+ok(-f $input_rei, 'Input archive found');
+
+$call = join(
+  ' ',
+  'perl', $script,
+  'extract',
+  '--input' => $input_rei,
+  '--output' => $output2,
+  '-sg' => 'REI/BNG'
+);
+
+# Test with sigle
+stdout_like(
+  sub {
+    system($call);
+  },
+  qr!REI/BNG extracted!s,
+  $call
+);
+
+# Test with sigle
+stdout_unlike(
+  sub {
+    system($call);
+  },
+  qr!REI/RBR extracted!s,
+  $call
+);
+
+ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
+ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
+ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
+
 # Check multiple archives
 $output = tempdir(CLEANUP => 1);
 ok(-d $output, 'Output directory exists');