Remove extract_text and extract_doc in favor of extract_sigle

Change-Id: I6577a8f453baab96e684da3b2238a31b4f0175e3
diff --git a/Changes b/Changes
index bd13616..4054bd0 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.37 2019-02-13
+0.37 2019-02-21
         - Support for 'koral:field' array.
         - Support for Koral versioning.
         - Added tests for english sources.
@@ -6,6 +6,8 @@
           Wikipedia resources.
         - Ignore temporary extraction
           on directory archiving.
+        - Remove extract_text and extract_doc in
+          favor of extract_sigle for archives.
 
 0.36 2019-01-22
         - Support for non-word tokens (fixes #5).
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 00b87b2..3606086 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -68,7 +68,7 @@
   my $nr = shift // 0;
   my $file = $self->[$nr]->[0];
   my ($header) = `unzip -l -UU -qq $file "*/header.xml"`;
-  return $header =~ m![\s\t]\.[/\\]! ? 1 : 0;
+  return ($header && $header =~ m![\s\t]\.[/\\]!) ? 1 : 0;
 };
 
 
@@ -212,140 +212,6 @@
 };
 
 
-sub extract_doc_new {
-  my ($self, $doc_path, $target_dir, $jobs) = @_;
-
-  my ($prefix, $corpus, $doc) = $self->split_path(
-    $doc_path . '/UNKNOWN' ) or return;
-
-  my @cmds = $self->cmds_from_sigle(
-    [join('/', $corpus, $doc)], $prefix
-  );
-
-  @cmds = map {
-    push @{$_}, '-d', $target_dir;
-    $_;
-  } @cmds;
-
-  return $self->_extract($jobs, @cmds);
-};
-
-
-# Extract document files to a directory
-sub extract_doc {
-  my $self = shift;
-  my ($doc_path, $target_dir, $jobs) = @_;
-
-  my $first = 1;
-
-  my @init_cmd = (
-    'unzip',          # Use unzip program
-    '-qo',            # quietly overwrite all existing files
-    '-uo',
-    '-d', $target_dir # Extract into target directory
-  );
-
-  my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
-  my @cmds;
-
-  # Iterate over all attached archives
-  foreach my $archive (@$self) {
-
-    # $_ is the zip
-    my @cmd = @init_cmd;
-    push(@cmd, $archive->[0]); # Extract from zip
-
-    # Add some interesting files for extraction
-    # Can't use catfile(), as this removes the '.' prefix
-    my @breadcrumbs = ($corpus);
-
-    # If the prefix is not forbidden - prefix!
-    unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
-
-    if ($first) {
-      # Only extract from first file
-      push(@cmd, join('/', @breadcrumbs, 'header.xml'));
-      $first = 0;
-    };
-
-    # With wildcard
-    if (index($doc, '*') > 0) {
-      push @breadcrumbs, $doc;
-    }
-
-    # As a folder sigle
-    else {
-      push @breadcrumbs, $doc, '*';
-    };
-
-    push(@cmd, join('/', @breadcrumbs));
-
-    # Run system call
-    push @cmds, \@cmd;
-  };
-
-  $self->_extract($jobs, @cmds);
-};
-
-
-# Extract text files to a directory
-sub extract_text {
-  my $self = shift;
-  my $text_path = shift;
-  my $target_dir = shift;
-
-  my $first = 1;
-
-  my @init_cmd = (
-    'unzip',          # Use unzip program
-    '-qo',            # quietly overwrite all existing files
-    '-uo',
-    '-d', $target_dir # Extract into target directory
-  );
-
-  my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
-
-  # Iterate over all attached archives
-  foreach my $archive (@$self) {
-
-    # $_ is the zip
-    my @cmd = @init_cmd;
-    push(@cmd, $archive->[0]); # Extract from zip
-
-    # Add some interesting files for extraction
-    # Can't use catfile(), as this removes the '.' prefix
-    my @breadcrumbs = ($corpus);
-
-    # If the prefix is not forbidden - prefix!
-    unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
-
-    if ($first) {
-
-      # Only extract from first file
-      push(@cmd, join('/', @breadcrumbs, 'header.xml'));
-      push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
-      $first = 0;
-    };
-
-    # With prefix
-    push @breadcrumbs, $doc, $text, '*';
-
-    push(@cmd, join('/', @breadcrumbs));
-
-    # Run system call
-    system(@cmd);
-
-    # Check for return code
-    if ($? != 0) {
-      carp("System call '" . join(' ', @cmd) . "' errors " . $?);
-      return;
-    };
-  };
-
-  # Fine
-  return 1;
-};
-
 
 # Extract from sigle
 sub extract_sigle {
@@ -398,7 +264,7 @@
         $prefix_check = 1;
       };
 
-      unshift @breadcrumbs, $prefix if $prefix;
+      unshift @breadcrumbs, '.' if $prefix;
 
       if ($first) {
 
diff --git a/script/korapxml2krill b/script/korapxml2krill
index d53004b..5e5b315 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -769,9 +769,10 @@
       # TODO: Make this OS independent
       print '... ' . (
 
-        $archive->extract_text(
-          ($prefix ? './' : '') . $_, $output
-        ) ? '' : 'not '
+        # TODO:
+        #   - prefix???
+        $archive->extract_sigle([$_], $output, $jobs)
+        ? '' : 'not '
       );
       print "extracted.\n";
     };
@@ -1004,7 +1005,7 @@
       # because extraction can be horrible slow!
 
       # Extract from archive
-      if ($archive->extract_text($dirs[$i], $temp)) {
+      if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
 
         # Create corpus directory
         my $input = catdir("$temp", $corpus);
@@ -1108,13 +1109,9 @@
 
         print "\n";
 
-        # TODO: Make this OS independent
-        my $path = ($prefix ? './' : '') . $_;
-
         print '... ' . (
-          $archive->extract_doc(
-            $path, $output, $sequential_extraction ? 1 : $jobs
-          ) ? '' : 'not '
+          $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
+            ? '' : 'not '
         );
         print "extracted.\n";
       }
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 1506d8b..de9144c 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -42,7 +42,7 @@
 my $dir = tempdir();
 
 my $f_path = 'WPD15/A00/00081';
-$archive->extract_text($f_path, $dir);
+$archive->extract_sigle([$f_path], $dir);
 
 ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path));
 
diff --git a/t/archive.t b/t/archive.t
index f86277f..71bedce 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -36,7 +36,7 @@
 
 {
   local $SIG{__WARN__} = sub {};
-  ok($archive->extract_text('./TEST/BSP/8', $dir), 'Wrong path');
+  ok($archive->extract_sigle(['TEST/BSP/8'], $dir), 'Wrong path');
 };
 
 ok(-d catdir($dir, 'TEST'), 'Test corpus directory exists');
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index 1da4e9d..7e2a0e7 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t
@@ -60,7 +60,7 @@
 my $dir = tempdir(CLEANUP => 1);
 {
   local $SIG{__WARN__} = sub {};
-  ok($archive->extract_text($list[0], $dir), 'Wrong path');
+  ok($archive->extract_sigle([$list[0]], $dir), 'Wrong path');
 };
 
 ok(-d catdir($dir, 'WPD15'), 'Test corpus directory exists');
diff --git a/t/script/extract.t b/t/script/extract.t
index 11d7cf3..084908b 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -51,14 +51,14 @@
   '--cache' => $cache
 );
 
-my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
+# my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
 
 # Test without compression
 stdout_like(
   sub {
     system($call);
   },
-  qr!TEST/BSP/1 $sep extracted!s,
+  qr!TEST/BSP/1 .* extracted!s,
 #  qr!TEST/BSP/1 $sep extracted.!s,
   $call
 );
@@ -91,7 +91,8 @@
   sub {
     system($call);
   },
-  qr!TEST/BSP/4 $sep extracted.!s,
+  # qr!TEST/BSP/4 $sep extracted.!s,
+  qr!TEST/BSP/4 .* extracted.!s,
   $call
 );
 
@@ -100,7 +101,8 @@
   sub {
     system($call);
   },
-  qr!TEST/BSP/5 $sep extracted.!s,
+  # qr!TEST/BSP/5 $sep extracted.!s,
+  qr!TEST/BSP/5 .* extracted.!s,
   $call
 );
 
@@ -110,7 +112,6 @@
 ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
 ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
 
-
 # Test with document sigle
 my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
 ok(-f $input_rei, 'Input archive found');
@@ -147,7 +148,6 @@
 ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
 ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
 
-
 # Test with document sigle
 $output2 = undef;
 $output2 = tempdir(CLEANUP => 1);
@@ -176,7 +176,7 @@
   sub {
     system($call);
   },
-  qr!REI/RBR $sep extracted!s,
+  qr!REI/RBR .* extracted!s,
   $call
 );
 
@@ -205,7 +205,7 @@
   sub {
     system($call);
   },
-  qr!WPD15/A00/00081 $sep extracted!s,
+  qr!WPD15/A00/00081 .* extracted!s,
   $call
 );
 
@@ -239,7 +239,8 @@
   sub {
     system($call);
   },
-  qr!TEST/BSP "Example"\/1 $sep extracted!s,
+  qr!TEST/BSP "Example"\/1 .* extracted!s,
+  # qr!TEST/BSP "Example"\/1 $sep extracted!s,
   # qr!Extract .+? TEST/BSP "Example"\/1!s,
   $call
 );