Remove extract_text and extract_doc in favor of extract_sigle
Change-Id: I6577a8f453baab96e684da3b2238a31b4f0175e3
diff --git a/Changes b/Changes
index bd13616..4054bd0 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.37 2019-02-13
+0.37 2019-02-21
- Support for 'koral:field' array.
- Support for Koral versioning.
- Added tests for english sources.
@@ -6,6 +6,8 @@
Wikipedia resources.
- Ignore temporary extraction
on directory archiving.
+ - Remove extract_text and extract_doc in
+ favor of extract_sigle for archives.
0.36 2019-01-22
- Support for non-word tokens (fixes #5).
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 00b87b2..3606086 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -68,7 +68,7 @@
my $nr = shift // 0;
my $file = $self->[$nr]->[0];
my ($header) = `unzip -l -UU -qq $file "*/header.xml"`;
- return $header =~ m![\s\t]\.[/\\]! ? 1 : 0;
+ return ($header && $header =~ m![\s\t]\.[/\\]!) ? 1 : 0;
};
@@ -212,140 +212,6 @@
};
-sub extract_doc_new {
- my ($self, $doc_path, $target_dir, $jobs) = @_;
-
- my ($prefix, $corpus, $doc) = $self->split_path(
- $doc_path . '/UNKNOWN' ) or return;
-
- my @cmds = $self->cmds_from_sigle(
- [join('/', $corpus, $doc)], $prefix
- );
-
- @cmds = map {
- push @{$_}, '-d', $target_dir;
- $_;
- } @cmds;
-
- return $self->_extract($jobs, @cmds);
-};
-
-
-# Extract document files to a directory
-sub extract_doc {
- my $self = shift;
- my ($doc_path, $target_dir, $jobs) = @_;
-
- my $first = 1;
-
- my @init_cmd = (
- 'unzip', # Use unzip program
- '-qo', # quietly overwrite all existing files
- '-uo',
- '-d', $target_dir # Extract into target directory
- );
-
- my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
- my @cmds;
-
- # Iterate over all attached archives
- foreach my $archive (@$self) {
-
- # $_ is the zip
- my @cmd = @init_cmd;
- push(@cmd, $archive->[0]); # Extract from zip
-
- # Add some interesting files for extraction
- # Can't use catfile(), as this removes the '.' prefix
- my @breadcrumbs = ($corpus);
-
- # If the prefix is not forbidden - prefix!
- unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
-
- if ($first) {
- # Only extract from first file
- push(@cmd, join('/', @breadcrumbs, 'header.xml'));
- $first = 0;
- };
-
- # With wildcard
- if (index($doc, '*') > 0) {
- push @breadcrumbs, $doc;
- }
-
- # As a folder sigle
- else {
- push @breadcrumbs, $doc, '*';
- };
-
- push(@cmd, join('/', @breadcrumbs));
-
- # Run system call
- push @cmds, \@cmd;
- };
-
- $self->_extract($jobs, @cmds);
-};
-
-
-# Extract text files to a directory
-sub extract_text {
- my $self = shift;
- my $text_path = shift;
- my $target_dir = shift;
-
- my $first = 1;
-
- my @init_cmd = (
- 'unzip', # Use unzip program
- '-qo', # quietly overwrite all existing files
- '-uo',
- '-d', $target_dir # Extract into target directory
- );
-
- my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
-
- # Iterate over all attached archives
- foreach my $archive (@$self) {
-
- # $_ is the zip
- my @cmd = @init_cmd;
- push(@cmd, $archive->[0]); # Extract from zip
-
- # Add some interesting files for extraction
- # Can't use catfile(), as this removes the '.' prefix
- my @breadcrumbs = ($corpus);
-
- # If the prefix is not forbidden - prefix!
- unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
-
- if ($first) {
-
- # Only extract from first file
- push(@cmd, join('/', @breadcrumbs, 'header.xml'));
- push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
- $first = 0;
- };
-
- # With prefix
- push @breadcrumbs, $doc, $text, '*';
-
- push(@cmd, join('/', @breadcrumbs));
-
- # Run system call
- system(@cmd);
-
- # Check for return code
- if ($? != 0) {
- carp("System call '" . join(' ', @cmd) . "' errors " . $?);
- return;
- };
- };
-
- # Fine
- return 1;
-};
-
# Extract from sigle
sub extract_sigle {
@@ -398,7 +264,7 @@
$prefix_check = 1;
};
- unshift @breadcrumbs, $prefix if $prefix;
+ unshift @breadcrumbs, '.' if $prefix;
if ($first) {
diff --git a/script/korapxml2krill b/script/korapxml2krill
index d53004b..5e5b315 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -769,9 +769,10 @@
# TODO: Make this OS independent
print '... ' . (
- $archive->extract_text(
- ($prefix ? './' : '') . $_, $output
- ) ? '' : 'not '
+ # TODO:
+ # - prefix???
+ $archive->extract_sigle([$_], $output, $jobs)
+ ? '' : 'not '
);
print "extracted.\n";
};
@@ -1004,7 +1005,7 @@
# because extraction can be horrible slow!
# Extract from archive
- if ($archive->extract_text($dirs[$i], $temp)) {
+ if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
# Create corpus directory
my $input = catdir("$temp", $corpus);
@@ -1108,13 +1109,9 @@
print "\n";
- # TODO: Make this OS independent
- my $path = ($prefix ? './' : '') . $_;
-
print '... ' . (
- $archive->extract_doc(
- $path, $output, $sequential_extraction ? 1 : $jobs
- ) ? '' : 'not '
+ $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
+ ? '' : 'not '
);
print "extracted.\n";
}
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 1506d8b..de9144c 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -42,7 +42,7 @@
my $dir = tempdir();
my $f_path = 'WPD15/A00/00081';
-$archive->extract_text($f_path, $dir);
+$archive->extract_sigle([$f_path], $dir);
ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path));
diff --git a/t/archive.t b/t/archive.t
index f86277f..71bedce 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -36,7 +36,7 @@
{
local $SIG{__WARN__} = sub {};
- ok($archive->extract_text('./TEST/BSP/8', $dir), 'Wrong path');
+ ok($archive->extract_sigle(['TEST/BSP/8'], $dir), 'Wrong path');
};
ok(-d catdir($dir, 'TEST'), 'Test corpus directory exists');
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index 1da4e9d..7e2a0e7 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t
@@ -60,7 +60,7 @@
my $dir = tempdir(CLEANUP => 1);
{
local $SIG{__WARN__} = sub {};
- ok($archive->extract_text($list[0], $dir), 'Wrong path');
+ ok($archive->extract_sigle([$list[0]], $dir), 'Wrong path');
};
ok(-d catdir($dir, 'WPD15'), 'Test corpus directory exists');
diff --git a/t/script/extract.t b/t/script/extract.t
index 11d7cf3..084908b 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -51,14 +51,14 @@
'--cache' => $cache
);
-my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
+# my $sep = qr!\.\.\.[\n\r]+?\.\.\.!;
# Test without compression
stdout_like(
sub {
system($call);
},
- qr!TEST/BSP/1 $sep extracted!s,
+ qr!TEST/BSP/1 .* extracted!s,
# qr!TEST/BSP/1 $sep extracted.!s,
$call
);
@@ -91,7 +91,8 @@
sub {
system($call);
},
- qr!TEST/BSP/4 $sep extracted.!s,
+ # qr!TEST/BSP/4 $sep extracted.!s,
+ qr!TEST/BSP/4 .* extracted.!s,
$call
);
@@ -100,7 +101,8 @@
sub {
system($call);
},
- qr!TEST/BSP/5 $sep extracted.!s,
+ # qr!TEST/BSP/5 $sep extracted.!s,
+ qr!TEST/BSP/5 .* extracted.!s,
$call
);
@@ -110,7 +112,6 @@
ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
-
# Test with document sigle
my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
ok(-f $input_rei, 'Input archive found');
@@ -147,7 +148,6 @@
ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
-
# Test with document sigle
$output2 = undef;
$output2 = tempdir(CLEANUP => 1);
@@ -176,7 +176,7 @@
sub {
system($call);
},
- qr!REI/RBR $sep extracted!s,
+ qr!REI/RBR .* extracted!s,
$call
);
@@ -205,7 +205,7 @@
sub {
system($call);
},
- qr!WPD15/A00/00081 $sep extracted!s,
+ qr!WPD15/A00/00081 .* extracted!s,
$call
);
@@ -239,7 +239,8 @@
sub {
system($call);
},
- qr!TEST/BSP "Example"\/1 $sep extracted!s,
+ qr!TEST/BSP "Example"\/1 .* extracted!s,
+ # qr!TEST/BSP "Example"\/1 $sep extracted!s,
# qr!Extract .+? TEST/BSP "Example"\/1!s,
$call
);