Add extract_sigle method to archive
Change-Id: Ic2eb6578c6e8fb57e2191f685a000fd712ec463d
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index bbdcfbd..00b87b2 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -1,6 +1,7 @@
package KorAP::XML::Archive;
use Carp qw/carp/;
use Mojo::Util qw/quote/;
+use List::Util qw/uniq/;
use File::Spec::Functions qw(rel2abs);
use strict;
use warnings;
@@ -211,6 +212,24 @@
};
+sub extract_doc_new {
+ my ($self, $doc_path, $target_dir, $jobs) = @_;
+
+ my ($prefix, $corpus, $doc) = $self->split_path(
+ $doc_path . '/UNKNOWN' ) or return;
+
+ my @cmds = $self->cmds_from_sigle(
+ [join('/', $corpus, $doc)], $prefix
+ );
+
+ @cmds = map {
+ push @{$_}, '-d', $target_dir;
+ $_;
+ } @cmds;
+
+ return $self->_extract($jobs, @cmds);
+};
+
# Extract document files to a directory
sub extract_doc {
@@ -257,7 +276,7 @@
# As a folder sigle
else {
push @breadcrumbs, $doc, '*';
- }
+ };
push(@cmd, join('/', @breadcrumbs));
@@ -328,6 +347,94 @@
};
+# Extract from sigle
+sub extract_sigle {
+ my ($self, $sigle, $target_dir, $jobs) = @_;
+ my @cmds = $self->cmds_from_sigle($sigle);
+
+ @cmds = map {
+ push @{$_}, '-d', $target_dir;
+ $_;
+ } @cmds;
+
+ return $self->_extract($jobs, @cmds);
+};
+
+
+# Create commands for sigle
+sub cmds_from_sigle {
+ my ($self, $sigle) = @_;
+
+ my $first = 1;
+
+ my @init_cmd = (
+ 'unzip', # Use unzip program
+ '-qo', # quietly overwrite all existing files
+ '-uo',
+ );
+
+ my @cmds;
+
+ # Iterate over all attached archives
+ for (my $i = 0; $i < @$self; $i++) {
+ my $archive = $self->[$i];
+ my $prefix_check = 0;
+ my $prefix = 0;
+
+ # $_ is the zip
+ my @cmd = @init_cmd;
+ push(@cmd, $archive->[0]); # Extract from zip
+
+ foreach (@$sigle) {
+ my ($corpus,$doc,$text) = split '/', $_;
+
+ # Add some interesting files for extraction
+ # Can't use catfile(), as this removes the '.' prefix
+ my @breadcrumbs = ($corpus);
+
+ # If the prefix is not forbidden - prefix!
+ unless ($prefix_check) {
+ $prefix = $self->check_prefix($i);
+ $prefix_check = 1;
+ };
+
+ unshift @breadcrumbs, $prefix if $prefix;
+
+ if ($first) {
+
+ # Only extract from first file
+ push(@cmd, join('/', @breadcrumbs, 'header.xml'));
+ push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
+ };
+
+ # With wildcard on doc level
+ if (index($doc, '*') > 0) {
+ push @breadcrumbs, $doc;
+ }
+
+ # For full-defined doc sigle
+ elsif (!$text) {
+ push @breadcrumbs, $doc, '*';
+ }
+
+ # For text sigle
+ else {
+ push @breadcrumbs, $doc, $text, '*';
+ }
+
+ # Add to command
+ push(@cmd, join('/', @breadcrumbs));
+ };
+
+ # Add to commands
+ push @cmds, [uniq @cmd];
+
+ $first = 0;
+ };
+
+ return @cmds;
+};
+
1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 3530288..d53004b 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -691,72 +691,75 @@
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
- my $prefix = 1;
+ # Will set @sigle
+ my $prefix = set_sigle($archive);
- # No sigles given
- unless (@sigle) {
-
- # Get files
- foreach ($archive->list_texts) {
-
- # Split path information
- ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
-
- # TODO: Make this OS independent
- push @sigle, join '/', $corpus, $doc, $text;
- };
- }
-
- # Check sigle for doc sigles
- else {
- my @new_sigle;
-
- my $prefix_check = 0;
-
- # Iterate over all sigle
- foreach (@sigle) {
-
- # Sigle is a doc sigle
- if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
-
- print "$_ ...";
- # Check if a prefix is needed
- unless ($prefix_check) {
-
- if ($prefix = $archive->check_prefix) {
- print " with prefix ...";
- };
- $prefix_check = 1;
- };
-
- print "\n";
-
- # TODO: Make this OS independent
- my $path = ($prefix ? './' : '') . $_;
-
- print '... ' . (
- $archive->extract_doc(
- $path, $output, $sequential_extraction ? 1 : $jobs
- ) ? '' : 'not '
- );
- print "extracted.\n";
- }
-
- # Sigle is a text sigle
- else {
- push @new_sigle, $_;
-
- unless ($prefix_check) {
-
- if ($prefix = $archive->check_prefix) {
- print " with prefix ...";
- };
- $prefix_check = 1;
- };
- };
- };
- @sigle = @new_sigle;
- };
+# my $prefix = 1;
+#
+# # No sigles given
+# unless (@sigle) {
+#
+# # Get files
+# foreach ($archive->list_texts) {
+#
+# # Split path information
+# ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
+#
+# # TODO: Make this OS independent
+# push @sigle, join '/', $corpus, $doc, $text;
+# };
+# }
+#
+# # Check sigle for doc sigles
+# else {
+# my @new_sigle;
+#
+# my $prefix_check = 0;
+#
+# # Iterate over all sigle
+# foreach (@sigle) {
+#
+# # Sigle is a doc sigle
+# if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+#
+# print "$_ ...";
+# # Check if a prefix is needed
+# unless ($prefix_check) {
+#
+# if ($prefix = $archive->check_prefix) {
+# print " with prefix ...";
+# };
+# $prefix_check = 1;
+# };
+#
+# print "\n";
+#
+# # TODO: Make this OS independent
+# my $path = ($prefix ? './' : '') . $_;
+#
+# print '... ' . (
+# $archive->extract_doc(
+# $path, $output, $sequential_extraction ? 1 : $jobs
+# ) ? '' : 'not '
+# );
+# print "extracted.\n";
+# }
+#
+# # Sigle is a text sigle
+# else {
+# push @new_sigle, $_;
+#
+# unless ($prefix_check) {
+#
+# if ($prefix = $archive->check_prefix) {
+# print " with prefix ...";
+# };
+# $prefix_check = 1;
+# };
+# };
+# };
+# @sigle = @new_sigle;
+# };
# Iterate over all given sigles and extract
foreach (@sigle) {
@@ -811,7 +814,7 @@
# Add some random extra to avoid clashes with multiple archives
$extract_dir = catdir($extract_dir, random_string('cccccc'));
- # Extract to temprary directory
+ # Extract to temporary directory
if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
@input = ($extract_dir);
}
@@ -829,8 +832,6 @@
};
};
- # TODO: Support sigles
-
# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
@@ -971,6 +972,9 @@
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
+ # Get sigles to extract
+ my $prefix = set_sigle($archive);
+
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
@@ -1056,6 +1060,86 @@
};
+# For an archive, this will create the list
+# of all sigles to process
+sub set_sigle {
+ my $archive = shift;
+
+ my $prefix = 1;
+ my @dirs = ();
+
+ # No sigles given
+ unless (@sigle) {
+
+ # Get files
+ foreach ($archive->list_texts) {
+
+ push @dirs, $_;
+
+ # Split path information
+ ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
+
+ # TODO: Make this OS independent
+ push @sigle, join '/', $corpus, $doc, $text;
+ };
+ }
+
+ # Check sigle for doc sigles
+ else {
+ my @new_sigle;
+
+ my $prefix_check = 0;
+
+ # Iterate over all sigle
+ foreach (@sigle) {
+
+ # Sigle is a doc sigle
+ if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+
+ print "$_ ...";
+ # Check if a prefix is needed
+ unless ($prefix_check) {
+
+ if ($prefix = $archive->check_prefix) {
+ print " with prefix ...";
+ };
+ $prefix_check = 1;
+ };
+
+ print "\n";
+
+ # TODO: Make this OS independent
+ my $path = ($prefix ? './' : '') . $_;
+
+ print '... ' . (
+ $archive->extract_doc(
+ $path, $output, $sequential_extraction ? 1 : $jobs
+ ) ? '' : 'not '
+ );
+ print "extracted.\n";
+ }
+
+ # Sigle is a text sigle
+ else {
+ push @new_sigle, $_;
+
+ unless ($prefix_check) {
+
+ if ($prefix = $archive->check_prefix) {
+ print " with prefix ...";
+ };
+ $prefix_check = 1;
+ };
+ };
+ };
+ @sigle = @new_sigle;
+ };
+
+ return $prefix;
+};
+
+
+
# Cleanup temporary extraction directory
if ($extract_dir) {
my $objects = remove_tree($extract_dir, { safe => 1 });
@@ -1344,8 +1428,8 @@
Supported parameters are:
C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>,
-C<temp-extract>, C<sequential-extraction>,
+C<output>, C<koral>,
+C<tempary-extract>, C<sequential-extraction>,
C<base-sentences>, C<base-paragraphs>,
C<base-pagebreaks>,
C<skip> (semicolon separated), C<sigle>
diff --git a/t/archive.t b/t/archive.t
index 1cd3c00..f86277f 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
use strict;
use warnings;
+use Data::Dumper;
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile catdir/;
@@ -52,6 +53,35 @@
$archive = KorAP::XML::Archive->new($file);
ok(!$archive->check_prefix, 'Archive has no dot prefix');
+my @cmd = map { join ' ', @{$_} } $archive->cmds_from_sigle(['REI/RB*', 'REI/BNG/00071']);
+
+like($cmd[0], qr!unzip -qo -uo t/corpus/archive_rei\.zip!);
+like($cmd[0], qr!\QREI/header.xml REI/RB*/header.xml REI/RB* REI/BNG/header.xml REI/BNG/00071/*\E!);
+ok(!$cmd[1]);
+
+# New temporary directory
+$dir = tempdir(CLEANUP => 1);
+
+{
+ local $SIG{__WARN__} = sub {};
+ ok($archive->extract_sigle(['REI/RB*', 'REI/BNG/00071'], $dir), 'Fine');
+};
+
+ok(-d catdir($dir, 'REI'), 'Test corpus directory exists');
+ok(-d catdir($dir, 'REI','BNG'), 'Test corpus directory exists');
+ok(-d catdir($dir, 'REI','BNG','00071'), 'Test corpus directory exists');
+
+ok(-f catdir($dir, 'REI', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','BNG', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','BNG','00071', 'header.xml'), 'Test corpus directory exists');
+
+ok(-f catdir($dir, 'REI','RBR', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','RBR','00610', 'header.xml'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'REI','RBR','00610', 'header.xml'), 'Test corpus directory exists');
+
+ok(!-e catdir($dir, 'REI','BNG','00128'), 'Test corpus directory does not exist');
+
+
done_testing;
__END__
diff --git a/t/script/archive.t b/t/script/archive.t
index 8d0fba1..290784b 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -38,7 +38,6 @@
my $input = catfile($f, '..', 'corpus', 'archive.zip');
ok(-f $input, 'Input archive found');
-
my $output = File::Temp->newdir(CLEANUP => 0);
$output->unlink_on_destroy(0);
@@ -221,5 +220,38 @@
$call
);
+
+
+# Test with sigles
+$input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+unlink($output);
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--input' => '' . $input,
+ '--output' => $output,
+ '--sigle' => 'TEST/BSP/2',
+ '--sigle' => 'TEST/BSP/5',
+ '-t' => 'Base#tokens_aggr',
+ '-m' => 'Sgbr'
+);
+
+{
+ local $SIG{__WARN__} = sub {};
+ my $out = stdout_from(sub { system($call); });
+
+ like($out, qr!TEST-BSP-1\.json!s, $call);
+
+ $out =~ m!Processed (.+?\.json)!;
+ $json = $1;
+};
+
+ok(-f $json, 'Json file exists');
+
+
done_testing;
__END__
diff --git a/t/script/extract.t b/t/script/extract.t
index e9f666f..11d7cf3 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -167,7 +167,7 @@
sub {
system($call);
},
- qr!Extract .+? REI/BN\*!s,
+ qr!Extract .+? REI/BN!s,
$call
);