Added extraction method for documents in archives
Change-Id: Id4ea7d9801a5750c77f81a2251d389adb6e06d31
diff --git a/Changes b/Changes
index f3f575a..b19f50a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.22 2016-10-26
+ - Added support for document extraction
+
0.21 2016-10-14
- Improved Windows support
diff --git a/MANIFEST b/MANIFEST
index f6e215b..5e4784b 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -102,6 +102,7 @@
t/script/extract.t
t/script/archive.t
t/corpus/archive.zip
+t/corpus/archive_rei.zip
t/corpus/BZK/header.xml
t/corpus/GOE/header.xml
t/corpus/VDI/header.xml
diff --git a/Readme.pod b/Readme.pod
index cd6258f..296cf13 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -10,8 +10,8 @@
=head1 SYNOPSIS
$ korapxml2krill -z --input <directory> --output <filename>
+ $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
$ korapxml2krill archive -z --input <directory|archive> --output <directory>
- $ korapxml2krill extract --input <directory|archive> --output <filename> --sigle <SIGLE>
=head1 DESCRIPTION
@@ -161,10 +161,11 @@
=item B<--sigle|-sg>
-Extract the given text sigles.
+Extract the given texts.
Can be set multiple times.
I<Currently only supported on C<extract>.>
Sigles have the structure C<Corpus>/C<Document>/C<Text>.
+In case the C<Text> path is omitted, the whole document will be extracted.
=item B<--log|-l>
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 56e0622..17a4f53 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -59,6 +59,16 @@
};
+# Check, if the archive has a prefix
+sub check_prefix {
+ my $self = shift;
+ my $nr = shift // 0;
+ my $file = $self->[$nr]->[0];
+ my ($header) = `unzip -l -UU -qq $file "*/header.xml"`;
+ return $header =~ m![\s\t]\.[/\\]! ? 1 : 0;
+};
+
+
# Split a text path to prefix, corpus, document, text
sub split_path {
my $self = shift;
@@ -76,12 +86,12 @@
};
# Unix form
- if ($text_path =~ m!^([^/]+?)/([^/]+?)/([^/]+?)$!) {
+ if ($text_path =~ m!^([^/]+?)/([^/]+?)[\\/]([^/]+?)$!) {
return ($prefix, $1, $2, $3);
}
# Windows form
- elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)\\([^\\]+?)$!) {
+ elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)[\\/]([^\\]+?)$!) {
return ($prefix, $1, $2, $3);
};
@@ -127,9 +137,65 @@
};
+# Extract document files to a directory
+sub extract_doc {
+ my $self = shift;
+ my $doc_path = shift;
+ my $target_dir = shift;
-# Extract files to a directory
-sub extract {
+ my $first = 1;
+
+ my @init_cmd = (
+ 'unzip', # Use unzip program
+ '-qo', # quietly overwrite all existing files
+ '-d', $target_dir # Extract into target directory
+ );
+
+ my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
+
+ # Iterate over all attached archives
+ foreach my $archive (@$self) {
+
+ # $_ is the zip
+ my @cmd = @init_cmd;
+ push(@cmd, $archive->[0]); # Extract from zip
+
+ # Add some interesting files for extraction
+ # Can't use catfile(), as this removes the '.' prefix
+ my @breadcrumbs = ($corpus);
+
+ # If the prefix is not forbidden - prefix!
+ unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
+
+ if ($first) {
+ # Only extract from first file
+ push(@cmd, join('/', @breadcrumbs, 'header.xml'));
+ # push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
+ $first = 0;
+ };
+
+ # With prefix
+ push @breadcrumbs, $doc, '*';
+
+ push(@cmd, join('/', @breadcrumbs));
+
+ # Run system call
+ system(@cmd);
+
+ # Check for return code
+ if ($? != 0) {
+ carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+ return;
+ };
+ };
+
+ # Fine
+ return 1;
+};
+
+
+# Extract text files to a directory
+sub extract_text {
my $self = shift;
my $text_path = shift;
my $target_dir = shift;
@@ -201,12 +267,14 @@
=head1 attach
+=head1 check_prefix
+
=head1 list_texts
Returns all texts found in the zip file
-=head1 extract
+=head1 extract_text
- $archive->extract('./GOE/AGU/0004', '~/temp');
+ $archive->extract_text('./GOE/AGU/0004', '~/temp');
Extract all files for the named text to a certain directory.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 6d42dbc..6ba77db 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -15,7 +15,7 @@
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
-our $VERSION = '0.21';
+our $VERSION = '0.22';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 2713342..d00ba9e 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -332,6 +332,40 @@
# TODO: Make this OS independent
push @sigle, join '/', $corpus, $doc, $text;
};
+ }
+
+ # Check sigle for doc sigles
+ else {
+ my @new_sigle;
+
+ my $prefix_check = 0;
+
+ # Iterate over all sigle
+ foreach (@sigle) {
+
+ # Sigle is a doc sigle
+ if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+ print "$_ ";
+
+ # Check if a prefix is needed
+ unless ($prefix_check) {
+ $prefix = $archive->check_prefix;
+ $prefix_check = 1;
+ };
+
+ # TODO: Make this OS independent
+ print '' . (
+ $archive->extract_doc(
+ ($prefix ? './' : '') . $_, $output
+ ) ? '' : 'not '
+ );
+ print "extracted.\n";
+ }
+ else {
+ push @new_sigle, $_;
+ };
+ };
+ @sigle = @new_sigle;
};
# Iterate over all given sigles and extract
@@ -340,7 +374,7 @@
# TODO: Make this OS independent
print '' . (
- $archive->extract(
+ $archive->extract_text(
($prefix ? './' : '') . $_, $output
) ? '' : 'not '
);
@@ -474,7 +508,7 @@
# because extraction can be horrible slow!
# Extract from archive
- if ($archive->extract($dirs[$i], $temp)) {
+ if ($archive->extract_text($dirs[$i], $temp)) {
# Create corpus directory
my $input = catdir("$temp", $corpus);
@@ -533,8 +567,8 @@
=head1 SYNOPSIS
$ korapxml2krill -z --input <directory> --output <filename>
+ $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
$ korapxml2krill archive -z --input <directory|archive> --output <directory>
- $ korapxml2krill extract --input <directory|archive> --output <filename> --sigle <SIGLE>
=head1 DESCRIPTION
@@ -684,10 +718,11 @@
=item B<--sigle|-sg>
-Extract the given text sigles.
+Extract the given texts.
Can be set multiple times.
I<Currently only supported on C<extract>.>
Sigles have the structure C<Corpus>/C<Document>/C<Text>.
+In case the C<Text> path is omitted, the whole document will be extracted.
=item B<--log|-l>
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 30a8708..478bbc1 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -42,7 +42,7 @@
my $dir = tempdir();
my $f_path = 'WPD15/A00/00081';
-$archive->extract($f_path, $dir);
+$archive->extract_text($f_path, $dir);
ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path));
diff --git a/t/archive.t b/t/archive.t
index 3d40549..0a17165 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -18,6 +18,8 @@
ok($archive->test, 'Test archive');
like($archive->path(0), qr/archive\.zip$/, 'Archive path');
+ok($archive->check_prefix, 'Archive has dot prefix');
+
my @list = $archive->list_texts;
is(scalar @list, 10, 'Found all tests');
is($list[0], './TEST/BSP/1', 'First document');
@@ -33,7 +35,7 @@
{
local $SIG{__WARN__} = sub {};
- ok($archive->extract('./TEST/BSP/8', $dir), 'Wrong path');
+ ok($archive->extract_text('./TEST/BSP/8', $dir), 'Wrong path');
};
ok(-d catdir($dir, 'TEST'), 'Test corpus directory exists');
@@ -41,6 +43,10 @@
ok(-d catdir($dir, 'TEST', 'BSP'), 'Test doc directory exists');
ok(-f catdir($dir, 'TEST', 'BSP', 'header.xml'), 'Test doc header exists');
+$file = catfile(dirname(__FILE__), 'corpus','archive_rei.zip');
+$archive = KorAP::XML::Archive->new($file);
+ok(!$archive->check_prefix, 'Archive has no prefix');
+
# TODO: Test attaching!
diff --git a/t/corpus/archive_rei.zip b/t/corpus/archive_rei.zip
new file mode 100644
index 0000000..8a00aaa
--- /dev/null
+++ b/t/corpus/archive_rei.zip
Binary files differ
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index 7865101..1da4e9d 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t
@@ -60,7 +60,7 @@
my $dir = tempdir(CLEANUP => 1);
{
local $SIG{__WARN__} = sub {};
- ok($archive->extract($list[0], $dir), 'Wrong path');
+ ok($archive->extract_text($list[0], $dir), 'Wrong path');
};
ok(-d catdir($dir, 'WPD15'), 'Test corpus directory exists');
diff --git a/t/script/extract.t b/t/script/extract.t
index 0f8f0b7..6c689f9 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -104,6 +104,42 @@
ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
+
+# Test with document sigle
+my $input_rei = catdir($f, '..', 'corpus', 'archive_rei.zip');
+ok(-f $input_rei, 'Input archive found');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'extract',
+ '--input' => $input_rei,
+ '--output' => $output2,
+ '-sg' => 'REI/BNG'
+);
+
+# Test with sigle
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!REI/BNG extracted!s,
+ $call
+);
+
+# Test with sigle
+stdout_unlike(
+ sub {
+ system($call);
+ },
+ qr!REI/RBR extracted!s,
+ $call
+);
+
+ok(-d catdir($output2, 'REI', 'BNG', '00071'), 'Directory created');
+ok(-d catdir($output2, 'REI', 'BNG', '00128'), 'Directory created');
+ok(!-d catdir($output2, 'REI', 'RBR', '00610'), 'Directory not created');
+
# Check multiple archives
$output = tempdir(CLEANUP => 1);
ok(-d $output, 'Output directory exists');