Test multiple archives
Change-Id: I48bcd02265c99e83f85dd3dac6ecb386de881fe6
diff --git a/MANIFEST b/MANIFEST
index 02a7513..dcf891b 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -46,6 +46,7 @@
lib/KorAP/XML/Annotation/XIP/Sentences.pm
t/archive.t
t/meta.t
+t/multiple_archives.t
t/primary.t
t/range.t
t/sort_tokens.t
@@ -120,6 +121,13 @@
t/corpus/A01/13047/header.xml
t/corpus/A01/13047/metadata.xml
t/corpus/A01/13047/text.txt
+t/corpus/archives/fake.zip
+t/corpus/archives/wpd15-single.corenlp.zip
+t/corpus/archives/wpd15-single.malt.zip
+t/corpus/archives/wpd15-single.mdparser.zip
+t/corpus/archives/wpd15-single.opennlp.zip
+t/corpus/archives/wpd15-single.tree_tagger.zip
+t/corpus/archives/wpd15-single.zip
t/corpus/BZK/D59/header.xml
t/corpus/ERL/00001/data.xml
t/corpus/ERL/00001/header.xml
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 1dd7798..56e0622 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -4,7 +4,7 @@
use strict;
use warnings;
-# Convert new archive helper
+# Construct new archive helper
sub new {
my $class = shift;
my @file;
@@ -137,8 +137,8 @@
my $first = 1;
my @init_cmd = (
- 'unzip', # Use unzip program
- '-qo', # quietly overwrite all existing files
+ 'unzip', # Use unzip program
+ '-qo', # quietly overwrite all existing files
'-d', $target_dir # Extract into target directory
);
@@ -153,20 +153,22 @@
# Add some interesting files for extraction
# Can't use catfile(), as this removes the '.' prefix
+ my @breadcrumbs = ($corpus);
+
+ # If the prefix is not forbidden - prefix!
+ unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
+
if ($first) {
# Only extract from first file
- push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
- push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
+ push(@cmd, join('/', @breadcrumbs, 'header.xml'));
+ push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
$first = 0;
};
# With prefix
- my @path = ($corpus, $doc, $text, '*');
+ push @breadcrumbs, $doc, $text, '*';
- # If the prefix is not forbidden - prefix!
- unshift @path, $prefix if $archive->[1];
-
- push(@cmd, join('/', @path));
+ push(@cmd, join('/', @breadcrumbs));
# Run system call
system(@cmd);
diff --git a/script/korapxml2krill b/script/korapxml2krill
index cde5e37..7e729e0 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -604,17 +604,16 @@
Directory or archive file of documents to convert.
-Multiple input archives are supported for archiving,
-with the constraint,
+Archiving supports multiple input archives with the constraint,
that the first archive listed contains all primary data files
and all meta data files.
-The directory structure follows the base directory format,
-starting with a C<.> root folder.
-In case an attached archive has no C<.> root folder,
-the archive path should start with a hash.
-i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
+(The directory structure follows the base directory format,
+that may include a C<.> root folder.
+In this case further archives lacking a C<.> root folder
+need to be passed with a hash sign in front of the archive's name.)
=item B<--output|-o> <directory|file>
diff --git a/t/corpus/archives/fake.zip b/t/corpus/archives/fake.zip
new file mode 100644
index 0000000..acbc962
--- /dev/null
+++ b/t/corpus/archives/fake.zip
@@ -0,0 +1 @@
+fake
diff --git a/t/corpus/archives/wpd15-single.corenlp.zip b/t/corpus/archives/wpd15-single.corenlp.zip
new file mode 100644
index 0000000..f4cfd4e
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.corenlp.zip
Binary files differ
diff --git a/t/corpus/archives/wpd15-single.malt.zip b/t/corpus/archives/wpd15-single.malt.zip
new file mode 100644
index 0000000..43a5add
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.malt.zip
Binary files differ
diff --git a/t/corpus/archives/wpd15-single.mdparser.zip b/t/corpus/archives/wpd15-single.mdparser.zip
new file mode 100644
index 0000000..8897f7f
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.mdparser.zip
Binary files differ
diff --git a/t/corpus/archives/wpd15-single.opennlp.zip b/t/corpus/archives/wpd15-single.opennlp.zip
new file mode 100644
index 0000000..c10b384
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.opennlp.zip
Binary files differ
diff --git a/t/corpus/archives/wpd15-single.tree_tagger.zip b/t/corpus/archives/wpd15-single.tree_tagger.zip
new file mode 100644
index 0000000..372e15e
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.tree_tagger.zip
Binary files differ
diff --git a/t/corpus/archives/wpd15-single.zip b/t/corpus/archives/wpd15-single.zip
new file mode 100644
index 0000000..89d0426
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.zip
Binary files differ
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
new file mode 100644
index 0000000..fbe14e3
--- /dev/null
+++ b/t/multiple_archives.t
@@ -0,0 +1,110 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile catdir/;
+use File::Temp qw/tempdir/;
+
+use_ok('KorAP::XML::Archive');
+
+my $name = 'wpd15-single';
+my @path = (dirname(__FILE__), 'corpus','archives');
+
+my $file = catfile(@path, $name . '.zip');
+my $archive = KorAP::XML::Archive->new($file);
+
+unless ($archive->test_unzip) {
+ plan skip_all => 'unzip not found';
+};
+
+ok($archive->test, 'Test archive');
+
+like($archive->path(0), qr/wpd15-single\.zip$/, 'Archive path');
+
+ok($archive->attach(catfile(@path, 'fake.zip')), 'Attach fake archive');
+
+# Fake archive is no valid zip file
+ok(!$archive->test, 'Test archive');
+
+# Recreate archive object
+$archive = KorAP::XML::Archive->new($file);
+
+# Test again
+ok($archive->test, 'Test archive');
+
+my @list = $archive->list_texts;
+is(scalar @list, 1, 'Found all tests');
+
+# Attach further archives
+ok($archive->attach(catfile(@path, $name . '.corenlp.zip')), 'Add corenlp');
+ok($archive->attach(catfile(@path, $name . '.malt.zip')), 'Add malt');
+ok($archive->attach(catfile(@path, $name . '.mdparser.zip')), 'Add mdparser');
+ok($archive->attach(catfile(@path, $name . '.opennlp.zip')), 'Add opennlp');
+ok($archive->attach(catfile(@path, $name . '.tree_tagger.zip')), 'Add tree tagger');
+
+@list = $archive->list_texts;
+is(scalar @list, 1, 'Found all tests');
+is($list[0], 'WPD15/A00/00081', 'First document');
+
+ok($archive->test, 'Test all archives');
+
+# Split path
+@path = $archive->split_path($list[0]);
+is($path[0],'', 'Prefix');
+is($path[1],'WPD15', 'Prefix');
+is($path[2],'A00', 'Prefix');
+is($path[3],'00081', 'Prefix');
+
+# Extract everything to temporary directory
+my $dir = tempdir(CLEANUP => 1);
+{
+ local $SIG{__WARN__} = sub {};
+ ok($archive->extract($list[0], $dir), 'Wrong path');
+};
+
+ok(-d catdir($dir, 'WPD15'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'WPD15', 'header.xml'), 'Test corpus header exists');
+ok(-d catdir($dir, 'WPD15', 'A00'), 'Test doc directory exists');
+ok(-f catdir($dir, 'WPD15', 'A00', 'header.xml'), 'Test doc header exists');
+ok(-d catdir($dir, 'WPD15', 'A00', '00081'), 'Test text directory exists');
+ok(-f catdir($dir, 'WPD15', 'A00', '00081', 'header.xml'), 'Test text header exists');
+
+ok(-f catdir($dir, 'WPD15', 'A00', '00081', 'data.xml'), 'Test primary data exists');
+
+my @file = ('WPD15', 'A00', '00081');
+ok(-f catdir($dir, @file, 'base', 'paragraph.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'base', 'sentences.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'base', 'tokens.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'base', 'tokens_aggr.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'base', 'tokens_conservative.xml'), 'Annotation data exists');
+
+ok(-f catdir($dir, @file, 'struct', 'structure.xml'), 'Annotation data exists');
+
+ok(-f catdir($dir, @file, 'corenlp', 'constituency.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'corenlp', 'metadata.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'corenlp', 'morpho.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'corenlp', 'sentences.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'corenlp', 'tokens.xml'), 'Annotation data exists');
+
+ok(-f catdir($dir, @file, 'malt', 'dependency.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'malt', 'metadata.xml'), 'Annotation data exists');
+
+ok(-f catdir($dir, @file, 'mdparser', 'dependency.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'mdparser', 'metadata.xml'), 'Annotation data exists');
+
+ok(-f catdir($dir, @file, 'opennlp', 'metadata.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'opennlp', 'morpho.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'opennlp', 'sentences.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'opennlp', 'tokens.xml'), 'Annotation data exists');
+
+ok(-f catdir($dir, @file, 'tree_tagger', 'metadata.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'tree_tagger', 'morpho.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'tree_tagger', 'sentences.xml'), 'Annotation data exists');
+ok(-f catdir($dir, @file, 'tree_tagger', 'tokens.xml'), 'Annotation data exists');
+
+
+done_testing;
+__END__
+
+