Added test for sigles support in extract
Change-Id: I5f5596a88da6314f0d1f7e8299fef7425f89a52f
diff --git a/Changes b/Changes
index 499883c..401602e 100644
--- a/Changes
+++ b/Changes
@@ -12,6 +12,7 @@
- Fixed setting multiple annotations in
script.
- Fixed output of version and help messages.
+ - Added extraction test.
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
diff --git a/MANIFEST b/MANIFEST
index 2496911..856b8f2 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -99,6 +99,7 @@
t/sgbr/token.t
t/script/single.t
t/script/usage.t
+t/script/extract.t
t/corpus/archive.zip
t/corpus/BZK/header.xml
t/corpus/GOE/header.xml
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 250c68d..4539f5d 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -220,20 +220,21 @@
my $cache = Cache::FastMmap->new(
share_file => $cache_file,
cache_size => $cache_size,
- init_file => $cache_init
+ init_file => $cache_init
);
+# Create batch object
my $batch_file = KorAP::XML::Batch::File->new(
- cache => $cache,
+ cache => $cache,
meta_type => $meta,
overwrite => $overwrite,
- foundry => $token_base_foundry,
- layer => $token_base_layer,
- gzip => $gzip,
- log => $log,
- primary => $primary,
- pretty => $pretty,
- anno => \@filtered_anno
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ gzip => $gzip,
+ log => $log,
+ primary => $primary,
+ pretty => $pretty,
+ anno => \@filtered_anno
);
@@ -313,13 +314,10 @@
# Extract XML files
elsif ($cmd eq 'extract') {
-warn '!!!!!!!!!!!!!------------> ';
-
-if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
-};
-
+ if ($output && (!-e $output || !-d $output)) {
+ print "Directory '$output' does not exist.\n\n";
+ exit(0);
+ };
# TODO: Support sigles and full archives
@@ -333,9 +331,24 @@
# Add further annotation archived
$archive->attach($_) foreach @input;
+ # No sigles given
+ unless (@sigle) {
+
+ # Get files
+ foreach ($archive->list_texts) {
+
+ # Split path information
+ my ($prefix, $corpus, $doc, $text) = $archive->split_path($_);
+
+ # TODO: Make this OS independent
+ push @sigle, join '/', $corpus, $doc, $text;
+ };
+ };
+
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ";
+ # TODO: Make this OS independent
print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
print "extracted.\n";
};
diff --git a/t/script/extract.t b/t/script/extract.t
new file mode 100644
index 0000000..4429a08
--- /dev/null
+++ b/t/script/extract.t
@@ -0,0 +1,113 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/tempdir/;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ 'extract'
+);
+
+# Test without parameters
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!extract.+?Extract KorAP-XML files!s,
+ $call
+);
+
+my $input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+my $output = tempdir(CLEANUP => 1);
+ok(-d $output, 'Output directory exists');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'extract',
+ '--input' => $input,
+ '--output' => $output,
+);
+
+# Test without compression
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!TEST/BSP/1 extracted.!s,
+ $call
+);
+
+ok(-d catdir($output, 'TEST', 'BSP', '1'), 'Directory created');
+ok(-d catdir($output, 'TEST', 'BSP', '1', 'base'), 'Directory created');
+ok(-d catdir($output, 'TEST', 'BSP', '1', 'sgbr'), 'Directory created');
+ok(-d catdir($output, 'TEST', 'BSP', '1', 'struct'), 'Directory created');
+ok(-f catfile($output, 'TEST', 'BSP', '1', 'data.xml'), 'File created');
+ok(-f catfile($output, 'TEST', 'BSP', '1', 'header.xml'), 'File created');
+ok(-d catdir($output, 'TEST', 'BSP', '2'), 'Directory created');
+ok(-d catdir($output, 'TEST', 'BSP', '3'), 'Directory created');
+
+# Check sigles
+my $output2 = tempdir(CLEANUP => 1);
+ok(-d $output2, 'Output directory exists');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'extract',
+ '--input' => $input,
+ '--output' => $output2,
+ '-sg' => 'TEST/BSP/4'
+);
+
+# Test with sigle
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!TEST/BSP/4 extracted.!s,
+ $call
+);
+
+# Test with sigle
+stdout_unlike(
+ sub {
+ system($call);
+ },
+ qr!TEST/BSP/5 extracted.!s,
+ $call
+);
+
+ok(!-d catdir($output2, 'TEST', 'BSP', '1'), 'Directory created');
+ok(!-d catdir($output2, 'TEST', 'BSP', '2'), 'Directory created');
+ok(!-d catdir($output2, 'TEST', 'BSP', '3'), 'Directory created');
+ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
+ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
+
+
+done_testing;
+__END__
+
+
+
+
+
+# Test sigle!
+# Test multiple archives
+
+
diff --git a/t/script/single.t b/t/script/single.t
index a05e5e9..9d8d28f 100644
--- a/t/script/single.t
+++ b/t/script/single.t
@@ -203,8 +203,6 @@
is($json->{keywords}, 'sgbrKodex:T', 'keywords');
is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
-# Test sigle!
-
done_testing;
__END__