Fix extraction of multiple archives
Change-Id: I9688500baf95b0324bf9ef77647ec8b1716135d3
diff --git a/Changes b/Changes
index 401602e..6ea6c3d 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.18 2016-08-15
+0.18 2016-08-16
- Added REI test.
- Added multiple archive support to korapxml2krill.
- Added support for prefix negation in korapxml2krill.
@@ -13,6 +13,8 @@
script.
- Fixed output of version and help messages.
- Added extraction test.
+ - Fixed extraction with multiple archives and prefix
+ negation support.
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 4539f5d..65bc89a 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -320,7 +320,6 @@
};
# TODO: Support sigles and full archives
-
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
unless ($archive->test_unzip) {
@@ -331,6 +330,8 @@
# Add further annotation archived
$archive->attach($_) foreach @input;
+ my $prefix = 1;
+
# No sigles given
unless (@sigle) {
@@ -338,7 +339,7 @@
foreach ($archive->list_texts) {
# Split path information
- my ($prefix, $corpus, $doc, $text) = $archive->split_path($_);
+ ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
# TODO: Make this OS independent
push @sigle, join '/', $corpus, $doc, $text;
@@ -349,7 +350,11 @@
foreach (@sigle) {
print "$_ ";
# TODO: Make this OS independent
- print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
+ print '' . (
+ $archive->extract(
+ ($prefix ? './' : '') . $_, $output
+ ) ? '' : 'not '
+ );
print "extracted.\n";
};
@@ -379,7 +384,7 @@
exit(0);
};
- # Zero means: everything runs in the parent process
+# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
my $count = 0; # Texts to process
@@ -391,9 +396,9 @@
my ($pid, $code) = shift;
my $data = pop;
print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
- ($iter++) . "/$count]" .
- ($code ? " $code" : '') .
- " $$data\n";
+ ($iter++) . "/$count]" .
+ ($code ? " $code" : '') .
+ " $$data\n";
}
);
@@ -594,6 +599,8 @@
In this case further archives lacking a C<.> root folder
need to be passed with a hash sign in front of the archive's name.)
+B<The root folder switch is experimental and may vanish in future versions.>
+
=item B<--output|-o> <directory|file>
Output folder for archive processing or
diff --git a/t/script/extract.t b/t/script/extract.t
index 4429a08..2ea1e13 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -99,15 +99,36 @@
ok(-d catdir($output2, 'TEST', 'BSP', '4'), 'Directory created');
ok(!-d catdir($output2, 'TEST', 'BSP', '5'), 'Directory created');
+# Check multiple archives
+$output = tempdir(CLEANUP => 1);
+ok(-d $output, 'Output directory exists');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'extract',
+ '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.zip'),
+ '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.tree_tagger.zip'),
+ '-i' => catfile($f, '..', 'corpus', 'archives', 'wpd15-single.opennlp.zip'),
+ '--output' => $output
+);
+
+# Test with sigle
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!WPD15/A00/00081 extracted.!s,
+ $call
+);
+
+ok(-d catdir($output, 'WPD15', 'A00', '00081'), 'Directory created');
+ok(-f catfile($output, 'WPD15', 'A00', 'header.xml'), 'Header file created');
+ok(-d catdir($output, 'WPD15', 'A00', '00081', 'base'), 'Directory created');
+
+ok(-f catfile($output, 'WPD15', 'A00', '00081', 'tree_tagger', 'morpho.xml'), 'New archive');
+ok(-f catfile($output, 'WPD15', 'A00', '00081', 'opennlp', 'morpho.xml'), 'New archive');
+
done_testing;
__END__
-
-
-
-
-
-# Test sigle!
-# Test multiple archives
-
-