Treat prefixes correct for text sigles
Change-Id: I710912f4681f16d866410ead0e050afe55c61b68
diff --git a/Changes b/Changes
index ced2a3d..b7de7f6 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.25 2017-02-16
+0.25 2017-02-17
- Updated to Mojolicious 7.20
- Fixed meta treatment in case analytic and monogr
are available
@@ -7,6 +7,7 @@
compliant with CoRoLa.
- Added support for pagebreak annotations.
- Renamed "pages" to "srcPages".
+ - Fixed handling of prefixes for text sigles.
0.24 2016-12-21
- Added --base-sentences and --base-paragraphs options
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index b1eb900..831d257 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -1,5 +1,6 @@
package KorAP::XML::Archive;
use Carp qw/carp/;
+use Mojo::Util qw/quote/;
use File::Spec::Functions qw(rel2abs);
use strict;
use warnings;
@@ -259,6 +260,7 @@
unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
if ($first) {
+
# Only extract from first file
push(@cmd, join('/', @breadcrumbs, 'header.xml'));
push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 43ac47a..6bdb961 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -391,13 +391,18 @@
# Sigle is a doc sigle
if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
- print "$_ ...\n";
+ print "$_ ...";
# Check if a prefix is needed
unless ($prefix_check) {
- $prefix = $archive->check_prefix;
+
+ if ($prefix = $archive->check_prefix) {
+ print " with prefix ...";
+ };
$prefix_check = 1;
};
+ print "\n";
+
# TODO: Make this OS independent
my $path = ($prefix ? './' : '') . $_;
@@ -408,8 +413,18 @@
);
print "extracted.\n";
}
+
+ # Sigle is a text sigle
else {
push @new_sigle, $_;
+
+ unless ($prefix_check) {
+
+ if ($prefix = $archive->check_prefix) {
+ print " with prefix ...";
+ };
+ $prefix_check = 1;
+ };
};
};
@sigle = @new_sigle;
@@ -417,10 +432,12 @@
# Iterate over all given sigles and extract
foreach (@sigle) {
+
print "$_ ...\n";
# TODO: Make this OS independent
print '... ' . (
+
$archive->extract_text(
($prefix ? './' : '') . $_, $output
) ? '' : 'not '
diff --git a/t/archive.t b/t/archive.t
index 0a17165..1cd3c00 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -47,8 +47,10 @@
$archive = KorAP::XML::Archive->new($file);
ok(!$archive->check_prefix, 'Archive has no prefix');
-
-# TODO: Test attaching!
+# No leading '.'
+$file = catfile(dirname(__FILE__), 'corpus','archive_rei.zip');
+$archive = KorAP::XML::Archive->new($file);
+ok(!$archive->check_prefix, 'Archive has no dot prefix');
done_testing;
diff --git a/t/corpus/archive_quotes.zip b/t/corpus/archive_quotes.zip
new file mode 100644
index 0000000..2326b17
--- /dev/null
+++ b/t/corpus/archive_quotes.zip
Binary files differ
diff --git a/t/script/extract.t b/t/script/extract.t
index cc43bc8..884cdb5 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -215,5 +215,30 @@
ok(-f catfile($output, 'WPD15', 'A00', '00081', 'opennlp', 'morpho.xml'), 'New archive');
+# With quotes:
+# Test with document sigle
+my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
+ok(-f $input, 'Input archive found');
+$output2 = undef;
+$output2 = tempdir(CLEANUP => 1);
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'extract',
+ '--input' => $input_quotes,
+ '--output' => $output2,
+ '-sg' => '"TEST/BSP \"Example\"/1"'
+);
+
+# Test with sigle
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!TEST/BSP "Example"\/1 $sep extracted!s,
+ $call
+);
+
done_testing;
__END__