Optimize performance of text listing
Change-Id: Id89f0bd6b73ecbc40ea98e3cd7da3a97b8af2baa
diff --git a/Changes b/Changes
index 5a672d7..d370457 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
+ - Zip listing optimized. Does no longer work with primary data
+ in text.xml files.
0.16 2016-03-18
- Added caching mechanism for
diff --git a/MANIFEST b/MANIFEST
index cce10c3..a1376f6 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -80,6 +80,7 @@
t/annotation/xip_sentences.t
t/annotation/koralquery.t
t/benchmark/parse_document.t
+t/benchmark/list_archive_texts.t
t/real/bzk.t
t/real/bzk_2.t
t/real/goethe.t
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 3090155..e44741d 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -33,20 +33,18 @@
# List all text paths contained in the file
sub list_texts {
my $self = shift;
- my $file = $$self;
- my %texts;
- foreach (`unzip -l $file *.xml`) {
- if ($_ =~ m![\t\s]
- ((?:\./)?
- [^\t\s/\.]+?/ # Corpus
- [^\t\s/]+?/ # Document
- [^\t\s/]+? # Text
- )/(?:[^/]+?)\.xml$!x) {
- $texts{$1} = 1;
+ my @texts;
+ foreach (`unzip -l -UU -qq $$self "*/data.xml"`) {
+ if (m![\t\s]
+ ((?:\./)?
+ [^\t\s/\.]+?/ # Corpus
+ [^\t\s/]+?/ # Document
+ [^\t\s/]+? # Text
+ )/data\.xml$!x) {
+ push @texts, $1;
};
};
-
- return sort {$a cmp $b} keys %texts;
+ return @texts;
};
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 99dd333..5cdacc4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -345,8 +345,6 @@
exit(1);
};
- # Test will be skipped
-
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ";
@@ -451,11 +449,6 @@
exit(1);
};
- unless ($archive->test) {
- print "Zip archive not compatible.\n\n";
- exit(1);
- };
-
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
diff --git a/t/archive.t b/t/archive.t
index 4a729b6..22ad61f 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -21,7 +21,7 @@
my @list = $archive->list_texts;
is(scalar @list, 10, 'Found all tests');
is($list[0], './TEST/BSP/1', 'First document');
-is($list[-1], './TEST/BSP/9', 'First document');
+is($list[-1], './TEST/BSP/10', 'First document');
my @path = $archive->split_path('./TEST/BSP/9');
is($path[0],'.', 'Prefix');