Optimize performance of text listing

Change-Id: Id89f0bd6b73ecbc40ea98e3cd7da3a97b8af2baa
diff --git a/Changes b/Changes
index 5a672d7..d370457 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
 0.17 2016-03-22
         - Rewrite siglen to use slashes as separators.
+	- Zip listing optimized. Does no longer work with primary data
+	  in text.xml files.
 
 0.16 2016-03-18
         - Added caching mechanism for
diff --git a/MANIFEST b/MANIFEST
index cce10c3..a1376f6 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -80,6 +80,7 @@
 t/annotation/xip_sentences.t
 t/annotation/koralquery.t
 t/benchmark/parse_document.t
+t/benchmark/list_archive_texts.t
 t/real/bzk.t
 t/real/bzk_2.t
 t/real/goethe.t
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 3090155..e44741d 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -33,20 +33,18 @@
 # List all text paths contained in the file
 sub list_texts {
   my $self = shift;
-  my $file = $$self;
-  my %texts;
-  foreach (`unzip -l $file *.xml`) {
-    if ($_ =~ m![\t\s]
-		((?:\./)?
-		  [^\t\s/\.]+?/ # Corpus
-		  [^\t\s/]+?/   # Document
-		  [^\t\s/]+?    # Text
-		)/(?:[^/]+?)\.xml$!x) {
-      $texts{$1} = 1;
+  my @texts;
+  foreach (`unzip -l -UU -qq $$self "*/data.xml"`) {
+    if (m![\t\s]
+      ((?:\./)?
+	[^\t\s/\.]+?/ # Corpus
+	[^\t\s/]+?/   # Document
+	[^\t\s/]+?    # Text
+      )/data\.xml$!x) {
+      push @texts, $1;
     };
   };
-
-  return sort {$a cmp $b} keys %texts;
+  return @texts;
 };
 
 
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 99dd333..5cdacc4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -345,8 +345,6 @@
       exit(1);
     };
 
-    # Test will be skipped
-
     # Iterate over all given sigles and extract
     foreach (@sigle) {
       print "$_ ";
@@ -451,11 +449,6 @@
       exit(1);
     };
 
-    unless ($archive->test) {
-      print "Zip archive not compatible.\n\n";
-      exit(1);
-    };
-
     print "Start processing ...\n";
     $t = Benchmark->new;
     my @dirs = $archive->list_texts;
diff --git a/t/archive.t b/t/archive.t
index 4a729b6..22ad61f 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -21,7 +21,7 @@
 my @list = $archive->list_texts;
 is(scalar @list, 10, 'Found all tests');
 is($list[0], './TEST/BSP/1', 'First document');
-is($list[-1], './TEST/BSP/9', 'First document');
+is($list[-1], './TEST/BSP/10', 'First document');
 
 my @path = $archive->split_path('./TEST/BSP/9');
 is($path[0],'.', 'Prefix');