Use iterators instead of file lists
Change-Id: I49ea1075939d8e0d042d15556ea16db9359d3aff
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 0fc438d..7e46f79 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -78,6 +78,42 @@
return @texts;
};
+# Create an iterator for text paths
+sub list_texts_iterator {
+ my $self = shift;
+ my $file = $self->[0]->[0];
+
+ # Open pipe to unzip command
+ open(my $unzip, "unzip -l -UU -qq $file \"*/data.xml\" |")
+ or die "Failed to run unzip: $!";
+
+ return sub {
+ while (my $line = <$unzip>) {
+ if ($line =~ m![\t\s]
+ ((?:\./)?
+ [^\s\t/\.]+?/ # Corpus
+ [^\/]+?/ # Document
+ [^/]+? # Text
+ )/data\.xml$!x) {
+ return $1; # Return next path
+ }
+ }
+ close($unzip);
+ return undef; # No more paths
+ };
+}
+
+# Get count of texts without storing paths
+sub count_texts {
+ my $self = shift;
+ my $count = 0;
+ my $iter = $self->list_texts_iterator;
+ while (defined(my $path = $iter->())) {
+ $count++;
+ }
+ return $count;
+};
+
# Check, if the archive has a prefix
sub check_prefix {