Use iterators instead of file lists
Change-Id: I49ea1075939d8e0d042d15556ea16db9359d3aff
diff --git a/.gitignore b/.gitignore
index 28b98b2..f57405c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,5 @@
t/real/corpus/AGD-orig
t/real/corpus/FOLK-orig
t/real/dck.t
+t/real/corpus/TWI21
+t/real/corpus/WPD15
\ No newline at end of file
diff --git a/Changes b/Changes
index 58ed8a7..f37c02d 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
0.62 2025-07-15
- Remove lock from tar builder.
+ - Don't create a list of files (that is passed to forks).
0.61 2025-04-30
- Support certainty in OpenNLP/Morpho.
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 0fc438d..7e46f79 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -78,6 +78,42 @@
return @texts;
};
+# Create an iterator for text paths
+sub list_texts_iterator {
+ my $self = shift;
+ my $file = $self->[0]->[0];
+
+ # Open pipe to unzip command
+ open(my $unzip, "unzip -l -UU -qq $file \"*/data.xml\" |")
+ or die "Failed to run unzip: $!";
+
+ return sub {
+ while (my $line = <$unzip>) {
+ if ($line =~ m![\t\s]
+ ((?:\./)?
+ [^\s\t/\.]+?/ # Corpus
+ [^\/]+?/ # Document
+ [^/]+? # Text
+ )/data\.xml$!x) {
+ return $1; # Return next path
+ }
+ }
+ close($unzip);
+ return undef; # No more paths
+ };
+}
+
+# Get count of texts without storing paths
+sub count_texts {
+ my $self = shift;
+ my $count = 0;
+ my $iter = $self->list_texts_iterator;
+ while (defined(my $path = $iter->())) {
+ $count++;
+ }
+ return $count;
+};
+
# Check, if the archive has a prefix
sub check_prefix {
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 487368f..f2ebe85 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -884,8 +884,24 @@
# Input is a directory
if (-d $input[0]) {
- my @dirs;
+ # First pass: count files
+ my $rule_count = Path::Iterator::Rule->new;
+ $rule_count->name('data.xml')->file;
+ my $count_iter = $rule_count->iter(
+ $input[0] => {
+ sorted => 0,
+ depthfirst => -1,
+ error_handler => undef
+ });
+ $count = 0;
+ while (defined(my $file = $count_iter->())) {
+ $count++;
+ };
+ print "Start processing ...\n" unless $q;
+ $t = Benchmark->new;
+
+ # Second pass: process files using iterator
my $rule = Path::Iterator::Rule->new;
$rule->name('data.xml')->file;
my $next = $rule->iter(
@@ -894,27 +910,21 @@
depthfirst => -1,
error_handler => undef
});
- while (defined(my $file = $next->())) {
- $file =~ s/\/data\.xml$//;
- push @dirs, $file;
- };
-
- print "Start processing ...\n" unless $q;
- $t = Benchmark->new;
- $count = scalar @dirs;
DIRECTORY_LOOP:
- for (my $i = 0; $i < $count; $i++) {
+ while (defined(my $file = $next->())) {
+ # Remove data.xml suffix to get directory path
+ $file =~ s/\/data\.xml$//;
my $filename = catfile(
$output_dir,
- get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+ get_file_name($input[0], $file) . '.json' . ($gzip ? '.gz' : '')
);
# Get the next fork
$pool->start and next DIRECTORY_LOOP;
- if (my $return = $batch_file->process($dirs[$i] => $filename)) {
+ if (my $return = $batch_file->process($file => $filename)) {
$pool->finish(
0,
[
@@ -925,7 +935,7 @@
);
}
else {
- $pool->finish(1, ["Unable to process " . $dirs[$i]]);
+ $pool->finish(1, ["Unable to process " . $file]);
};
};
}
@@ -946,14 +956,18 @@
print "Start processing ...\n" unless $q;
$t = Benchmark->new;
- my @dirs = $archive->list_texts;
- $count = scalar @dirs;
-
- ARCHIVE_LOOP:
- for (my $i = 0; $i < $count; $i++) {
-
+
+ # Get count of texts
+ $count = $archive->count_texts;
+
+ # Get iterator for text paths
+ my $text_iter = $archive->list_texts_iterator;
+
+ # Process texts one at a time using the iterator
+ ARCHIVE_LOOP:
+ while (defined(my $text_path = $text_iter->())) {
# Split path information
- my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+ my ($prefix, $corpus, $doc, $text) = $archive->split_path($text_path);
my $filename = catfile(
$output_dir,
@@ -1003,7 +1017,7 @@
# Unable to extract
else {
- $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
+ $pool->finish(1, ["Unable to extract " . $text_path, $temp]);
};
};
}
diff --git a/t/archive.t b/t/archive.t
index 77026d7..adf7559 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -27,6 +27,18 @@
is($list[0], './TEST/BSP/1', 'First document');
is($list[-1], './TEST/BSP/10', 'First document');
+# Test list_texts_iterator
+my $iter = $archive->list_texts_iterator;
+ok($iter, 'Iterator created');
+my @iter_list;
+while (defined(my $path = $iter->())) {
+ push @iter_list, $path;
+}
+is_deeply(\@iter_list, \@list, 'Iterator returns same paths as list_texts');
+
+# Test count_texts
+is($archive->count_texts, 10, 'count_texts returns correct number');
+
my @path = $archive->split_path('./TEST/BSP/9');
is($path[0],'.', 'Prefix');
is($path[1],'TEST', 'Prefix');
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index e57bf1e..643ab81 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t
@@ -47,6 +47,18 @@
is(scalar @list, 1, 'Found all tests');
is($list[0], 'WPD15/A00/00081', 'First document');
+# Test list_texts_iterator
+my $iter = $archive->list_texts_iterator;
+ok($iter, 'Iterator created');
+my @iter_list;
+while (defined(my $path = $iter->())) {
+ push @iter_list, $path;
+}
+is_deeply(\@iter_list, \@list, 'Iterator returns same paths as list_texts');
+
+# Test count_texts
+is($archive->count_texts, 1, 'count_texts returns correct number');
+
ok($archive->test, 'Test all archives');
# Split path