Use iterators instead of file lists Change-Id: I49ea1075939d8e0d042d15556ea16db9359d3aff

commit: 8b03ba54da243810850acdb13816d19b71bbcd4d [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Jul 15 09:16:18 2025 +0200
committer: Akron <nils@diewald-online.de> Wed Jul 16 17:54:16 2025 +0200
tree: 0f85f47caf59a99b968555b4f0073fea801395f3
parent: cb12af7dc8936276ca7a178434a34fcab3106934 [diff]
diff --git a/.gitignore b/.gitignore
index 28b98b2..f57405c 100644
--- a/.gitignore
+++ b/.gitignore

@@ -40,3 +40,5 @@
 t/real/corpus/AGD-orig
 t/real/corpus/FOLK-orig
 t/real/dck.t
+t/real/corpus/TWI21
+t/real/corpus/WPD15
\ No newline at end of file

diff --git a/Changes b/Changes
index 58ed8a7..f37c02d 100644
--- a/Changes
+++ b/Changes

@@ -1,5 +1,6 @@
 0.62 2025-07-15
         - Remove lock from tar builder.
+        - Don't create a list of files (that is passed to forks).
 
 0.61 2025-04-30
         - Support certainty in OpenNLP/Morpho.

diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 0fc438d..7e46f79 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm

@@ -78,6 +78,42 @@
   return @texts;
 };
 
+# Create an iterator for text paths
+sub list_texts_iterator {
+  my $self = shift;
+  my $file = $self->[0]->[0];
+  
+  # Open pipe to unzip command
+  open(my $unzip, "unzip -l -UU -qq $file \"*/data.xml\" |") 
+    or die "Failed to run unzip: $!";
+  
+  return sub {
+    while (my $line = <$unzip>) {
+      if ($line =~ m![\t\s]
+            ((?:\./)?
+              [^\s\t/\.]+?/ # Corpus
+              [^\/]+?/   # Document
+              [^/]+?    # Text
+            )/data\.xml$!x) {
+        return $1;  # Return next path
+      }
+    }
+    close($unzip);
+    return undef;  # No more paths
+  };
+}
+
+# Get count of texts without storing paths
+sub count_texts {
+  my $self = shift;
+  my $count = 0;
+  my $iter = $self->list_texts_iterator;
+  while (defined(my $path = $iter->())) {
+    $count++;
+  }
+  return $count;
+};
+
 
 # Check, if the archive has a prefix
 sub check_prefix {

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 487368f..f2ebe85 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -884,8 +884,24 @@
   # Input is a directory
   if (-d $input[0]) {
 
-    my @dirs;
+    # First pass: count files
+    my $rule_count = Path::Iterator::Rule->new;
+    $rule_count->name('data.xml')->file;
+    my $count_iter = $rule_count->iter(
+      $input[0] => {
+        sorted => 0,
+        depthfirst => -1,
+        error_handler => undef
+      });
+    $count = 0;
+    while (defined(my $file = $count_iter->())) {
+      $count++;
+    };
 
+    print "Start processing ...\n" unless $q;
+    $t = Benchmark->new;
+
+    # Second pass: process files using iterator
     my $rule = Path::Iterator::Rule->new;
     $rule->name('data.xml')->file;
     my $next = $rule->iter(
@@ -894,27 +910,21 @@
         depthfirst => -1,
         error_handler => undef
       });
-    while (defined(my $file = $next->())) {
-      $file =~ s/\/data\.xml$//;
-      push @dirs, $file;
-    };
-
-    print "Start processing ...\n" unless $q;
-    $t = Benchmark->new;
-    $count = scalar @dirs;
 
   DIRECTORY_LOOP:
-    for (my $i = 0; $i < $count; $i++) {
+    while (defined(my $file = $next->())) {
+      # Remove data.xml suffix to get directory path
+      $file =~ s/\/data\.xml$//;
 
       my $filename = catfile(
         $output_dir,
-        get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+        get_file_name($input[0], $file) . '.json' . ($gzip ? '.gz' : '')
       );
 
       # Get the next fork
       $pool->start and next DIRECTORY_LOOP;
 
-      if (my $return = $batch_file->process($dirs[$i] => $filename)) {
+      if (my $return = $batch_file->process($file => $filename)) {
         $pool->finish(
           0,
           [
@@ -925,7 +935,7 @@
         );
       }
       else {
-        $pool->finish(1, ["Unable to process " . $dirs[$i]]);
+        $pool->finish(1, ["Unable to process " . $file]);
       };
     };
   }
@@ -946,14 +956,18 @@
 
     print "Start processing ...\n" unless $q;
     $t = Benchmark->new;
-    my @dirs = $archive->list_texts;
-    $count = scalar @dirs;
-
-  ARCHIVE_LOOP:
-    for (my $i = 0; $i < $count; $i++) {
-
+    
+    # Get count of texts
+    $count = $archive->count_texts;
+    
+    # Get iterator for text paths
+    my $text_iter = $archive->list_texts_iterator;
+    
+    # Process texts one at a time using the iterator
+    ARCHIVE_LOOP:
+    while (defined(my $text_path = $text_iter->())) {
       # Split path information
-      my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+      my ($prefix, $corpus, $doc, $text) = $archive->split_path($text_path);
 
       my $filename = catfile(
         $output_dir,
@@ -1003,7 +1017,7 @@
 
       # Unable to extract
       else {
-        $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
+        $pool->finish(1, ["Unable to extract " . $text_path, $temp]);
       };
     };
   }

diff --git a/t/archive.t b/t/archive.t
index 77026d7..adf7559 100644
--- a/t/archive.t
+++ b/t/archive.t

@@ -27,6 +27,18 @@
 is($list[0], './TEST/BSP/1', 'First document');
 is($list[-1], './TEST/BSP/10', 'First document');
 
+# Test list_texts_iterator
+my $iter = $archive->list_texts_iterator;
+ok($iter, 'Iterator created');
+my @iter_list;
+while (defined(my $path = $iter->())) {
+  push @iter_list, $path;
+}
+is_deeply(\@iter_list, \@list, 'Iterator returns same paths as list_texts');
+
+# Test count_texts
+is($archive->count_texts, 10, 'count_texts returns correct number');
+
 my @path = $archive->split_path('./TEST/BSP/9');
 is($path[0],'.', 'Prefix');
 is($path[1],'TEST', 'Prefix');

diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index e57bf1e..643ab81 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t

@@ -47,6 +47,18 @@
 is(scalar @list, 1, 'Found all tests');
 is($list[0], 'WPD15/A00/00081', 'First document');
 
+# Test list_texts_iterator
+my $iter = $archive->list_texts_iterator;
+ok($iter, 'Iterator created');
+my @iter_list;
+while (defined(my $path = $iter->())) {
+  push @iter_list, $path;
+}
+is_deeply(\@iter_list, \@list, 'Iterator returns same paths as list_texts');
+
+# Test count_texts
+is($archive->count_texts, 1, 'count_texts returns correct number');
+
 ok($archive->test, 'Test all archives');
 
 # Split path
commit	8b03ba54da243810850acdb13816d19b71bbcd4d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Jul 15 09:16:18 2025 +0200
committer	Akron <nils@diewald-online.de>	Wed Jul 16 17:54:16 2025 +0200
tree	0f85f47caf59a99b968555b4f0073fea801395f3
parent	cb12af7dc8936276ca7a178434a34fcab3106934 [diff]