Avoid lock in tar building

Change-Id: If726018a137f9d1d6ea63c5b4b319d661fcaf271
diff --git a/Changes b/Changes
index c454fba..58ed8a7 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.62 2025-07-15
+        - Remove lock from tar builder.
+
 0.61 2025-04-30
         - Support certainty in OpenNLP/Morpho.
 
diff --git a/Readme.pod b/Readme.pod
index b849760..e272014 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -58,7 +58,7 @@
 
   $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
 
-Convert archives sequentially. The inputs are not merged but treated
+Convert archives in serial. The inputs are not merged but treated
 as they are (so they may be premerged or globs).
 the C<--out> directory is treated as the base directory where subdirectories
 are created based on the archive name. In case the C<--to-tar> flag is given,
@@ -327,7 +327,7 @@
 Only valid for the C<archive> command.
 
 Writes the output into a tar archive.
-
+The tar needs to be opened with C<--ignore-zeros> afterwards.
 
 =item B<--sigle|-sg>
 
@@ -727,7 +727,7 @@
 
 =head1 COPYRIGHT AND LICENSE
 
-Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
+Copyright (C) 2015-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
 
 Author: L<Nils Diewald|https://www.nils-diewald.de/>
 
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 3c9a47e..f43b9df 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
 
 our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
 
-our $VERSION = '0.61';
+our $VERSION = '0.62';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c522b32..487368f 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -22,7 +22,7 @@
 use Path::Iterator::Rule;
 use Parallel::ForkManager;
 use File::Glob ':bsd_glob';
-use File::Temp qw/tempdir/;
+use File::Temp qw/tempdir tempfile/;
 use File::Path qw(remove_tree make_path);
 use File::Basename;
 use Mojo::Collection 'c';
@@ -178,7 +178,7 @@
 # - Improve core count logging.
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2024/11/14';
+our $LAST_CHANGE = '2025/07/15';
 our $LOCAL = $FindBin::Bin;
 our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
@@ -716,7 +716,7 @@
         );
         print "extracted.\n";
       } else {
-        $archive->extract_sigle(1, [$_], $output, $jobs);
+        $archive->extract_sigle($q, [$_], $output, $jobs);
       }
     };
   }
@@ -786,36 +786,40 @@
   my $tar_archive;
   my $output_dir = $output;
   my $tar_fh;
+  my $final_tar_file;
+  my %tar_pool;
+  my $next_tar = 1;  # Counter for tar assignment
 
   # Initialize tar archive
   if ($to_tar) {
-
     # Set output name
-    my $tar_file = $output;
-    unless ($tar_file =~ /\.tar$/) {
-      $tar_file .= '.tar';
+    $final_tar_file = $output;
+    unless ($final_tar_file =~ /\.tar$/) {
+      $final_tar_file .= '.tar';
     };
 
-    # Initiate the tar file
-    print "Writing to file $tar_file\n" unless $q;
-    $tar_fh = IO::File->new($tar_file, 'w');
-    $tar_fh->binmode(1);
+    print "Writing to file $final_tar_file\n" unless $q;
 
-    # Use tar builder for archiving
-    if (eval("use Archive::Tar::Builder; 1;")) {
-      $tar_archive = Archive::Tar::Builder->new(
-        ignore_errors => 1
+    # Create tar pool with size equal to number of jobs
+    # If jobs is 0, create just one tar file
+    my $pool_size = $jobs > 0 ? $jobs : 1;
+    for my $i (1..$pool_size) {
+      my ($fh, $temp_tar) = tempfile(
+        "korapxml2krill_pool_${i}_XXXX",
+        SUFFIX => '.tar',
+        TMPDIR => 1
       );
 
-      # Set handle
-      $tar_archive->set_handle($tar_fh);
-    }
+      $tar_pool{$i} = {
+        fh => $fh,
+        file => $temp_tar,
+      };
 
-    # Fallback solution
-    else {
-      $tar_archive = KorAP::XML::TarBuilder->new(
-        $tar_fh
-      );
+      if (eval("use Archive::Tar::Builder; 1;")) {
+        ($tar_pool{$i}->{archive} = Archive::Tar::Builder->new(ignore_errors => 1))->set_handle($fh);
+      } else {
+        $tar_pool{$i}->{archive} = KorAP::XML::TarBuilder->new($fh);
+      }
     };
 
     # Output to temporary directory
@@ -838,22 +842,25 @@
 
       if (!$code && $to_tar && $data->[2]) {
         my $filename = $data->[2];
+        my $clean_file = fileparse($filename);
 
-        # Lock filehandle
-        if (flock($tar_fh, LOCK_EX)) {
+        # Get next available tar file in round-robin fashion
+        my $pool_size = $jobs > 0 ? $jobs : 1;
+        my $pool_idx = $next_tar;
+        $next_tar = ($next_tar % $pool_size) + 1;
 
-          my $clean_file = fileparse($filename);
+        my $tar = $tar_pool{$pool_idx};
 
-          # Archive and remove file
-          $tar_archive->archive_as($filename => $clean_file);
-          unlink $filename;
+        # Lock the tar file before writing
+        flock($tar->{fh}, LOCK_EX);
 
-          # Unlock filehandle
-          flock($tar_fh, LOCK_UN);
-        }
-        else {
-          $log->warn("Unable to add $filename to archive");
-        };
+        # Add file to pool tar
+        $tar->{archive}->archive_as($filename => $clean_file);
+
+        # Release lock
+        flock($tar->{fh}, LOCK_UN);
+
+        unlink $filename;
       };
 
       $data->[1] = undef if $data->[1];
@@ -1007,12 +1014,55 @@
 
   $pool->wait_all_children;
 
-  # Close tar filehandle
-  if ($to_tar && $tar_fh) {
-    $tar_archive->finish;
-    $tar_fh->close;
-    print "Wrote to tar archive.\n" unless $q;
-  };
+  # Merge all temporary tar files into final tar if needed
+  if ($to_tar && %tar_pool) {
+    $| = 1;
+    print "Merging " . scalar(keys %tar_pool) . " temporary tar files...\n" unless $q;
+
+    # Open final tar file
+    my $final_fh = IO::File->new($final_tar_file, 'w') or die "Cannot open $final_tar_file: $!";
+    $final_fh->binmode(1);
+
+    # Create final archive
+    my $final_archive;
+
+    if (eval("use Archive::Tar::Builder; 1;")) {
+      $final_archive = Archive::Tar::Builder->new(ignore_errors => 1);
+      $final_archive->set_handle($final_fh);
+    } else {
+      $final_archive = KorAP::XML::TarBuilder->new($final_fh);
+    }
+
+    # Finish and close all pool tar files
+    foreach my $pool_idx (sort keys %tar_pool) {
+      my $tar = $tar_pool{$pool_idx};
+      $tar->{archive}->finish;
+      $tar->{fh}->close;
+
+      # Append temp tar content to final tar using efficient buffered copy
+      open my $temp_fh, '<:raw', $tar->{file} or die "Cannot open temp tar $tar->{file}: $!";
+      my $buffer_size = 1024 * 1024; # 1MB buffer
+      my $buffer;
+      while (my $bytes_read = read($temp_fh, $buffer, $buffer_size)) {
+        my $bytes_written = 0;
+        while ($bytes_written < $bytes_read) {
+          my $written = syswrite($final_fh, $buffer, $bytes_read - $bytes_written, $bytes_written);
+          die "Write error: $!" unless defined $written;
+          $bytes_written += $written;
+        }
+      }
+      close $temp_fh;
+
+      # Clean up temp tar
+      unlink $tar->{file};
+    }
+
+    # Close final tar
+    $final_archive->finish;
+    $final_fh->close;
+    print "Wrote to tar archive $final_tar_file\n" unless $q;
+  }
+
   unless ($q) {
     print timestr(timediff(Benchmark->new, $t))."\n";
     print "Done.\n";
diff --git a/t/script/archive_tar.t b/t/script/archive_tar.t
index 1dfd195..8e517c2 100644
--- a/t/script/archive_tar.t
+++ b/t/script/archive_tar.t
@@ -13,6 +13,7 @@
 use Data::Dumper;
 use KorAP::XML::Archive;
 use utf8;
+use Archive::Tar;
 
 if ($ENV{SKIP_SCRIPT}) {
   plan skip_all => 'Skip script tests';
@@ -70,7 +71,79 @@
 like($combined, qr!Writing to file .+?\.tar!, 'Write out');
 like($combined, qr!Wrote to tar archive!, 'Write out');
 
+# Now test with multiple jobs
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => $input,  # Use the same input as the first test
+  '--output' => $output . '_multi.tar',
+  '--cache' => $cache,
+  '-t' => 'Base#tokens_aggr',
+  '-m' => 'Sgbr',  # Add meta type parameter
+  '--to-tar',
+  '--gzip',  # Add gzip parameter
+  '--jobs' => 3  # Use 3 jobs to test multiple tar files
+);
 
+# Test with multiple jobs
+$combined = combined_from( sub { system($call) });
+
+like($combined, qr!Writing to file .+?\.tar!, 'Write out');
+like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
+like($combined, qr!Wrote to tar archive!, 'Write out');
+
+# Read the merged tar with --ignore-zeros
+my $tar_file = $output . '_multi.tar';
+ok(-f $tar_file, 'Multi-job tar file exists');
+
+# Use Archive::Tar to read the merged tar
+my $merged_tar = Archive::Tar->new;
+ok($merged_tar->read($tar_file, 1, {ignore_zeros => 1}), 'Can read merged tar with ignore_zeros');
+
+# Verify expected files are present
+ok($merged_tar->contains_file('TEST-BSP-1.json.gz'), 'Expected file found in merged tar');
+
+# Check the content is valid
+my $content = $merged_tar->get_content('TEST-BSP-1.json.gz');
+ok(length($content) > 0, 'File content is not empty');
+is(scalar($merged_tar->list_files()), 1, 'One file in tar');
+
+# Test with multiple jobs and multiple input files
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => 't/corpus/artificial',  # Use artificial test corpus
+  '--output' => $output . '_multi_files.tar',
+  '--cache' => $cache,
+  '-t' => 'Base#sentences',  # Use sentences.xml
+  '-m' => 'Sgbr',
+  '--to-tar',
+  '--gzip',
+  '--jobs' => 3
+);
+
+# Run multi-file test
+$combined = combined_from( sub { system($call) });
+
+like($combined, qr!Writing to file .+?\.tar!, 'Write out for multi-file');
+like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
+like($combined, qr!Wrote to tar archive!, 'Write out');
+
+# Read the merged tar with --ignore-zeros
+my $multi_tar_file = $output . '_multi_files.tar';
+ok(-f $multi_tar_file, 'Multi-file tar exists');
+
+# Use Archive::Tar to read the merged tar
+my $multi_merged_tar = Archive::Tar->new;
+ok($multi_merged_tar->read($multi_tar_file, 1, {ignore_zeros => 1}), 'Can read multi-file tar with ignore_zeros');
+
+# Check that the file is in the tar
+my @files = $multi_merged_tar->list_files();
+my $found_files = join("\n", @files);
+
+like($found_files, qr/artificial\.json\.gz/, 'Contains artificial document');
 
 done_testing;
 __END__