Avoid lock in tar building
Change-Id: If726018a137f9d1d6ea63c5b4b319d661fcaf271
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c522b32..487368f 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -22,7 +22,7 @@
use Path::Iterator::Rule;
use Parallel::ForkManager;
use File::Glob ':bsd_glob';
-use File::Temp qw/tempdir/;
+use File::Temp qw/tempdir tempfile/;
use File::Path qw(remove_tree make_path);
use File::Basename;
use Mojo::Collection 'c';
@@ -178,7 +178,7 @@
# - Improve core count logging.
# ----------------------------------------------------------
-our $LAST_CHANGE = '2024/11/14';
+our $LAST_CHANGE = '2025/07/15';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -716,7 +716,7 @@
);
print "extracted.\n";
} else {
- $archive->extract_sigle(1, [$_], $output, $jobs);
+ $archive->extract_sigle($q, [$_], $output, $jobs);
}
};
}
@@ -786,36 +786,40 @@
my $tar_archive;
my $output_dir = $output;
my $tar_fh;
+ my $final_tar_file;
+ my %tar_pool;
+ my $next_tar = 1; # Counter for tar assignment
# Initialize tar archive
if ($to_tar) {
-
# Set output name
- my $tar_file = $output;
- unless ($tar_file =~ /\.tar$/) {
- $tar_file .= '.tar';
+ $final_tar_file = $output;
+ unless ($final_tar_file =~ /\.tar$/) {
+ $final_tar_file .= '.tar';
};
- # Initiate the tar file
- print "Writing to file $tar_file\n" unless $q;
- $tar_fh = IO::File->new($tar_file, 'w');
- $tar_fh->binmode(1);
+ print "Writing to file $final_tar_file\n" unless $q;
- # Use tar builder for archiving
- if (eval("use Archive::Tar::Builder; 1;")) {
- $tar_archive = Archive::Tar::Builder->new(
- ignore_errors => 1
+ # Create tar pool with size equal to number of jobs
+ # If jobs is 0, create just one tar file
+ my $pool_size = $jobs > 0 ? $jobs : 1;
+ for my $i (1..$pool_size) {
+ my ($fh, $temp_tar) = tempfile(
+ "korapxml2krill_pool_${i}_XXXX",
+ SUFFIX => '.tar',
+ TMPDIR => 1
);
- # Set handle
- $tar_archive->set_handle($tar_fh);
- }
+ $tar_pool{$i} = {
+ fh => $fh,
+ file => $temp_tar,
+ };
- # Fallback solution
- else {
- $tar_archive = KorAP::XML::TarBuilder->new(
- $tar_fh
- );
+ if (eval("use Archive::Tar::Builder; 1;")) {
+ ($tar_pool{$i}->{archive} = Archive::Tar::Builder->new(ignore_errors => 1))->set_handle($fh);
+ } else {
+ $tar_pool{$i}->{archive} = KorAP::XML::TarBuilder->new($fh);
+ }
};
# Output to temporary directory
@@ -838,22 +842,25 @@
if (!$code && $to_tar && $data->[2]) {
my $filename = $data->[2];
+ my $clean_file = fileparse($filename);
- # Lock filehandle
- if (flock($tar_fh, LOCK_EX)) {
+ # Get next available tar file in round-robin fashion
+ my $pool_size = $jobs > 0 ? $jobs : 1;
+ my $pool_idx = $next_tar;
+ $next_tar = ($next_tar % $pool_size) + 1;
- my $clean_file = fileparse($filename);
+ my $tar = $tar_pool{$pool_idx};
- # Archive and remove file
- $tar_archive->archive_as($filename => $clean_file);
- unlink $filename;
+ # Lock the tar file before writing
+ flock($tar->{fh}, LOCK_EX);
- # Unlock filehandle
- flock($tar_fh, LOCK_UN);
- }
- else {
- $log->warn("Unable to add $filename to archive");
- };
+ # Add file to pool tar
+ $tar->{archive}->archive_as($filename => $clean_file);
+
+ # Release lock
+ flock($tar->{fh}, LOCK_UN);
+
+ unlink $filename;
};
$data->[1] = undef if $data->[1];
@@ -1007,12 +1014,55 @@
$pool->wait_all_children;
- # Close tar filehandle
- if ($to_tar && $tar_fh) {
- $tar_archive->finish;
- $tar_fh->close;
- print "Wrote to tar archive.\n" unless $q;
- };
+ # Merge all temporary tar files into final tar if needed
+ if ($to_tar && %tar_pool) {
+ $| = 1;
+ print "Merging " . scalar(keys %tar_pool) . " temporary tar files...\n" unless $q;
+
+ # Open final tar file
+ my $final_fh = IO::File->new($final_tar_file, 'w') or die "Cannot open $final_tar_file: $!";
+ $final_fh->binmode(1);
+
+ # Create final archive
+ my $final_archive;
+
+ if (eval("use Archive::Tar::Builder; 1;")) {
+ $final_archive = Archive::Tar::Builder->new(ignore_errors => 1);
+ $final_archive->set_handle($final_fh);
+ } else {
+ $final_archive = KorAP::XML::TarBuilder->new($final_fh);
+ }
+
+ # Finish and close all pool tar files
+ foreach my $pool_idx (sort keys %tar_pool) {
+ my $tar = $tar_pool{$pool_idx};
+ $tar->{archive}->finish;
+ $tar->{fh}->close;
+
+ # Append temp tar content to final tar using efficient buffered copy
+ open my $temp_fh, '<:raw', $tar->{file} or die "Cannot open temp tar $tar->{file}: $!";
+ my $buffer_size = 1024 * 1024; # 1MB buffer
+ my $buffer;
+ while (my $bytes_read = read($temp_fh, $buffer, $buffer_size)) {
+ my $bytes_written = 0;
+ while ($bytes_written < $bytes_read) {
+ my $written = syswrite($final_fh, $buffer, $bytes_read - $bytes_written, $bytes_written);
+ die "Write error: $!" unless defined $written;
+ $bytes_written += $written;
+ }
+ }
+ close $temp_fh;
+
+ # Clean up temp tar
+ unlink $tar->{file};
+ }
+
+ # Close final tar
+ $final_archive->finish;
+ $final_fh->close;
+ print "Wrote to tar archive $final_tar_file\n" unless $q;
+ }
+
unless ($q) {
print timestr(timediff(Benchmark->new, $t))."\n";
print "Done.\n";