Avoid lock in tar building
Change-Id: If726018a137f9d1d6ea63c5b4b319d661fcaf271
diff --git a/Changes b/Changes
index c454fba..58ed8a7 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.62 2025-07-15
+ - Remove lock from tar builder.
+
0.61 2025-04-30
- Support certainty in OpenNLP/Morpho.
diff --git a/Readme.pod b/Readme.pod
index b849760..e272014 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -58,7 +58,7 @@
$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
-Convert archives sequentially. The inputs are not merged but treated
+Convert archives in serial. The inputs are not merged but treated
as they are (so they may be premerged or globs).
the C<--out> directory is treated as the base directory where subdirectories
are created based on the archive name. In case the C<--to-tar> flag is given,
@@ -327,7 +327,7 @@
Only valid for the C<archive> command.
Writes the output into a tar archive.
-
+The tar needs to be opened with C<--ignore-zeros> afterwards.
=item B<--sigle|-sg>
@@ -727,7 +727,7 @@
=head1 COPYRIGHT AND LICENSE
-Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
+Copyright (C) 2015-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: L<Nils Diewald|https://www.nils-diewald.de/>
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 3c9a47e..f43b9df 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
-our $VERSION = '0.61';
+our $VERSION = '0.62';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c522b32..487368f 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -22,7 +22,7 @@
use Path::Iterator::Rule;
use Parallel::ForkManager;
use File::Glob ':bsd_glob';
-use File::Temp qw/tempdir/;
+use File::Temp qw/tempdir tempfile/;
use File::Path qw(remove_tree make_path);
use File::Basename;
use Mojo::Collection 'c';
@@ -178,7 +178,7 @@
# - Improve core count logging.
# ----------------------------------------------------------
-our $LAST_CHANGE = '2024/11/14';
+our $LAST_CHANGE = '2025/07/15';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -716,7 +716,7 @@
);
print "extracted.\n";
} else {
- $archive->extract_sigle(1, [$_], $output, $jobs);
+ $archive->extract_sigle($q, [$_], $output, $jobs);
}
};
}
@@ -786,36 +786,40 @@
my $tar_archive;
my $output_dir = $output;
my $tar_fh;
+ my $final_tar_file;
+ my %tar_pool;
+ my $next_tar = 1; # Counter for tar assignment
# Initialize tar archive
if ($to_tar) {
-
# Set output name
- my $tar_file = $output;
- unless ($tar_file =~ /\.tar$/) {
- $tar_file .= '.tar';
+ $final_tar_file = $output;
+ unless ($final_tar_file =~ /\.tar$/) {
+ $final_tar_file .= '.tar';
};
- # Initiate the tar file
- print "Writing to file $tar_file\n" unless $q;
- $tar_fh = IO::File->new($tar_file, 'w');
- $tar_fh->binmode(1);
+ print "Writing to file $final_tar_file\n" unless $q;
- # Use tar builder for archiving
- if (eval("use Archive::Tar::Builder; 1;")) {
- $tar_archive = Archive::Tar::Builder->new(
- ignore_errors => 1
+ # Create tar pool with size equal to number of jobs
+ # If jobs is 0, create just one tar file
+ my $pool_size = $jobs > 0 ? $jobs : 1;
+ for my $i (1..$pool_size) {
+ my ($fh, $temp_tar) = tempfile(
+ "korapxml2krill_pool_${i}_XXXX",
+ SUFFIX => '.tar',
+ TMPDIR => 1
);
- # Set handle
- $tar_archive->set_handle($tar_fh);
- }
+ $tar_pool{$i} = {
+ fh => $fh,
+ file => $temp_tar,
+ };
- # Fallback solution
- else {
- $tar_archive = KorAP::XML::TarBuilder->new(
- $tar_fh
- );
+ if (eval("use Archive::Tar::Builder; 1;")) {
+ ($tar_pool{$i}->{archive} = Archive::Tar::Builder->new(ignore_errors => 1))->set_handle($fh);
+ } else {
+ $tar_pool{$i}->{archive} = KorAP::XML::TarBuilder->new($fh);
+ }
};
# Output to temporary directory
@@ -838,22 +842,25 @@
if (!$code && $to_tar && $data->[2]) {
my $filename = $data->[2];
+ my $clean_file = fileparse($filename);
- # Lock filehandle
- if (flock($tar_fh, LOCK_EX)) {
+ # Get next available tar file in round-robin fashion
+ my $pool_size = $jobs > 0 ? $jobs : 1;
+ my $pool_idx = $next_tar;
+ $next_tar = ($next_tar % $pool_size) + 1;
- my $clean_file = fileparse($filename);
+ my $tar = $tar_pool{$pool_idx};
- # Archive and remove file
- $tar_archive->archive_as($filename => $clean_file);
- unlink $filename;
+ # Lock the tar file before writing
+ flock($tar->{fh}, LOCK_EX);
- # Unlock filehandle
- flock($tar_fh, LOCK_UN);
- }
- else {
- $log->warn("Unable to add $filename to archive");
- };
+ # Add file to pool tar
+ $tar->{archive}->archive_as($filename => $clean_file);
+
+ # Release lock
+ flock($tar->{fh}, LOCK_UN);
+
+ unlink $filename;
};
$data->[1] = undef if $data->[1];
@@ -1007,12 +1014,55 @@
$pool->wait_all_children;
- # Close tar filehandle
- if ($to_tar && $tar_fh) {
- $tar_archive->finish;
- $tar_fh->close;
- print "Wrote to tar archive.\n" unless $q;
- };
+ # Merge all temporary tar files into final tar if needed
+ if ($to_tar && %tar_pool) {
+ $| = 1;
+ print "Merging " . scalar(keys %tar_pool) . " temporary tar files...\n" unless $q;
+
+ # Open final tar file
+ my $final_fh = IO::File->new($final_tar_file, 'w') or die "Cannot open $final_tar_file: $!";
+ $final_fh->binmode(1);
+
+ # Create final archive
+ my $final_archive;
+
+ if (eval("use Archive::Tar::Builder; 1;")) {
+ $final_archive = Archive::Tar::Builder->new(ignore_errors => 1);
+ $final_archive->set_handle($final_fh);
+ } else {
+ $final_archive = KorAP::XML::TarBuilder->new($final_fh);
+ }
+
+ # Finish and close all pool tar files
+ foreach my $pool_idx (sort keys %tar_pool) {
+ my $tar = $tar_pool{$pool_idx};
+ $tar->{archive}->finish;
+ $tar->{fh}->close;
+
+ # Append temp tar content to final tar using efficient buffered copy
+ open my $temp_fh, '<:raw', $tar->{file} or die "Cannot open temp tar $tar->{file}: $!";
+ my $buffer_size = 1024 * 1024; # 1MB buffer
+ my $buffer;
+ while (my $bytes_read = read($temp_fh, $buffer, $buffer_size)) {
+ my $bytes_written = 0;
+ while ($bytes_written < $bytes_read) {
+ my $written = syswrite($final_fh, $buffer, $bytes_read - $bytes_written, $bytes_written);
+ die "Write error: $!" unless defined $written;
+ $bytes_written += $written;
+ }
+ }
+ close $temp_fh;
+
+ # Clean up temp tar
+ unlink $tar->{file};
+ }
+
+ # Close final tar
+ $final_archive->finish;
+ $final_fh->close;
+ print "Wrote to tar archive $final_tar_file\n" unless $q;
+ }
+
unless ($q) {
print timestr(timediff(Benchmark->new, $t))."\n";
print "Done.\n";
diff --git a/t/script/archive_tar.t b/t/script/archive_tar.t
index 1dfd195..8e517c2 100644
--- a/t/script/archive_tar.t
+++ b/t/script/archive_tar.t
@@ -13,6 +13,7 @@
use Data::Dumper;
use KorAP::XML::Archive;
use utf8;
+use Archive::Tar;
if ($ENV{SKIP_SCRIPT}) {
plan skip_all => 'Skip script tests';
@@ -70,7 +71,79 @@
like($combined, qr!Writing to file .+?\.tar!, 'Write out');
like($combined, qr!Wrote to tar archive!, 'Write out');
+# Now test with multiple jobs
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--input' => $input, # Use the same input as the first test
+ '--output' => $output . '_multi.tar',
+ '--cache' => $cache,
+ '-t' => 'Base#tokens_aggr',
+ '-m' => 'Sgbr', # Add meta type parameter
+ '--to-tar',
+ '--gzip', # Add gzip parameter
+ '--jobs' => 3 # Use 3 jobs to test multiple tar files
+);
+# Test with multiple jobs
+$combined = combined_from( sub { system($call) });
+
+like($combined, qr!Writing to file .+?\.tar!, 'Write out');
+like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
+like($combined, qr!Wrote to tar archive!, 'Write out');
+
+# Read the merged tar with --ignore-zeros
+my $tar_file = $output . '_multi.tar';
+ok(-f $tar_file, 'Multi-job tar file exists');
+
+# Use Archive::Tar to read the merged tar
+my $merged_tar = Archive::Tar->new;
+ok($merged_tar->read($tar_file, 1, {ignore_zeros => 1}), 'Can read merged tar with ignore_zeros');
+
+# Verify expected files are present
+ok($merged_tar->contains_file('TEST-BSP-1.json.gz'), 'Expected file found in merged tar');
+
+# Check the content is valid
+my $content = $merged_tar->get_content('TEST-BSP-1.json.gz');
+ok(length($content) > 0, 'File content is not empty');
+is(scalar($merged_tar->list_files()), 1, 'One file in tar');
+
+# Test with multiple jobs and multiple input files
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--input' => 't/corpus/artificial', # Use artificial test corpus
+ '--output' => $output . '_multi_files.tar',
+ '--cache' => $cache,
+ '-t' => 'Base#sentences', # Use sentences.xml
+ '-m' => 'Sgbr',
+ '--to-tar',
+ '--gzip',
+ '--jobs' => 3
+);
+
+# Run multi-file test
+$combined = combined_from( sub { system($call) });
+
+like($combined, qr!Writing to file .+?\.tar!, 'Write out for multi-file');
+like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
+like($combined, qr!Wrote to tar archive!, 'Write out');
+
+# Read the merged tar with --ignore-zeros
+my $multi_tar_file = $output . '_multi_files.tar';
+ok(-f $multi_tar_file, 'Multi-file tar exists');
+
+# Use Archive::Tar to read the merged tar
+my $multi_merged_tar = Archive::Tar->new;
+ok($multi_merged_tar->read($multi_tar_file, 1, {ignore_zeros => 1}), 'Can read multi-file tar with ignore_zeros');
+
+# Check that the file is in the tar
+my @files = $multi_merged_tar->list_files();
+my $found_files = join("\n", @files);
+
+like($found_files, qr/artificial\.json\.gz/, 'Contains artificial document');
done_testing;
__END__