blob: 8e517c2db502aa05ca9efe5237922aea69fd5fa5 [file] [log] [blame]
#/usr/bin/env perl
use strict;
use warnings;
use File::Basename 'dirname';
use File::Spec::Functions qw/catdir catfile/;
use File::Temp qw/:POSIX tempdir/;
use Mojo::File;
use Mojo::Util qw/quote/;
use Mojo::JSON qw/decode_json/;
use IO::Uncompress::Gunzip;
use Test::More;
use Test::Output qw/:stdout :stderr :functions/;
use Data::Dumper;
use KorAP::XML::Archive;
use utf8;
use Archive::Tar;
if ($ENV{SKIP_SCRIPT}) {
plan skip_all => 'Skip script tests';
};
my $f = dirname(__FILE__);
my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
my $call = join(
' ',
'perl', $script,
'archive'
);
unless (KorAP::XML::Archive::test_unzip) {
plan skip_all => 'unzip not found';
};
# Test without parameters
stdout_like(
sub {
system($call);
},
qr!archive.+?\$ korapxml2krill!s,
$call
);
my $input = catfile($f, '..', 'corpus', 'archive.zip');
ok(-f $input, 'Input archive found');
my $output = File::Temp->new;
ok(-f $output, 'Output directory exists');
my $input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
my $cache = tmpnam();
$call = join(
' ',
'perl', $script,
'archive',
'--input' => $input_quotes,
'--output' => $output . '.tar',
'--cache' => $cache,
'-t' => 'Base#tokens_aggr',
'--to-tar'
);
# Test without parameters
my $combined = combined_from( sub { system($call) });
like($combined, qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Input is fine');
like($combined, qr!Writing to file .+?\.tar!, 'Write out');
like($combined, qr!Wrote to tar archive!, 'Write out');
# Now test with multiple jobs
$call = join(
' ',
'perl', $script,
'archive',
'--input' => $input, # Use the same input as the first test
'--output' => $output . '_multi.tar',
'--cache' => $cache,
'-t' => 'Base#tokens_aggr',
'-m' => 'Sgbr', # Add meta type parameter
'--to-tar',
'--gzip', # Add gzip parameter
'--jobs' => 3 # Use 3 jobs to test multiple tar files
);
# Test with multiple jobs
$combined = combined_from( sub { system($call) });
like($combined, qr!Writing to file .+?\.tar!, 'Write out');
like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
like($combined, qr!Wrote to tar archive!, 'Write out');
# Read the merged tar with --ignore-zeros
my $tar_file = $output . '_multi.tar';
ok(-f $tar_file, 'Multi-job tar file exists');
# Use Archive::Tar to read the merged tar
my $merged_tar = Archive::Tar->new;
ok($merged_tar->read($tar_file, 1, {ignore_zeros => 1}), 'Can read merged tar with ignore_zeros');
# Verify expected files are present
ok($merged_tar->contains_file('TEST-BSP-1.json.gz'), 'Expected file found in merged tar');
# Check the content is valid
my $content = $merged_tar->get_content('TEST-BSP-1.json.gz');
ok(length($content) > 0, 'File content is not empty');
is(scalar($merged_tar->list_files()), 1, 'One file in tar');
# Test with multiple jobs and multiple input files
$call = join(
' ',
'perl', $script,
'archive',
'--input' => 't/corpus/artificial', # Use artificial test corpus
'--output' => $output . '_multi_files.tar',
'--cache' => $cache,
'-t' => 'Base#sentences', # Use sentences.xml
'-m' => 'Sgbr',
'--to-tar',
'--gzip',
'--jobs' => 3
);
# Run multi-file test
$combined = combined_from( sub { system($call) });
like($combined, qr!Writing to file .+?\.tar!, 'Write out for multi-file');
like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
like($combined, qr!Wrote to tar archive!, 'Write out');
# Read the merged tar with --ignore-zeros
my $multi_tar_file = $output . '_multi_files.tar';
ok(-f $multi_tar_file, 'Multi-file tar exists');
# Use Archive::Tar to read the merged tar
my $multi_merged_tar = Archive::Tar->new;
ok($multi_merged_tar->read($multi_tar_file, 1, {ignore_zeros => 1}), 'Can read multi-file tar with ignore_zeros');
# Check that the file is in the tar
my @files = $multi_merged_tar->list_files();
my $found_files = join("\n", @files);
like($found_files, qr/artificial\.json\.gz/, 'Contains artificial document');
done_testing;
__END__