blob: 8e517c2db502aa05ca9efe5237922aea69fd5fa5 [file] [log] [blame]
Akron486f9ab2017-04-22 23:25:19 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akron5fd2d8e2017-06-19 15:29:39 +02006use File::Temp qw/:POSIX tempdir/;
Akron486f9ab2017-04-22 23:25:19 +02007use Mojo::File;
8use Mojo::Util qw/quote/;
9use Mojo::JSON qw/decode_json/;
10use IO::Uncompress::Gunzip;
11use Test::More;
12use Test::Output qw/:stdout :stderr :functions/;
13use Data::Dumper;
14use KorAP::XML::Archive;
15use utf8;
Akroncb12af72025-07-15 14:36:10 +020016use Archive::Tar;
Akron486f9ab2017-04-22 23:25:19 +020017
Akronfab17d32020-07-31 14:38:29 +020018if ($ENV{SKIP_SCRIPT}) {
19 plan skip_all => 'Skip script tests';
20};
21
Akron486f9ab2017-04-22 23:25:19 +020022my $f = dirname(__FILE__);
23my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
24
25my $call = join(
26 ' ',
27 'perl', $script,
28 'archive'
29);
30
31unless (KorAP::XML::Archive::test_unzip) {
32 plan skip_all => 'unzip not found';
33};
34
35# Test without parameters
36stdout_like(
37 sub {
38 system($call);
39 },
40 qr!archive.+?\$ korapxml2krill!s,
41 $call
42);
43
44my $input = catfile($f, '..', 'corpus', 'archive.zip');
45ok(-f $input, 'Input archive found');
46
47my $output = File::Temp->new;
48
49ok(-f $output, 'Output directory exists');
50
51my $input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
52
Akron5fd2d8e2017-06-19 15:29:39 +020053my $cache = tmpnam();
54
Akron486f9ab2017-04-22 23:25:19 +020055$call = join(
56 ' ',
57 'perl', $script,
58 'archive',
59 '--input' => $input_quotes,
60 '--output' => $output . '.tar',
Akron5fd2d8e2017-06-19 15:29:39 +020061 '--cache' => $cache,
Akron486f9ab2017-04-22 23:25:19 +020062 '-t' => 'Base#tokens_aggr',
63 '--to-tar'
64);
65
66# Test without parameters
67my $combined = combined_from( sub { system($call) });
68
Akronda3097e2017-04-23 19:53:57 +020069like($combined, qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Input is fine');
Akron486f9ab2017-04-22 23:25:19 +020070
Akronda3097e2017-04-23 19:53:57 +020071like($combined, qr!Writing to file .+?\.tar!, 'Write out');
72like($combined, qr!Wrote to tar archive!, 'Write out');
Akron486f9ab2017-04-22 23:25:19 +020073
Akroncb12af72025-07-15 14:36:10 +020074# Now test with multiple jobs
75$call = join(
76 ' ',
77 'perl', $script,
78 'archive',
79 '--input' => $input, # Use the same input as the first test
80 '--output' => $output . '_multi.tar',
81 '--cache' => $cache,
82 '-t' => 'Base#tokens_aggr',
83 '-m' => 'Sgbr', # Add meta type parameter
84 '--to-tar',
85 '--gzip', # Add gzip parameter
86 '--jobs' => 3 # Use 3 jobs to test multiple tar files
87);
Akron486f9ab2017-04-22 23:25:19 +020088
Akroncb12af72025-07-15 14:36:10 +020089# Test with multiple jobs
90$combined = combined_from( sub { system($call) });
91
92like($combined, qr!Writing to file .+?\.tar!, 'Write out');
93like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
94like($combined, qr!Wrote to tar archive!, 'Write out');
95
96# Read the merged tar with --ignore-zeros
97my $tar_file = $output . '_multi.tar';
98ok(-f $tar_file, 'Multi-job tar file exists');
99
100# Use Archive::Tar to read the merged tar
101my $merged_tar = Archive::Tar->new;
102ok($merged_tar->read($tar_file, 1, {ignore_zeros => 1}), 'Can read merged tar with ignore_zeros');
103
104# Verify expected files are present
105ok($merged_tar->contains_file('TEST-BSP-1.json.gz'), 'Expected file found in merged tar');
106
107# Check the content is valid
108my $content = $merged_tar->get_content('TEST-BSP-1.json.gz');
109ok(length($content) > 0, 'File content is not empty');
110is(scalar($merged_tar->list_files()), 1, 'One file in tar');
111
112# Test with multiple jobs and multiple input files
113$call = join(
114 ' ',
115 'perl', $script,
116 'archive',
117 '--input' => 't/corpus/artificial', # Use artificial test corpus
118 '--output' => $output . '_multi_files.tar',
119 '--cache' => $cache,
120 '-t' => 'Base#sentences', # Use sentences.xml
121 '-m' => 'Sgbr',
122 '--to-tar',
123 '--gzip',
124 '--jobs' => 3
125);
126
127# Run multi-file test
128$combined = combined_from( sub { system($call) });
129
130like($combined, qr!Writing to file .+?\.tar!, 'Write out for multi-file');
131like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files');
132like($combined, qr!Wrote to tar archive!, 'Write out');
133
134# Read the merged tar with --ignore-zeros
135my $multi_tar_file = $output . '_multi_files.tar';
136ok(-f $multi_tar_file, 'Multi-file tar exists');
137
138# Use Archive::Tar to read the merged tar
139my $multi_merged_tar = Archive::Tar->new;
140ok($multi_merged_tar->read($multi_tar_file, 1, {ignore_zeros => 1}), 'Can read multi-file tar with ignore_zeros');
141
142# Check that the file is in the tar
143my @files = $multi_merged_tar->list_files();
144my $found_files = join("\n", @files);
145
146like($found_files, qr/artificial\.json\.gz/, 'Contains artificial document');
Akron486f9ab2017-04-22 23:25:19 +0200147
148done_testing;
149__END__