Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 1 | #/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use File::Basename 'dirname'; |
| 5 | use File::Spec::Functions qw/catdir catfile/; |
Akron | 5fd2d8e | 2017-06-19 15:29:39 +0200 | [diff] [blame] | 6 | use File::Temp qw/:POSIX tempdir/; |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 7 | use Mojo::File; |
| 8 | use Mojo::Util qw/quote/; |
| 9 | use Mojo::JSON qw/decode_json/; |
| 10 | use IO::Uncompress::Gunzip; |
| 11 | use Test::More; |
| 12 | use Test::Output qw/:stdout :stderr :functions/; |
| 13 | use Data::Dumper; |
| 14 | use KorAP::XML::Archive; |
| 15 | use utf8; |
Akron | cb12af7 | 2025-07-15 14:36:10 +0200 | [diff] [blame^] | 16 | use Archive::Tar; |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 17 | |
Akron | fab17d3 | 2020-07-31 14:38:29 +0200 | [diff] [blame] | 18 | if ($ENV{SKIP_SCRIPT}) { |
| 19 | plan skip_all => 'Skip script tests'; |
| 20 | }; |
| 21 | |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 22 | my $f = dirname(__FILE__); |
| 23 | my $script = catfile($f, '..', '..', 'script', 'korapxml2krill'); |
| 24 | |
| 25 | my $call = join( |
| 26 | ' ', |
| 27 | 'perl', $script, |
| 28 | 'archive' |
| 29 | ); |
| 30 | |
| 31 | unless (KorAP::XML::Archive::test_unzip) { |
| 32 | plan skip_all => 'unzip not found'; |
| 33 | }; |
| 34 | |
| 35 | # Test without parameters |
| 36 | stdout_like( |
| 37 | sub { |
| 38 | system($call); |
| 39 | }, |
| 40 | qr!archive.+?\$ korapxml2krill!s, |
| 41 | $call |
| 42 | ); |
| 43 | |
| 44 | my $input = catfile($f, '..', 'corpus', 'archive.zip'); |
| 45 | ok(-f $input, 'Input archive found'); |
| 46 | |
| 47 | my $output = File::Temp->new; |
| 48 | |
| 49 | ok(-f $output, 'Output directory exists'); |
| 50 | |
| 51 | my $input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'"; |
| 52 | |
Akron | 5fd2d8e | 2017-06-19 15:29:39 +0200 | [diff] [blame] | 53 | my $cache = tmpnam(); |
| 54 | |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 55 | $call = join( |
| 56 | ' ', |
| 57 | 'perl', $script, |
| 58 | 'archive', |
| 59 | '--input' => $input_quotes, |
| 60 | '--output' => $output . '.tar', |
Akron | 5fd2d8e | 2017-06-19 15:29:39 +0200 | [diff] [blame] | 61 | '--cache' => $cache, |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 62 | '-t' => 'Base#tokens_aggr', |
| 63 | '--to-tar' |
| 64 | ); |
| 65 | |
| 66 | # Test without parameters |
| 67 | my $combined = combined_from( sub { system($call) }); |
| 68 | |
Akron | da3097e | 2017-04-23 19:53:57 +0200 | [diff] [blame] | 69 | like($combined, qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Input is fine'); |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 70 | |
Akron | da3097e | 2017-04-23 19:53:57 +0200 | [diff] [blame] | 71 | like($combined, qr!Writing to file .+?\.tar!, 'Write out'); |
| 72 | like($combined, qr!Wrote to tar archive!, 'Write out'); |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 73 | |
Akron | cb12af7 | 2025-07-15 14:36:10 +0200 | [diff] [blame^] | 74 | # Now test with multiple jobs |
| 75 | $call = join( |
| 76 | ' ', |
| 77 | 'perl', $script, |
| 78 | 'archive', |
| 79 | '--input' => $input, # Use the same input as the first test |
| 80 | '--output' => $output . '_multi.tar', |
| 81 | '--cache' => $cache, |
| 82 | '-t' => 'Base#tokens_aggr', |
| 83 | '-m' => 'Sgbr', # Add meta type parameter |
| 84 | '--to-tar', |
| 85 | '--gzip', # Add gzip parameter |
| 86 | '--jobs' => 3 # Use 3 jobs to test multiple tar files |
| 87 | ); |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 88 | |
Akron | cb12af7 | 2025-07-15 14:36:10 +0200 | [diff] [blame^] | 89 | # Test with multiple jobs |
| 90 | $combined = combined_from( sub { system($call) }); |
| 91 | |
| 92 | like($combined, qr!Writing to file .+?\.tar!, 'Write out'); |
| 93 | like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files'); |
| 94 | like($combined, qr!Wrote to tar archive!, 'Write out'); |
| 95 | |
| 96 | # Read the merged tar with --ignore-zeros |
| 97 | my $tar_file = $output . '_multi.tar'; |
| 98 | ok(-f $tar_file, 'Multi-job tar file exists'); |
| 99 | |
| 100 | # Use Archive::Tar to read the merged tar |
| 101 | my $merged_tar = Archive::Tar->new; |
| 102 | ok($merged_tar->read($tar_file, 1, {ignore_zeros => 1}), 'Can read merged tar with ignore_zeros'); |
| 103 | |
| 104 | # Verify expected files are present |
| 105 | ok($merged_tar->contains_file('TEST-BSP-1.json.gz'), 'Expected file found in merged tar'); |
| 106 | |
| 107 | # Check the content is valid |
| 108 | my $content = $merged_tar->get_content('TEST-BSP-1.json.gz'); |
| 109 | ok(length($content) > 0, 'File content is not empty'); |
| 110 | is(scalar($merged_tar->list_files()), 1, 'One file in tar'); |
| 111 | |
| 112 | # Test with multiple jobs and multiple input files |
| 113 | $call = join( |
| 114 | ' ', |
| 115 | 'perl', $script, |
| 116 | 'archive', |
| 117 | '--input' => 't/corpus/artificial', # Use artificial test corpus |
| 118 | '--output' => $output . '_multi_files.tar', |
| 119 | '--cache' => $cache, |
| 120 | '-t' => 'Base#sentences', # Use sentences.xml |
| 121 | '-m' => 'Sgbr', |
| 122 | '--to-tar', |
| 123 | '--gzip', |
| 124 | '--jobs' => 3 |
| 125 | ); |
| 126 | |
| 127 | # Run multi-file test |
| 128 | $combined = combined_from( sub { system($call) }); |
| 129 | |
| 130 | like($combined, qr!Writing to file .+?\.tar!, 'Write out for multi-file'); |
| 131 | like($combined, qr!Merging 3 temporary tar files!, 'Merging correct number of temp files'); |
| 132 | like($combined, qr!Wrote to tar archive!, 'Write out'); |
| 133 | |
| 134 | # Read the merged tar with --ignore-zeros |
| 135 | my $multi_tar_file = $output . '_multi_files.tar'; |
| 136 | ok(-f $multi_tar_file, 'Multi-file tar exists'); |
| 137 | |
| 138 | # Use Archive::Tar to read the merged tar |
| 139 | my $multi_merged_tar = Archive::Tar->new; |
| 140 | ok($multi_merged_tar->read($multi_tar_file, 1, {ignore_zeros => 1}), 'Can read multi-file tar with ignore_zeros'); |
| 141 | |
| 142 | # Check that the file is in the tar |
| 143 | my @files = $multi_merged_tar->list_files(); |
| 144 | my $found_files = join("\n", @files); |
| 145 | |
| 146 | like($found_files, qr/artificial\.json\.gz/, 'Contains artificial document'); |
Akron | 486f9ab | 2017-04-22 23:25:19 +0200 | [diff] [blame] | 147 | |
| 148 | done_testing; |
| 149 | __END__ |