Improved tar support
Change-Id: I318b6f18e571c81a34752911bc9d009d726c7d14
diff --git a/MANIFEST b/MANIFEST
index 812c953..65f5f5a 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -102,6 +102,7 @@
t/script/usage.t
t/script/extract.t
t/script/archive.t
+t/script/archive_tar.t
t/script/config.t
t/script/base.t
t/corpus/archive.zip
diff --git a/Makefile.PL b/Makefile.PL
index ab696f0..7eec5c1 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -40,7 +40,8 @@
'Sys::Info' => 0.78,
'Config::Simple' => 4.58,
'String::Random' => 0.29,
- 'File::Path' => 2.12
+ 'File::Path' => 2.12,
+ 'Archive::Tar::Builder' => 2.5002
},
MIN_PERL_VERSION => '5.014',
test => {
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 226c35a..a6aa95f 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -157,7 +157,7 @@
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs),
- 'to-tar=s' => \(my $to_tar),
+ 'to-tar' => \(my $to_tar),
'sequential-extraction|se' => \(my $sequential_extraction),
'cache-size|cs=s' => \(my $cache_size),
'cache-delete|cd!' => \(my $cache_delete),
@@ -350,7 +350,7 @@
# Start serial processing
if ($cmd eq 'serial') {
- if ($output && (!-e $output || !-d $output)) {
+ if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
print "Directory '$output' does not exist.\n\n";
exit(0);
};
@@ -383,7 +383,7 @@
# This will create a directory
my $new_out = catdir($output, get_file_name_from_glob($_));
- # Create new path
+ # Create new path, in case the output is not meant to be tarred
unless ($to_tar) {
if (make_path($new_out) == 0 && !-d $new_out) {
$log->error("Can\'t create path $new_out");
@@ -558,7 +558,7 @@
s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
if ($cmd) {
- if ($output && (!-e $output || !-d $output)) {
+ if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
print "Directory '$output' does not exist.\n\n";
exit(0);
};
@@ -807,6 +807,34 @@
# exit(1);
# };
+ my $tar_archive;
+ my $output_dir = $output;
+
+ # Initialize tar archive
+ if ($to_tar) {
+ $tar_archive = Archive::Tar::Builder->new(
+ ignore_errors => 1
+ );
+
+ # Set output name
+ my $tar_file = $output;
+ unless ($tar_file =~ /\.tar$/) {
+ $tar_file .= '.tar';
+ };
+
+ # Initiate the tar file
+ print "Writing to file $tar_file\n";
+ my $fh = IO::File->new($tar_file, 'w');
+ $fh->binmode(1);
+
+ # Set handle
+ $tar_archive->set_handle($fh);
+
+ # Output to temporary directory
+ $output_dir = File::Temp->newdir;
+ };
+
+
# Input is a directory
if (-d $input[0]) {
my $it = Directory::Iterator->new($input[0]);
@@ -826,26 +854,6 @@
$t = Benchmark->new;
$count = scalar @dirs;
- my $tar_archive;
- my $output_dir = $output;
- if ($to_tar) {
- $tar_archive = Archive::Tar::Builder->new(
- ignore_errors => 1
- );
-
- # Set output name
- my $tar_file = $output;
- unless ($tar_file =~ /\.tar$/) {
- $tar_file .= '.tar';
- };
- my $fh = IO::File->new($tar_file, 'w');
- $fh->binmode(1);
-
- # Set handle
- $tar_archive->set_handle($fh);
- $output_dir = File::Temp->newdir;
- };
-
DIRECTORY_LOOP:
for (my $i = 0; $i < $count; $i++) {
@@ -858,16 +866,17 @@
$pool->start and next DIRECTORY_LOOP;
if (my $return = $batch_file->process($dirs[$i] => $filename)) {
- $pool->finish(
- 0,
- ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
- );
# Add to tar archive
if ($to_tar) {
$tar_archive->archive($filename);
unlink $filename;
};
+
+ $pool->finish(
+ 0,
+ ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
+ );
}
else {
$pool->finish(1, ["Unable to process " . $dirs[$i]]);
@@ -898,7 +907,7 @@
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
my $filename = catfile(
- $output,
+ $output_dir,
get_file_name(
catfile($corpus, $doc, $text)
. '.json' . ($gzip ? '.gz' : '')
@@ -925,6 +934,13 @@
# Write file
if (my $return = $batch_file->process($dir => $filename)) {
+
+ # Add to tar archive
+ if ($to_tar) {
+ $tar_archive->archive($filename);
+ unlink $filename;
+ };
+
# Delete temporary file
$pool->finish(
0,
diff --git a/t/script/archive_tar.t b/t/script/archive_tar.t
new file mode 100644
index 0000000..aee6542
--- /dev/null
+++ b/t/script/archive_tar.t
@@ -0,0 +1,68 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/tempdir/;
+use Mojo::File;
+use Mojo::Util qw/quote/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output qw/:stdout :stderr :functions/;
+use Data::Dumper;
+use KorAP::XML::Archive;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ 'archive'
+);
+
+unless (KorAP::XML::Archive::test_unzip) {
+ plan skip_all => 'unzip not found';
+};
+
+# Test without parameters
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!archive.+?\$ korapxml2krill!s,
+ $call
+);
+
+my $input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+my $output = File::Temp->new;
+
+ok(-f $output, 'Output directory exists');
+
+my $input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--input' => $input_quotes,
+ '--output' => $output . '.tar',
+ '-t' => 'Base#tokens_aggr',
+ '--to-tar'
+);
+
+# Test without parameters
+my $combined = combined_from( sub { system($call) });
+
+diag $combined;
+
+#qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
+
+
+
+done_testing;
+__END__