Added archive test script
Change-Id: Iaa6e9dd9c8186fe02432c0c512c23db8a9275d8b
diff --git a/Changes b/Changes
index 6ea6c3d..8d1d990 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.18 2016-08-16
+0.18 2016-08-17
- Added REI test.
- Added multiple archive support to korapxml2krill.
- Added support for prefix negation in korapxml2krill.
@@ -12,9 +12,10 @@
- Fixed setting multiple annotations in
script.
- Fixed output of version and help messages.
- - Added extraction test.
+ - Added script test for extraction.
- Fixed extraction with multiple archives and prefix
negation support.
+ - Added script test for archives.
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
diff --git a/MANIFEST b/MANIFEST
index 856b8f2..098ee93 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -100,6 +100,7 @@
t/script/single.t
t/script/usage.t
t/script/extract.t
+t/script/archive.t
t/corpus/archive.zip
t/corpus/BZK/header.xml
t/corpus/GOE/header.xml
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index ea376f4..fe57d78 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -5,7 +5,7 @@
use Scalar::Util qw/weaken/;
use XML::Fast;
use Try::Tiny;
-use Carp qw/croak/;
+use Carp qw/croak carp/;
use KorAP::XML::Document::Primary;
use KorAP::XML::Tokenizer;
use Log::Log4perl;
@@ -73,11 +73,11 @@
# Load file
$file = b($data_xml)->slurp;
-
try {
local $SIG{__WARN__} = sub {
- $error = 1;
+ $error = 1;
};
+
$rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
} catch {
$self->log->warn($unable);
@@ -97,17 +97,22 @@
$self->corpus_sigle($1);
}
else {
- croak $unable . ': ID not parseable';
+ $self->log->warn($unable . ': ID not parseable');
+ return;
};
}
else {
- croak $unable . ': No raw_text found or no ID';
+ $self->log->warn($unable . ': No raw_text found or no ID');
+ return;
};
# Get primary data
my $pd = $rt->{text};
- croak $unable unless $pd;
+ unless ($pd) {
+ $self->log->warn($unable . ': No primary data found');
+ return;
+ };
# Associate primary data
$self->{pd} = KorAP::XML::Document::Primary->new($pd);
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 65bc89a..939dcd4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -92,37 +92,37 @@
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
- 'cache-size|cs=s' => \(my $cache_size = '50m'),
- 'cache-delete|cd!' => \(my $cache_delete = 1),
- 'cache-init|ci!' => \(my $cache_init = 1),
'log|l=s' => \(my $log_level = 'ERROR'),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs = 0),
+ 'cache-size|cs=s' => \(my $cache_size = '50m'),
+ 'cache-delete|cd!' => \(my $cache_delete = 1),
+ 'cache-init|ci!' => \(my $cache_init = 1),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
- -verbose => 99,
- -msg => $VERSION_MSG,
- -output => '-'
+ -verbose => 99,
+ -msg => $VERSION_MSG,
+ -output => '-'
);
},
'version|v' => sub {
pod2usage(
- -verbose => 0,
- -msg => $VERSION_MSG,
- -output => '-'
+ -verbose => 0,
+ -msg => $VERSION_MSG,
+ -output => '-'
)
}
);
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
- -verbose => 99,
- -msg => $VERSION_MSG,
- -output => '-',
- -exit => 1
+ -verbose => 99,
+ -msg => $VERSION_MSG,
+ -output => '-',
+ -exit => 1
);
# Input has to be defined
@@ -281,6 +281,14 @@
# Convert sigle to path construct
s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
+if ($cmd) {
+ if ($output && (!-e $output || !-d $output)) {
+ print "Directory '$output' does not exist.\n\n";
+ exit(0);
+ };
+};
+
+
# Process a single file
unless ($cmd) {
my $input = $input[0];
@@ -303,6 +311,7 @@
# Create and parse new document
$input =~ s{([^/])$}{$1/};
+ # Process file
$batch_file->process($input, $output);
# Delete cache file
@@ -314,14 +323,10 @@
# Extract XML files
elsif ($cmd eq 'extract') {
- if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
- };
-
- # TODO: Support sigles and full archives
+ # Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+ # Check zip capabilities
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
@@ -349,6 +354,7 @@
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ";
+
# TODO: Make this OS independent
print '' . (
$archive->extract(
@@ -361,6 +367,8 @@
print "\n";
exit(1);
}
+
+ # Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
};
@@ -369,32 +377,20 @@
# Process an archive
elsif ($cmd eq 'archive') {
-warn '!!!!!!!!!!!!!------------> ';
-
-if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
-};
-
-
# TODO: Support sigles
- if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
- };
-
-# Zero means: everything runs in the parent process
+ # Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
- my $count = 0; # Texts to process
+ my $count = 0; # Texts to process
my $iter = 1; # Current text in process
# Report on fork message
$pool->run_on_finish (
sub {
- my ($pid, $code) = shift;
+ my ($pid, $code) = @_;
my $data = pop;
+
print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
@@ -403,16 +399,17 @@
);
my $t;
+ my $temp;
print "Reading data ...\n";
-# unless (Cache::FastMmap->new(
-# share_file => $cache_file,
-# cache_size => $cache_size,
-# init_file => $cache_init
-# )) {
-# print "Unable to intialize cache '$cache_file'\n\n";
-# exit(1);
-# };
+ # unless (Cache::FastMmap->new(
+ # share_file => $cache_file,
+ # cache_size => $cache_size,
+ # init_file => $cache_init
+ # )) {
+ # print "Unable to intialize cache '$cache_file'\n\n";
+ # exit(1);
+ # };
# Input is a directory
if (-d $input[0]) {
@@ -420,10 +417,11 @@
my @dirs;
my $dir;
+ # Todo: Make a DO WHILE
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
- push @dirs, $dir;
- $it->prune;
+ push @dirs, $dir;
+ $it->prune;
};
last unless $it->next;
};
@@ -436,15 +434,13 @@
for (my $i = 0; $i < $count; $i++) {
my $filename = catfile(
- $output,
- get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+ $output,
+ get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
);
# Get the next fork
- my $pid = $pool->start and next DIRECTORY_LOOP;
- my $msg;
-
- $msg = $batch_file->process($dirs[$i] => $filename);
+ $pool->start and next DIRECTORY_LOOP;
+ my $msg = $batch_file->process($dirs[$i] => $filename);
$pool->finish(0, \$msg);
};
}
@@ -465,6 +461,9 @@
my @dirs = $archive->list_texts;
$count = scalar @dirs;
+ # Create temporary file
+ $temp = File::Temp->newdir;
+
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
@@ -472,41 +471,41 @@
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
my $filename = catfile(
- $output,
- get_file_name(
- catfile($corpus, $doc, $text)
- . '.json' . ($gzip ? '.gz' : '')
- )
+ $output,
+ get_file_name(
+ catfile($corpus, $doc, $text)
+ . '.json' . ($gzip ? '.gz' : '')
+ )
);
# Get the next fork
- my $pid = $pool->start and next ARCHIVE_LOOP;
-
- # Create temporary file
- my $temp = File::Temp->newdir;
+ $pool->start and next ARCHIVE_LOOP;
my $msg;
# Extract from archive
if ($archive->extract($dirs[$i], $temp)) {
- # Create corpus directory
- my $input = catdir("$temp", $corpus);
+ # Create corpus directory
+ my $input = catdir("$temp", $corpus);
- # Temporary directory
- my $dir = catdir($input, $doc, $text);
+ # Temporary directory
+ my $dir = catdir($input, $doc, $text);
- # Write file
- $msg = $batch_file->process($dir => $output);
-
- $temp = undef;
- $pool->finish(0, \$msg);
+ # Write file
+ if ($batch_file->process($dir => $filename)) {
+ $pool->finish(0, \("Processed " . $filename));
+ }
+ else {
+ $pool->finish(1, \("Unable to process " . $dir));
+ };
}
+
+ # Unable to extract
else {
- $temp = undef;
- $msg = "Unable to extract " . $dirs[$i] . "\n";
- $pool->finish(1, \$msg);
+ $msg = "Unable to extract " . $dirs[$i] . "\n";
+ $pool->finish(1, \$msg);
};
};
}
@@ -517,6 +516,9 @@
$pool->wait_all_children;
+ # Delete temporary file
+ $temp = undef;
+
# Delete cache file
unlink($cache_file) if $cache_delete;
diff --git a/t/script/archive.t b/t/script/archive.t
new file mode 100644
index 0000000..2032180
--- /dev/null
+++ b/t/script/archive.t
@@ -0,0 +1,70 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/tempdir/;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output qw/:stdout :stderr :functions/;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ 'archive'
+);
+
+# Test without parameters
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!archive.+?Process an!s,
+ $call
+);
+
+my $input = catfile($f, '..', 'corpus', 'archive.zip');
+ok(-f $input, 'Input archive found');
+
+my $output = tempdir(CLEANUP => 1);
+ok(-d $output, 'Output directory exists');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--input' => $input,
+ '--output' => $output,
+ '-t' => 'Base#tokens_aggr',
+ '-m' => 'Sgbr'
+);
+
+# Test without compression
+my $json;
+{
+ local $SIG{__WARN__} = sub {};
+ my $out = stdout_from(sub { system($call); });
+
+ like($out, qr!TEST-BSP-1\.json!s, $call);
+
+ $out =~ m!Processed (.+?\.json)!;
+ $json = $1;
+};
+
+ok(-f $json, 'Json file exists');
+ok((my $file = slurp $json), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+
+is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
+is($json->{sgbrKodex}, 'M', 'Kodex meta data');
+
+done_testing;
+__END__