Added archive test script
Change-Id: Iaa6e9dd9c8186fe02432c0c512c23db8a9275d8b
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 65bc89a..939dcd4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -92,37 +92,37 @@
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
- 'cache-size|cs=s' => \(my $cache_size = '50m'),
- 'cache-delete|cd!' => \(my $cache_delete = 1),
- 'cache-init|ci!' => \(my $cache_init = 1),
'log|l=s' => \(my $log_level = 'ERROR'),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs = 0),
+ 'cache-size|cs=s' => \(my $cache_size = '50m'),
+ 'cache-delete|cd!' => \(my $cache_delete = 1),
+ 'cache-init|ci!' => \(my $cache_init = 1),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
- -verbose => 99,
- -msg => $VERSION_MSG,
- -output => '-'
+ -verbose => 99,
+ -msg => $VERSION_MSG,
+ -output => '-'
);
},
'version|v' => sub {
pod2usage(
- -verbose => 0,
- -msg => $VERSION_MSG,
- -output => '-'
+ -verbose => 0,
+ -msg => $VERSION_MSG,
+ -output => '-'
)
}
);
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
- -verbose => 99,
- -msg => $VERSION_MSG,
- -output => '-',
- -exit => 1
+ -verbose => 99,
+ -msg => $VERSION_MSG,
+ -output => '-',
+ -exit => 1
);
# Input has to be defined
@@ -281,6 +281,14 @@
# Convert sigle to path construct
s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
+if ($cmd) {
+ if ($output && (!-e $output || !-d $output)) {
+ print "Directory '$output' does not exist.\n\n";
+ exit(0);
+ };
+};
+
+
# Process a single file
unless ($cmd) {
my $input = $input[0];
@@ -303,6 +311,7 @@
# Create and parse new document
$input =~ s{([^/])$}{$1/};
+ # Process file
$batch_file->process($input, $output);
# Delete cache file
@@ -314,14 +323,10 @@
# Extract XML files
elsif ($cmd eq 'extract') {
- if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
- };
-
- # TODO: Support sigles and full archives
+ # Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+ # Check zip capabilities
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
@@ -349,6 +354,7 @@
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ";
+
# TODO: Make this OS independent
print '' . (
$archive->extract(
@@ -361,6 +367,8 @@
print "\n";
exit(1);
}
+
+ # Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
};
@@ -369,32 +377,20 @@
# Process an archive
elsif ($cmd eq 'archive') {
-warn '!!!!!!!!!!!!!------------> ';
-
-if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
-};
-
-
# TODO: Support sigles
- if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
- };
-
-# Zero means: everything runs in the parent process
+ # Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
- my $count = 0; # Texts to process
+ my $count = 0; # Texts to process
my $iter = 1; # Current text in process
# Report on fork message
$pool->run_on_finish (
sub {
- my ($pid, $code) = shift;
+ my ($pid, $code) = @_;
my $data = pop;
+
print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
@@ -403,16 +399,17 @@
);
my $t;
+ my $temp;
print "Reading data ...\n";
-# unless (Cache::FastMmap->new(
-# share_file => $cache_file,
-# cache_size => $cache_size,
-# init_file => $cache_init
-# )) {
-# print "Unable to intialize cache '$cache_file'\n\n";
-# exit(1);
-# };
+ # unless (Cache::FastMmap->new(
+ # share_file => $cache_file,
+ # cache_size => $cache_size,
+ # init_file => $cache_init
+ # )) {
+ # print "Unable to intialize cache '$cache_file'\n\n";
+ # exit(1);
+ # };
# Input is a directory
if (-d $input[0]) {
@@ -420,10 +417,11 @@
my @dirs;
my $dir;
+ # Todo: Make a DO WHILE
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
- push @dirs, $dir;
- $it->prune;
+ push @dirs, $dir;
+ $it->prune;
};
last unless $it->next;
};
@@ -436,15 +434,13 @@
for (my $i = 0; $i < $count; $i++) {
my $filename = catfile(
- $output,
- get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+ $output,
+ get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
);
# Get the next fork
- my $pid = $pool->start and next DIRECTORY_LOOP;
- my $msg;
-
- $msg = $batch_file->process($dirs[$i] => $filename);
+ $pool->start and next DIRECTORY_LOOP;
+ my $msg = $batch_file->process($dirs[$i] => $filename);
$pool->finish(0, \$msg);
};
}
@@ -465,6 +461,9 @@
my @dirs = $archive->list_texts;
$count = scalar @dirs;
+ # Create temporary file
+ $temp = File::Temp->newdir;
+
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
@@ -472,41 +471,41 @@
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
my $filename = catfile(
- $output,
- get_file_name(
- catfile($corpus, $doc, $text)
- . '.json' . ($gzip ? '.gz' : '')
- )
+ $output,
+ get_file_name(
+ catfile($corpus, $doc, $text)
+ . '.json' . ($gzip ? '.gz' : '')
+ )
);
# Get the next fork
- my $pid = $pool->start and next ARCHIVE_LOOP;
-
- # Create temporary file
- my $temp = File::Temp->newdir;
+ $pool->start and next ARCHIVE_LOOP;
my $msg;
# Extract from archive
if ($archive->extract($dirs[$i], $temp)) {
- # Create corpus directory
- my $input = catdir("$temp", $corpus);
+ # Create corpus directory
+ my $input = catdir("$temp", $corpus);
- # Temporary directory
- my $dir = catdir($input, $doc, $text);
+ # Temporary directory
+ my $dir = catdir($input, $doc, $text);
- # Write file
- $msg = $batch_file->process($dir => $output);
-
- $temp = undef;
- $pool->finish(0, \$msg);
+ # Write file
+ if ($batch_file->process($dir => $filename)) {
+ $pool->finish(0, \("Processed " . $filename));
+ }
+ else {
+ $pool->finish(1, \("Unable to process " . $dir));
+ };
}
+
+ # Unable to extract
else {
- $temp = undef;
- $msg = "Unable to extract " . $dirs[$i] . "\n";
- $pool->finish(1, \$msg);
+ $msg = "Unable to extract " . $dirs[$i] . "\n";
+ $pool->finish(1, \$msg);
};
};
}
@@ -517,6 +516,9 @@
$pool->wait_all_children;
+ # Delete temporary file
+ $temp = undef;
+
# Delete cache file
unlink($cache_file) if $cache_delete;