Added test for script calls
Change-Id: I3f2f02257ca54c83b470cfe4b531069a44ed9739
diff --git a/MANIFEST b/MANIFEST
index 9911feb..d1d8c24 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -97,6 +97,7 @@
t/sgbr/meta_ids.t
t/sgbr/pos.t
t/sgbr/token.t
+t/script/single.t
t/corpus/archive.zip
t/corpus/BZK/header.xml
t/corpus/GOE/header.xml
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index d91074f..e84cb2e 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -6,21 +6,22 @@
use strict;
use warnings;
+# Constructor
sub new {
my $class = shift;
my %param = @_;
bless {
cache => $param{cache} // undef,
- meta_type => $param{meta_type} // 'I5',
+ meta_type => $param{meta_type} || 'I5',
overwrite => $param{overwrite},
- foundry => $param{foundry} // 'Base',
- layer => $param{layer} // 'Tokens',
- anno => $param{anno} // [[]],
- log => $param{log} // Mojo::Log->new(level => 'fatal'),
+ foundry => $param{foundry} || 'Base',
+ layer => $param{layer} || 'Tokens',
+ anno => $param{anno} || [[]],
+ log => $param{log} || Mojo::Log->new(level => 'fatal'),
primary => $param{primary},
pretty => $param{pretty},
- gzip => $param{gzip} // 0
+ gzip => $param{gzip} // 0
}, $class;
};
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 8a56858..4b06ca8 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -14,6 +14,7 @@
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
+use KorAP::XML::Batch::File;
use Parallel::ForkManager;
# TODO: use Parallel::Loops
# TODO: make output files
@@ -70,7 +71,6 @@
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
-
# Parse comand
my $cmd;
our @ARGV;
@@ -125,6 +125,8 @@
# Input has to be defined
pod2usage(%ERROR_HASH) unless @input;
+# Gzip has no effect, if no output is given
+pod2usage(%ERROR_HASH) if $gzip && !$output;
# Initialize log4perl object
Log::Log4perl->init({
@@ -136,6 +138,101 @@
my $log = Log::Log4perl->get_logger('main');
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
+
+my @layers;
+push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
+
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
+push(@layers, ['Connexor', 'Sentences']);
+
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities']);
+push(@layers, ['CoreNLP', 'Sentences']);
+push(@layers, ['CoreNLP', 'Morpho']);
+push(@layers, ['CoreNLP', 'Constituency']);
+
+# DeReKo
+push(@layers, ['DeReKo', 'Structure']);
+
+# Glemm
+push(@layers, ['Glemm', 'Morpho']);
+
+# Malt
+push(@layers, ['Malt', 'Dependency']);
+
+# MDParser
+push(@layers, ['MDParser', 'Dependency']);
+
+# Mate
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
+push(@layers, ['OpenNLP', 'Sentences']);
+
+# Schreibgebrauch
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
+
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
+push(@layers, ['TreeTagger', 'Sentences']);
+
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+push(@layers, ['XIP', 'Sentences']);
+push(@layers, ['XIP', 'Dependency']);
+
+# Check filters
+my @filtered_anno;
+if ($skip{'#all'}) {
+ foreach (@anno) {
+ push @filtered_anno, [ split('#', $_) ];
+ };
+}
+
+# Add all annotations that are not skipped
+else {
+ # Add to index file - respect skipping
+ foreach my $info (@layers) {
+ # Skip if Foundry or Foundry#Layer should be skipped
+ unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
+ push @filtered_anno, $info;
+ };
+ };
+};
+
+# Get tokenization basis
+my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
+
+# TODO: This should not be initialized for batch
+my $cache = Cache::FastMmap->new(
+ share_file => $cache_file,
+ cache_size => $cache_size,
+ init_file => $cache_init
+);
+
+my $batch_file = KorAP::XML::Batch::File->new(
+ cache => $cache,
+ meta_type => $meta,
+ overwrite => $overwrite,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ gzip => $gzip,
+ log => $log,
+ primary => $primary,
+ pretty => $pretty,
+ anno => \@filtered_anno
+);
+
# Get file name based on path information
sub get_file_name ($) {
@@ -150,31 +247,31 @@
# Write file
-sub write_file {
- my $anno = shift;
- my $file = get_file_name $anno;
-
- # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
-
- my $call = 'perl ' . $LOCAL . '/korapxml2krill';
- $call .= ' -i ' . $anno;
- $call .= ' -o ' . $output . '/' . $file . '.json';
- $call .= '.gz -z' if $gzip;
- $call .= ' -m ' . $meta if $meta;
- $call .= ' -w' if $overwrite;
- $call .= ' -t ' . $token_base if $token_base;
- $call .= ' -l ' . $log_level if $log_level;
- $call .= ' -c ' . $cache_file;
- $call .= ' -cs ' . $cache_size;
- $call .= ' --no-cache-delete'; # Don't delete the cache
- $call .= ' --no-cache-init'; # Don't initialize the cache
- $call .= ' --no-primary ' if $primary;
- $call .= ' -y ' . $pretty if $pretty;
- $call .= ' -a ' . $_ foreach @anno;
- $call .= ' -s ' . $_ foreach @skip;
- system($call);
- return "$file";
-};
+#sub write_file {
+# my $anno = shift;
+# my $file = get_file_name $anno;
+#
+# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
+#
+# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
+# $call .= ' -i ' . $anno;
+# $call .= ' -o ' . $output . '/' . $file . '.json';
+# $call .= '.gz -z' if $gzip;
+# $call .= ' -m ' . $meta if $meta;
+# $call .= ' -w' if $overwrite;
+# $call .= ' -t ' . $token_base if $token_base;
+# $call .= ' -l ' . $log_level if $log_level;
+# $call .= ' -c ' . $cache_file;
+# $call .= ' -cs ' . $cache_size;
+# $call .= ' --no-cache-delete'; # Don't delete the cache
+# $call .= ' --no-cache-init'; # Don't initialize the cache
+# $call .= ' --no-primary ' if $primary;
+# $call .= ' -y ' . $pretty if $pretty;
+# $call .= ' -a ' . $_ foreach @anno;
+# $call .= ' -s ' . $_ foreach @skip;
+# system($call);
+# return "$file";
+#};
# Convert sigle to path construct
@@ -184,18 +281,6 @@
unless ($cmd) {
my $input = $input[0];
- # Can't print gzip to STDOUT
- pod2usage(%ERROR_HASH) if $gzip && !$output;
-
- my %skip;
- $skip{lc($_)} = 1 foreach @skip;
-
- # Ignore processing
- if (!$overwrite && $output && -e $output) {
- $log->trace($output . ' already exists');
- exit(0);
- };
-
BEGIN {
$main::TIME = Benchmark->new;
$main::LAST_STOP = Benchmark->new;
@@ -213,144 +298,25 @@
# Create and parse new document
$input =~ s{([^/])$}{$1/};
- my $doc = KorAP::XML::Krill->new(
- path => $input,
- meta_type => ($meta // 'I5'),
- cache => Cache::FastMmap->new(
- share_file => $cache_file,
- cache_size => $cache_size,
- init_file => $cache_init
- )
- );
- unless ($doc->parse) {
- $log->warn($output . " can't be processed - no document data");
- exit(0);
- };
-
- my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
- if ($token_base) {
- ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
- };
-
- # Get tokenization
- my $tokens = KorAP::XML::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => $token_base_foundry,
- layer => $token_base_layer,
- name => 'tokens'
- );
-
- # Unable to process base tokenization
- unless ($tokens->parse) {
- $log->error($output . " can't be processed - no base tokenization");
- exit(0);
- };
-
- my @layers;
- push(@layers, ['Base', 'Sentences']);
- push(@layers, ['Base', 'Paragraphs']);
-
- # Connexor
- push(@layers, ['Connexor', 'Morpho']);
- push(@layers, ['Connexor', 'Syntax']);
- push(@layers, ['Connexor', 'Phrase']);
- push(@layers, ['Connexor', 'Sentences']);
-
- # CoreNLP
- push(@layers, ['CoreNLP', 'NamedEntities']);
- push(@layers, ['CoreNLP', 'Sentences']);
- push(@layers, ['CoreNLP', 'Morpho']);
- push(@layers, ['CoreNLP', 'Constituency']);
-
- # DeReKo
- push(@layers, ['DeReKo', 'Structure']);
-
- # Glemm
- push(@layers, ['Glemm', 'Morpho']);
-
- # Malt
- push(@layers, ['Malt', 'Dependency']);
-
- # MDParser
- push(@layers, ['MDParser', 'Dependency']);
-
- # Mate
- push(@layers, ['Mate', 'Morpho']);
- push(@layers, ['Mate', 'Dependency']);
-
- # OpenNLP
- push(@layers, ['OpenNLP', 'Morpho']);
- push(@layers, ['OpenNLP', 'Sentences']);
-
- # Schreibgebrauch
- push(@layers, ['Sgbr', 'Lemma']);
- push(@layers, ['Sgbr', 'Morpho']);
-
- # TreeTagger
- push(@layers, ['TreeTagger', 'Morpho']);
- push(@layers, ['TreeTagger', 'Sentences']);
-
- # XIP
- push(@layers, ['XIP', 'Morpho']);
- push(@layers, ['XIP', 'Constituency']);
- push(@layers, ['XIP', 'Sentences']);
- push(@layers, ['XIP', 'Dependency']);
-
-
- if ($skip{'#all'}) {
- foreach (@anno) {
- $tokens->add(split('#', $_));
- stop_time;
- };
- }
- else {
- # Add to index file - respect skipping
- foreach my $info (@layers) {
- # Skip if Foundry or Foundry#Layer should be skipped
- unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
- $tokens->add(@$info);
- stop_time;
- };
- };
- };
-
- my $file;
- my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
-
- if ($output) {
-
- if ($gzip) {
- $file = IO::Compress::Gzip->new($output, Minimal => 1);
- }
- else {
- $file = IO::File->new($output, "w");
- };
-
- $file->print($print_text);
- $file->close;
- }
-
- else {
- print $print_text . "\n";
- };
+ $batch_file->process($input, $output);
# Delete cache file
unlink($cache_file) if $cache_delete;
- stop_time;
+# stop_time;
}
# Extract XML files
elsif ($cmd eq 'extract') {
- pod2usage(%ERROR_HASH) unless $output;
+warn '!!!!!!!!!!!!!------------> ';
- if ($output && (!-e $output || !-d $output)) {
- print "Directory '$output' does not exist.\n\n";
- exit(0);
- };
+if ($output && (!-e $output || !-d $output)) {
+ print "Directory '$output' does not exist.\n\n";
+ exit(0);
+};
+
# TODO: Support sigles and full archives
@@ -382,9 +348,15 @@
# Process an archive
elsif ($cmd eq 'archive') {
- # TODO: Support sigles
+warn '!!!!!!!!!!!!!------------> ';
- pod2usage(%ERROR_HASH) unless $output;
+if ($output && (!-e $output || !-d $output)) {
+ print "Directory '$output' does not exist.\n\n";
+ exit(0);
+};
+
+
+ # TODO: Support sigles
if ($output && (!-e $output || !-d $output)) {
print "Directory '$output' does not exist.\n\n";
@@ -412,14 +384,14 @@
my $t;
print "Reading data ...\n";
- unless (Cache::FastMmap->new(
- share_file => $cache_file,
- cache_size => $cache_size,
- init_file => $cache_init
- )) {
- print "Unable to intialize cache '$cache_file'\n\n";
- exit(1);
- };
+# unless (Cache::FastMmap->new(
+# share_file => $cache_file,
+# cache_size => $cache_size,
+# init_file => $cache_init
+# )) {
+# print "Unable to intialize cache '$cache_file'\n\n";
+# exit(1);
+# };
# Input is a directory
if (-d $input[0]) {
@@ -442,30 +414,23 @@
DIRECTORY_LOOP:
for (my $i = 0; $i < $count; $i++) {
- unless ($overwrite) {
- my $filename = catfile(
- $output,
- get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
- );
-
- if (-e $filename) {
- $iter++;
- print "Skip $filename\n";
- next;
- };
- };
+ my $filename = catfile(
+ $output,
+ get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+ );
# Get the next fork
my $pid = $pool->start and next DIRECTORY_LOOP;
my $msg;
- $msg = write_file($dirs[$i]);
+ $msg = $batch_file->process($dirs[$i] => $filename);
$pool->finish(0, \$msg);
};
}
# Input is a file
elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
@@ -485,23 +450,13 @@
# Split path information
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
- unless ($overwrite) {
-
- # This is not correct!!
- my $filename = catfile(
- $output,
- get_file_name(
- catfile($corpus, $doc, $text)
- . '.json' . ($gzip ? '.gz' : '')
- )
- );
-
- if (-e $filename) {
- $iter++;
- print "Skip $filename\n";
- next;
- };
- };
+ my $filename = catfile(
+ $output,
+ get_file_name(
+ catfile($corpus, $doc, $text)
+ . '.json' . ($gzip ? '.gz' : '')
+ )
+ );
# Get the next fork
my $pid = $pool->start and next ARCHIVE_LOOP;
@@ -521,7 +476,7 @@
my $dir = catdir($input, $doc, $text);
# Write file
- $msg = write_file($dir);
+ $msg = $batch_file->process($dir => $output);
$temp = undef;
$pool->finish(0, \$msg);
diff --git a/t/script/single.t b/t/script/single.t
new file mode 100644
index 0000000..053f80b
--- /dev/null
+++ b/t/script/single.t
@@ -0,0 +1,52 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
+my $output = tmpnam();
+
+ok(-f $script, 'Script found');
+ok(-d $input, 'Input directory found');
+
+my $call = 'perl ';
+$call .= $script . ' ';
+$call .= "--input $input ";
+$call .= "--output $output ";
+$call .= '-t OpenNLP#Tokens ';
+
+system($call);
+
+ok(my $file = slurp $output, 'Slurp data');
+ok(my $json = decode_json $file, 'decode json');
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+
+system($call . ' -z');
+
+my $gz = IO::Uncompress::Gunzip->new($output);
+ok($gz->read($file), 'Uncompress');
+
+ok($json = decode_json $file, 'decode json');
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+
+
+done_testing;
+__END__