Added test for script calls Change-Id: I3f2f02257ca54c83b470cfe4b531069a44ed9739

commit: e1dbc3840a7525e36425fcc42e756fa081053056 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Jul 08 22:24:52 2016 +0200
committer: Akron <nils@diewald-online.de> Fri Jul 08 22:24:52 2016 +0200
tree: 64c346294d570bdf548dfc33fc8112d253efb7e4
parent: cdf0e0017b93f25d4b3fb2768e7bea9e6fa32239 [diff]
diff --git a/MANIFEST b/MANIFEST
index 9911feb..d1d8c24 100755
--- a/MANIFEST
+++ b/MANIFEST

@@ -97,6 +97,7 @@
 t/sgbr/meta_ids.t
 t/sgbr/pos.t
 t/sgbr/token.t
+t/script/single.t
 t/corpus/archive.zip
 t/corpus/BZK/header.xml
 t/corpus/GOE/header.xml

diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index d91074f..e84cb2e 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm

@@ -6,21 +6,22 @@
 use strict;
 use warnings;
 
+# Constructor
 sub new {
   my $class = shift;
   my %param = @_;
 
   bless {
     cache     => $param{cache}     // undef,
-    meta_type => $param{meta_type} // 'I5',
+    meta_type => $param{meta_type} || 'I5',
     overwrite => $param{overwrite},
-    foundry   => $param{foundry}   // 'Base',
-    layer     => $param{layer}     // 'Tokens',
-    anno      => $param{anno}      // [[]],
-    log       => $param{log}       // Mojo::Log->new(level => 'fatal'),
+    foundry   => $param{foundry}   || 'Base',
+    layer     => $param{layer}     || 'Tokens',
+    anno      => $param{anno}      || [[]],
+    log       => $param{log}       || Mojo::Log->new(level => 'fatal'),
     primary   => $param{primary},
     pretty    => $param{pretty},
-    gzip      => $param{gzip} // 0
+    gzip      => $param{gzip}      // 0
   }, $class;
 };
 

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 8a56858..4b06ca8 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -14,6 +14,7 @@
 use KorAP::XML::Krill;
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
+use KorAP::XML::Batch::File;
 use Parallel::ForkManager;
 # TODO: use Parallel::Loops
 # TODO: make output files
@@ -70,7 +71,6 @@
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
 VERSION
 
-
 # Parse comand
 my $cmd;
 our @ARGV;
@@ -125,6 +125,8 @@
 # Input has to be defined
 pod2usage(%ERROR_HASH) unless @input;
 
+# Gzip has no effect, if no output is given
+pod2usage(%ERROR_HASH) if $gzip && !$output;
 
 # Initialize log4perl object
 Log::Log4perl->init({
@@ -136,6 +138,101 @@
 
 my $log = Log::Log4perl->get_logger('main');
 
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
+
+my @layers;
+push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
+
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
+push(@layers, ['Connexor', 'Sentences']);
+
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities']);
+push(@layers, ['CoreNLP', 'Sentences']);
+push(@layers, ['CoreNLP', 'Morpho']);
+push(@layers, ['CoreNLP', 'Constituency']);
+
+# DeReKo
+push(@layers, ['DeReKo', 'Structure']);
+
+# Glemm
+push(@layers, ['Glemm', 'Morpho']);
+
+# Malt
+push(@layers, ['Malt', 'Dependency']);
+
+# MDParser
+push(@layers, ['MDParser', 'Dependency']);
+
+# Mate
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
+push(@layers, ['OpenNLP', 'Sentences']);
+
+# Schreibgebrauch
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
+
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
+push(@layers, ['TreeTagger', 'Sentences']);
+
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+push(@layers, ['XIP', 'Sentences']);
+push(@layers, ['XIP', 'Dependency']);
+
+# Check filters
+my @filtered_anno;
+if ($skip{'#all'}) {
+  foreach (@anno) {
+    push @filtered_anno, [ split('#', $_) ];
+  };
+}
+
+# Add all annotations that are not skipped
+else {
+  # Add to index file - respect skipping
+  foreach my $info (@layers) {
+    # Skip if Foundry or Foundry#Layer should be skipped
+    unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
+      push @filtered_anno, $info;
+    };
+  };
+};
+
+# Get tokenization basis
+my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
+
+# TODO: This should not be initialized for batch
+my $cache = Cache::FastMmap->new(
+  share_file => $cache_file,
+  cache_size => $cache_size,
+  init_file => $cache_init
+);
+
+my $batch_file = KorAP::XML::Batch::File->new(
+  cache => $cache,
+  meta_type => $meta,
+  overwrite => $overwrite,
+  foundry => $token_base_foundry,
+  layer => $token_base_layer,
+  gzip => $gzip,
+  log => $log,
+  primary => $primary,
+  pretty => $pretty,
+  anno => \@filtered_anno
+);
+
 
 # Get file name based on path information
 sub get_file_name ($) {
@@ -150,31 +247,31 @@
 
 
 # Write file
-sub write_file {
-  my $anno = shift;
-  my $file = get_file_name $anno;
-
-  # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
-
-  my $call = 'perl ' . $LOCAL . '/korapxml2krill';
-  $call .= ' -i ' . $anno;
-  $call .= ' -o ' . $output . '/' . $file . '.json';
-  $call .= '.gz -z' if $gzip;
-  $call .= ' -m ' . $meta if $meta;
-  $call .= ' -w' if $overwrite;
-  $call .= ' -t ' . $token_base if $token_base;
-  $call .= ' -l ' . $log_level if $log_level;
-  $call .= ' -c ' . $cache_file;
-  $call .= ' -cs ' . $cache_size;
-  $call .= ' --no-cache-delete'; # Don't delete the cache
-  $call .= ' --no-cache-init'; # Don't initialize the cache
-  $call .= ' --no-primary ' if $primary;
-  $call .= ' -y ' . $pretty if $pretty;
-  $call .= ' -a ' . $_ foreach @anno;
-  $call .= ' -s ' . $_ foreach @skip;
-  system($call);
-  return "$file";
-};
+#sub write_file {
+#  my $anno = shift;
+#  my $file = get_file_name $anno;
+#
+#  # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
+#
+#  my $call = 'perl ' . $LOCAL . '/korapxml2krill';
+#  $call .= ' -i ' . $anno;
+#  $call .= ' -o ' . $output . '/' . $file . '.json';
+#  $call .= '.gz -z' if $gzip;
+#  $call .= ' -m ' . $meta if $meta;
+#  $call .= ' -w' if $overwrite;
+#  $call .= ' -t ' . $token_base if $token_base;
+#  $call .= ' -l ' . $log_level if $log_level;
+#  $call .= ' -c ' . $cache_file;
+#  $call .= ' -cs ' . $cache_size;
+#  $call .= ' --no-cache-delete'; # Don't delete the cache
+#  $call .= ' --no-cache-init'; # Don't initialize the cache
+#  $call .= ' --no-primary ' if $primary;
+#  $call .= ' -y ' . $pretty if $pretty;
+#  $call .= ' -a ' . $_ foreach @anno;
+#  $call .= ' -s ' . $_ foreach @skip;
+#  system($call);
+#  return "$file";
+#};
 
 
 # Convert sigle to path construct
@@ -184,18 +281,6 @@
 unless ($cmd) {
   my $input = $input[0];
 
-  # Can't print gzip to STDOUT
-  pod2usage(%ERROR_HASH) if $gzip && !$output;
-
-  my %skip;
-  $skip{lc($_)} = 1 foreach @skip;
-
-  # Ignore processing
-  if (!$overwrite && $output && -e $output) {
-    $log->trace($output . ' already exists');
-    exit(0);
-  };
-
   BEGIN {
     $main::TIME = Benchmark->new;
     $main::LAST_STOP = Benchmark->new;
@@ -213,144 +298,25 @@
 
   # Create and parse new document
   $input =~ s{([^/])$}{$1/};
-  my $doc = KorAP::XML::Krill->new(
-    path => $input,
-    meta_type => ($meta // 'I5'),
-    cache => Cache::FastMmap->new(
-      share_file => $cache_file,
-      cache_size => $cache_size,
-      init_file => $cache_init
-    )
-  );
 
-  unless ($doc->parse) {
-    $log->warn($output . " can't be processed - no document data");
-    exit(0);
-  };
-
-  my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
-  if ($token_base) {
-    ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
-  };
-
-  # Get tokenization
-  my $tokens = KorAP::XML::Tokenizer->new(
-    path => $doc->path,
-    doc => $doc,
-    foundry => $token_base_foundry,
-    layer => $token_base_layer,
-    name => 'tokens'
-  );
-
-  # Unable to process base tokenization
-  unless ($tokens->parse) {
-    $log->error($output . " can't be processed - no base tokenization");
-    exit(0);
-  };
-
-  my @layers;
-  push(@layers, ['Base', 'Sentences']);
-  push(@layers, ['Base', 'Paragraphs']);
-
-  # Connexor
-  push(@layers, ['Connexor', 'Morpho']);
-  push(@layers, ['Connexor', 'Syntax']);
-  push(@layers, ['Connexor', 'Phrase']);
-  push(@layers, ['Connexor', 'Sentences']);
-
-  # CoreNLP
-  push(@layers, ['CoreNLP', 'NamedEntities']);
-  push(@layers, ['CoreNLP', 'Sentences']);
-  push(@layers, ['CoreNLP', 'Morpho']);
-  push(@layers, ['CoreNLP', 'Constituency']);
-
-  # DeReKo
-  push(@layers, ['DeReKo', 'Structure']);
-
-  # Glemm
-  push(@layers, ['Glemm', 'Morpho']);
-
-  # Malt
-  push(@layers, ['Malt', 'Dependency']);
-
-  # MDParser
-  push(@layers, ['MDParser', 'Dependency']);
-
-  # Mate
-  push(@layers, ['Mate', 'Morpho']);
-  push(@layers, ['Mate', 'Dependency']);
-
-  # OpenNLP
-  push(@layers, ['OpenNLP', 'Morpho']);
-  push(@layers, ['OpenNLP', 'Sentences']);
-
-  # Schreibgebrauch
-  push(@layers, ['Sgbr', 'Lemma']);
-  push(@layers, ['Sgbr', 'Morpho']);
-
-  # TreeTagger
-  push(@layers, ['TreeTagger', 'Morpho']);
-  push(@layers, ['TreeTagger', 'Sentences']);
-
-  # XIP
-  push(@layers, ['XIP', 'Morpho']);
-  push(@layers, ['XIP', 'Constituency']);
-  push(@layers, ['XIP', 'Sentences']);
-  push(@layers, ['XIP', 'Dependency']);
-
-
-  if ($skip{'#all'}) {
-    foreach (@anno) {
-      $tokens->add(split('#', $_));
-      stop_time;
-    };
-  }
-  else {
-    # Add to index file - respect skipping
-    foreach my $info (@layers) {
-      # Skip if Foundry or Foundry#Layer should be skipped
-      unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
-	$tokens->add(@$info);
-	stop_time;
-      };
-    };
-  };
-
-  my $file;
-  my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
-
-  if ($output) {
-
-    if ($gzip) {
-      $file = IO::Compress::Gzip->new($output, Minimal => 1);
-    }
-    else {
-      $file = IO::File->new($output, "w");
-    };
-
-    $file->print($print_text);
-    $file->close;
-  }
-
-  else {
-    print $print_text . "\n";
-  };
+  $batch_file->process($input, $output);
 
   # Delete cache file
   unlink($cache_file) if $cache_delete;
 
-  stop_time;
+#  stop_time;
 }
 
 # Extract XML files
 elsif ($cmd eq 'extract') {
 
-  pod2usage(%ERROR_HASH) unless $output;
+warn '!!!!!!!!!!!!!------------> ';
 
-  if ($output && (!-e $output || !-d $output)) {
-    print "Directory '$output' does not exist.\n\n";
-    exit(0);
-  };
+if ($output && (!-e $output || !-d $output)) {
+  print "Directory '$output' does not exist.\n\n";
+  exit(0);
+};
+
 
   # TODO: Support sigles and full archives
 
@@ -382,9 +348,15 @@
 # Process an archive
 elsif ($cmd eq 'archive') {
 
-  # TODO: Support sigles
+warn '!!!!!!!!!!!!!------------> ';
 
-  pod2usage(%ERROR_HASH) unless $output;
+if ($output && (!-e $output || !-d $output)) {
+  print "Directory '$output' does not exist.\n\n";
+  exit(0);
+};
+
+
+  # TODO: Support sigles
 
   if ($output && (!-e $output || !-d $output)) {
     print "Directory '$output' does not exist.\n\n";
@@ -412,14 +384,14 @@
   my $t;
   print "Reading data ...\n";
 
-  unless (Cache::FastMmap->new(
-    share_file => $cache_file,
-    cache_size => $cache_size,
-    init_file => $cache_init
-  )) {
-    print "Unable to intialize cache '$cache_file'\n\n";
-    exit(1);
-  };
+#  unless (Cache::FastMmap->new(
+#    share_file => $cache_file,
+#    cache_size => $cache_size,
+#    init_file => $cache_init
+#  )) {
+#    print "Unable to intialize cache '$cache_file'\n\n";
+#    exit(1);
+#  };
 
   # Input is a directory
   if (-d $input[0]) {
@@ -442,30 +414,23 @@
   DIRECTORY_LOOP:
     for (my $i = 0; $i < $count; $i++) {
 
-      unless ($overwrite) {
-	my $filename = catfile(
-	  $output,
-	  get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
-	);
-
-	if (-e $filename) {
-	  $iter++;
-	  print "Skip $filename\n";
-	  next;
-	};
-      };
+      my $filename = catfile(
+	$output,
+	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+      );
 
       # Get the next fork
       my $pid = $pool->start and next DIRECTORY_LOOP;
       my $msg;
 
-      $msg = write_file($dirs[$i]);
+      $msg = $batch_file->process($dirs[$i] => $filename);
       $pool->finish(0, \$msg);
     };
   }
 
   # Input is a file
   elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
+
     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
       exit(1);
@@ -485,23 +450,13 @@
       # Split path information
       my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
 
-      unless ($overwrite) {
-
-	# This is not correct!!
-	my $filename = catfile(
-	  $output,
-	  get_file_name(
-	    catfile($corpus, $doc, $text)
-	      . '.json' . ($gzip ? '.gz' : '')
-	    )
-	);
-
-	if (-e $filename) {
-	  $iter++;
-	  print "Skip $filename\n";
-	  next;
-	};
-      };
+      my $filename = catfile(
+	$output,
+	get_file_name(
+	  catfile($corpus, $doc, $text)
+	    . '.json' . ($gzip ? '.gz' : '')
+	  )
+      );
 
       # Get the next fork
       my $pid = $pool->start and next ARCHIVE_LOOP;
@@ -521,7 +476,7 @@
 	my $dir = catdir($input, $doc, $text);
 
 	# Write file
-	$msg = write_file($dir);
+	$msg = $batch_file->process($dir => $output);
 
 	$temp = undef;
 	$pool->finish(0, \$msg);

diff --git a/t/script/single.t b/t/script/single.t
new file mode 100644
index 0000000..053f80b
--- /dev/null
+++ b/t/script/single.t

@@ -0,0 +1,52 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
+my $output = tmpnam();
+
+ok(-f $script, 'Script found');
+ok(-d $input, 'Input directory found');
+
+my $call = 'perl ';
+$call .= $script . ' ';
+$call .= "--input $input ";
+$call .= "--output $output ";
+$call .= '-t OpenNLP#Tokens ';
+
+system($call);
+
+ok(my $file = slurp $output, 'Slurp data');
+ok(my $json = decode_json $file, 'decode json');
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+
+system($call . ' -z');
+
+my $gz = IO::Uncompress::Gunzip->new($output);
+ok($gz->read($file), 'Uncompress');
+
+ok($json = decode_json $file, 'decode json');
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+
+
+done_testing;
+__END__
commit	e1dbc3840a7525e36425fcc42e756fa081053056	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Jul 08 22:24:52 2016 +0200
committer	Akron <nils@diewald-online.de>	Fri Jul 08 22:24:52 2016 +0200
tree	64c346294d570bdf548dfc33fc8112d253efb7e4
parent	cdf0e0017b93f25d4b3fb2768e7bea9e6fa32239 [diff]