Fixed annotation bug in script Change-Id: I2aef3e9d2bb73adf4969f0378e2bbe4914cee48a

commit: 5f51d4251005f9db8346fef36f8713e453c341ea [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 16 16:26:43 2016 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 16 16:26:43 2016 +0200
tree: 0b37c40c915699da44a34620357cc6e81d8e351f
parent: 92ad95b5d478a3a4665cb953dbcd1441d2d4e89b [diff]
diff --git a/Changes b/Changes
index e4a5f80..065409b 100644
--- a/Changes
+++ b/Changes

@@ -9,39 +9,41 @@
         - Added test for direct I5 support.
         - Fixed support for Mojolicious 7.
         - Added script test.
+        - Fixed setting multiple annotations in
+          script.
 
 0.17 2016-03-22
         - Rewrite siglen to use slashes as separators.
-	- Zip listing optimized. Does no longer work with primary data
-	  in text.xml files.
+	      - Zip listing optimized. Does no longer work with primary data
+	        in text.xml files.
 
 0.16 2016-03-18
         - Added caching mechanism for
-	  metadata.
+	        metadata.
 
 0.15 2016-03-17
         - Modularized metadata handling.
         - Simplified metadata handling.
-	- Added --meta option to script.
-	- Removed deprecated --human option from script.
+	      - Added --meta option to script.
+        - Removed deprecated --human option from script.
 
 0.14 2016-03-15
         - Renamed ::Index to ::Annotate and ::Field to ::Index.
-	- Renamed 'allow' to 'anno' as parameters of the script. 
-	- Added readme.
+        - Renamed 'allow' to 'anno' as parameters of the script. 
+        - Added readme.
 
 0.13 2016-03-10
         - Removed korapxml2krill_dir.
-	- Renamed dependency nodes.
-	- Made dependency relations more effective (trimmed down TUIs)
-	  ! This is currently very slow !
+	      - Renamed dependency nodes.
+        - Made dependency relations more effective (trimmed down TUIs)
+	        ! This is currently very slow !
 
 0.12 2016-02-28
         - Added extract method to korapxml2krill.
-	- Fixed Mate/Dependency.
-	- Fixed skip flag in korapxml2krill.
-	- Ignore spans outside the token range
-	  (i.e. character offsets end before tokens have started).
+        - Fixed Mate/Dependency.
+        - Fixed skip flag in korapxml2krill.
+        - Ignore spans outside the token range
+          (i.e. character offsets end before tokens have started).
 
 0.11 2016-02-23
         - Merged korapxml2krill and korapxml2krill_dir.
@@ -51,43 +53,43 @@
 
 0.09 2016-02-15
         - Fixed temporary directory handling in scripts.
-	- Improved skipping for archive handling in scripts.
+        - Improved skipping for archive handling in scripts.
 
 0.08 2016-02-14
         - Added support for archive streaming.
-	- Improved scripts.
+        - Improved scripts.
 
 0.07 2016-02-13
         - Improved support for Schreibgebrauch meta data
-	  (IDS flavour).
+          (IDS flavour).
 
 0.06 2016-02-11
         - Improved support for Schreibgebrauch meta data
-	  (Duden flavour).
+          (Duden flavour).
 
 0.05 2016-02-04
         - Changed KorAP::Document to KorAP::XML::Krill.
-	- Renamed "Schreibgebrauch" to "Sgbr".
-	- Preparation for GitHub release.
+        - Renamed "Schreibgebrauch" to "Sgbr".
+        - Preparation for GitHub release.
 
 0.04 2016-01-28
         - Added PTI to all payloads.
-	- Added support for empty elements.
-	- Added support for element attributes in struct.
-	- Added meta data support for Schreibgebrauch.
-	- Fixed test suite for meta data.
+        - Added support for empty elements.
+        - Added support for element attributes in struct.
+        - Added meta data support for Schreibgebrauch.
+        - Fixed test suite for meta data.
 
 0.03 2014-11-03
         - Added new metadata scheme.
-	- Fixed a minor bug in the constituency tree building.
-	- Sorted terms in tokens a priori.
+        - Fixed a minor bug in the constituency tree building.
+        - Sorted terms in tokens a priori.
 
 0.02 2014-07-21
         - Sentence annotations for all providing foundries
-	- Starting subtokenization 
+        - Starting subtokenization 
 
 0.01 2014-04-15
-	- [bugfix] for first token annotations
+        - [bugfix] for first token annotations
         - Sentences are now available from all foundries that have it
         - <>:p is now <>:base/para
-	- Added <>:base/text
+        - Added <>:base/text

diff --git a/Makefile.PL b/Makefile.PL
index 8e3aae2..efe949b 100644
--- a/Makefile.PL
+++ b/Makefile.PL

@@ -40,9 +40,10 @@
   test => {
     TESTS =>
       't/*.t '.
-	't/annotation/*.t ' .
-	  't/sgbr/*.t ' .
-	    't/real/*.t'
+      't/annotation/*.t ' .
+      't/sgbr/*.t ' .
+      't/real/*.t ' .
+      't/script/*.t '
 	  },
   EXE_FILES => ['script/korapxml2krill']
 );

diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index e84cb2e..76a4a46 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm

@@ -27,9 +27,7 @@
 
 # Process a file
 sub process {
-  my $self = shift;
-  my $input = shift;
-  my $output = shift;
+  my ($self, $input, $output) = @_;
 
   if (!$self->{overwrite} && $output && -e $output) {
     $self->{log}->debug($output . ' already exists');
@@ -71,18 +69,29 @@
 
   my $file;
   my $print_text = ($self->{pretty} ? $tokens->to_pretty_json($self->{primary}) : $tokens->to_json($self->{primary}));
+
+  # There is an output file given
   if ($output) {
+
     if ($self->{gzip}) {
-      $file = IO::Compress::Gzip->new($output, Minimal => 1);
+      $file = IO::Compress::Gzip->new($output, TextFlag => 1, Minimal => 1);
     }
     else {
       $file = IO::File->new($output, "w");
     };
 
-    $file->print($print_text);
+    # Write to output
+    unless ($file->print($print_text)) {
+      $self->{log}->error('Unable to write to ' . $file);
+    };
+
+    # Flush pending data
+    # $file->flush if $self->{gzip};
+
     $file->close;
   }
 
+  # Direct output to STDOUT
   else {
     print $print_text . "\n";
   };

diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 5740492..5b9ee6f 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm

@@ -408,7 +408,7 @@
       return $retval;
     }
     else {
-      $self->log->warn('Unable to parse '.$mod);
+      $self->log->debug('Unable to parse '.$mod);
     };
   }
   else {

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 4b06ca8..e668f07 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -65,7 +65,7 @@
 # - Added MDParser#Dependency
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2016/07/06';
+our $LAST_CHANGE = '2016/08/16';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -78,7 +78,7 @@
   $cmd = shift @ARGV;
 };
 
-my (@skip, @sigle, @input);
+my (@skip, @sigle, @anno, @input);
 my $text;
 
 # Parse options from the command line
@@ -96,7 +96,7 @@
   'cache-delete|cd!' => \(my $cache_delete = 1),
   'cache-init|ci!'   => \(my $cache_init = 1),
   'log|l=s'     => \(my $log_level = 'ERROR'),
-  'anno|a=s'    => \(my @anno),
+  'anno|a=s'    => \@anno,
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs = 0),
@@ -288,11 +288,11 @@
 
   sub stop_time {
     my $new = Benchmark->new;
-    $log->trace(
+    $log->info(
       'The code took: '.
-	timestr(timediff($new, $main::LAST_STOP)) .
-	  ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
-	);
+        timestr(timediff($new, $main::LAST_STOP)) .
+        ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
+      );
     $main::LAST_STOP = $new;
   };
 
@@ -304,7 +304,7 @@
   # Delete cache file
   unlink($cache_file) if $cache_delete;
 
-#  stop_time;
+  stop_time;
 }
 
 # Extract XML files

diff --git a/t/script.t b/t/script.t
deleted file mode 100644
index 5122b63..0000000
--- a/t/script.t
+++ /dev/null

@@ -1,18 +0,0 @@
-use strict;
-use warnings;
-use Test::More;
-use File::Basename 'dirname';
-use File::Spec::Functions qw/catfile/;
-use Test::Output;
-use FindBin;
-
-my $script = catfile(dirname(__FILE__), '..', 'script', 'korapxml2krill');
-
-stdout_like(
-  sub { system('perl', $script) },
-  qr!Usage.+?korapxml2krill!s,
-  'Usage output'
-);
-
-done_testing;
-__END__

diff --git a/t/script/single.t b/t/script/single.t
index 053f80b..cda1b57 100644
--- a/t/script/single.t
+++ b/t/script/single.t

@@ -8,45 +8,132 @@
 use Mojo::JSON qw/decode_json/;
 use IO::Uncompress::Gunzip;
 use Test::More;
+use Test::Output;
+use Data::Dumper;
 
 my $f = dirname(__FILE__);
 my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
-my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
-my $output = tmpnam();
-
 ok(-f $script, 'Script found');
+
+stdout_like(
+  sub { system('perl', $script) },
+  qr!Usage.+?korapxml2krill!s,
+  'Usage output'
+);
+
+my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
 ok(-d $input, 'Input directory found');
 
-my $call = 'perl ';
-$call .= $script . ' ';
-$call .= "--input $input ";
-$call .= "--output $output ";
-$call .= '-t OpenNLP#Tokens ';
+my $output = tmpnam();
 
-system($call);
+ok(!-f $output, 'Output does not exist');
 
-ok(my $file = slurp $output, 'Slurp data');
-ok(my $json = decode_json $file, 'decode json');
+my $call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '-t' => 'OpenNLP#Tokens',
+  '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = slurp $output), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
 is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
 is($json->{title}, 'Beispiel Text', 'Title');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
 is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
 
-system($call . ' -z');
+# Delete output
+unlink $output;
+ok(!-f $output, 'Output does not exist');
 
-my $gz = IO::Uncompress::Gunzip->new($output);
-ok($gz->read($file), 'Uncompress');
+$call .= ' -z';
 
-ok($json = decode_json $file, 'decode json');
+# Test with compression
+stderr_like(
+  sub { system($call); },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+
+# Uncompress the data using a buffer
+my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
+($file, my $buffer) = '';
+while ($gz->read($buffer)) {
+  $file .= $buffer;
+};
+
+ok($json = decode_json($file), 'decode json');
+
 is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
 is($json->{title}, 'Beispiel Text', 'Title');
-is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
 is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 
+# Delete output
+unlink $output;
+ok(!-f $output, 'Output does not exist');
+
+# Use a different token source and skip all annotations,
+# except for DeReKo#Structure and Mate#Dependency
+$call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '-t' => 'CoreNLP#Tokens',
+  '-s' => '#all',
+  '-a' => 'DeReKo#Structure',
+  '-a' => 'Mate#Dependency',
+  '-l' => 'INFO'
+);
+
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = slurp $output), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
+is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
+
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
+
+# Test overwrite!!!
+# Test meta
+# Test sigle!
+# Test help
+# Test version
+
 
 done_testing;
 __END__
+
commit	5f51d4251005f9db8346fef36f8713e453c341ea	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 16 16:26:43 2016 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 16 16:26:43 2016 +0200
tree	0b37c40c915699da44a34620357cc6e81d8e351f
parent	92ad95b5d478a3a4665cb953dbcd1441d2d4e89b [diff]