Fixed annotation bug in script
Change-Id: I2aef3e9d2bb73adf4969f0378e2bbe4914cee48a
diff --git a/Changes b/Changes
index e4a5f80..065409b 100644
--- a/Changes
+++ b/Changes
@@ -9,39 +9,41 @@
- Added test for direct I5 support.
- Fixed support for Mojolicious 7.
- Added script test.
+ - Fixed setting multiple annotations in
+ script.
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
- - Zip listing optimized. Does no longer work with primary data
- in text.xml files.
+ - Zip listing optimized. Does no longer work with primary data
+ in text.xml files.
0.16 2016-03-18
- Added caching mechanism for
- metadata.
+ metadata.
0.15 2016-03-17
- Modularized metadata handling.
- Simplified metadata handling.
- - Added --meta option to script.
- - Removed deprecated --human option from script.
+ - Added --meta option to script.
+ - Removed deprecated --human option from script.
0.14 2016-03-15
- Renamed ::Index to ::Annotate and ::Field to ::Index.
- - Renamed 'allow' to 'anno' as parameters of the script.
- - Added readme.
+ - Renamed 'allow' to 'anno' as parameters of the script.
+ - Added readme.
0.13 2016-03-10
- Removed korapxml2krill_dir.
- - Renamed dependency nodes.
- - Made dependency relations more effective (trimmed down TUIs)
- ! This is currently very slow !
+ - Renamed dependency nodes.
+ - Made dependency relations more effective (trimmed down TUIs)
+ ! This is currently very slow !
0.12 2016-02-28
- Added extract method to korapxml2krill.
- - Fixed Mate/Dependency.
- - Fixed skip flag in korapxml2krill.
- - Ignore spans outside the token range
- (i.e. character offsets end before tokens have started).
+ - Fixed Mate/Dependency.
+ - Fixed skip flag in korapxml2krill.
+ - Ignore spans outside the token range
+ (i.e. character offsets end before tokens have started).
0.11 2016-02-23
- Merged korapxml2krill and korapxml2krill_dir.
@@ -51,43 +53,43 @@
0.09 2016-02-15
- Fixed temporary directory handling in scripts.
- - Improved skipping for archive handling in scripts.
+ - Improved skipping for archive handling in scripts.
0.08 2016-02-14
- Added support for archive streaming.
- - Improved scripts.
+ - Improved scripts.
0.07 2016-02-13
- Improved support for Schreibgebrauch meta data
- (IDS flavour).
+ (IDS flavour).
0.06 2016-02-11
- Improved support for Schreibgebrauch meta data
- (Duden flavour).
+ (Duden flavour).
0.05 2016-02-04
- Changed KorAP::Document to KorAP::XML::Krill.
- - Renamed "Schreibgebrauch" to "Sgbr".
- - Preparation for GitHub release.
+ - Renamed "Schreibgebrauch" to "Sgbr".
+ - Preparation for GitHub release.
0.04 2016-01-28
- Added PTI to all payloads.
- - Added support for empty elements.
- - Added support for element attributes in struct.
- - Added meta data support for Schreibgebrauch.
- - Fixed test suite for meta data.
+ - Added support for empty elements.
+ - Added support for element attributes in struct.
+ - Added meta data support for Schreibgebrauch.
+ - Fixed test suite for meta data.
0.03 2014-11-03
- Added new metadata scheme.
- - Fixed a minor bug in the constituency tree building.
- - Sorted terms in tokens a priori.
+ - Fixed a minor bug in the constituency tree building.
+ - Sorted terms in tokens a priori.
0.02 2014-07-21
- Sentence annotations for all providing foundries
- - Starting subtokenization
+ - Starting subtokenization
0.01 2014-04-15
- - [bugfix] for first token annotations
+ - [bugfix] for first token annotations
- Sentences are now available from all foundries that have it
- <>:p is now <>:base/para
- - Added <>:base/text
+ - Added <>:base/text
diff --git a/Makefile.PL b/Makefile.PL
index 8e3aae2..efe949b 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -40,9 +40,10 @@
test => {
TESTS =>
't/*.t '.
- 't/annotation/*.t ' .
- 't/sgbr/*.t ' .
- 't/real/*.t'
+ 't/annotation/*.t ' .
+ 't/sgbr/*.t ' .
+ 't/real/*.t ' .
+ 't/script/*.t '
},
EXE_FILES => ['script/korapxml2krill']
);
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index e84cb2e..76a4a46 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -27,9 +27,7 @@
# Process a file
sub process {
- my $self = shift;
- my $input = shift;
- my $output = shift;
+ my ($self, $input, $output) = @_;
if (!$self->{overwrite} && $output && -e $output) {
$self->{log}->debug($output . ' already exists');
@@ -71,18 +69,29 @@
my $file;
my $print_text = ($self->{pretty} ? $tokens->to_pretty_json($self->{primary}) : $tokens->to_json($self->{primary}));
+
+ # There is an output file given
if ($output) {
+
if ($self->{gzip}) {
- $file = IO::Compress::Gzip->new($output, Minimal => 1);
+ $file = IO::Compress::Gzip->new($output, TextFlag => 1, Minimal => 1);
}
else {
$file = IO::File->new($output, "w");
};
- $file->print($print_text);
+ # Write to output
+ unless ($file->print($print_text)) {
+ $self->{log}->error('Unable to write to ' . $file);
+ };
+
+ # Flush pending data
+ # $file->flush if $self->{gzip};
+
$file->close;
}
+ # Direct output to STDOUT
else {
print $print_text . "\n";
};
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 5740492..5b9ee6f 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -408,7 +408,7 @@
return $retval;
}
else {
- $self->log->warn('Unable to parse '.$mod);
+ $self->log->debug('Unable to parse '.$mod);
};
}
else {
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 4b06ca8..e668f07 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -65,7 +65,7 @@
# - Added MDParser#Dependency
# ----------------------------------------------------------
-our $LAST_CHANGE = '2016/07/06';
+our $LAST_CHANGE = '2016/08/16';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -78,7 +78,7 @@
$cmd = shift @ARGV;
};
-my (@skip, @sigle, @input);
+my (@skip, @sigle, @anno, @input);
my $text;
# Parse options from the command line
@@ -96,7 +96,7 @@
'cache-delete|cd!' => \(my $cache_delete = 1),
'cache-init|ci!' => \(my $cache_init = 1),
'log|l=s' => \(my $log_level = 'ERROR'),
- 'anno|a=s' => \(my @anno),
+ 'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs = 0),
@@ -288,11 +288,11 @@
sub stop_time {
my $new = Benchmark->new;
- $log->trace(
+ $log->info(
'The code took: '.
- timestr(timediff($new, $main::LAST_STOP)) .
- ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
- );
+ timestr(timediff($new, $main::LAST_STOP)) .
+ ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
+ );
$main::LAST_STOP = $new;
};
@@ -304,7 +304,7 @@
# Delete cache file
unlink($cache_file) if $cache_delete;
-# stop_time;
+ stop_time;
}
# Extract XML files
diff --git a/t/script.t b/t/script.t
deleted file mode 100644
index 5122b63..0000000
--- a/t/script.t
+++ /dev/null
@@ -1,18 +0,0 @@
-use strict;
-use warnings;
-use Test::More;
-use File::Basename 'dirname';
-use File::Spec::Functions qw/catfile/;
-use Test::Output;
-use FindBin;
-
-my $script = catfile(dirname(__FILE__), '..', 'script', 'korapxml2krill');
-
-stdout_like(
- sub { system('perl', $script) },
- qr!Usage.+?korapxml2krill!s,
- 'Usage output'
-);
-
-done_testing;
-__END__
diff --git a/t/script/single.t b/t/script/single.t
index 053f80b..cda1b57 100644
--- a/t/script/single.t
+++ b/t/script/single.t
@@ -8,45 +8,132 @@
use Mojo::JSON qw/decode_json/;
use IO::Uncompress::Gunzip;
use Test::More;
+use Test::Output;
+use Data::Dumper;
my $f = dirname(__FILE__);
my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
-my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
-my $output = tmpnam();
-
ok(-f $script, 'Script found');
+
+stdout_like(
+ sub { system('perl', $script) },
+ qr!Usage.+?korapxml2krill!s,
+ 'Usage output'
+);
+
+my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
ok(-d $input, 'Input directory found');
-my $call = 'perl ';
-$call .= $script . ' ';
-$call .= "--input $input ";
-$call .= "--output $output ";
-$call .= '-t OpenNLP#Tokens ';
+my $output = tmpnam();
-system($call);
+ok(!-f $output, 'Output does not exist');
-ok(my $file = slurp $output, 'Slurp data');
-ok(my $json = decode_json $file, 'decode json');
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '-t' => 'OpenNLP#Tokens',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = slurp $output), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
-system($call . ' -z');
+# Delete output
+unlink $output;
+ok(!-f $output, 'Output does not exist');
-my $gz = IO::Uncompress::Gunzip->new($output);
-ok($gz->read($file), 'Uncompress');
+$call .= ' -z';
-ok($json = decode_json $file, 'decode json');
+# Test with compression
+stderr_like(
+ sub { system($call); },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+
+# Uncompress the data using a buffer
+my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
+($file, my $buffer) = '';
+while ($gz->read($buffer)) {
+ $file .= $buffer;
+};
+
+ok($json = decode_json($file), 'decode json');
+
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
-is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+# Delete output
+unlink $output;
+ok(!-f $output, 'Output does not exist');
+
+# Use a different token source and skip all annotations,
+# except for DeReKo#Structure and Mate#Dependency
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '-t' => 'CoreNLP#Tokens',
+ '-s' => '#all',
+ '-a' => 'DeReKo#Structure',
+ '-a' => 'Mate#Dependency',
+ '-l' => 'INFO'
+);
+
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = slurp $output), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
+is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
+
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
+
+# Test overwrite!!!
+# Test meta
+# Test sigle!
+# Test help
+# Test version
+
done_testing;
__END__
+