Update scripts and sgbr test suite
Change-Id: I3d584fd559d84eca80da6c5fbe257edab79b1017
diff --git a/script/korapxml2krill b/script/korapxml2krill
new file mode 100644
index 0000000..96c3b23
--- /dev/null
+++ b/script/korapxml2krill
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use lib 'lib', '../lib';
+use Getopt::Long;
+use Benchmark qw/:hireswallclock/;
+use IO::Compress::Gzip qw/$GzipError/;
+use Log::Log4perl;
+
+use KorAP::XML::Krill;
+use KorAP::XML::Tokenizer;
+
+our $VERSION = 0.04;
+
+# Merges foundry data to create indexer friendly documents
+# ndiewald, 2014/10/29
+
+# 2016/02/04
+# - renamed to korapxml2krill
+# - added Schreibgebrauch support
+
+sub printhelp {
+ print <<'EOHELP';
+
+Merge foundry data based on a tokenization and create indexer friendly documents.
+
+Call:
+korapxml2krill -z --input <directory> --output <filename>
+
+--input|-i <directory> Directory of the document to index
+--output|-o <filename> Document name for output (optional),
+ Writes to <STDOUT> by default
+--overwrite|-w Overwrite files that already exist
+--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
+ the name of the foundry and optionally the name
+ of the layer. Defaults to OpenNLP#tokens.
+--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
+ or specific layers by defining the name
+ with a # in front of the foundry,
+ e.g. Mate#Morpho. Alternatively you can skip #ALL.
+ Can be set multiple times.
+--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
+ combining the foundry name with a # and the layer name.
+--primary|-p Output primary data or not. Defaults to true.
+ Can be flagged using --no-primary as well.
+--human|-m Represent the data human friendly,
+ while the output defaults to JSON
+--pretty|-y Pretty print json output
+--gzip|-z Compress the output
+ (expects a defined output file)
+--log|-l The Log4perl log level, defaults to ERROR.
+--help|-h Print this document (optional)
+
+diewald@ids-mannheim.de, 2016/02/04
+
+EOHELP
+ exit(defined $_[0] ? $_[0] : 0);
+};
+
+# Options from the command line
+my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
+ $primary, @allow, $pretty, $overwrite);
+GetOptions(
+ 'input|i=s' => \$input,
+ 'output|o=s' => \$output,
+ 'overwrite|w' => \$overwrite,
+ 'human|m' => \$text,
+ 'token|t=s' => \$token_base,
+ 'gzip|z' => \$gzip,
+ 'skip|s=s' => \@skip,
+ 'log|l=s' => \$log_level,
+ 'allow|a=s' => \@allow,
+ 'primary|p!' => \$primary,
+ 'pretty|y' => \$pretty,
+ 'help|h' => sub { printhelp }
+);
+
+printhelp(1) if !$input || ($gzip && !$output);
+
+$log_level //= 'ERROR';
+
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
+
+Log::Log4perl->init({
+ 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+ 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+ 'log4perl.appender.STDERR.layout' => 'PatternLayout',
+ 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+my $log = Log::Log4perl->get_logger('main');
+
+# Ignore processing
+if (!$overwrite && $output && -e $output) {
+ $log->trace($output . ' already exists');
+ exit(0);
+};
+
+BEGIN {
+ $main::TIME = Benchmark->new;
+ $main::LAST_STOP = Benchmark->new;
+};
+
+sub stop_time {
+ my $new = Benchmark->new;
+ $log->trace(
+ 'The code took: '.
+ timestr(timediff($new, $main::LAST_STOP)) .
+ ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
+ );
+ $main::LAST_STOP = $new;
+};
+
+# Call perl script/korapxml2krill WPD/AAA/00001
+
+# Create and parse new document
+$input =~ s{([^/])$}{$1/};
+my $doc = KorAP::XML::Krill->new( path => $input );
+
+unless ($doc->parse) {
+ $log->warn($output . " can't be processed - no document data");
+ exit(0);
+};
+
+my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
+if ($token_base) {
+ ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
+};
+
+# Get tokenization
+my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ name => 'tokens'
+);
+
+# Unable to process base tokenization
+unless ($tokens->parse) {
+ $log->error($output . " can't be processed - no base tokenization");
+ exit(0);
+};
+
+my @layers;
+push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
+
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
+push(@layers, ['Connexor', 'Sentences']);
+
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities']);
+push(@layers, ['CoreNLP', 'Sentences']);
+push(@layers, ['CoreNLP', 'Morpho']);
+push(@layers, ['CoreNLP', 'Constituency']);
+
+# DeReKo
+push(@layers, ['DeReKo', 'Structure']);
+
+# Glemm
+push(@layers, ['Glemm', 'Morpho']);
+
+# Malt
+# push(@layers, ['Malt', 'Dependency']);
+
+# Mate
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
+push(@layers, ['OpenNLP', 'Sentences']);
+
+# Schreibgebrauch
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
+
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
+push(@layers, ['TreeTagger', 'Sentences']);
+
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+push(@layers, ['XIP', 'Sentences']);
+push(@layers, ['XIP', 'Dependency']);
+
+
+if ($skip{'#all'}) {
+ foreach (@allow) {
+ $tokens->add(split('#', $_));
+ stop_time;
+ };
+}
+else {
+ # Add to index file - respect skipping
+ foreach my $info (@layers) {
+ unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
+ $tokens->add(@$info);
+ stop_time;
+ };
+ };
+};
+
+my $file;
+
+my $print_text = $text ? $tokens->to_string($primary) :
+ ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
+
+if ($output) {
+
+ if ($gzip) {
+ $file = IO::Compress::Gzip->new($output, Minimal => 1);
+ }
+ else {
+ $file = IO::File->new($output, "w");
+ };
+
+ $file->print($print_text);
+ $file->close;
+}
+
+else {
+ print $print_text . "\n";
+};
+
+stop_time;
+
+__END__