Update scripts and sgbr test suite Change-Id: I3d584fd559d84eca80da6c5fbe257edab79b1017

commit: 93d620e46dba1a790845408bd92571eb22c854e3 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 05 19:40:05 2016 +0100
committer: Akron <nils@diewald-online.de> Fri Feb 05 19:40:05 2016 +0100
tree: 2f4846549357ce96ed94e14083ac5248177cda38
parent: 6764d55ed817f9f70ce83be86c56f16525cbb3a1 [diff] [blame]
diff --git a/script/korapxml2krill b/script/korapxml2krill
new file mode 100644
index 0000000..96c3b23
--- /dev/null
+++ b/script/korapxml2krill

@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use lib 'lib', '../lib';
+use Getopt::Long;
+use Benchmark qw/:hireswallclock/;
+use IO::Compress::Gzip qw/$GzipError/;
+use Log::Log4perl;
+
+use KorAP::XML::Krill;
+use KorAP::XML::Tokenizer;
+
+our $VERSION = 0.04;
+
+# Merges foundry data to create indexer friendly documents
+# ndiewald, 2014/10/29
+
+# 2016/02/04
+# - renamed to korapxml2krill
+# - added Schreibgebrauch support
+
+sub printhelp {
+  print <<'EOHELP';
+
+Merge foundry data based on a tokenization and create indexer friendly documents.
+
+Call:
+korapxml2krill -z --input <directory> --output <filename>
+
+--input|-i <directory>          Directory of the document to index
+--output|-o <filename>          Document name for output (optional),
+                                Writes to <STDOUT> by default
+--overwrite|-w                  Overwrite files that already exist
+--token|-t <foundry>[#<layer>]  Define the default tokenization by specifying
+                                the name of the foundry and optionally the name
+                                of the layer. Defaults to OpenNLP#tokens.
+--skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name
+                                or specific layers by defining the name
+                                with a # in front of the foundry,
+                                e.g. Mate#Morpho. Alternatively you can skip #ALL.
+                                Can be set multiple times.
+--allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them
+                                combining the foundry name with a # and the layer name.
+--primary|-p                    Output primary data or not. Defaults to true.
+                                Can be flagged using --no-primary as well.
+--human|-m                      Represent the data human friendly,
+                                while the output defaults to JSON
+--pretty|-y                     Pretty print json output
+--gzip|-z                       Compress the output
+                                (expects a defined output file)
+--log|-l                        The Log4perl log level, defaults to ERROR.
+--help|-h                       Print this document (optional)
+
+diewald@ids-mannheim.de, 2016/02/04
+
+EOHELP
+  exit(defined $_[0] ? $_[0] : 0);
+};
+
+# Options from the command line
+my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
+    $primary, @allow, $pretty, $overwrite);
+GetOptions(
+  'input|i=s'   => \$input,
+  'output|o=s'  => \$output,
+  'overwrite|w' => \$overwrite,
+  'human|m'     => \$text,
+  'token|t=s'   => \$token_base,
+  'gzip|z'      => \$gzip,
+  'skip|s=s'    => \@skip,
+  'log|l=s'     => \$log_level,
+  'allow|a=s'   => \@allow,
+  'primary|p!'  => \$primary,
+  'pretty|y'    => \$pretty,
+  'help|h'      => sub { printhelp }
+);
+
+printhelp(1) if !$input || ($gzip && !$output);
+
+$log_level //= 'ERROR';
+
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
+
+Log::Log4perl->init({
+  'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+  'log4perl.appender.STDERR.layout' => 'PatternLayout',
+  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+my $log = Log::Log4perl->get_logger('main');
+
+# Ignore processing
+if (!$overwrite && $output && -e $output) {
+  $log->trace($output . ' already exists');
+  exit(0);
+};
+
+BEGIN {
+  $main::TIME = Benchmark->new;
+  $main::LAST_STOP = Benchmark->new;
+};
+
+sub stop_time {
+  my $new = Benchmark->new;
+  $log->trace(
+    'The code took: '.
+      timestr(timediff($new, $main::LAST_STOP)) .
+	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
+      );
+  $main::LAST_STOP = $new;
+};
+
+# Call perl script/korapxml2krill WPD/AAA/00001
+
+# Create and parse new document
+$input =~ s{([^/])$}{$1/};
+my $doc = KorAP::XML::Krill->new( path => $input );
+
+unless ($doc->parse) {
+  $log->warn($output . " can't be processed - no document data");
+  exit(0);
+};
+
+my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
+if ($token_base) {
+  ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
+};
+
+# Get tokenization
+my $tokens = KorAP::XML::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => $token_base_foundry,
+  layer => $token_base_layer,
+  name => 'tokens'
+);
+
+# Unable to process base tokenization
+unless ($tokens->parse) {
+  $log->error($output . " can't be processed - no base tokenization");
+  exit(0);
+};
+
+my @layers;
+push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
+
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
+push(@layers, ['Connexor', 'Sentences']);
+
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities']);
+push(@layers, ['CoreNLP', 'Sentences']);
+push(@layers, ['CoreNLP', 'Morpho']);
+push(@layers, ['CoreNLP', 'Constituency']);
+
+# DeReKo
+push(@layers, ['DeReKo', 'Structure']);
+
+# Glemm
+push(@layers, ['Glemm', 'Morpho']);
+
+# Malt
+# push(@layers, ['Malt', 'Dependency']);
+
+# Mate
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
+push(@layers, ['OpenNLP', 'Sentences']);
+
+# Schreibgebrauch
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
+
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
+push(@layers, ['TreeTagger', 'Sentences']);
+
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+push(@layers, ['XIP', 'Sentences']);
+push(@layers, ['XIP', 'Dependency']);
+
+
+if ($skip{'#all'}) {
+  foreach (@allow) {
+    $tokens->add(split('#', $_));
+    stop_time;
+  };
+}
+else {
+  # Add to index file - respect skipping
+  foreach my $info (@layers) {
+    unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
+      $tokens->add(@$info);
+      stop_time;
+    };
+  };
+};
+
+my $file;
+
+my $print_text = $text ? $tokens->to_string($primary) :
+  ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
+
+if ($output) {
+
+  if ($gzip) {
+    $file = IO::Compress::Gzip->new($output, Minimal => 1);
+  }
+  else {
+    $file = IO::File->new($output, "w");
+  };
+
+  $file->print($print_text);
+  $file->close;
+}
+
+else {
+  print $print_text . "\n";
+};
+
+stop_time;
+
+__END__
commit	93d620e46dba1a790845408bd92571eb22c854e3	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 05 19:40:05 2016 +0100
committer	Akron <nils@diewald-online.de>	Fri Feb 05 19:40:05 2016 +0100
tree	2f4846549357ce96ed94e14083ac5248177cda38
parent	6764d55ed817f9f70ce83be86c56f16525cbb3a1 [diff] [blame]