blob: 6443c8ade07f7caf5ff0280e809a213c794478c3 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use lib 'lib', '../lib';
use Getopt::Long;
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
use KorAP::XML::Krill;
use KorAP::XML::Tokenizer;
# Merges foundry data to create indexer friendly documents
# ndiewald, 2014/10/29
# 2016/02/04
# - renamed to korapxml2krill
# - added Schreibgebrauch support
#
# 2016/02/12
# - fixed foundry skipping
#
# 2016/02/14
# - Added version information
sub printversion {
print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
exit(1);
};
sub printhelp {
print <<'EOHELP';
Merge foundry data based on a tokenization and create indexer friendly documents.
Call:
korapxml2krill -z --input <directory> --output <filename>
--input|-i <directory> Directory of the document to index
--output|-o <filename> Document name for output (optional),
Writes to <STDOUT> by default
--overwrite|-w Overwrite files that already exist
--token|-t <foundry>[#<file>] Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to OpenNLP#tokens.
--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
or specific layers by defining the name
with a # in front of the foundry,
e.g. Mate#Morpho. Alternatively you can skip #ALL.
Can be set multiple times.
--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
combining the foundry name with a # and the layer name.
--primary|-p Output primary data or not. Defaults to true.
Can be flagged using --no-primary as well.
--human|-m Represent the data human friendly,
while the output defaults to JSON
--pretty|-y Pretty print json output
--gzip|-z Compress the output
(expects a defined output file)
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
--version|-v Print version information
diewald@ids-mannheim.de, 2016/02/15
EOHELP
exit(defined $_[0] ? $_[0] : 0);
};
# Options from the command line
my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
$primary, @allow, $pretty, $overwrite);
GetOptions(
'input|i=s' => \$input,
'output|o=s' => \$output,
'overwrite|w' => \$overwrite,
'human|m' => \$text,
'token|t=s' => \$token_base,
'gzip|z' => \$gzip,
'skip|s=s' => \@skip,
'log|l=s' => \$log_level,
'allow|a=s' => \@allow,
'primary|p!' => \$primary,
'pretty|y' => \$pretty,
'help|h' => sub { printhelp },
'version|v' => sub { printversion }
);
printhelp(1) if !$input || ($gzip && !$output);
$log_level //= 'ERROR';
my %skip;
$skip{lc($_)} = 1 foreach @skip;
Log::Log4perl->init({
'log4perl.rootLogger' => uc($log_level) . ', STDERR',
'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
'log4perl.appender.STDERR.layout' => 'PatternLayout',
'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
});
my $log = Log::Log4perl->get_logger('main');
# Ignore processing
if (!$overwrite && $output && -e $output) {
$log->trace($output . ' already exists');
exit(0);
};
BEGIN {
$main::TIME = Benchmark->new;
$main::LAST_STOP = Benchmark->new;
};
sub stop_time {
my $new = Benchmark->new;
$log->trace(
'The code took: '.
timestr(timediff($new, $main::LAST_STOP)) .
' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
);
$main::LAST_STOP = $new;
};
# Create and parse new document
$input =~ s{([^/])$}{$1/};
my $doc = KorAP::XML::Krill->new( path => $input );
unless ($doc->parse) {
$log->warn($output . " can't be processed - no document data");
exit(0);
};
my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
if ($token_base) {
($token_base_foundry, $token_base_layer) = split /#/, $token_base;
};
# Get tokenization
my $tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens'
);
# Unable to process base tokenization
unless ($tokens->parse) {
$log->error($output . " can't be processed - no base tokenization");
exit(0);
};
my @layers;
push(@layers, ['Base', 'Sentences']);
push(@layers, ['Base', 'Paragraphs']);
# Connexor
push(@layers, ['Connexor', 'Morpho']);
push(@layers, ['Connexor', 'Syntax']);
push(@layers, ['Connexor', 'Phrase']);
push(@layers, ['Connexor', 'Sentences']);
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
push(@layers, ['CoreNLP', 'Morpho']);
push(@layers, ['CoreNLP', 'Constituency']);
# DeReKo
push(@layers, ['DeReKo', 'Structure']);
# Glemm
push(@layers, ['Glemm', 'Morpho']);
# Malt
# push(@layers, ['Malt', 'Dependency']);
# Mate
push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
# OpenNLP
push(@layers, ['OpenNLP', 'Morpho']);
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
push(@layers, ['Sgbr', 'Lemma']);
push(@layers, ['Sgbr', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
push(@layers, ['XIP', 'Sentences']);
push(@layers, ['XIP', 'Dependency']);
if ($skip{'#all'}) {
foreach (@allow) {
$tokens->add(split('#', $_));
stop_time;
};
}
else {
# Add to index file - respect skipping
foreach my $info (@layers) {
# Skip if Foundry or Foundry#Layer should be skipped
unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
$tokens->add(@$info);
stop_time;
};
};
};
my $file;
my $print_text = $text ? $tokens->to_string($primary) :
($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
if ($output) {
if ($gzip) {
$file = IO::Compress::Gzip->new($output, Minimal => 1);
}
else {
$file = IO::File->new($output, "w");
};
$file->print($print_text);
$file->close;
}
else {
print $print_text . "\n";
};
stop_time;
__END__