| #!/usr/bin/env perl | 
 | use strict; | 
 | use warnings; | 
 | use lib 'lib', '../lib'; | 
 | use Getopt::Long; | 
 | use Benchmark qw/:hireswallclock/; | 
 | use IO::Compress::Gzip qw/$GzipError/; | 
 | use Log::Log4perl; | 
 |  | 
 | use KorAP::XML::Krill; | 
 | use KorAP::XML::Tokenizer; | 
 |  | 
 | # Merges foundry data to create indexer friendly documents | 
 | # ndiewald, 2014/10/29 | 
 |  | 
 | # 2016/02/04 | 
 | # - renamed to korapxml2krill | 
 | # - added Schreibgebrauch support | 
 | # | 
 | # 2016/02/12 | 
 | # - fixed foundry skipping | 
 | # | 
 | # 2016/02/14 | 
 | # - Added version information | 
 |  | 
 | sub printversion { | 
 |   print "Version " . $KorAP::XML::Krill::VERSION . "\n\n"; | 
 |   exit(1); | 
 | }; | 
 |  | 
 | sub printhelp { | 
 |   print <<'EOHELP'; | 
 |  | 
 | Merge foundry data based on a tokenization and create indexer friendly documents. | 
 |  | 
 | Call: | 
 | korapxml2krill -z --input <directory> --output <filename> | 
 |  | 
 |   --input|-i <directory>          Directory of the document to index | 
 |   --output|-o <filename>          Document name for output (optional), | 
 |                                   Writes to <STDOUT> by default | 
 |   --overwrite|-w                  Overwrite files that already exist | 
 |   --token|-t <foundry>[#<file>]   Define the default tokenization by specifying | 
 |                                   the name of the foundry and optionally the name | 
 |                                   of the layer-file. Defaults to OpenNLP#tokens. | 
 |   --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
 |                                   or specific layers by defining the name | 
 |                                   with a # in front of the foundry, | 
 |                                   e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
 |                                   Can be set multiple times. | 
 |   --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
 |                                   combining the foundry name with a # and the layer name. | 
 |   --primary|-p                    Output primary data or not. Defaults to true. | 
 |                                   Can be flagged using --no-primary as well. | 
 |   --human|-m                      Represent the data human friendly, | 
 |                                   while the output defaults to JSON | 
 |   --pretty|-y                     Pretty print json output | 
 |   --gzip|-z                       Compress the output | 
 |                                   (expects a defined output file) | 
 |   --log|-l                        The Log4perl log level, defaults to ERROR. | 
 |   --help|-h                       Print this document (optional) | 
 |   --version|-v                    Print version information | 
 |  | 
 | diewald@ids-mannheim.de, 2016/02/15 | 
 |  | 
 | EOHELP | 
 |   exit(defined $_[0] ? $_[0] : 0); | 
 | }; | 
 |  | 
 | # Options from the command line | 
 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, | 
 |     $primary, @allow, $pretty, $overwrite); | 
 | GetOptions( | 
 |   'input|i=s'   => \$input, | 
 |   'output|o=s'  => \$output, | 
 |   'overwrite|w' => \$overwrite, | 
 |   'human|m'     => \$text, | 
 |   'token|t=s'   => \$token_base, | 
 |   'gzip|z'      => \$gzip, | 
 |   'skip|s=s'    => \@skip, | 
 |   'log|l=s'     => \$log_level, | 
 |   'allow|a=s'   => \@allow, | 
 |   'primary|p!'  => \$primary, | 
 |   'pretty|y'    => \$pretty, | 
 |   'help|h'      => sub { printhelp }, | 
 |   'version|v'   => sub { printversion } | 
 | ); | 
 |  | 
 | printhelp(1) if !$input || ($gzip && !$output); | 
 |  | 
 | $log_level //= 'ERROR'; | 
 |  | 
 | my %skip; | 
 | $skip{lc($_)} = 1 foreach @skip; | 
 |  | 
 | Log::Log4perl->init({ | 
 |   'log4perl.rootLogger' => uc($log_level) . ', STDERR', | 
 |   'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', | 
 |   'log4perl.appender.STDERR.layout' => 'PatternLayout', | 
 |   'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' | 
 | }); | 
 |  | 
 | my $log = Log::Log4perl->get_logger('main'); | 
 |  | 
 | # Ignore processing | 
 | if (!$overwrite && $output && -e $output) { | 
 |   $log->trace($output . ' already exists'); | 
 |   exit(0); | 
 | }; | 
 |  | 
 | BEGIN { | 
 |   $main::TIME = Benchmark->new; | 
 |   $main::LAST_STOP = Benchmark->new; | 
 | }; | 
 |  | 
 | sub stop_time { | 
 |   my $new = Benchmark->new; | 
 |   $log->trace( | 
 |     'The code took: '. | 
 |       timestr(timediff($new, $main::LAST_STOP)) . | 
 | 	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' | 
 |       ); | 
 |   $main::LAST_STOP = $new; | 
 | }; | 
 |  | 
 | # Create and parse new document | 
 | $input =~ s{([^/])$}{$1/}; | 
 | my $doc = KorAP::XML::Krill->new( path => $input ); | 
 |  | 
 | unless ($doc->parse) { | 
 |   $log->warn($output . " can't be processed - no document data"); | 
 |   exit(0); | 
 | }; | 
 |  | 
 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); | 
 | if ($token_base) { | 
 |   ($token_base_foundry, $token_base_layer) = split /#/, $token_base; | 
 | }; | 
 |  | 
 | # Get tokenization | 
 | my $tokens = KorAP::XML::Tokenizer->new( | 
 |   path => $doc->path, | 
 |   doc => $doc, | 
 |   foundry => $token_base_foundry, | 
 |   layer => $token_base_layer, | 
 |   name => 'tokens' | 
 | ); | 
 |  | 
 | # Unable to process base tokenization | 
 | unless ($tokens->parse) { | 
 |   $log->error($output . " can't be processed - no base tokenization"); | 
 |   exit(0); | 
 | }; | 
 |  | 
 | my @layers; | 
 | push(@layers, ['Base', 'Sentences']); | 
 | push(@layers, ['Base', 'Paragraphs']); | 
 |  | 
 | # Connexor | 
 | push(@layers, ['Connexor', 'Morpho']); | 
 | push(@layers, ['Connexor', 'Syntax']); | 
 | push(@layers, ['Connexor', 'Phrase']); | 
 | push(@layers, ['Connexor', 'Sentences']); | 
 |  | 
 | # CoreNLP | 
 | push(@layers, ['CoreNLP', 'NamedEntities']); | 
 | push(@layers, ['CoreNLP', 'Sentences']); | 
 | push(@layers, ['CoreNLP', 'Morpho']); | 
 | push(@layers, ['CoreNLP', 'Constituency']); | 
 |  | 
 | # DeReKo | 
 | push(@layers, ['DeReKo', 'Structure']); | 
 |  | 
 | # Glemm | 
 | push(@layers, ['Glemm', 'Morpho']); | 
 |  | 
 | # Malt | 
 | # push(@layers, ['Malt', 'Dependency']); | 
 |  | 
 | # Mate | 
 | push(@layers, ['Mate', 'Morpho']); | 
 | push(@layers, ['Mate', 'Dependency']); | 
 |  | 
 | # OpenNLP | 
 | push(@layers, ['OpenNLP', 'Morpho']); | 
 | push(@layers, ['OpenNLP', 'Sentences']); | 
 |  | 
 | # Schreibgebrauch | 
 | push(@layers, ['Sgbr', 'Lemma']); | 
 | push(@layers, ['Sgbr', 'Morpho']); | 
 |  | 
 | # TreeTagger | 
 | push(@layers, ['TreeTagger', 'Morpho']); | 
 | push(@layers, ['TreeTagger', 'Sentences']); | 
 |  | 
 | # XIP | 
 | push(@layers, ['XIP', 'Morpho']); | 
 | push(@layers, ['XIP', 'Constituency']); | 
 | push(@layers, ['XIP', 'Sentences']); | 
 | push(@layers, ['XIP', 'Dependency']); | 
 |  | 
 |  | 
 | if ($skip{'#all'}) { | 
 |   foreach (@allow) { | 
 |     $tokens->add(split('#', $_)); | 
 |     stop_time; | 
 |   }; | 
 | } | 
 | else { | 
 |   # Add to index file - respect skipping | 
 |   foreach my $info (@layers) { | 
 |     # Skip if Foundry or Foundry#Layer should be skipped | 
 |     unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) { | 
 |       $tokens->add(@$info); | 
 |       stop_time; | 
 |     }; | 
 |   }; | 
 | }; | 
 |  | 
 | my $file; | 
 |  | 
 | my $print_text = $text ? $tokens->to_string($primary) : | 
 |   ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); | 
 |  | 
 | if ($output) { | 
 |  | 
 |   if ($gzip) { | 
 |     $file = IO::Compress::Gzip->new($output, Minimal => 1); | 
 |   } | 
 |   else { | 
 |     $file = IO::File->new($output, "w"); | 
 |   }; | 
 |  | 
 |   $file->print($print_text); | 
 |   $file->close; | 
 | } | 
 |  | 
 | else { | 
 |   print $print_text . "\n"; | 
 | }; | 
 |  | 
 | stop_time; | 
 |  | 
 | __END__ |