| #!/usr/bin/env perl | 
 | use strict; | 
 | use warnings; | 
 | use lib 'lib', '../lib'; | 
 | use Getopt::Long; | 
 | use Benchmark qw/:hireswallclock/; | 
 | use IO::Compress::Gzip qw/$GzipError/; | 
 | use Log::Log4perl; | 
 |  | 
 | use KorAP::XML::Krill; | 
 | use KorAP::XML::Tokenizer; | 
 |  | 
 | our $VERSION = 0.04; | 
 |  | 
 | # Merges foundry data to create indexer friendly documents | 
 | # ndiewald, 2014/10/29 | 
 |  | 
 | # 2016/02/04 | 
 | # - renamed to korapxml2krill | 
 | # - added Schreibgebrauch support | 
 |  | 
 | sub printhelp { | 
 |   print <<'EOHELP'; | 
 |  | 
 | Merge foundry data based on a tokenization and create indexer friendly documents. | 
 |  | 
 | Call: | 
 | korapxml2krill -z --input <directory> --output <filename> | 
 |  | 
 | --input|-i <directory>          Directory of the document to index | 
 | --output|-o <filename>          Document name for output (optional), | 
 |                                 Writes to <STDOUT> by default | 
 | --overwrite|-w                  Overwrite files that already exist | 
 | --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
 |                                 the name of the foundry and optionally the name | 
 |                                 of the layer. Defaults to OpenNLP#tokens. | 
 | --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
 |                                 or specific layers by defining the name | 
 |                                 with a # in front of the foundry, | 
 |                                 e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
 |                                 Can be set multiple times. | 
 | --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
 |                                 combining the foundry name with a # and the layer name. | 
 | --primary|-p                    Output primary data or not. Defaults to true. | 
 |                                 Can be flagged using --no-primary as well. | 
 | --human|-m                      Represent the data human friendly, | 
 |                                 while the output defaults to JSON | 
 | --pretty|-y                     Pretty print json output | 
 | --gzip|-z                       Compress the output | 
 |                                 (expects a defined output file) | 
 | --log|-l                        The Log4perl log level, defaults to ERROR. | 
 | --help|-h                       Print this document (optional) | 
 |  | 
 | diewald@ids-mannheim.de, 2016/02/04 | 
 |  | 
 | EOHELP | 
 |   exit(defined $_[0] ? $_[0] : 0); | 
 | }; | 
 |  | 
 | # Options from the command line | 
 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, | 
 |     $primary, @allow, $pretty, $overwrite); | 
 | GetOptions( | 
 |   'input|i=s'   => \$input, | 
 |   'output|o=s'  => \$output, | 
 |   'overwrite|w' => \$overwrite, | 
 |   'human|m'     => \$text, | 
 |   'token|t=s'   => \$token_base, | 
 |   'gzip|z'      => \$gzip, | 
 |   'skip|s=s'    => \@skip, | 
 |   'log|l=s'     => \$log_level, | 
 |   'allow|a=s'   => \@allow, | 
 |   'primary|p!'  => \$primary, | 
 |   'pretty|y'    => \$pretty, | 
 |   'help|h'      => sub { printhelp } | 
 | ); | 
 |  | 
 | printhelp(1) if !$input || ($gzip && !$output); | 
 |  | 
 | $log_level //= 'ERROR'; | 
 |  | 
 | my %skip; | 
 | $skip{lc($_)} = 1 foreach @skip; | 
 |  | 
 | Log::Log4perl->init({ | 
 |   'log4perl.rootLogger' => uc($log_level) . ', STDERR', | 
 |   'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', | 
 |   'log4perl.appender.STDERR.layout' => 'PatternLayout', | 
 |   'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' | 
 | }); | 
 |  | 
 | my $log = Log::Log4perl->get_logger('main'); | 
 |  | 
 | # Ignore processing | 
 | if (!$overwrite && $output && -e $output) { | 
 |   $log->trace($output . ' already exists'); | 
 |   exit(0); | 
 | }; | 
 |  | 
 | BEGIN { | 
 |   $main::TIME = Benchmark->new; | 
 |   $main::LAST_STOP = Benchmark->new; | 
 | }; | 
 |  | 
 | sub stop_time { | 
 |   my $new = Benchmark->new; | 
 |   $log->trace( | 
 |     'The code took: '. | 
 |       timestr(timediff($new, $main::LAST_STOP)) . | 
 | 	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' | 
 |       ); | 
 |   $main::LAST_STOP = $new; | 
 | }; | 
 |  | 
 | # Call perl script/korapxml2krill WPD/AAA/00001 | 
 |  | 
 | # Create and parse new document | 
 | $input =~ s{([^/])$}{$1/}; | 
 | my $doc = KorAP::XML::Krill->new( path => $input ); | 
 |  | 
 | unless ($doc->parse) { | 
 |   $log->warn($output . " can't be processed - no document data"); | 
 |   exit(0); | 
 | }; | 
 |  | 
 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); | 
 | if ($token_base) { | 
 |   ($token_base_foundry, $token_base_layer) = split /#/, $token_base; | 
 | }; | 
 |  | 
 | # Get tokenization | 
 | my $tokens = KorAP::XML::Tokenizer->new( | 
 |   path => $doc->path, | 
 |   doc => $doc, | 
 |   foundry => $token_base_foundry, | 
 |   layer => $token_base_layer, | 
 |   name => 'tokens' | 
 | ); | 
 |  | 
 | # Unable to process base tokenization | 
 | unless ($tokens->parse) { | 
 |   $log->error($output . " can't be processed - no base tokenization"); | 
 |   exit(0); | 
 | }; | 
 |  | 
 | my @layers; | 
 | push(@layers, ['Base', 'Sentences']); | 
 | push(@layers, ['Base', 'Paragraphs']); | 
 |  | 
 | # Connexor | 
 | push(@layers, ['Connexor', 'Morpho']); | 
 | push(@layers, ['Connexor', 'Syntax']); | 
 | push(@layers, ['Connexor', 'Phrase']); | 
 | push(@layers, ['Connexor', 'Sentences']); | 
 |  | 
 | # CoreNLP | 
 | push(@layers, ['CoreNLP', 'NamedEntities']); | 
 | push(@layers, ['CoreNLP', 'Sentences']); | 
 | push(@layers, ['CoreNLP', 'Morpho']); | 
 | push(@layers, ['CoreNLP', 'Constituency']); | 
 |  | 
 | # DeReKo | 
 | push(@layers, ['DeReKo', 'Structure']); | 
 |  | 
 | # Glemm | 
 | push(@layers, ['Glemm', 'Morpho']); | 
 |  | 
 | # Malt | 
 | # push(@layers, ['Malt', 'Dependency']); | 
 |  | 
 | # Mate | 
 | push(@layers, ['Mate', 'Morpho']); | 
 | push(@layers, ['Mate', 'Dependency']); | 
 |  | 
 | # OpenNLP | 
 | push(@layers, ['OpenNLP', 'Morpho']); | 
 | push(@layers, ['OpenNLP', 'Sentences']); | 
 |  | 
 | # Schreibgebrauch | 
 | push(@layers, ['Sgbr', 'Lemma']); | 
 | push(@layers, ['Sgbr', 'Morpho']); | 
 |  | 
 | # TreeTagger | 
 | push(@layers, ['TreeTagger', 'Morpho']); | 
 | push(@layers, ['TreeTagger', 'Sentences']); | 
 |  | 
 | # XIP | 
 | push(@layers, ['XIP', 'Morpho']); | 
 | push(@layers, ['XIP', 'Constituency']); | 
 | push(@layers, ['XIP', 'Sentences']); | 
 | push(@layers, ['XIP', 'Dependency']); | 
 |  | 
 |  | 
 | if ($skip{'#all'}) { | 
 |   foreach (@allow) { | 
 |     $tokens->add(split('#', $_)); | 
 |     stop_time; | 
 |   }; | 
 | } | 
 | else { | 
 |   # Add to index file - respect skipping | 
 |   foreach my $info (@layers) { | 
 |     unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) { | 
 |       $tokens->add(@$info); | 
 |       stop_time; | 
 |     }; | 
 |   }; | 
 | }; | 
 |  | 
 | my $file; | 
 |  | 
 | my $print_text = $text ? $tokens->to_string($primary) : | 
 |   ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); | 
 |  | 
 | if ($output) { | 
 |  | 
 |   if ($gzip) { | 
 |     $file = IO::Compress::Gzip->new($output, Minimal => 1); | 
 |   } | 
 |   else { | 
 |     $file = IO::File->new($output, "w"); | 
 |   }; | 
 |  | 
 |   $file->print($print_text); | 
 |   $file->close; | 
 | } | 
 |  | 
 | else { | 
 |   print $print_text . "\n"; | 
 | }; | 
 |  | 
 | stop_time; | 
 |  | 
 | __END__ |