| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; | 
 | 6 | use Benchmark qw/:hireswallclock/; | 
 | 7 | use IO::Compress::Gzip qw/$GzipError/; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 10 | use KorAP::XML::Krill; | 
 | 11 | use KorAP::XML::Tokenizer; | 
 | 12 |  | 
 | 13 | our $VERSION = 0.04; | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 |  | 
 | 15 | # Merges foundry data to create indexer friendly documents | 
| Nils Diewald | 79a355c | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 16 | # ndiewald, 2014/10/29 | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 17 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 18 | # 2016/02/04 | 
 | 19 | # - renamed to korapxml2krill | 
 | 20 | # - added Schreibgebrauch support | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 21 | # | 
 | 22 | # 2016/02/12 | 
 | 23 | # - fixed foundry skipping | 
 | 24 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 25 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 26 | sub printhelp { | 
 | 27 |   print <<'EOHELP'; | 
 | 28 |  | 
 | 29 | Merge foundry data based on a tokenization and create indexer friendly documents. | 
 | 30 |  | 
 | 31 | Call: | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 32 | korapxml2krill -z --input <directory> --output <filename> | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 33 |  | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 34 |   --input|-i <directory>          Directory of the document to index | 
 | 35 |   --output|-o <filename>          Document name for output (optional), | 
 | 36 |                                   Writes to <STDOUT> by default | 
 | 37 |   --overwrite|-w                  Overwrite files that already exist | 
 | 38 |   --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
 | 39 |                                   the name of the foundry and optionally the name | 
 | 40 |                                   of the layer. Defaults to OpenNLP#tokens. | 
 | 41 |   --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
 | 42 |                                   or specific layers by defining the name | 
 | 43 |                                   with a # in front of the foundry, | 
 | 44 |                                   e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
 | 45 |                                   Can be set multiple times. | 
 | 46 |   --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
 | 47 |                                   combining the foundry name with a # and the layer name. | 
 | 48 |   --primary|-p                    Output primary data or not. Defaults to true. | 
 | 49 |                                   Can be flagged using --no-primary as well. | 
 | 50 |   --human|-m                      Represent the data human friendly, | 
 | 51 |                                   while the output defaults to JSON | 
 | 52 |   --pretty|-y                     Pretty print json output | 
 | 53 |   --gzip|-z                       Compress the output | 
 | 54 |                                   (expects a defined output file) | 
 | 55 |   --log|-l                        The Log4perl log level, defaults to ERROR. | 
 | 56 |   --help|-h                       Print this document (optional) | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 57 |  | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 58 | diewald@ids-mannheim.de, 2016/02/12 | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 59 |  | 
 | 60 | EOHELP | 
 | 61 |   exit(defined $_[0] ? $_[0] : 0); | 
 | 62 | }; | 
 | 63 |  | 
 | 64 | # Options from the command line | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 65 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, | 
 | 66 |     $primary, @allow, $pretty, $overwrite); | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 67 | GetOptions( | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 68 |   'input|i=s'   => \$input, | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 69 |   'output|o=s'  => \$output, | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 70 |   'overwrite|w' => \$overwrite, | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 71 |   'human|m'     => \$text, | 
 | 72 |   'token|t=s'   => \$token_base, | 
 | 73 |   'gzip|z'      => \$gzip, | 
 | 74 |   'skip|s=s'    => \@skip, | 
 | 75 |   'log|l=s'     => \$log_level, | 
 | 76 |   'allow|a=s'   => \@allow, | 
 | 77 |   'primary|p!'  => \$primary, | 
 | 78 |   'pretty|y'    => \$pretty, | 
 | 79 |   'help|h'      => sub { printhelp } | 
 | 80 | ); | 
 | 81 |  | 
 | 82 | printhelp(1) if !$input || ($gzip && !$output); | 
 | 83 |  | 
 | 84 | $log_level //= 'ERROR'; | 
 | 85 |  | 
 | 86 | my %skip; | 
 | 87 | $skip{lc($_)} = 1 foreach @skip; | 
 | 88 |  | 
 | 89 | Log::Log4perl->init({ | 
 | 90 |   'log4perl.rootLogger' => uc($log_level) . ', STDERR', | 
 | 91 |   'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', | 
 | 92 |   'log4perl.appender.STDERR.layout' => 'PatternLayout', | 
 | 93 |   'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' | 
 | 94 | }); | 
 | 95 |  | 
 | 96 | my $log = Log::Log4perl->get_logger('main'); | 
 | 97 |  | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 98 | # Ignore processing | 
 | 99 | if (!$overwrite && $output && -e $output) { | 
 | 100 |   $log->trace($output . ' already exists'); | 
 | 101 |   exit(0); | 
 | 102 | }; | 
 | 103 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 104 | BEGIN { | 
 | 105 |   $main::TIME = Benchmark->new; | 
 | 106 |   $main::LAST_STOP = Benchmark->new; | 
 | 107 | }; | 
 | 108 |  | 
 | 109 | sub stop_time { | 
 | 110 |   my $new = Benchmark->new; | 
 | 111 |   $log->trace( | 
 | 112 |     'The code took: '. | 
 | 113 |       timestr(timediff($new, $main::LAST_STOP)) . | 
 | 114 | 	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' | 
 | 115 |       ); | 
 | 116 |   $main::LAST_STOP = $new; | 
 | 117 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 118 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 119 | # Create and parse new document | 
 | 120 | $input =~ s{([^/])$}{$1/}; | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 121 | my $doc = KorAP::XML::Krill->new( path => $input ); | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 122 |  | 
 | 123 | unless ($doc->parse) { | 
| Nils Diewald | 34926b4 | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 124 |   $log->warn($output . " can't be processed - no document data"); | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 125 |   exit(0); | 
 | 126 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 127 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 128 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); | 
 | 129 | if ($token_base) { | 
 | 130 |   ($token_base_foundry, $token_base_layer) = split /#/, $token_base; | 
 | 131 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 132 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 133 | # Get tokenization | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 134 | my $tokens = KorAP::XML::Tokenizer->new( | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 135 |   path => $doc->path, | 
 | 136 |   doc => $doc, | 
 | 137 |   foundry => $token_base_foundry, | 
 | 138 |   layer => $token_base_layer, | 
 | 139 |   name => 'tokens' | 
 | 140 | ); | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 141 |  | 
 | 142 | # Unable to process base tokenization | 
 | 143 | unless ($tokens->parse) { | 
| Nils Diewald | 34926b4 | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 144 |   $log->error($output . " can't be processed - no base tokenization"); | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 145 |   exit(0); | 
 | 146 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 147 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 148 | my @layers; | 
| Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 149 | push(@layers, ['Base', 'Sentences']); | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 150 | push(@layers, ['Base', 'Paragraphs']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 151 |  | 
| Akron | 627a80a | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 152 | # Connexor | 
 | 153 | push(@layers, ['Connexor', 'Morpho']); | 
 | 154 | push(@layers, ['Connexor', 'Syntax']); | 
 | 155 | push(@layers, ['Connexor', 'Phrase']); | 
 | 156 | push(@layers, ['Connexor', 'Sentences']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 157 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 158 | # CoreNLP | 
| Nils Diewald | 7ed12c8 | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 159 | push(@layers, ['CoreNLP', 'NamedEntities']); | 
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 160 | push(@layers, ['CoreNLP', 'Sentences']); | 
| Nils Diewald | 7ed12c8 | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 161 | push(@layers, ['CoreNLP', 'Morpho']); | 
 | 162 | push(@layers, ['CoreNLP', 'Constituency']); | 
 | 163 |  | 
| Akron | 627a80a | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 164 | # DeReKo | 
 | 165 | push(@layers, ['DeReKo', 'Structure']); | 
 | 166 |  | 
| Nils Diewald | 7ed12c8 | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 167 | # Glemm | 
 | 168 | push(@layers, ['Glemm', 'Morpho']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 169 |  | 
| Akron | 627a80a | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 170 | # Malt | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 171 | # push(@layers, ['Malt', 'Dependency']); | 
| Akron | 627a80a | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 172 |  | 
 | 173 | # Mate | 
 | 174 | push(@layers, ['Mate', 'Morpho']); | 
 | 175 | push(@layers, ['Mate', 'Dependency']); | 
 | 176 |  | 
 | 177 | # OpenNLP | 
 | 178 | push(@layers, ['OpenNLP', 'Morpho']); | 
 | 179 | push(@layers, ['OpenNLP', 'Sentences']); | 
 | 180 |  | 
 | 181 | # Schreibgebrauch | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 182 | push(@layers, ['Sgbr', 'Lemma']); | 
 | 183 | push(@layers, ['Sgbr', 'Morpho']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 184 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 185 | # TreeTagger | 
 | 186 | push(@layers, ['TreeTagger', 'Morpho']); | 
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 187 | push(@layers, ['TreeTagger', 'Sentences']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 188 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 189 | # XIP | 
 | 190 | push(@layers, ['XIP', 'Morpho']); | 
 | 191 | push(@layers, ['XIP', 'Constituency']); | 
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 192 | push(@layers, ['XIP', 'Sentences']); | 
| Akron | 627a80a | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 193 | push(@layers, ['XIP', 'Dependency']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 194 |  | 
 | 195 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 196 | if ($skip{'#all'}) { | 
 | 197 |   foreach (@allow) { | 
 | 198 |     $tokens->add(split('#', $_)); | 
 | 199 |     stop_time; | 
 | 200 |   }; | 
 | 201 | } | 
 | 202 | else { | 
 | 203 |   # Add to index file - respect skipping | 
 | 204 |   foreach my $info (@layers) { | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 205 |     # Skip if Foundry or Foundry#Layer should be skipped | 
 | 206 |     unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) { | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 207 |       $tokens->add(@$info); | 
 | 208 |       stop_time; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 209 |     }; | 
 | 210 |   }; | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 211 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 212 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 213 | my $file; | 
 | 214 |  | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 215 | my $print_text = $text ? $tokens->to_string($primary) : | 
 | 216 |   ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 217 |  | 
 | 218 | if ($output) { | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 219 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 220 |   if ($gzip) { | 
 | 221 |     $file = IO::Compress::Gzip->new($output, Minimal => 1); | 
 | 222 |   } | 
 | 223 |   else { | 
 | 224 |     $file = IO::File->new($output, "w"); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 225 |   }; | 
 | 226 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 227 |   $file->print($print_text); | 
 | 228 |   $file->close; | 
 | 229 | } | 
| Nils Diewald | 5b4865f | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 230 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 231 | else { | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 232 |   print $print_text . "\n"; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 233 | }; | 
 | 234 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 235 | stop_time; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 236 |  | 
 | 237 | __END__ |