| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; | 
 | 6 | use Benchmark qw/:hireswallclock/; | 
 | 7 | use IO::Compress::Gzip qw/$GzipError/; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 |  | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 10 | use KorAP::XML::Krill; | 
 | 11 | use KorAP::XML::Tokenizer; | 
 | 12 |  | 
 | 13 | our $VERSION = 0.04; | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 |  | 
 | 15 | # Merges foundry data to create indexer friendly documents | 
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 16 | # ndiewald, 2014/10/29 | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 17 |  | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 18 | # 2016/02/04 | 
 | 19 | # - renamed to korapxml2krill | 
 | 20 | # - added Schreibgebrauch support | 
 | 21 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 22 | sub printhelp { | 
 | 23 |   print <<'EOHELP'; | 
 | 24 |  | 
 | 25 | Merge foundry data based on a tokenization and create indexer friendly documents. | 
 | 26 |  | 
 | 27 | Call: | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 28 | korapxml2krill -z --input <directory> --output <filename> | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 29 |  | 
 | 30 | --input|-i <directory>          Directory of the document to index | 
 | 31 | --output|-o <filename>          Document name for output (optional), | 
 | 32 |                                 Writes to <STDOUT> by default | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 33 | --overwrite|-w                  Overwrite files that already exist | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 34 | --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
 | 35 |                                 the name of the foundry and optionally the name | 
 | 36 |                                 of the layer. Defaults to OpenNLP#tokens. | 
 | 37 | --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
 | 38 |                                 or specific layers by defining the name | 
 | 39 |                                 with a # in front of the foundry, | 
 | 40 |                                 e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
 | 41 |                                 Can be set multiple times. | 
 | 42 | --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
 | 43 |                                 combining the foundry name with a # and the layer name. | 
 | 44 | --primary|-p                    Output primary data or not. Defaults to true. | 
 | 45 |                                 Can be flagged using --no-primary as well. | 
 | 46 | --human|-m                      Represent the data human friendly, | 
 | 47 |                                 while the output defaults to JSON | 
 | 48 | --pretty|-y                     Pretty print json output | 
 | 49 | --gzip|-z                       Compress the output | 
 | 50 |                                 (expects a defined output file) | 
 | 51 | --log|-l                        The Log4perl log level, defaults to ERROR. | 
 | 52 | --help|-h                       Print this document (optional) | 
 | 53 |  | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 54 | diewald@ids-mannheim.de, 2016/02/04 | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 55 |  | 
 | 56 | EOHELP | 
 | 57 |   exit(defined $_[0] ? $_[0] : 0); | 
 | 58 | }; | 
 | 59 |  | 
 | 60 | # Options from the command line | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 61 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, | 
 | 62 |     $primary, @allow, $pretty, $overwrite); | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 63 | GetOptions( | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 64 |   'input|i=s'   => \$input, | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 65 |   'output|o=s'  => \$output, | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 66 |   'overwrite|w' => \$overwrite, | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 67 |   'human|m'     => \$text, | 
 | 68 |   'token|t=s'   => \$token_base, | 
 | 69 |   'gzip|z'      => \$gzip, | 
 | 70 |   'skip|s=s'    => \@skip, | 
 | 71 |   'log|l=s'     => \$log_level, | 
 | 72 |   'allow|a=s'   => \@allow, | 
 | 73 |   'primary|p!'  => \$primary, | 
 | 74 |   'pretty|y'    => \$pretty, | 
 | 75 |   'help|h'      => sub { printhelp } | 
 | 76 | ); | 
 | 77 |  | 
 | 78 | printhelp(1) if !$input || ($gzip && !$output); | 
 | 79 |  | 
 | 80 | $log_level //= 'ERROR'; | 
 | 81 |  | 
 | 82 | my %skip; | 
 | 83 | $skip{lc($_)} = 1 foreach @skip; | 
 | 84 |  | 
 | 85 | Log::Log4perl->init({ | 
 | 86 |   'log4perl.rootLogger' => uc($log_level) . ', STDERR', | 
 | 87 |   'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', | 
 | 88 |   'log4perl.appender.STDERR.layout' => 'PatternLayout', | 
 | 89 |   'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' | 
 | 90 | }); | 
 | 91 |  | 
 | 92 | my $log = Log::Log4perl->get_logger('main'); | 
 | 93 |  | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 94 | # Ignore processing | 
 | 95 | if (!$overwrite && $output && -e $output) { | 
 | 96 |   $log->trace($output . ' already exists'); | 
 | 97 |   exit(0); | 
 | 98 | }; | 
 | 99 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 100 | BEGIN { | 
 | 101 |   $main::TIME = Benchmark->new; | 
 | 102 |   $main::LAST_STOP = Benchmark->new; | 
 | 103 | }; | 
 | 104 |  | 
 | 105 | sub stop_time { | 
 | 106 |   my $new = Benchmark->new; | 
 | 107 |   $log->trace( | 
 | 108 |     'The code took: '. | 
 | 109 |       timestr(timediff($new, $main::LAST_STOP)) . | 
 | 110 | 	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' | 
 | 111 |       ); | 
 | 112 |   $main::LAST_STOP = $new; | 
 | 113 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 114 |  | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 115 | # Call perl script/korapxml2krill WPD/AAA/00001 | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 116 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 117 | # Create and parse new document | 
 | 118 | $input =~ s{([^/])$}{$1/}; | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 119 | my $doc = KorAP::XML::Krill->new( path => $input ); | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 120 |  | 
 | 121 | unless ($doc->parse) { | 
| Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 122 |   $log->warn($output . " can't be processed - no document data"); | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 123 |   exit(0); | 
 | 124 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 125 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 126 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); | 
 | 127 | if ($token_base) { | 
 | 128 |   ($token_base_foundry, $token_base_layer) = split /#/, $token_base; | 
 | 129 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 130 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 131 | # Get tokenization | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 132 | my $tokens = KorAP::XML::Tokenizer->new( | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 133 |   path => $doc->path, | 
 | 134 |   doc => $doc, | 
 | 135 |   foundry => $token_base_foundry, | 
 | 136 |   layer => $token_base_layer, | 
 | 137 |   name => 'tokens' | 
 | 138 | ); | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 139 |  | 
 | 140 | # Unable to process base tokenization | 
 | 141 | unless ($tokens->parse) { | 
| Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 142 |   $log->error($output . " can't be processed - no base tokenization"); | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 143 |   exit(0); | 
 | 144 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 145 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 146 | my @layers; | 
| Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 147 | push(@layers, ['Base', 'Sentences']); | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 148 | push(@layers, ['Base', 'Paragraphs']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 149 |  | 
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 150 | # Connexor | 
 | 151 | push(@layers, ['Connexor', 'Morpho']); | 
 | 152 | push(@layers, ['Connexor', 'Syntax']); | 
 | 153 | push(@layers, ['Connexor', 'Phrase']); | 
 | 154 | push(@layers, ['Connexor', 'Sentences']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 155 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 156 | # CoreNLP | 
| Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 157 | push(@layers, ['CoreNLP', 'NamedEntities']); | 
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 158 | push(@layers, ['CoreNLP', 'Sentences']); | 
| Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 159 | push(@layers, ['CoreNLP', 'Morpho']); | 
 | 160 | push(@layers, ['CoreNLP', 'Constituency']); | 
 | 161 |  | 
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 162 | # DeReKo | 
 | 163 | push(@layers, ['DeReKo', 'Structure']); | 
 | 164 |  | 
| Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 165 | # Glemm | 
 | 166 | push(@layers, ['Glemm', 'Morpho']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 167 |  | 
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 168 | # Malt | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 169 | # push(@layers, ['Malt', 'Dependency']); | 
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 170 |  | 
 | 171 | # Mate | 
 | 172 | push(@layers, ['Mate', 'Morpho']); | 
 | 173 | push(@layers, ['Mate', 'Dependency']); | 
 | 174 |  | 
 | 175 | # OpenNLP | 
 | 176 | push(@layers, ['OpenNLP', 'Morpho']); | 
 | 177 | push(@layers, ['OpenNLP', 'Sentences']); | 
 | 178 |  | 
 | 179 | # Schreibgebrauch | 
| Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 180 | push(@layers, ['Sgbr', 'Lemma']); | 
 | 181 | push(@layers, ['Sgbr', 'Morpho']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 182 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 183 | # TreeTagger | 
 | 184 | push(@layers, ['TreeTagger', 'Morpho']); | 
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 185 | push(@layers, ['TreeTagger', 'Sentences']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 186 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 187 | # XIP | 
 | 188 | push(@layers, ['XIP', 'Morpho']); | 
 | 189 | push(@layers, ['XIP', 'Constituency']); | 
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 190 | push(@layers, ['XIP', 'Sentences']); | 
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 191 | push(@layers, ['XIP', 'Dependency']); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 192 |  | 
 | 193 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 194 | if ($skip{'#all'}) { | 
 | 195 |   foreach (@allow) { | 
 | 196 |     $tokens->add(split('#', $_)); | 
 | 197 |     stop_time; | 
 | 198 |   }; | 
 | 199 | } | 
 | 200 | else { | 
 | 201 |   # Add to index file - respect skipping | 
 | 202 |   foreach my $info (@layers) { | 
 | 203 |     unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) { | 
 | 204 |       $tokens->add(@$info); | 
 | 205 |       stop_time; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 206 |     }; | 
 | 207 |   }; | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 208 | }; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 209 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 210 | my $file; | 
 | 211 |  | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 212 | my $print_text = $text ? $tokens->to_string($primary) : | 
 | 213 |   ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 214 |  | 
 | 215 | if ($output) { | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 216 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 217 |   if ($gzip) { | 
 | 218 |     $file = IO::Compress::Gzip->new($output, Minimal => 1); | 
 | 219 |   } | 
 | 220 |   else { | 
 | 221 |     $file = IO::File->new($output, "w"); | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 222 |   }; | 
 | 223 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 224 |   $file->print($print_text); | 
 | 225 |   $file->close; | 
 | 226 | } | 
| Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 227 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 228 | else { | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 229 |   print $print_text . "\n"; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 230 | }; | 
 | 231 |  | 
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 232 | stop_time; | 
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 233 |  | 
 | 234 | __END__ |