Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; |
| 6 | use Benchmark qw/:hireswallclock/; |
| 7 | use IO::Compress::Gzip qw/$GzipError/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 10 | use KorAP::XML::Krill; |
| 11 | use KorAP::XML::Tokenizer; |
| 12 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 13 | # Merges foundry data to create indexer friendly documents |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 14 | # ndiewald, 2014/10/29 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 15 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 16 | # 2016/02/04 |
| 17 | # - renamed to korapxml2krill |
| 18 | # - added Schreibgebrauch support |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 19 | # |
| 20 | # 2016/02/12 |
| 21 | # - fixed foundry skipping |
Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame^] | 22 | # |
| 23 | # 2016/02/14 |
| 24 | # - Added version information |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 25 | |
Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame^] | 26 | sub printversion { |
| 27 | print "Version " . $KorAP::XML::Krill::VERSION . "\n\n"; |
| 28 | exit(1); |
| 29 | }; |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 30 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 31 | sub printhelp { |
| 32 | print <<'EOHELP'; |
| 33 | |
| 34 | Merge foundry data based on a tokenization and create indexer friendly documents. |
| 35 | |
| 36 | Call: |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 37 | korapxml2krill -z --input <directory> --output <filename> |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 38 | |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 39 | --input|-i <directory> Directory of the document to index |
| 40 | --output|-o <filename> Document name for output (optional), |
| 41 | Writes to <STDOUT> by default |
| 42 | --overwrite|-w Overwrite files that already exist |
| 43 | --token|-t <foundry>[#<layer>] Define the default tokenization by specifying |
| 44 | the name of the foundry and optionally the name |
| 45 | of the layer. Defaults to OpenNLP#tokens. |
| 46 | --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name |
| 47 | or specific layers by defining the name |
| 48 | with a # in front of the foundry, |
| 49 | e.g. Mate#Morpho. Alternatively you can skip #ALL. |
| 50 | Can be set multiple times. |
| 51 | --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them |
| 52 | combining the foundry name with a # and the layer name. |
| 53 | --primary|-p Output primary data or not. Defaults to true. |
| 54 | Can be flagged using --no-primary as well. |
| 55 | --human|-m Represent the data human friendly, |
| 56 | while the output defaults to JSON |
| 57 | --pretty|-y Pretty print json output |
| 58 | --gzip|-z Compress the output |
| 59 | (expects a defined output file) |
| 60 | --log|-l The Log4perl log level, defaults to ERROR. |
| 61 | --help|-h Print this document (optional) |
Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame^] | 62 | --version|-v Print version information |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 63 | |
Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame^] | 64 | diewald@ids-mannheim.de, 2016/02/14 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 65 | |
| 66 | EOHELP |
| 67 | exit(defined $_[0] ? $_[0] : 0); |
| 68 | }; |
| 69 | |
| 70 | # Options from the command line |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 71 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, |
| 72 | $primary, @allow, $pretty, $overwrite); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 73 | GetOptions( |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 74 | 'input|i=s' => \$input, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 75 | 'output|o=s' => \$output, |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 76 | 'overwrite|w' => \$overwrite, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 77 | 'human|m' => \$text, |
| 78 | 'token|t=s' => \$token_base, |
| 79 | 'gzip|z' => \$gzip, |
| 80 | 'skip|s=s' => \@skip, |
| 81 | 'log|l=s' => \$log_level, |
| 82 | 'allow|a=s' => \@allow, |
| 83 | 'primary|p!' => \$primary, |
| 84 | 'pretty|y' => \$pretty, |
Akron | 150b29e | 2016-02-14 23:06:48 +0100 | [diff] [blame^] | 85 | 'help|h' => sub { printhelp }, |
| 86 | 'version|v' => sub { printversion } |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 87 | ); |
| 88 | |
| 89 | printhelp(1) if !$input || ($gzip && !$output); |
| 90 | |
| 91 | $log_level //= 'ERROR'; |
| 92 | |
| 93 | my %skip; |
| 94 | $skip{lc($_)} = 1 foreach @skip; |
| 95 | |
| 96 | Log::Log4perl->init({ |
| 97 | 'log4perl.rootLogger' => uc($log_level) . ', STDERR', |
| 98 | 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', |
| 99 | 'log4perl.appender.STDERR.layout' => 'PatternLayout', |
| 100 | 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' |
| 101 | }); |
| 102 | |
| 103 | my $log = Log::Log4perl->get_logger('main'); |
| 104 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 105 | # Ignore processing |
| 106 | if (!$overwrite && $output && -e $output) { |
| 107 | $log->trace($output . ' already exists'); |
| 108 | exit(0); |
| 109 | }; |
| 110 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 111 | BEGIN { |
| 112 | $main::TIME = Benchmark->new; |
| 113 | $main::LAST_STOP = Benchmark->new; |
| 114 | }; |
| 115 | |
| 116 | sub stop_time { |
| 117 | my $new = Benchmark->new; |
| 118 | $log->trace( |
| 119 | 'The code took: '. |
| 120 | timestr(timediff($new, $main::LAST_STOP)) . |
| 121 | ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' |
| 122 | ); |
| 123 | $main::LAST_STOP = $new; |
| 124 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 125 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 126 | # Create and parse new document |
| 127 | $input =~ s{([^/])$}{$1/}; |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 128 | my $doc = KorAP::XML::Krill->new( path => $input ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 129 | |
| 130 | unless ($doc->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 131 | $log->warn($output . " can't be processed - no document data"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 132 | exit(0); |
| 133 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 134 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 135 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); |
| 136 | if ($token_base) { |
| 137 | ($token_base_foundry, $token_base_layer) = split /#/, $token_base; |
| 138 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 139 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 140 | # Get tokenization |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 141 | my $tokens = KorAP::XML::Tokenizer->new( |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 142 | path => $doc->path, |
| 143 | doc => $doc, |
| 144 | foundry => $token_base_foundry, |
| 145 | layer => $token_base_layer, |
| 146 | name => 'tokens' |
| 147 | ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 148 | |
| 149 | # Unable to process base tokenization |
| 150 | unless ($tokens->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 151 | $log->error($output . " can't be processed - no base tokenization"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 152 | exit(0); |
| 153 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 154 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 155 | my @layers; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 156 | push(@layers, ['Base', 'Sentences']); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 157 | push(@layers, ['Base', 'Paragraphs']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 158 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 159 | # Connexor |
| 160 | push(@layers, ['Connexor', 'Morpho']); |
| 161 | push(@layers, ['Connexor', 'Syntax']); |
| 162 | push(@layers, ['Connexor', 'Phrase']); |
| 163 | push(@layers, ['Connexor', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 164 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 165 | # CoreNLP |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 166 | push(@layers, ['CoreNLP', 'NamedEntities']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 167 | push(@layers, ['CoreNLP', 'Sentences']); |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 168 | push(@layers, ['CoreNLP', 'Morpho']); |
| 169 | push(@layers, ['CoreNLP', 'Constituency']); |
| 170 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 171 | # DeReKo |
| 172 | push(@layers, ['DeReKo', 'Structure']); |
| 173 | |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 174 | # Glemm |
| 175 | push(@layers, ['Glemm', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 176 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 177 | # Malt |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 178 | # push(@layers, ['Malt', 'Dependency']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 179 | |
| 180 | # Mate |
| 181 | push(@layers, ['Mate', 'Morpho']); |
| 182 | push(@layers, ['Mate', 'Dependency']); |
| 183 | |
| 184 | # OpenNLP |
| 185 | push(@layers, ['OpenNLP', 'Morpho']); |
| 186 | push(@layers, ['OpenNLP', 'Sentences']); |
| 187 | |
| 188 | # Schreibgebrauch |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 189 | push(@layers, ['Sgbr', 'Lemma']); |
| 190 | push(@layers, ['Sgbr', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 191 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 192 | # TreeTagger |
| 193 | push(@layers, ['TreeTagger', 'Morpho']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 194 | push(@layers, ['TreeTagger', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 195 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 196 | # XIP |
| 197 | push(@layers, ['XIP', 'Morpho']); |
| 198 | push(@layers, ['XIP', 'Constituency']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 199 | push(@layers, ['XIP', 'Sentences']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 200 | push(@layers, ['XIP', 'Dependency']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 201 | |
| 202 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 203 | if ($skip{'#all'}) { |
| 204 | foreach (@allow) { |
| 205 | $tokens->add(split('#', $_)); |
| 206 | stop_time; |
| 207 | }; |
| 208 | } |
| 209 | else { |
| 210 | # Add to index file - respect skipping |
| 211 | foreach my $info (@layers) { |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 212 | # Skip if Foundry or Foundry#Layer should be skipped |
| 213 | unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 214 | $tokens->add(@$info); |
| 215 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 216 | }; |
| 217 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 218 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 219 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 220 | my $file; |
| 221 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 222 | my $print_text = $text ? $tokens->to_string($primary) : |
| 223 | ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 224 | |
| 225 | if ($output) { |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 226 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 227 | if ($gzip) { |
| 228 | $file = IO::Compress::Gzip->new($output, Minimal => 1); |
| 229 | } |
| 230 | else { |
| 231 | $file = IO::File->new($output, "w"); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 232 | }; |
| 233 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 234 | $file->print($print_text); |
| 235 | $file->close; |
| 236 | } |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 237 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 238 | else { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 239 | print $print_text . "\n"; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 240 | }; |
| 241 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 242 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 243 | |
| 244 | __END__ |