Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; |
| 6 | use Benchmark qw/:hireswallclock/; |
| 7 | use IO::Compress::Gzip qw/$GzipError/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | use KorAP::Document; |
| 10 | use KorAP::Tokenizer; |
| 11 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 12 | our $VERSION = 0.03; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 13 | |
| 14 | # Merges foundry data to create indexer friendly documents |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 15 | # ndiewald, 2014/10/29 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 16 | |
| 17 | sub printhelp { |
| 18 | print <<'EOHELP'; |
| 19 | |
| 20 | Merge foundry data based on a tokenization and create indexer friendly documents. |
| 21 | |
| 22 | Call: |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 23 | prepare_index.pl -z --input <directory> --output <filename> |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 24 | |
| 25 | --input|-i <directory> Directory of the document to index |
| 26 | --output|-o <filename> Document name for output (optional), |
| 27 | Writes to <STDOUT> by default |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 28 | --overwrite|-w Overwrite files that already exist |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 29 | --token|-t <foundry>[#<layer>] Define the default tokenization by specifying |
| 30 | the name of the foundry and optionally the name |
| 31 | of the layer. Defaults to OpenNLP#tokens. |
| 32 | --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name |
| 33 | or specific layers by defining the name |
| 34 | with a # in front of the foundry, |
| 35 | e.g. Mate#Morpho. Alternatively you can skip #ALL. |
| 36 | Can be set multiple times. |
| 37 | --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them |
| 38 | combining the foundry name with a # and the layer name. |
| 39 | --primary|-p Output primary data or not. Defaults to true. |
| 40 | Can be flagged using --no-primary as well. |
| 41 | --human|-m Represent the data human friendly, |
| 42 | while the output defaults to JSON |
| 43 | --pretty|-y Pretty print json output |
| 44 | --gzip|-z Compress the output |
| 45 | (expects a defined output file) |
| 46 | --log|-l The Log4perl log level, defaults to ERROR. |
| 47 | --help|-h Print this document (optional) |
| 48 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 49 | diewald@ids-mannheim.de, 2014/11/05 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 50 | |
| 51 | EOHELP |
| 52 | exit(defined $_[0] ? $_[0] : 0); |
| 53 | }; |
| 54 | |
| 55 | # Options from the command line |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 56 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, |
| 57 | $primary, @allow, $pretty, $overwrite); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 58 | GetOptions( |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 59 | 'input|i=s' => \$input, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 60 | 'output|o=s' => \$output, |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 61 | 'overwrite|w' => \$overwrite, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 62 | 'human|m' => \$text, |
| 63 | 'token|t=s' => \$token_base, |
| 64 | 'gzip|z' => \$gzip, |
| 65 | 'skip|s=s' => \@skip, |
| 66 | 'log|l=s' => \$log_level, |
| 67 | 'allow|a=s' => \@allow, |
| 68 | 'primary|p!' => \$primary, |
| 69 | 'pretty|y' => \$pretty, |
| 70 | 'help|h' => sub { printhelp } |
| 71 | ); |
| 72 | |
| 73 | printhelp(1) if !$input || ($gzip && !$output); |
| 74 | |
| 75 | $log_level //= 'ERROR'; |
| 76 | |
| 77 | my %skip; |
| 78 | $skip{lc($_)} = 1 foreach @skip; |
| 79 | |
| 80 | Log::Log4perl->init({ |
| 81 | 'log4perl.rootLogger' => uc($log_level) . ', STDERR', |
| 82 | 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', |
| 83 | 'log4perl.appender.STDERR.layout' => 'PatternLayout', |
| 84 | 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' |
| 85 | }); |
| 86 | |
| 87 | my $log = Log::Log4perl->get_logger('main'); |
| 88 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 89 | # Ignore processing |
| 90 | if (!$overwrite && $output && -e $output) { |
| 91 | $log->trace($output . ' already exists'); |
| 92 | exit(0); |
| 93 | }; |
| 94 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 95 | BEGIN { |
| 96 | $main::TIME = Benchmark->new; |
| 97 | $main::LAST_STOP = Benchmark->new; |
| 98 | }; |
| 99 | |
| 100 | sub stop_time { |
| 101 | my $new = Benchmark->new; |
| 102 | $log->trace( |
| 103 | 'The code took: '. |
| 104 | timestr(timediff($new, $main::LAST_STOP)) . |
| 105 | ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' |
| 106 | ); |
| 107 | $main::LAST_STOP = $new; |
| 108 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 109 | |
| 110 | # Call perl script/prepare_index.pl WPD/AAA/00001 |
| 111 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 112 | # Create and parse new document |
| 113 | $input =~ s{([^/])$}{$1/}; |
| 114 | my $doc = KorAP::Document->new( path => $input ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 115 | |
| 116 | unless ($doc->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 117 | $log->warn($output . " can't be processed - no document data"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 118 | exit(0); |
| 119 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 120 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 121 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); |
| 122 | if ($token_base) { |
| 123 | ($token_base_foundry, $token_base_layer) = split /#/, $token_base; |
| 124 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 125 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 126 | # Get tokenization |
| 127 | my $tokens = KorAP::Tokenizer->new( |
| 128 | path => $doc->path, |
| 129 | doc => $doc, |
| 130 | foundry => $token_base_foundry, |
| 131 | layer => $token_base_layer, |
| 132 | name => 'tokens' |
| 133 | ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 134 | |
| 135 | # Unable to process base tokenization |
| 136 | unless ($tokens->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 137 | $log->error($output . " can't be processed - no base tokenization"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 138 | exit(0); |
| 139 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 140 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 141 | my @layers; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 142 | push(@layers, ['Base', 'Sentences']); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 143 | push(@layers, ['Base', 'Paragraphs']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 144 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 145 | # Connexor |
| 146 | push(@layers, ['Connexor', 'Morpho']); |
| 147 | push(@layers, ['Connexor', 'Syntax']); |
| 148 | push(@layers, ['Connexor', 'Phrase']); |
| 149 | push(@layers, ['Connexor', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 150 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 151 | # CoreNLP |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 152 | push(@layers, ['CoreNLP', 'NamedEntities']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 153 | push(@layers, ['CoreNLP', 'Sentences']); |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 154 | push(@layers, ['CoreNLP', 'Morpho']); |
| 155 | push(@layers, ['CoreNLP', 'Constituency']); |
| 156 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 157 | # DeReKo |
| 158 | push(@layers, ['DeReKo', 'Structure']); |
| 159 | |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 160 | # Glemm |
| 161 | push(@layers, ['Glemm', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 162 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 163 | # Malt |
| 164 | push(@layers, ['Malt', 'Dependency']); |
| 165 | |
| 166 | # Mate |
| 167 | push(@layers, ['Mate', 'Morpho']); |
| 168 | push(@layers, ['Mate', 'Dependency']); |
| 169 | |
| 170 | # OpenNLP |
| 171 | push(@layers, ['OpenNLP', 'Morpho']); |
| 172 | push(@layers, ['OpenNLP', 'Sentences']); |
| 173 | |
| 174 | # Schreibgebrauch |
| 175 | push(@layers, ['Schreibgebrauch', 'Lemma']); |
| 176 | push(@layers, ['Schreibgebrauch', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 177 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 178 | # TreeTagger |
| 179 | push(@layers, ['TreeTagger', 'Morpho']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 180 | push(@layers, ['TreeTagger', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 181 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 182 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 183 | # XIP |
| 184 | push(@layers, ['XIP', 'Morpho']); |
| 185 | push(@layers, ['XIP', 'Constituency']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 186 | push(@layers, ['XIP', 'Sentences']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 187 | push(@layers, ['XIP', 'Dependency']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 188 | |
| 189 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 190 | if ($skip{'#all'}) { |
| 191 | foreach (@allow) { |
| 192 | $tokens->add(split('#', $_)); |
| 193 | stop_time; |
| 194 | }; |
| 195 | } |
| 196 | else { |
| 197 | # Add to index file - respect skipping |
| 198 | foreach my $info (@layers) { |
| 199 | unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) { |
| 200 | $tokens->add(@$info); |
| 201 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 202 | }; |
| 203 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 204 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 205 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 206 | my $file; |
| 207 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 208 | my $print_text = $text ? $tokens->to_string($primary) : |
| 209 | ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 210 | |
| 211 | if ($output) { |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 212 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 213 | if ($gzip) { |
| 214 | $file = IO::Compress::Gzip->new($output, Minimal => 1); |
| 215 | } |
| 216 | else { |
| 217 | $file = IO::File->new($output, "w"); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 218 | }; |
| 219 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 220 | $file->print($print_text); |
| 221 | $file->close; |
| 222 | } |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 223 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 224 | else { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 225 | print $print_text . "\n"; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 226 | }; |
| 227 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 228 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 229 | |
| 230 | __END__ |