Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; |
| 6 | use Benchmark qw/:hireswallclock/; |
| 7 | use IO::Compress::Gzip qw/$GzipError/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 10 | use KorAP::XML::Krill; |
| 11 | use KorAP::XML::Tokenizer; |
| 12 | |
| 13 | our $VERSION = 0.04; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 | |
| 15 | # Merges foundry data to create indexer friendly documents |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 16 | # ndiewald, 2014/10/29 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 17 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 18 | # 2016/02/04 |
| 19 | # - renamed to korapxml2krill |
| 20 | # - added Schreibgebrauch support |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 21 | # |
| 22 | # 2016/02/12 |
| 23 | # - fixed foundry skipping |
| 24 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 25 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 26 | sub printhelp { |
| 27 | print <<'EOHELP'; |
| 28 | |
| 29 | Merge foundry data based on a tokenization and create indexer friendly documents. |
| 30 | |
| 31 | Call: |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 32 | korapxml2krill -z --input <directory> --output <filename> |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 33 | |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 34 | --input|-i <directory> Directory of the document to index |
| 35 | --output|-o <filename> Document name for output (optional), |
| 36 | Writes to <STDOUT> by default |
| 37 | --overwrite|-w Overwrite files that already exist |
| 38 | --token|-t <foundry>[#<layer>] Define the default tokenization by specifying |
| 39 | the name of the foundry and optionally the name |
| 40 | of the layer. Defaults to OpenNLP#tokens. |
| 41 | --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name |
| 42 | or specific layers by defining the name |
| 43 | with a # in front of the foundry, |
| 44 | e.g. Mate#Morpho. Alternatively you can skip #ALL. |
| 45 | Can be set multiple times. |
| 46 | --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them |
| 47 | combining the foundry name with a # and the layer name. |
| 48 | --primary|-p Output primary data or not. Defaults to true. |
| 49 | Can be flagged using --no-primary as well. |
| 50 | --human|-m Represent the data human friendly, |
| 51 | while the output defaults to JSON |
| 52 | --pretty|-y Pretty print json output |
| 53 | --gzip|-z Compress the output |
| 54 | (expects a defined output file) |
| 55 | --log|-l The Log4perl log level, defaults to ERROR. |
| 56 | --help|-h Print this document (optional) |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 57 | |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 58 | diewald@ids-mannheim.de, 2016/02/12 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 59 | |
| 60 | EOHELP |
| 61 | exit(defined $_[0] ? $_[0] : 0); |
| 62 | }; |
| 63 | |
| 64 | # Options from the command line |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 65 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, |
| 66 | $primary, @allow, $pretty, $overwrite); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 67 | GetOptions( |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 68 | 'input|i=s' => \$input, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 69 | 'output|o=s' => \$output, |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 70 | 'overwrite|w' => \$overwrite, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 71 | 'human|m' => \$text, |
| 72 | 'token|t=s' => \$token_base, |
| 73 | 'gzip|z' => \$gzip, |
| 74 | 'skip|s=s' => \@skip, |
| 75 | 'log|l=s' => \$log_level, |
| 76 | 'allow|a=s' => \@allow, |
| 77 | 'primary|p!' => \$primary, |
| 78 | 'pretty|y' => \$pretty, |
| 79 | 'help|h' => sub { printhelp } |
| 80 | ); |
| 81 | |
| 82 | printhelp(1) if !$input || ($gzip && !$output); |
| 83 | |
| 84 | $log_level //= 'ERROR'; |
| 85 | |
| 86 | my %skip; |
| 87 | $skip{lc($_)} = 1 foreach @skip; |
| 88 | |
| 89 | Log::Log4perl->init({ |
| 90 | 'log4perl.rootLogger' => uc($log_level) . ', STDERR', |
| 91 | 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', |
| 92 | 'log4perl.appender.STDERR.layout' => 'PatternLayout', |
| 93 | 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' |
| 94 | }); |
| 95 | |
| 96 | my $log = Log::Log4perl->get_logger('main'); |
| 97 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 98 | # Ignore processing |
| 99 | if (!$overwrite && $output && -e $output) { |
| 100 | $log->trace($output . ' already exists'); |
| 101 | exit(0); |
| 102 | }; |
| 103 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 104 | BEGIN { |
| 105 | $main::TIME = Benchmark->new; |
| 106 | $main::LAST_STOP = Benchmark->new; |
| 107 | }; |
| 108 | |
| 109 | sub stop_time { |
| 110 | my $new = Benchmark->new; |
| 111 | $log->trace( |
| 112 | 'The code took: '. |
| 113 | timestr(timediff($new, $main::LAST_STOP)) . |
| 114 | ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' |
| 115 | ); |
| 116 | $main::LAST_STOP = $new; |
| 117 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 118 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 119 | # Create and parse new document |
| 120 | $input =~ s{([^/])$}{$1/}; |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 121 | my $doc = KorAP::XML::Krill->new( path => $input ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 122 | |
| 123 | unless ($doc->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 124 | $log->warn($output . " can't be processed - no document data"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 125 | exit(0); |
| 126 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 127 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 128 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); |
| 129 | if ($token_base) { |
| 130 | ($token_base_foundry, $token_base_layer) = split /#/, $token_base; |
| 131 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 132 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 133 | # Get tokenization |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 134 | my $tokens = KorAP::XML::Tokenizer->new( |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 135 | path => $doc->path, |
| 136 | doc => $doc, |
| 137 | foundry => $token_base_foundry, |
| 138 | layer => $token_base_layer, |
| 139 | name => 'tokens' |
| 140 | ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 141 | |
| 142 | # Unable to process base tokenization |
| 143 | unless ($tokens->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 144 | $log->error($output . " can't be processed - no base tokenization"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 145 | exit(0); |
| 146 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 147 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 148 | my @layers; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 149 | push(@layers, ['Base', 'Sentences']); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 150 | push(@layers, ['Base', 'Paragraphs']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 151 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 152 | # Connexor |
| 153 | push(@layers, ['Connexor', 'Morpho']); |
| 154 | push(@layers, ['Connexor', 'Syntax']); |
| 155 | push(@layers, ['Connexor', 'Phrase']); |
| 156 | push(@layers, ['Connexor', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 157 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 158 | # CoreNLP |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 159 | push(@layers, ['CoreNLP', 'NamedEntities']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 160 | push(@layers, ['CoreNLP', 'Sentences']); |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 161 | push(@layers, ['CoreNLP', 'Morpho']); |
| 162 | push(@layers, ['CoreNLP', 'Constituency']); |
| 163 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 164 | # DeReKo |
| 165 | push(@layers, ['DeReKo', 'Structure']); |
| 166 | |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 167 | # Glemm |
| 168 | push(@layers, ['Glemm', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 169 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 170 | # Malt |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 171 | # push(@layers, ['Malt', 'Dependency']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 172 | |
| 173 | # Mate |
| 174 | push(@layers, ['Mate', 'Morpho']); |
| 175 | push(@layers, ['Mate', 'Dependency']); |
| 176 | |
| 177 | # OpenNLP |
| 178 | push(@layers, ['OpenNLP', 'Morpho']); |
| 179 | push(@layers, ['OpenNLP', 'Sentences']); |
| 180 | |
| 181 | # Schreibgebrauch |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 182 | push(@layers, ['Sgbr', 'Lemma']); |
| 183 | push(@layers, ['Sgbr', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 184 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 185 | # TreeTagger |
| 186 | push(@layers, ['TreeTagger', 'Morpho']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 187 | push(@layers, ['TreeTagger', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 188 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 189 | # XIP |
| 190 | push(@layers, ['XIP', 'Morpho']); |
| 191 | push(@layers, ['XIP', 'Constituency']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 192 | push(@layers, ['XIP', 'Sentences']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 193 | push(@layers, ['XIP', 'Dependency']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 194 | |
| 195 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 196 | if ($skip{'#all'}) { |
| 197 | foreach (@allow) { |
| 198 | $tokens->add(split('#', $_)); |
| 199 | stop_time; |
| 200 | }; |
| 201 | } |
| 202 | else { |
| 203 | # Add to index file - respect skipping |
| 204 | foreach my $info (@layers) { |
Akron | 069bd71 | 2016-02-12 19:09:06 +0100 | [diff] [blame^] | 205 | # Skip if Foundry or Foundry#Layer should be skipped |
| 206 | unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 207 | $tokens->add(@$info); |
| 208 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 209 | }; |
| 210 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 211 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 212 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 213 | my $file; |
| 214 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 215 | my $print_text = $text ? $tokens->to_string($primary) : |
| 216 | ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 217 | |
| 218 | if ($output) { |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 219 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 220 | if ($gzip) { |
| 221 | $file = IO::Compress::Gzip->new($output, Minimal => 1); |
| 222 | } |
| 223 | else { |
| 224 | $file = IO::File->new($output, "w"); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 225 | }; |
| 226 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 227 | $file->print($print_text); |
| 228 | $file->close; |
| 229 | } |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 230 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 231 | else { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 232 | print $print_text . "\n"; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 233 | }; |
| 234 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 235 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 236 | |
| 237 | __END__ |